{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 6279, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004778687055731438, "grad_norm": 6.597299531195726, "learning_rate": 0.0, "loss": 0.9182, "step": 1 }, { "epoch": 0.0009557374111462876, "grad_norm": 6.798268123961526, "learning_rate": 6.369426751592358e-08, "loss": 0.9348, "step": 2 }, { "epoch": 0.0014336061167194312, "grad_norm": 6.547375642053667, "learning_rate": 1.2738853503184715e-07, "loss": 0.9002, "step": 3 }, { "epoch": 0.0019114748222925752, "grad_norm": 6.714041140675801, "learning_rate": 1.9108280254777072e-07, "loss": 0.9221, "step": 4 }, { "epoch": 0.002389343527865719, "grad_norm": 6.482638024201559, "learning_rate": 2.547770700636943e-07, "loss": 0.8989, "step": 5 }, { "epoch": 0.0028672122334388625, "grad_norm": 6.628677060072881, "learning_rate": 3.1847133757961787e-07, "loss": 0.9246, "step": 6 }, { "epoch": 0.0033450809390120064, "grad_norm": 6.54934804614698, "learning_rate": 3.8216560509554143e-07, "loss": 0.9118, "step": 7 }, { "epoch": 0.0038229496445851504, "grad_norm": 6.822862171641388, "learning_rate": 4.45859872611465e-07, "loss": 0.9453, "step": 8 }, { "epoch": 0.004300818350158294, "grad_norm": 6.436429750787215, "learning_rate": 5.095541401273886e-07, "loss": 0.8919, "step": 9 }, { "epoch": 0.004778687055731438, "grad_norm": 6.20546872502828, "learning_rate": 5.732484076433121e-07, "loss": 0.9081, "step": 10 }, { "epoch": 0.0052565557613045814, "grad_norm": 6.1990164932210305, "learning_rate": 6.369426751592357e-07, "loss": 0.9152, "step": 11 }, { "epoch": 0.005734424466877725, "grad_norm": 6.213474019663624, "learning_rate": 7.006369426751592e-07, "loss": 0.9127, "step": 12 }, { "epoch": 0.006212293172450869, "grad_norm": 4.774163980815126, "learning_rate": 7.643312101910829e-07, "loss": 0.8724, "step": 13 }, { "epoch": 0.006690161878024013, "grad_norm": 4.635856494922676, "learning_rate": 8.280254777070064e-07, "loss": 0.8794, "step": 14 }, { "epoch": 0.007168030583597156, "grad_norm": 4.4643552624491925, "learning_rate": 8.9171974522293e-07, "loss": 0.8574, "step": 15 }, { "epoch": 0.007645899289170301, "grad_norm": 4.746112409024538, "learning_rate": 9.554140127388537e-07, "loss": 0.9037, "step": 16 }, { "epoch": 0.008123767994743444, "grad_norm": 3.6663226655371237, "learning_rate": 1.0191082802547772e-06, "loss": 0.8572, "step": 17 }, { "epoch": 0.008601636700316589, "grad_norm": 3.161655587285928, "learning_rate": 1.0828025477707007e-06, "loss": 0.8504, "step": 18 }, { "epoch": 0.009079505405889731, "grad_norm": 3.023372031668388, "learning_rate": 1.1464968152866242e-06, "loss": 0.8404, "step": 19 }, { "epoch": 0.009557374111462876, "grad_norm": 2.8123940051382466, "learning_rate": 1.210191082802548e-06, "loss": 0.8233, "step": 20 }, { "epoch": 0.01003524281703602, "grad_norm": 2.3319495260663037, "learning_rate": 1.2738853503184715e-06, "loss": 0.8007, "step": 21 }, { "epoch": 0.010513111522609163, "grad_norm": 2.1046299310129926, "learning_rate": 1.337579617834395e-06, "loss": 0.7893, "step": 22 }, { "epoch": 0.010990980228182307, "grad_norm": 1.970451770951646, "learning_rate": 1.4012738853503185e-06, "loss": 0.8091, "step": 23 }, { "epoch": 0.01146884893375545, "grad_norm": 2.2852013132038262, "learning_rate": 1.4649681528662422e-06, "loss": 0.761, "step": 24 }, { "epoch": 0.011946717639328594, "grad_norm": 2.3125203117243527, "learning_rate": 1.5286624203821657e-06, "loss": 0.7576, "step": 25 }, { "epoch": 0.012424586344901739, "grad_norm": 2.485516736089154, "learning_rate": 1.5923566878980892e-06, "loss": 0.7298, "step": 26 }, { "epoch": 0.012902455050474881, "grad_norm": 2.328181944771006, "learning_rate": 1.6560509554140127e-06, "loss": 0.7241, "step": 27 }, { "epoch": 0.013380323756048026, "grad_norm": 2.1332096308422592, "learning_rate": 1.7197452229299363e-06, "loss": 0.7561, "step": 28 }, { "epoch": 0.01385819246162117, "grad_norm": 1.9124393579077286, "learning_rate": 1.78343949044586e-06, "loss": 0.7421, "step": 29 }, { "epoch": 0.014336061167194313, "grad_norm": 1.868955513961223, "learning_rate": 1.8471337579617835e-06, "loss": 0.6917, "step": 30 }, { "epoch": 0.014813929872767457, "grad_norm": 1.5698888685595818, "learning_rate": 1.9108280254777074e-06, "loss": 0.7297, "step": 31 }, { "epoch": 0.015291798578340602, "grad_norm": 1.2265156599136022, "learning_rate": 1.974522292993631e-06, "loss": 0.698, "step": 32 }, { "epoch": 0.015769667283913744, "grad_norm": 1.3091022856057957, "learning_rate": 2.0382165605095544e-06, "loss": 0.6542, "step": 33 }, { "epoch": 0.01624753598948689, "grad_norm": 1.45790978867409, "learning_rate": 2.101910828025478e-06, "loss": 0.6759, "step": 34 }, { "epoch": 0.016725404695060033, "grad_norm": 1.482461582005205, "learning_rate": 2.1656050955414015e-06, "loss": 0.6654, "step": 35 }, { "epoch": 0.017203273400633178, "grad_norm": 1.3245355619344652, "learning_rate": 2.229299363057325e-06, "loss": 0.6541, "step": 36 }, { "epoch": 0.01768114210620632, "grad_norm": 1.1472782080031219, "learning_rate": 2.2929936305732485e-06, "loss": 0.6515, "step": 37 }, { "epoch": 0.018159010811779463, "grad_norm": 1.0289175415890455, "learning_rate": 2.356687898089172e-06, "loss": 0.6299, "step": 38 }, { "epoch": 0.018636879517352607, "grad_norm": 0.9574361352811592, "learning_rate": 2.420382165605096e-06, "loss": 0.6258, "step": 39 }, { "epoch": 0.01911474822292575, "grad_norm": 0.9277275080285016, "learning_rate": 2.4840764331210194e-06, "loss": 0.6337, "step": 40 }, { "epoch": 0.019592616928498896, "grad_norm": 0.8945726217645557, "learning_rate": 2.547770700636943e-06, "loss": 0.6198, "step": 41 }, { "epoch": 0.02007048563407204, "grad_norm": 0.8774284132253958, "learning_rate": 2.6114649681528665e-06, "loss": 0.6102, "step": 42 }, { "epoch": 0.02054835433964518, "grad_norm": 0.9050997026733555, "learning_rate": 2.67515923566879e-06, "loss": 0.6221, "step": 43 }, { "epoch": 0.021026223045218326, "grad_norm": 0.8632323727229961, "learning_rate": 2.7388535031847135e-06, "loss": 0.622, "step": 44 }, { "epoch": 0.02150409175079147, "grad_norm": 0.813517779384421, "learning_rate": 2.802547770700637e-06, "loss": 0.6089, "step": 45 }, { "epoch": 0.021981960456364615, "grad_norm": 0.8186427508339532, "learning_rate": 2.8662420382165605e-06, "loss": 0.6016, "step": 46 }, { "epoch": 0.02245982916193776, "grad_norm": 0.8174387059211783, "learning_rate": 2.9299363057324844e-06, "loss": 0.5931, "step": 47 }, { "epoch": 0.0229376978675109, "grad_norm": 1.305617651951218, "learning_rate": 2.993630573248408e-06, "loss": 0.5692, "step": 48 }, { "epoch": 0.023415566573084044, "grad_norm": 0.8380237368547312, "learning_rate": 3.0573248407643314e-06, "loss": 0.5887, "step": 49 }, { "epoch": 0.02389343527865719, "grad_norm": 0.8200238530137682, "learning_rate": 3.121019108280255e-06, "loss": 0.5929, "step": 50 }, { "epoch": 0.024371303984230333, "grad_norm": 0.8403778738693355, "learning_rate": 3.1847133757961785e-06, "loss": 0.6014, "step": 51 }, { "epoch": 0.024849172689803477, "grad_norm": 0.9908747877734703, "learning_rate": 3.248407643312102e-06, "loss": 0.5735, "step": 52 }, { "epoch": 0.025327041395376622, "grad_norm": 0.8075484398827105, "learning_rate": 3.3121019108280255e-06, "loss": 0.5593, "step": 53 }, { "epoch": 0.025804910100949763, "grad_norm": 0.7626574710843891, "learning_rate": 3.375796178343949e-06, "loss": 0.5806, "step": 54 }, { "epoch": 0.026282778806522907, "grad_norm": 0.836758410598703, "learning_rate": 3.4394904458598725e-06, "loss": 0.5768, "step": 55 }, { "epoch": 0.02676064751209605, "grad_norm": 0.7496247956447741, "learning_rate": 3.5031847133757964e-06, "loss": 0.5815, "step": 56 }, { "epoch": 0.027238516217669196, "grad_norm": 0.7950751813645308, "learning_rate": 3.56687898089172e-06, "loss": 0.5893, "step": 57 }, { "epoch": 0.02771638492324234, "grad_norm": 0.7934523889506495, "learning_rate": 3.6305732484076435e-06, "loss": 0.5675, "step": 58 }, { "epoch": 0.02819425362881548, "grad_norm": 0.7469734117900833, "learning_rate": 3.694267515923567e-06, "loss": 0.5679, "step": 59 }, { "epoch": 0.028672122334388626, "grad_norm": 0.7390161251771473, "learning_rate": 3.757961783439491e-06, "loss": 0.5464, "step": 60 }, { "epoch": 0.02914999103996177, "grad_norm": 0.756427918659596, "learning_rate": 3.821656050955415e-06, "loss": 0.5422, "step": 61 }, { "epoch": 0.029627859745534914, "grad_norm": 0.7389039375217277, "learning_rate": 3.885350318471338e-06, "loss": 0.5551, "step": 62 }, { "epoch": 0.03010572845110806, "grad_norm": 0.7367851829841032, "learning_rate": 3.949044585987262e-06, "loss": 0.5449, "step": 63 }, { "epoch": 0.030583597156681203, "grad_norm": 0.7123685102640923, "learning_rate": 4.012738853503185e-06, "loss": 0.5458, "step": 64 }, { "epoch": 0.031061465862254344, "grad_norm": 0.7357352408857197, "learning_rate": 4.076433121019109e-06, "loss": 0.5341, "step": 65 }, { "epoch": 0.03153933456782749, "grad_norm": 0.7214843181688534, "learning_rate": 4.140127388535032e-06, "loss": 0.5459, "step": 66 }, { "epoch": 0.032017203273400636, "grad_norm": 0.7467241250442314, "learning_rate": 4.203821656050956e-06, "loss": 0.5396, "step": 67 }, { "epoch": 0.03249507197897378, "grad_norm": 0.7727746910699702, "learning_rate": 4.26751592356688e-06, "loss": 0.5472, "step": 68 }, { "epoch": 0.03297294068454692, "grad_norm": 0.7251863973183906, "learning_rate": 4.331210191082803e-06, "loss": 0.5597, "step": 69 }, { "epoch": 0.033450809390120066, "grad_norm": 0.7272882874909277, "learning_rate": 4.394904458598727e-06, "loss": 0.5556, "step": 70 }, { "epoch": 0.03392867809569321, "grad_norm": 0.8391711398530735, "learning_rate": 4.45859872611465e-06, "loss": 0.5496, "step": 71 }, { "epoch": 0.034406546801266355, "grad_norm": 0.9360566101078907, "learning_rate": 4.522292993630574e-06, "loss": 0.5432, "step": 72 }, { "epoch": 0.034884415506839496, "grad_norm": 0.7797140366402956, "learning_rate": 4.585987261146497e-06, "loss": 0.5395, "step": 73 }, { "epoch": 0.03536228421241264, "grad_norm": 0.707051554509993, "learning_rate": 4.649681528662421e-06, "loss": 0.5311, "step": 74 }, { "epoch": 0.035840152917985785, "grad_norm": 0.7834053957637365, "learning_rate": 4.713375796178344e-06, "loss": 0.5487, "step": 75 }, { "epoch": 0.036318021623558926, "grad_norm": 0.7935586931488725, "learning_rate": 4.777070063694268e-06, "loss": 0.5327, "step": 76 }, { "epoch": 0.036795890329132074, "grad_norm": 0.770263498050658, "learning_rate": 4.840764331210192e-06, "loss": 0.5146, "step": 77 }, { "epoch": 0.037273759034705214, "grad_norm": 0.9115020793198642, "learning_rate": 4.904458598726115e-06, "loss": 0.5438, "step": 78 }, { "epoch": 0.037751627740278355, "grad_norm": 0.8472805895573531, "learning_rate": 4.968152866242039e-06, "loss": 0.549, "step": 79 }, { "epoch": 0.0382294964458515, "grad_norm": 0.7542037051233978, "learning_rate": 5.031847133757962e-06, "loss": 0.5351, "step": 80 }, { "epoch": 0.038707365151424644, "grad_norm": 0.747781469023545, "learning_rate": 5.095541401273886e-06, "loss": 0.5084, "step": 81 }, { "epoch": 0.03918523385699779, "grad_norm": 0.7476744681713057, "learning_rate": 5.159235668789809e-06, "loss": 0.5245, "step": 82 }, { "epoch": 0.03966310256257093, "grad_norm": 0.7768862389201611, "learning_rate": 5.222929936305733e-06, "loss": 0.5271, "step": 83 }, { "epoch": 0.04014097126814408, "grad_norm": 0.7111007495704235, "learning_rate": 5.286624203821657e-06, "loss": 0.5266, "step": 84 }, { "epoch": 0.04061883997371722, "grad_norm": 0.732731140034443, "learning_rate": 5.35031847133758e-06, "loss": 0.5263, "step": 85 }, { "epoch": 0.04109670867929036, "grad_norm": 0.8652091302250842, "learning_rate": 5.414012738853504e-06, "loss": 0.5273, "step": 86 }, { "epoch": 0.04157457738486351, "grad_norm": 0.737707359165254, "learning_rate": 5.477707006369427e-06, "loss": 0.5061, "step": 87 }, { "epoch": 0.04205244609043665, "grad_norm": 0.8065815895775212, "learning_rate": 5.541401273885351e-06, "loss": 0.5098, "step": 88 }, { "epoch": 0.0425303147960098, "grad_norm": 0.7266312168372544, "learning_rate": 5.605095541401274e-06, "loss": 0.4999, "step": 89 }, { "epoch": 0.04300818350158294, "grad_norm": 0.7293962461070471, "learning_rate": 5.668789808917198e-06, "loss": 0.5213, "step": 90 }, { "epoch": 0.04348605220715608, "grad_norm": 0.7774805752991096, "learning_rate": 5.732484076433121e-06, "loss": 0.4995, "step": 91 }, { "epoch": 0.04396392091272923, "grad_norm": 0.7530825142450961, "learning_rate": 5.796178343949045e-06, "loss": 0.5365, "step": 92 }, { "epoch": 0.04444178961830237, "grad_norm": 0.6739843506427442, "learning_rate": 5.859872611464969e-06, "loss": 0.5146, "step": 93 }, { "epoch": 0.04491965832387552, "grad_norm": 0.7872225186802001, "learning_rate": 5.923566878980892e-06, "loss": 0.5171, "step": 94 }, { "epoch": 0.04539752702944866, "grad_norm": 0.7593936458726057, "learning_rate": 5.987261146496816e-06, "loss": 0.5075, "step": 95 }, { "epoch": 0.0458753957350218, "grad_norm": 0.7402231844428684, "learning_rate": 6.050955414012739e-06, "loss": 0.5219, "step": 96 }, { "epoch": 0.04635326444059495, "grad_norm": 0.8456741711717691, "learning_rate": 6.114649681528663e-06, "loss": 0.5179, "step": 97 }, { "epoch": 0.04683113314616809, "grad_norm": 0.7479830972685164, "learning_rate": 6.178343949044586e-06, "loss": 0.516, "step": 98 }, { "epoch": 0.047309001851741236, "grad_norm": 0.7472523172623668, "learning_rate": 6.24203821656051e-06, "loss": 0.498, "step": 99 }, { "epoch": 0.04778687055731438, "grad_norm": 0.8971442569968484, "learning_rate": 6.305732484076433e-06, "loss": 0.51, "step": 100 }, { "epoch": 0.04826473926288752, "grad_norm": 0.7564195148457464, "learning_rate": 6.369426751592357e-06, "loss": 0.5198, "step": 101 }, { "epoch": 0.048742607968460666, "grad_norm": 1.0620219552470769, "learning_rate": 6.433121019108281e-06, "loss": 0.5, "step": 102 }, { "epoch": 0.04922047667403381, "grad_norm": 0.7898571941764354, "learning_rate": 6.496815286624204e-06, "loss": 0.5154, "step": 103 }, { "epoch": 0.049698345379606955, "grad_norm": 0.7334402709211295, "learning_rate": 6.560509554140128e-06, "loss": 0.499, "step": 104 }, { "epoch": 0.050176214085180096, "grad_norm": 0.7398643150657089, "learning_rate": 6.624203821656051e-06, "loss": 0.506, "step": 105 }, { "epoch": 0.050654082790753244, "grad_norm": 0.7563474458560936, "learning_rate": 6.687898089171975e-06, "loss": 0.5153, "step": 106 }, { "epoch": 0.051131951496326385, "grad_norm": 0.7767148986192806, "learning_rate": 6.751592356687898e-06, "loss": 0.4994, "step": 107 }, { "epoch": 0.051609820201899526, "grad_norm": 0.7746503252727361, "learning_rate": 6.815286624203822e-06, "loss": 0.4994, "step": 108 }, { "epoch": 0.05208768890747267, "grad_norm": 0.8270879600000823, "learning_rate": 6.878980891719745e-06, "loss": 0.4912, "step": 109 }, { "epoch": 0.052565557613045814, "grad_norm": 0.7271341530708423, "learning_rate": 6.942675159235669e-06, "loss": 0.5038, "step": 110 }, { "epoch": 0.05304342631861896, "grad_norm": 0.7909333113850586, "learning_rate": 7.006369426751593e-06, "loss": 0.4965, "step": 111 }, { "epoch": 0.0535212950241921, "grad_norm": 1.1920708554598451, "learning_rate": 7.070063694267516e-06, "loss": 0.4835, "step": 112 }, { "epoch": 0.053999163729765244, "grad_norm": 1.1437207443713215, "learning_rate": 7.13375796178344e-06, "loss": 0.4871, "step": 113 }, { "epoch": 0.05447703243533839, "grad_norm": 0.7995131778249069, "learning_rate": 7.197452229299363e-06, "loss": 0.5125, "step": 114 }, { "epoch": 0.05495490114091153, "grad_norm": 0.7658107271492185, "learning_rate": 7.261146496815287e-06, "loss": 0.5043, "step": 115 }, { "epoch": 0.05543276984648468, "grad_norm": 0.922334411856167, "learning_rate": 7.32484076433121e-06, "loss": 0.4989, "step": 116 }, { "epoch": 0.05591063855205782, "grad_norm": 0.7836377405415712, "learning_rate": 7.388535031847134e-06, "loss": 0.4899, "step": 117 }, { "epoch": 0.05638850725763096, "grad_norm": 0.8764338235424465, "learning_rate": 7.452229299363057e-06, "loss": 0.5001, "step": 118 }, { "epoch": 0.05686637596320411, "grad_norm": 0.8139065283378535, "learning_rate": 7.515923566878982e-06, "loss": 0.5079, "step": 119 }, { "epoch": 0.05734424466877725, "grad_norm": 0.8340053526116425, "learning_rate": 7.579617834394906e-06, "loss": 0.5087, "step": 120 }, { "epoch": 0.0578221133743504, "grad_norm": 0.9688304321284679, "learning_rate": 7.64331210191083e-06, "loss": 0.4945, "step": 121 }, { "epoch": 0.05829998207992354, "grad_norm": 0.8501127062676569, "learning_rate": 7.707006369426753e-06, "loss": 0.4736, "step": 122 }, { "epoch": 0.05877785078549669, "grad_norm": 0.8146384000193851, "learning_rate": 7.770700636942676e-06, "loss": 0.4917, "step": 123 }, { "epoch": 0.05925571949106983, "grad_norm": 0.8905411656723172, "learning_rate": 7.8343949044586e-06, "loss": 0.4931, "step": 124 }, { "epoch": 0.05973358819664297, "grad_norm": 0.8610319305755783, "learning_rate": 7.898089171974524e-06, "loss": 0.5019, "step": 125 }, { "epoch": 0.06021145690221612, "grad_norm": 0.8674477387963704, "learning_rate": 7.961783439490447e-06, "loss": 0.4896, "step": 126 }, { "epoch": 0.06068932560778926, "grad_norm": 0.8577218357744752, "learning_rate": 8.02547770700637e-06, "loss": 0.4811, "step": 127 }, { "epoch": 0.06116719431336241, "grad_norm": 0.8448292497264104, "learning_rate": 8.089171974522295e-06, "loss": 0.504, "step": 128 }, { "epoch": 0.06164506301893555, "grad_norm": 0.937833097972482, "learning_rate": 8.152866242038218e-06, "loss": 0.4962, "step": 129 }, { "epoch": 0.06212293172450869, "grad_norm": 0.7840157899279157, "learning_rate": 8.21656050955414e-06, "loss": 0.4985, "step": 130 }, { "epoch": 0.06260080043008183, "grad_norm": 0.8889842008188884, "learning_rate": 8.280254777070064e-06, "loss": 0.4917, "step": 131 }, { "epoch": 0.06307866913565498, "grad_norm": 0.8118606083274249, "learning_rate": 8.343949044585989e-06, "loss": 0.5121, "step": 132 }, { "epoch": 0.06355653784122813, "grad_norm": 0.8160111895836885, "learning_rate": 8.407643312101912e-06, "loss": 0.4975, "step": 133 }, { "epoch": 0.06403440654680127, "grad_norm": 0.8257529923932002, "learning_rate": 8.471337579617835e-06, "loss": 0.4905, "step": 134 }, { "epoch": 0.06451227525237441, "grad_norm": 0.9316496549034691, "learning_rate": 8.53503184713376e-06, "loss": 0.4922, "step": 135 }, { "epoch": 0.06499014395794755, "grad_norm": 3.6885784250742777, "learning_rate": 8.598726114649683e-06, "loss": 0.5088, "step": 136 }, { "epoch": 0.0654680126635207, "grad_norm": 1.0431096223978598, "learning_rate": 8.662420382165606e-06, "loss": 0.4851, "step": 137 }, { "epoch": 0.06594588136909384, "grad_norm": 0.9301103008954675, "learning_rate": 8.726114649681529e-06, "loss": 0.5035, "step": 138 }, { "epoch": 0.06642375007466698, "grad_norm": 0.961097839927529, "learning_rate": 8.789808917197454e-06, "loss": 0.5035, "step": 139 }, { "epoch": 0.06690161878024013, "grad_norm": 0.8559608590796796, "learning_rate": 8.853503184713377e-06, "loss": 0.4786, "step": 140 }, { "epoch": 0.06737948748581327, "grad_norm": 0.8702110473814613, "learning_rate": 8.9171974522293e-06, "loss": 0.4769, "step": 141 }, { "epoch": 0.06785735619138641, "grad_norm": 0.89503054543084, "learning_rate": 8.980891719745225e-06, "loss": 0.4936, "step": 142 }, { "epoch": 0.06833522489695956, "grad_norm": 0.9405145862374199, "learning_rate": 9.044585987261148e-06, "loss": 0.4994, "step": 143 }, { "epoch": 0.06881309360253271, "grad_norm": 0.8359505048050953, "learning_rate": 9.10828025477707e-06, "loss": 0.4888, "step": 144 }, { "epoch": 0.06929096230810584, "grad_norm": 0.8335744765307723, "learning_rate": 9.171974522292994e-06, "loss": 0.486, "step": 145 }, { "epoch": 0.06976883101367899, "grad_norm": 0.9191064552692454, "learning_rate": 9.235668789808919e-06, "loss": 0.4936, "step": 146 }, { "epoch": 0.07024669971925214, "grad_norm": 0.8613976280696265, "learning_rate": 9.299363057324842e-06, "loss": 0.4892, "step": 147 }, { "epoch": 0.07072456842482527, "grad_norm": 0.9085851257000545, "learning_rate": 9.363057324840765e-06, "loss": 0.4959, "step": 148 }, { "epoch": 0.07120243713039842, "grad_norm": 0.8498504776265091, "learning_rate": 9.426751592356688e-06, "loss": 0.4799, "step": 149 }, { "epoch": 0.07168030583597157, "grad_norm": 0.981381083456667, "learning_rate": 9.490445859872613e-06, "loss": 0.4759, "step": 150 }, { "epoch": 0.07215817454154472, "grad_norm": 0.956769830775876, "learning_rate": 9.554140127388536e-06, "loss": 0.4898, "step": 151 }, { "epoch": 0.07263604324711785, "grad_norm": 1.0038255875882836, "learning_rate": 9.617834394904459e-06, "loss": 0.4775, "step": 152 }, { "epoch": 0.073113911952691, "grad_norm": 0.8713370885352425, "learning_rate": 9.681528662420384e-06, "loss": 0.4969, "step": 153 }, { "epoch": 0.07359178065826415, "grad_norm": 0.8141213363595519, "learning_rate": 9.745222929936307e-06, "loss": 0.4705, "step": 154 }, { "epoch": 0.07406964936383728, "grad_norm": 3.751367127935264, "learning_rate": 9.80891719745223e-06, "loss": 0.4864, "step": 155 }, { "epoch": 0.07454751806941043, "grad_norm": 1.1148603281454004, "learning_rate": 9.872611464968153e-06, "loss": 0.4822, "step": 156 }, { "epoch": 0.07502538677498358, "grad_norm": 0.8287232413246658, "learning_rate": 9.936305732484078e-06, "loss": 0.4642, "step": 157 }, { "epoch": 0.07550325548055671, "grad_norm": 0.8915041269681552, "learning_rate": 1e-05, "loss": 0.4768, "step": 158 }, { "epoch": 0.07598112418612986, "grad_norm": 0.873722784678586, "learning_rate": 1.0063694267515924e-05, "loss": 0.4808, "step": 159 }, { "epoch": 0.076458992891703, "grad_norm": 0.8676339202348703, "learning_rate": 1.0127388535031849e-05, "loss": 0.4916, "step": 160 }, { "epoch": 0.07693686159727615, "grad_norm": 0.9305589175210003, "learning_rate": 1.0191082802547772e-05, "loss": 0.4747, "step": 161 }, { "epoch": 0.07741473030284929, "grad_norm": 0.8519987127043529, "learning_rate": 1.0254777070063695e-05, "loss": 0.4756, "step": 162 }, { "epoch": 0.07789259900842244, "grad_norm": 0.8823350335352326, "learning_rate": 1.0318471337579618e-05, "loss": 0.4662, "step": 163 }, { "epoch": 0.07837046771399558, "grad_norm": 0.7500687237850792, "learning_rate": 1.0382165605095543e-05, "loss": 0.485, "step": 164 }, { "epoch": 0.07884833641956872, "grad_norm": 0.8517746565202567, "learning_rate": 1.0445859872611466e-05, "loss": 0.4863, "step": 165 }, { "epoch": 0.07932620512514187, "grad_norm": 1.2439834788261996, "learning_rate": 1.0509554140127389e-05, "loss": 0.4818, "step": 166 }, { "epoch": 0.07980407383071501, "grad_norm": 0.9673894115074996, "learning_rate": 1.0573248407643314e-05, "loss": 0.4916, "step": 167 }, { "epoch": 0.08028194253628816, "grad_norm": 0.8778733640888571, "learning_rate": 1.0636942675159237e-05, "loss": 0.4786, "step": 168 }, { "epoch": 0.0807598112418613, "grad_norm": 0.9193416256035223, "learning_rate": 1.070063694267516e-05, "loss": 0.4806, "step": 169 }, { "epoch": 0.08123767994743444, "grad_norm": 1.0391078840034496, "learning_rate": 1.0764331210191083e-05, "loss": 0.4601, "step": 170 }, { "epoch": 0.08171554865300759, "grad_norm": 0.8696302636427024, "learning_rate": 1.0828025477707008e-05, "loss": 0.4766, "step": 171 }, { "epoch": 0.08219341735858073, "grad_norm": 0.8538992156431188, "learning_rate": 1.089171974522293e-05, "loss": 0.4929, "step": 172 }, { "epoch": 0.08267128606415387, "grad_norm": 0.8784469546200593, "learning_rate": 1.0955414012738854e-05, "loss": 0.4827, "step": 173 }, { "epoch": 0.08314915476972702, "grad_norm": 0.7996342900390987, "learning_rate": 1.1019108280254777e-05, "loss": 0.4768, "step": 174 }, { "epoch": 0.08362702347530016, "grad_norm": 0.9283558874133578, "learning_rate": 1.1082802547770702e-05, "loss": 0.4654, "step": 175 }, { "epoch": 0.0841048921808733, "grad_norm": 1.1381555316283323, "learning_rate": 1.1146496815286625e-05, "loss": 0.4741, "step": 176 }, { "epoch": 0.08458276088644645, "grad_norm": 0.9965684448163404, "learning_rate": 1.1210191082802548e-05, "loss": 0.4836, "step": 177 }, { "epoch": 0.0850606295920196, "grad_norm": 0.8998507742131849, "learning_rate": 1.1273885350318473e-05, "loss": 0.4996, "step": 178 }, { "epoch": 0.08553849829759273, "grad_norm": 1.0467069720974493, "learning_rate": 1.1337579617834396e-05, "loss": 0.4824, "step": 179 }, { "epoch": 0.08601636700316588, "grad_norm": 0.865608733409604, "learning_rate": 1.1401273885350319e-05, "loss": 0.4701, "step": 180 }, { "epoch": 0.08649423570873903, "grad_norm": 0.8930941148308225, "learning_rate": 1.1464968152866242e-05, "loss": 0.4887, "step": 181 }, { "epoch": 0.08697210441431216, "grad_norm": 0.9632697993858005, "learning_rate": 1.1528662420382167e-05, "loss": 0.4592, "step": 182 }, { "epoch": 0.08744997311988531, "grad_norm": 0.8648798405759216, "learning_rate": 1.159235668789809e-05, "loss": 0.461, "step": 183 }, { "epoch": 0.08792784182545846, "grad_norm": 1.1724901516943187, "learning_rate": 1.1656050955414013e-05, "loss": 0.4701, "step": 184 }, { "epoch": 0.08840571053103159, "grad_norm": 1.428382716298115, "learning_rate": 1.1719745222929938e-05, "loss": 0.4774, "step": 185 }, { "epoch": 0.08888357923660474, "grad_norm": 0.9014718070520222, "learning_rate": 1.178343949044586e-05, "loss": 0.4825, "step": 186 }, { "epoch": 0.08936144794217789, "grad_norm": 0.9006881000139725, "learning_rate": 1.1847133757961784e-05, "loss": 0.4729, "step": 187 }, { "epoch": 0.08983931664775104, "grad_norm": 0.8927655043358175, "learning_rate": 1.1910828025477707e-05, "loss": 0.4725, "step": 188 }, { "epoch": 0.09031718535332417, "grad_norm": 0.8765220142542697, "learning_rate": 1.1974522292993632e-05, "loss": 0.4739, "step": 189 }, { "epoch": 0.09079505405889732, "grad_norm": 0.8335515837070985, "learning_rate": 1.2038216560509555e-05, "loss": 0.4721, "step": 190 }, { "epoch": 0.09127292276447047, "grad_norm": 0.8307106452230681, "learning_rate": 1.2101910828025478e-05, "loss": 0.4846, "step": 191 }, { "epoch": 0.0917507914700436, "grad_norm": 0.8681247868696225, "learning_rate": 1.2165605095541401e-05, "loss": 0.4728, "step": 192 }, { "epoch": 0.09222866017561675, "grad_norm": 0.8869954220873089, "learning_rate": 1.2229299363057326e-05, "loss": 0.4721, "step": 193 }, { "epoch": 0.0927065288811899, "grad_norm": 0.8726364899218871, "learning_rate": 1.2292993630573249e-05, "loss": 0.4676, "step": 194 }, { "epoch": 0.09318439758676304, "grad_norm": 0.9315747844747434, "learning_rate": 1.2356687898089172e-05, "loss": 0.4662, "step": 195 }, { "epoch": 0.09366226629233618, "grad_norm": 0.8894305822680878, "learning_rate": 1.2420382165605097e-05, "loss": 0.4593, "step": 196 }, { "epoch": 0.09414013499790932, "grad_norm": 0.8882716488428526, "learning_rate": 1.248407643312102e-05, "loss": 0.4805, "step": 197 }, { "epoch": 0.09461800370348247, "grad_norm": 0.9004982374479936, "learning_rate": 1.2547770700636943e-05, "loss": 0.481, "step": 198 }, { "epoch": 0.0950958724090556, "grad_norm": 0.8961471264918875, "learning_rate": 1.2611464968152866e-05, "loss": 0.4805, "step": 199 }, { "epoch": 0.09557374111462875, "grad_norm": 0.883909235141162, "learning_rate": 1.267515923566879e-05, "loss": 0.4617, "step": 200 }, { "epoch": 0.0960516098202019, "grad_norm": 0.8490947466546838, "learning_rate": 1.2738853503184714e-05, "loss": 0.4643, "step": 201 }, { "epoch": 0.09652947852577504, "grad_norm": 1.0723263514380847, "learning_rate": 1.2802547770700637e-05, "loss": 0.4792, "step": 202 }, { "epoch": 0.09700734723134818, "grad_norm": 0.8029199282504613, "learning_rate": 1.2866242038216562e-05, "loss": 0.4698, "step": 203 }, { "epoch": 0.09748521593692133, "grad_norm": 0.8834995304331348, "learning_rate": 1.2929936305732485e-05, "loss": 0.4699, "step": 204 }, { "epoch": 0.09796308464249448, "grad_norm": 0.8189162195122099, "learning_rate": 1.2993630573248408e-05, "loss": 0.4398, "step": 205 }, { "epoch": 0.09844095334806761, "grad_norm": 0.9229854446430572, "learning_rate": 1.3057324840764331e-05, "loss": 0.454, "step": 206 }, { "epoch": 0.09891882205364076, "grad_norm": 0.8530294598476205, "learning_rate": 1.3121019108280256e-05, "loss": 0.4644, "step": 207 }, { "epoch": 0.09939669075921391, "grad_norm": 1.0909086978949185, "learning_rate": 1.3184713375796179e-05, "loss": 0.4736, "step": 208 }, { "epoch": 0.09987455946478704, "grad_norm": 0.9272994665983898, "learning_rate": 1.3248407643312102e-05, "loss": 0.4515, "step": 209 }, { "epoch": 0.10035242817036019, "grad_norm": 1.1625605063974396, "learning_rate": 1.3312101910828025e-05, "loss": 0.4477, "step": 210 }, { "epoch": 0.10083029687593334, "grad_norm": 0.9352271125871605, "learning_rate": 1.337579617834395e-05, "loss": 0.4878, "step": 211 }, { "epoch": 0.10130816558150649, "grad_norm": 1.2595391048195885, "learning_rate": 1.3439490445859873e-05, "loss": 0.4694, "step": 212 }, { "epoch": 0.10178603428707962, "grad_norm": 0.8996354926203881, "learning_rate": 1.3503184713375796e-05, "loss": 0.4564, "step": 213 }, { "epoch": 0.10226390299265277, "grad_norm": 0.9358758715512773, "learning_rate": 1.356687898089172e-05, "loss": 0.4866, "step": 214 }, { "epoch": 0.10274177169822592, "grad_norm": 0.8988782843133788, "learning_rate": 1.3630573248407644e-05, "loss": 0.4657, "step": 215 }, { "epoch": 0.10321964040379905, "grad_norm": 0.9416537008791075, "learning_rate": 1.3694267515923567e-05, "loss": 0.4563, "step": 216 }, { "epoch": 0.1036975091093722, "grad_norm": 0.8472207027263581, "learning_rate": 1.375796178343949e-05, "loss": 0.4606, "step": 217 }, { "epoch": 0.10417537781494535, "grad_norm": 0.8585181561986107, "learning_rate": 1.3821656050955415e-05, "loss": 0.4825, "step": 218 }, { "epoch": 0.10465324652051848, "grad_norm": 1.015742170592993, "learning_rate": 1.3885350318471338e-05, "loss": 0.4576, "step": 219 }, { "epoch": 0.10513111522609163, "grad_norm": 0.8766329989197548, "learning_rate": 1.3949044585987261e-05, "loss": 0.4648, "step": 220 }, { "epoch": 0.10560898393166478, "grad_norm": 0.8917767684612441, "learning_rate": 1.4012738853503186e-05, "loss": 0.4713, "step": 221 }, { "epoch": 0.10608685263723792, "grad_norm": 0.8758511266694687, "learning_rate": 1.4076433121019109e-05, "loss": 0.4741, "step": 222 }, { "epoch": 0.10656472134281106, "grad_norm": 0.8916775950330903, "learning_rate": 1.4140127388535032e-05, "loss": 0.4544, "step": 223 }, { "epoch": 0.1070425900483842, "grad_norm": 1.4793552859878714, "learning_rate": 1.4203821656050955e-05, "loss": 0.4497, "step": 224 }, { "epoch": 0.10752045875395735, "grad_norm": 1.1872856996535994, "learning_rate": 1.426751592356688e-05, "loss": 0.4749, "step": 225 }, { "epoch": 0.10799832745953049, "grad_norm": 0.8700025220590151, "learning_rate": 1.4331210191082803e-05, "loss": 0.4751, "step": 226 }, { "epoch": 0.10847619616510364, "grad_norm": 0.9905036123666954, "learning_rate": 1.4394904458598726e-05, "loss": 0.4734, "step": 227 }, { "epoch": 0.10895406487067678, "grad_norm": 0.9544401508765005, "learning_rate": 1.4458598726114649e-05, "loss": 0.4662, "step": 228 }, { "epoch": 0.10943193357624993, "grad_norm": 0.8769851114803372, "learning_rate": 1.4522292993630574e-05, "loss": 0.4669, "step": 229 }, { "epoch": 0.10990980228182307, "grad_norm": 0.8733018885948226, "learning_rate": 1.4585987261146497e-05, "loss": 0.4696, "step": 230 }, { "epoch": 0.11038767098739621, "grad_norm": 0.9441368673354331, "learning_rate": 1.464968152866242e-05, "loss": 0.4546, "step": 231 }, { "epoch": 0.11086553969296936, "grad_norm": 0.789316515274095, "learning_rate": 1.4713375796178345e-05, "loss": 0.4927, "step": 232 }, { "epoch": 0.1113434083985425, "grad_norm": 0.8994006543929829, "learning_rate": 1.4777070063694268e-05, "loss": 0.4768, "step": 233 }, { "epoch": 0.11182127710411564, "grad_norm": 0.8060765297549664, "learning_rate": 1.4840764331210191e-05, "loss": 0.4663, "step": 234 }, { "epoch": 0.11229914580968879, "grad_norm": 0.8826274187881293, "learning_rate": 1.4904458598726114e-05, "loss": 0.4635, "step": 235 }, { "epoch": 0.11277701451526193, "grad_norm": 0.9233679523033971, "learning_rate": 1.4968152866242039e-05, "loss": 0.469, "step": 236 }, { "epoch": 0.11325488322083507, "grad_norm": 0.9109628801781714, "learning_rate": 1.5031847133757964e-05, "loss": 0.455, "step": 237 }, { "epoch": 0.11373275192640822, "grad_norm": 0.8186652665707146, "learning_rate": 1.5095541401273888e-05, "loss": 0.4581, "step": 238 }, { "epoch": 0.11421062063198137, "grad_norm": 0.9309662613156999, "learning_rate": 1.5159235668789811e-05, "loss": 0.4412, "step": 239 }, { "epoch": 0.1146884893375545, "grad_norm": 3.931134407255136, "learning_rate": 1.5222929936305735e-05, "loss": 0.4509, "step": 240 }, { "epoch": 0.11516635804312765, "grad_norm": 1.1827238868781336, "learning_rate": 1.528662420382166e-05, "loss": 0.4574, "step": 241 }, { "epoch": 0.1156442267487008, "grad_norm": 0.8016143281537751, "learning_rate": 1.5350318471337582e-05, "loss": 0.4493, "step": 242 }, { "epoch": 0.11612209545427393, "grad_norm": 0.968479721862784, "learning_rate": 1.5414012738853506e-05, "loss": 0.4698, "step": 243 }, { "epoch": 0.11659996415984708, "grad_norm": 2.2168571117681246, "learning_rate": 1.547770700636943e-05, "loss": 0.4564, "step": 244 }, { "epoch": 0.11707783286542023, "grad_norm": 0.9881765074402735, "learning_rate": 1.5541401273885352e-05, "loss": 0.4639, "step": 245 }, { "epoch": 0.11755570157099338, "grad_norm": 0.941404090505303, "learning_rate": 1.5605095541401275e-05, "loss": 0.4631, "step": 246 }, { "epoch": 0.11803357027656651, "grad_norm": 0.9698016606767922, "learning_rate": 1.56687898089172e-05, "loss": 0.444, "step": 247 }, { "epoch": 0.11851143898213966, "grad_norm": 0.915195330989377, "learning_rate": 1.5732484076433124e-05, "loss": 0.4581, "step": 248 }, { "epoch": 0.1189893076877128, "grad_norm": 0.8642148011456234, "learning_rate": 1.5796178343949047e-05, "loss": 0.4506, "step": 249 }, { "epoch": 0.11946717639328594, "grad_norm": 0.935450575737815, "learning_rate": 1.585987261146497e-05, "loss": 0.461, "step": 250 }, { "epoch": 0.11994504509885909, "grad_norm": 0.892113620702839, "learning_rate": 1.5923566878980894e-05, "loss": 0.4761, "step": 251 }, { "epoch": 0.12042291380443224, "grad_norm": 0.9066524856260676, "learning_rate": 1.5987261146496817e-05, "loss": 0.4625, "step": 252 }, { "epoch": 0.12090078251000537, "grad_norm": 1.1306712148067306, "learning_rate": 1.605095541401274e-05, "loss": 0.4496, "step": 253 }, { "epoch": 0.12137865121557852, "grad_norm": 1.0415733798385562, "learning_rate": 1.6114649681528666e-05, "loss": 0.448, "step": 254 }, { "epoch": 0.12185651992115167, "grad_norm": 0.8142111756751625, "learning_rate": 1.617834394904459e-05, "loss": 0.4602, "step": 255 }, { "epoch": 0.12233438862672481, "grad_norm": 0.9011444127512608, "learning_rate": 1.6242038216560512e-05, "loss": 0.4743, "step": 256 }, { "epoch": 0.12281225733229795, "grad_norm": 0.8423920386335364, "learning_rate": 1.6305732484076436e-05, "loss": 0.4584, "step": 257 }, { "epoch": 0.1232901260378711, "grad_norm": 0.8878027469639385, "learning_rate": 1.636942675159236e-05, "loss": 0.4586, "step": 258 }, { "epoch": 0.12376799474344424, "grad_norm": 0.9181872747662326, "learning_rate": 1.643312101910828e-05, "loss": 0.4559, "step": 259 }, { "epoch": 0.12424586344901738, "grad_norm": 0.8298428319861421, "learning_rate": 1.6496815286624205e-05, "loss": 0.4719, "step": 260 }, { "epoch": 0.12472373215459052, "grad_norm": 0.8689700122227426, "learning_rate": 1.6560509554140128e-05, "loss": 0.4837, "step": 261 }, { "epoch": 0.12520160086016366, "grad_norm": 0.8387758000094565, "learning_rate": 1.6624203821656054e-05, "loss": 0.4727, "step": 262 }, { "epoch": 0.1256794695657368, "grad_norm": 0.7571078182093177, "learning_rate": 1.6687898089171977e-05, "loss": 0.47, "step": 263 }, { "epoch": 0.12615733827130995, "grad_norm": 0.9152714319886641, "learning_rate": 1.67515923566879e-05, "loss": 0.4498, "step": 264 }, { "epoch": 0.1266352069768831, "grad_norm": 0.7676184859264714, "learning_rate": 1.6815286624203824e-05, "loss": 0.452, "step": 265 }, { "epoch": 0.12711307568245625, "grad_norm": 0.893887271149616, "learning_rate": 1.6878980891719747e-05, "loss": 0.4571, "step": 266 }, { "epoch": 0.1275909443880294, "grad_norm": 0.7664207431197599, "learning_rate": 1.694267515923567e-05, "loss": 0.4605, "step": 267 }, { "epoch": 0.12806881309360255, "grad_norm": 0.8462357846724339, "learning_rate": 1.7006369426751593e-05, "loss": 0.4627, "step": 268 }, { "epoch": 0.12854668179917567, "grad_norm": 0.7917183376178123, "learning_rate": 1.707006369426752e-05, "loss": 0.4587, "step": 269 }, { "epoch": 0.12902455050474881, "grad_norm": 0.8751109922497367, "learning_rate": 1.7133757961783442e-05, "loss": 0.4593, "step": 270 }, { "epoch": 0.12950241921032196, "grad_norm": 0.8869857964850081, "learning_rate": 1.7197452229299365e-05, "loss": 0.4455, "step": 271 }, { "epoch": 0.1299802879158951, "grad_norm": 0.8652417097770274, "learning_rate": 1.726114649681529e-05, "loss": 0.4532, "step": 272 }, { "epoch": 0.13045815662146826, "grad_norm": 0.8319161291537053, "learning_rate": 1.732484076433121e-05, "loss": 0.4453, "step": 273 }, { "epoch": 0.1309360253270414, "grad_norm": 0.816357574302165, "learning_rate": 1.7388535031847135e-05, "loss": 0.446, "step": 274 }, { "epoch": 0.13141389403261453, "grad_norm": 0.8885905160698827, "learning_rate": 1.7452229299363058e-05, "loss": 0.4695, "step": 275 }, { "epoch": 0.13189176273818767, "grad_norm": 1.0861739881800032, "learning_rate": 1.7515923566878984e-05, "loss": 0.4567, "step": 276 }, { "epoch": 0.13236963144376082, "grad_norm": 0.7809548635639034, "learning_rate": 1.7579617834394907e-05, "loss": 0.459, "step": 277 }, { "epoch": 0.13284750014933397, "grad_norm": 0.7635231358009456, "learning_rate": 1.764331210191083e-05, "loss": 0.4589, "step": 278 }, { "epoch": 0.13332536885490712, "grad_norm": 0.7560665612792802, "learning_rate": 1.7707006369426754e-05, "loss": 0.4528, "step": 279 }, { "epoch": 0.13380323756048026, "grad_norm": 0.8086543758881466, "learning_rate": 1.7770700636942677e-05, "loss": 0.472, "step": 280 }, { "epoch": 0.1342811062660534, "grad_norm": 0.7550004521071806, "learning_rate": 1.78343949044586e-05, "loss": 0.4551, "step": 281 }, { "epoch": 0.13475897497162653, "grad_norm": 0.7788235365516403, "learning_rate": 1.7898089171974523e-05, "loss": 0.4759, "step": 282 }, { "epoch": 0.13523684367719968, "grad_norm": 0.7296889721103695, "learning_rate": 1.796178343949045e-05, "loss": 0.4504, "step": 283 }, { "epoch": 0.13571471238277283, "grad_norm": 0.8658688096621077, "learning_rate": 1.8025477707006372e-05, "loss": 0.4645, "step": 284 }, { "epoch": 0.13619258108834598, "grad_norm": 0.7398631780173166, "learning_rate": 1.8089171974522295e-05, "loss": 0.4476, "step": 285 }, { "epoch": 0.13667044979391912, "grad_norm": 0.8865227853224859, "learning_rate": 1.815286624203822e-05, "loss": 0.465, "step": 286 }, { "epoch": 0.13714831849949227, "grad_norm": 0.8311750620796008, "learning_rate": 1.821656050955414e-05, "loss": 0.4652, "step": 287 }, { "epoch": 0.13762618720506542, "grad_norm": 0.8878939015819043, "learning_rate": 1.8280254777070065e-05, "loss": 0.4714, "step": 288 }, { "epoch": 0.13810405591063854, "grad_norm": 0.8154300177125134, "learning_rate": 1.8343949044585988e-05, "loss": 0.4458, "step": 289 }, { "epoch": 0.1385819246162117, "grad_norm": 0.8769838209673224, "learning_rate": 1.8407643312101914e-05, "loss": 0.4425, "step": 290 }, { "epoch": 0.13905979332178484, "grad_norm": 0.8218246714550715, "learning_rate": 1.8471337579617837e-05, "loss": 0.4555, "step": 291 }, { "epoch": 0.13953766202735798, "grad_norm": 0.9038739220509564, "learning_rate": 1.853503184713376e-05, "loss": 0.4542, "step": 292 }, { "epoch": 0.14001553073293113, "grad_norm": 0.9290101054130957, "learning_rate": 1.8598726114649684e-05, "loss": 0.4573, "step": 293 }, { "epoch": 0.14049339943850428, "grad_norm": 0.8231788424091275, "learning_rate": 1.8662420382165607e-05, "loss": 0.454, "step": 294 }, { "epoch": 0.14097126814407743, "grad_norm": 0.8068528909020721, "learning_rate": 1.872611464968153e-05, "loss": 0.4703, "step": 295 }, { "epoch": 0.14144913684965055, "grad_norm": 0.8841324441539453, "learning_rate": 1.8789808917197453e-05, "loss": 0.4576, "step": 296 }, { "epoch": 0.1419270055552237, "grad_norm": 0.8103277577643994, "learning_rate": 1.8853503184713376e-05, "loss": 0.4624, "step": 297 }, { "epoch": 0.14240487426079684, "grad_norm": 0.7407111914182409, "learning_rate": 1.8917197452229302e-05, "loss": 0.4539, "step": 298 }, { "epoch": 0.14288274296637, "grad_norm": 0.8494222480318973, "learning_rate": 1.8980891719745225e-05, "loss": 0.4633, "step": 299 }, { "epoch": 0.14336061167194314, "grad_norm": 0.760718892931761, "learning_rate": 1.904458598726115e-05, "loss": 0.4491, "step": 300 }, { "epoch": 0.1438384803775163, "grad_norm": 0.883575953116712, "learning_rate": 1.910828025477707e-05, "loss": 0.4579, "step": 301 }, { "epoch": 0.14431634908308943, "grad_norm": 0.8706151177176045, "learning_rate": 1.9171974522292995e-05, "loss": 0.4747, "step": 302 }, { "epoch": 0.14479421778866255, "grad_norm": 0.8387944329782351, "learning_rate": 1.9235668789808918e-05, "loss": 0.4484, "step": 303 }, { "epoch": 0.1452720864942357, "grad_norm": 1.1446786646678901, "learning_rate": 1.929936305732484e-05, "loss": 0.4694, "step": 304 }, { "epoch": 0.14574995519980885, "grad_norm": 0.895347496956798, "learning_rate": 1.9363057324840767e-05, "loss": 0.4488, "step": 305 }, { "epoch": 0.146227823905382, "grad_norm": 1.5869264897706326, "learning_rate": 1.942675159235669e-05, "loss": 0.4478, "step": 306 }, { "epoch": 0.14670569261095515, "grad_norm": 0.9876414045670644, "learning_rate": 1.9490445859872614e-05, "loss": 0.4593, "step": 307 }, { "epoch": 0.1471835613165283, "grad_norm": 0.8428176446808334, "learning_rate": 1.9554140127388537e-05, "loss": 0.4603, "step": 308 }, { "epoch": 0.14766143002210141, "grad_norm": 0.9091407347525033, "learning_rate": 1.961783439490446e-05, "loss": 0.4635, "step": 309 }, { "epoch": 0.14813929872767456, "grad_norm": 0.8541069012050329, "learning_rate": 1.9681528662420383e-05, "loss": 0.4556, "step": 310 }, { "epoch": 0.1486171674332477, "grad_norm": 0.8866008034684022, "learning_rate": 1.9745222929936306e-05, "loss": 0.4595, "step": 311 }, { "epoch": 0.14909503613882086, "grad_norm": 1.0041418570948004, "learning_rate": 1.9808917197452232e-05, "loss": 0.4575, "step": 312 }, { "epoch": 0.149572904844394, "grad_norm": 0.8608989524709532, "learning_rate": 1.9872611464968155e-05, "loss": 0.4592, "step": 313 }, { "epoch": 0.15005077354996715, "grad_norm": 0.9193782324674579, "learning_rate": 1.993630573248408e-05, "loss": 0.4702, "step": 314 }, { "epoch": 0.1505286422555403, "grad_norm": 0.8953098442701494, "learning_rate": 2e-05, "loss": 0.4372, "step": 315 }, { "epoch": 0.15100651096111342, "grad_norm": 0.9315345476094137, "learning_rate": 1.99999986130882e-05, "loss": 0.4561, "step": 316 }, { "epoch": 0.15148437966668657, "grad_norm": 0.8193614925815302, "learning_rate": 1.999999445235318e-05, "loss": 0.4419, "step": 317 }, { "epoch": 0.15196224837225972, "grad_norm": 0.8580436146437501, "learning_rate": 1.9999987517796095e-05, "loss": 0.4517, "step": 318 }, { "epoch": 0.15244011707783287, "grad_norm": 0.823969742063536, "learning_rate": 1.9999977809418872e-05, "loss": 0.465, "step": 319 }, { "epoch": 0.152917985783406, "grad_norm": 0.878097134652308, "learning_rate": 1.99999653272242e-05, "loss": 0.4515, "step": 320 }, { "epoch": 0.15339585448897916, "grad_norm": 0.8475324958361699, "learning_rate": 1.999995007121554e-05, "loss": 0.4491, "step": 321 }, { "epoch": 0.1538737231945523, "grad_norm": 0.8481861989745667, "learning_rate": 1.9999932041397128e-05, "loss": 0.4623, "step": 322 }, { "epoch": 0.15435159190012543, "grad_norm": 0.7962447154311731, "learning_rate": 1.9999911237773964e-05, "loss": 0.4525, "step": 323 }, { "epoch": 0.15482946060569858, "grad_norm": 0.7407282750796474, "learning_rate": 1.999988766035182e-05, "loss": 0.4465, "step": 324 }, { "epoch": 0.15530732931127172, "grad_norm": 0.8298386043292453, "learning_rate": 1.9999861309137232e-05, "loss": 0.4542, "step": 325 }, { "epoch": 0.15578519801684487, "grad_norm": 0.7485540031873075, "learning_rate": 1.999983218413751e-05, "loss": 0.4413, "step": 326 }, { "epoch": 0.15626306672241802, "grad_norm": 1.5089153257896015, "learning_rate": 1.9999800285360736e-05, "loss": 0.4631, "step": 327 }, { "epoch": 0.15674093542799117, "grad_norm": 0.8158254057514103, "learning_rate": 1.999976561281576e-05, "loss": 0.4548, "step": 328 }, { "epoch": 0.15721880413356432, "grad_norm": 0.8034746528062102, "learning_rate": 1.9999728166512187e-05, "loss": 0.4399, "step": 329 }, { "epoch": 0.15769667283913744, "grad_norm": 0.7519006657356639, "learning_rate": 1.999968794646042e-05, "loss": 0.4503, "step": 330 }, { "epoch": 0.15817454154471058, "grad_norm": 0.8336919761973133, "learning_rate": 1.9999644952671604e-05, "loss": 0.4574, "step": 331 }, { "epoch": 0.15865241025028373, "grad_norm": 0.7895923256720561, "learning_rate": 1.9999599185157673e-05, "loss": 0.445, "step": 332 }, { "epoch": 0.15913027895585688, "grad_norm": 0.8990492990399171, "learning_rate": 1.9999550643931313e-05, "loss": 0.4542, "step": 333 }, { "epoch": 0.15960814766143003, "grad_norm": 0.7540142922122045, "learning_rate": 1.9999499329005995e-05, "loss": 0.4701, "step": 334 }, { "epoch": 0.16008601636700318, "grad_norm": 0.8744480175459001, "learning_rate": 1.9999445240395953e-05, "loss": 0.4587, "step": 335 }, { "epoch": 0.16056388507257632, "grad_norm": 0.7760610520617973, "learning_rate": 1.9999388378116186e-05, "loss": 0.4431, "step": 336 }, { "epoch": 0.16104175377814944, "grad_norm": 0.9367580126034849, "learning_rate": 1.9999328742182472e-05, "loss": 0.4471, "step": 337 }, { "epoch": 0.1615196224837226, "grad_norm": 0.8695840825168545, "learning_rate": 1.999926633261135e-05, "loss": 0.4369, "step": 338 }, { "epoch": 0.16199749118929574, "grad_norm": 0.8085802780374626, "learning_rate": 1.9999201149420128e-05, "loss": 0.4412, "step": 339 }, { "epoch": 0.1624753598948689, "grad_norm": 0.9521212177197098, "learning_rate": 1.9999133192626893e-05, "loss": 0.434, "step": 340 }, { "epoch": 0.16295322860044203, "grad_norm": 0.821086659719905, "learning_rate": 1.999906246225049e-05, "loss": 0.4601, "step": 341 }, { "epoch": 0.16343109730601518, "grad_norm": 0.8520130760092065, "learning_rate": 1.9998988958310542e-05, "loss": 0.4604, "step": 342 }, { "epoch": 0.1639089660115883, "grad_norm": 0.90728532792054, "learning_rate": 1.9998912680827436e-05, "loss": 0.44, "step": 343 }, { "epoch": 0.16438683471716145, "grad_norm": 0.8144812444652371, "learning_rate": 1.9998833629822328e-05, "loss": 0.4398, "step": 344 }, { "epoch": 0.1648647034227346, "grad_norm": 0.9394455637761423, "learning_rate": 1.9998751805317152e-05, "loss": 0.4472, "step": 345 }, { "epoch": 0.16534257212830775, "grad_norm": 0.7942127753027973, "learning_rate": 1.9998667207334596e-05, "loss": 0.4518, "step": 346 }, { "epoch": 0.1658204408338809, "grad_norm": 0.8729986142728988, "learning_rate": 1.9998579835898135e-05, "loss": 0.4671, "step": 347 }, { "epoch": 0.16629830953945404, "grad_norm": 0.8038525219516176, "learning_rate": 1.9998489691031994e-05, "loss": 0.4475, "step": 348 }, { "epoch": 0.1667761782450272, "grad_norm": 0.7793008423025405, "learning_rate": 1.9998396772761186e-05, "loss": 0.4397, "step": 349 }, { "epoch": 0.1672540469506003, "grad_norm": 0.8347706044801682, "learning_rate": 1.999830108111148e-05, "loss": 0.4471, "step": 350 }, { "epoch": 0.16773191565617346, "grad_norm": 0.775692619220453, "learning_rate": 1.999820261610942e-05, "loss": 0.4539, "step": 351 }, { "epoch": 0.1682097843617466, "grad_norm": 0.823420887104832, "learning_rate": 1.9998101377782322e-05, "loss": 0.4544, "step": 352 }, { "epoch": 0.16868765306731975, "grad_norm": 0.81678869462245, "learning_rate": 1.9997997366158264e-05, "loss": 0.4365, "step": 353 }, { "epoch": 0.1691655217728929, "grad_norm": 0.7743470553025751, "learning_rate": 1.99978905812661e-05, "loss": 0.4554, "step": 354 }, { "epoch": 0.16964339047846605, "grad_norm": 0.7641953458649339, "learning_rate": 1.999778102313545e-05, "loss": 0.4578, "step": 355 }, { "epoch": 0.1701212591840392, "grad_norm": 1.0296472768958496, "learning_rate": 1.99976686917967e-05, "loss": 0.4389, "step": 356 }, { "epoch": 0.17059912788961232, "grad_norm": 0.7672656783937326, "learning_rate": 1.9997553587281012e-05, "loss": 0.4253, "step": 357 }, { "epoch": 0.17107699659518547, "grad_norm": 0.7273787482171606, "learning_rate": 1.999743570962031e-05, "loss": 0.4458, "step": 358 }, { "epoch": 0.1715548653007586, "grad_norm": 0.8352123781017939, "learning_rate": 1.9997315058847296e-05, "loss": 0.4449, "step": 359 }, { "epoch": 0.17203273400633176, "grad_norm": 0.8661207476943021, "learning_rate": 1.999719163499543e-05, "loss": 0.4416, "step": 360 }, { "epoch": 0.1725106027119049, "grad_norm": 0.814549911316121, "learning_rate": 1.999706543809896e-05, "loss": 0.4512, "step": 361 }, { "epoch": 0.17298847141747806, "grad_norm": 0.783161823300539, "learning_rate": 1.9996936468192874e-05, "loss": 0.4426, "step": 362 }, { "epoch": 0.1734663401230512, "grad_norm": 0.8535445094255457, "learning_rate": 1.9996804725312963e-05, "loss": 0.4492, "step": 363 }, { "epoch": 0.17394420882862432, "grad_norm": 0.8668749659680126, "learning_rate": 1.9996670209495757e-05, "loss": 0.4574, "step": 364 }, { "epoch": 0.17442207753419747, "grad_norm": 0.9384280918032981, "learning_rate": 1.999653292077857e-05, "loss": 0.4308, "step": 365 }, { "epoch": 0.17489994623977062, "grad_norm": 0.8148125307985487, "learning_rate": 1.999639285919949e-05, "loss": 0.4501, "step": 366 }, { "epoch": 0.17537781494534377, "grad_norm": 0.8327358415612538, "learning_rate": 1.9996250024797364e-05, "loss": 0.4318, "step": 367 }, { "epoch": 0.17585568365091692, "grad_norm": 0.8013196691254347, "learning_rate": 1.9996104417611815e-05, "loss": 0.4501, "step": 368 }, { "epoch": 0.17633355235649006, "grad_norm": 0.7247950871190025, "learning_rate": 1.9995956037683225e-05, "loss": 0.4371, "step": 369 }, { "epoch": 0.17681142106206318, "grad_norm": 0.8177374937697399, "learning_rate": 1.999580488505276e-05, "loss": 0.4561, "step": 370 }, { "epoch": 0.17728928976763633, "grad_norm": 0.7929393091666411, "learning_rate": 1.999565095976234e-05, "loss": 0.454, "step": 371 }, { "epoch": 0.17776715847320948, "grad_norm": 0.7843978974842676, "learning_rate": 1.999549426185466e-05, "loss": 0.4514, "step": 372 }, { "epoch": 0.17824502717878263, "grad_norm": 0.7290779188482008, "learning_rate": 1.9995334791373194e-05, "loss": 0.456, "step": 373 }, { "epoch": 0.17872289588435578, "grad_norm": 0.7580582343922024, "learning_rate": 1.9995172548362172e-05, "loss": 0.4519, "step": 374 }, { "epoch": 0.17920076458992892, "grad_norm": 0.8860454919596451, "learning_rate": 1.9995007532866594e-05, "loss": 0.4356, "step": 375 }, { "epoch": 0.17967863329550207, "grad_norm": 0.9228385090291951, "learning_rate": 1.9994839744932237e-05, "loss": 0.4411, "step": 376 }, { "epoch": 0.1801565020010752, "grad_norm": 0.7361256362322965, "learning_rate": 1.9994669184605642e-05, "loss": 0.4644, "step": 377 }, { "epoch": 0.18063437070664834, "grad_norm": 0.7285467924733723, "learning_rate": 1.9994495851934116e-05, "loss": 0.4504, "step": 378 }, { "epoch": 0.1811122394122215, "grad_norm": 0.6852384080647346, "learning_rate": 1.9994319746965743e-05, "loss": 0.4367, "step": 379 }, { "epoch": 0.18159010811779464, "grad_norm": 0.7016915322810489, "learning_rate": 1.9994140869749366e-05, "loss": 0.4423, "step": 380 }, { "epoch": 0.18206797682336778, "grad_norm": 0.7419915375299211, "learning_rate": 1.9993959220334608e-05, "loss": 0.4319, "step": 381 }, { "epoch": 0.18254584552894093, "grad_norm": 0.691366860088611, "learning_rate": 1.999377479877185e-05, "loss": 0.4396, "step": 382 }, { "epoch": 0.18302371423451408, "grad_norm": 0.8101619011356389, "learning_rate": 1.9993587605112252e-05, "loss": 0.437, "step": 383 }, { "epoch": 0.1835015829400872, "grad_norm": 0.7680909891808453, "learning_rate": 1.9993397639407736e-05, "loss": 0.4415, "step": 384 }, { "epoch": 0.18397945164566035, "grad_norm": 0.7391098105965442, "learning_rate": 1.9993204901710995e-05, "loss": 0.4519, "step": 385 }, { "epoch": 0.1844573203512335, "grad_norm": 0.7022079848845771, "learning_rate": 1.999300939207549e-05, "loss": 0.4378, "step": 386 }, { "epoch": 0.18493518905680664, "grad_norm": 0.8413020737595335, "learning_rate": 1.999281111055545e-05, "loss": 0.4259, "step": 387 }, { "epoch": 0.1854130577623798, "grad_norm": 0.675966589527395, "learning_rate": 1.9992610057205888e-05, "loss": 0.4401, "step": 388 }, { "epoch": 0.18589092646795294, "grad_norm": 0.7936827487639535, "learning_rate": 1.9992406232082557e-05, "loss": 0.4364, "step": 389 }, { "epoch": 0.1863687951735261, "grad_norm": 0.6543232226417673, "learning_rate": 1.9992199635241997e-05, "loss": 0.4505, "step": 390 }, { "epoch": 0.1868466638790992, "grad_norm": 0.8239352462676031, "learning_rate": 1.9991990266741524e-05, "loss": 0.4405, "step": 391 }, { "epoch": 0.18732453258467235, "grad_norm": 0.7573116961718065, "learning_rate": 1.9991778126639202e-05, "loss": 0.4219, "step": 392 }, { "epoch": 0.1878024012902455, "grad_norm": 0.7835516359179223, "learning_rate": 1.9991563214993885e-05, "loss": 0.4374, "step": 393 }, { "epoch": 0.18828026999581865, "grad_norm": 0.7604499069077229, "learning_rate": 1.9991345531865173e-05, "loss": 0.4589, "step": 394 }, { "epoch": 0.1887581387013918, "grad_norm": 0.76870199204744, "learning_rate": 1.999112507731346e-05, "loss": 0.4508, "step": 395 }, { "epoch": 0.18923600740696495, "grad_norm": 0.6786887636756039, "learning_rate": 1.999090185139989e-05, "loss": 0.4342, "step": 396 }, { "epoch": 0.1897138761125381, "grad_norm": 0.6999332563120491, "learning_rate": 1.9990675854186384e-05, "loss": 0.4407, "step": 397 }, { "epoch": 0.1901917448181112, "grad_norm": 0.8334575931899786, "learning_rate": 1.9990447085735624e-05, "loss": 0.4429, "step": 398 }, { "epoch": 0.19066961352368436, "grad_norm": 0.7685745229409217, "learning_rate": 1.9990215546111074e-05, "loss": 0.4398, "step": 399 }, { "epoch": 0.1911474822292575, "grad_norm": 0.7005717268603726, "learning_rate": 1.9989981235376956e-05, "loss": 0.4486, "step": 400 }, { "epoch": 0.19162535093483066, "grad_norm": 0.6603349007597211, "learning_rate": 1.9989744153598264e-05, "loss": 0.4581, "step": 401 }, { "epoch": 0.1921032196404038, "grad_norm": 0.7880156496415623, "learning_rate": 1.998950430084076e-05, "loss": 0.4381, "step": 402 }, { "epoch": 0.19258108834597695, "grad_norm": 0.7736124963496666, "learning_rate": 1.998926167717097e-05, "loss": 0.4202, "step": 403 }, { "epoch": 0.19305895705155007, "grad_norm": 0.7008884273159984, "learning_rate": 1.9989016282656207e-05, "loss": 0.4539, "step": 404 }, { "epoch": 0.19353682575712322, "grad_norm": 0.7575001466030076, "learning_rate": 1.9988768117364526e-05, "loss": 0.4367, "step": 405 }, { "epoch": 0.19401469446269637, "grad_norm": 0.7446911712881855, "learning_rate": 1.9988517181364767e-05, "loss": 0.4587, "step": 406 }, { "epoch": 0.19449256316826952, "grad_norm": 0.7076619424650085, "learning_rate": 1.9988263474726536e-05, "loss": 0.4322, "step": 407 }, { "epoch": 0.19497043187384266, "grad_norm": 0.7386357724353199, "learning_rate": 1.9988006997520208e-05, "loss": 0.446, "step": 408 }, { "epoch": 0.1954483005794158, "grad_norm": 0.7180682823947508, "learning_rate": 1.9987747749816923e-05, "loss": 0.4367, "step": 409 }, { "epoch": 0.19592616928498896, "grad_norm": 0.8350524472024324, "learning_rate": 1.9987485731688595e-05, "loss": 0.4691, "step": 410 }, { "epoch": 0.19640403799056208, "grad_norm": 0.7262068323613972, "learning_rate": 1.9987220943207903e-05, "loss": 0.4412, "step": 411 }, { "epoch": 0.19688190669613523, "grad_norm": 0.7033003276566966, "learning_rate": 1.998695338444829e-05, "loss": 0.4516, "step": 412 }, { "epoch": 0.19735977540170838, "grad_norm": 0.740747812864168, "learning_rate": 1.9986683055483975e-05, "loss": 0.4444, "step": 413 }, { "epoch": 0.19783764410728152, "grad_norm": 0.8106322648081262, "learning_rate": 1.9986409956389946e-05, "loss": 0.421, "step": 414 }, { "epoch": 0.19831551281285467, "grad_norm": 1.343586386680477, "learning_rate": 1.998613408724195e-05, "loss": 0.4443, "step": 415 }, { "epoch": 0.19879338151842782, "grad_norm": 0.8243575543930032, "learning_rate": 1.9985855448116507e-05, "loss": 0.4256, "step": 416 }, { "epoch": 0.19927125022400097, "grad_norm": 0.7616433208806905, "learning_rate": 1.9985574039090912e-05, "loss": 0.4333, "step": 417 }, { "epoch": 0.1997491189295741, "grad_norm": 0.8197988566665843, "learning_rate": 1.9985289860243222e-05, "loss": 0.4292, "step": 418 }, { "epoch": 0.20022698763514724, "grad_norm": 0.6780749889189807, "learning_rate": 1.9985002911652262e-05, "loss": 0.4454, "step": 419 }, { "epoch": 0.20070485634072038, "grad_norm": 0.8323686570996995, "learning_rate": 1.998471319339763e-05, "loss": 0.4357, "step": 420 }, { "epoch": 0.20118272504629353, "grad_norm": 0.7320142206171304, "learning_rate": 1.998442070555968e-05, "loss": 0.4145, "step": 421 }, { "epoch": 0.20166059375186668, "grad_norm": 0.782381732360279, "learning_rate": 1.9984125448219555e-05, "loss": 0.4424, "step": 422 }, { "epoch": 0.20213846245743983, "grad_norm": 0.748058238639162, "learning_rate": 1.998382742145914e-05, "loss": 0.4426, "step": 423 }, { "epoch": 0.20261633116301297, "grad_norm": 0.7799942137521412, "learning_rate": 1.9983526625361115e-05, "loss": 0.4427, "step": 424 }, { "epoch": 0.2030941998685861, "grad_norm": 0.703037954101251, "learning_rate": 1.9983223060008908e-05, "loss": 0.4298, "step": 425 }, { "epoch": 0.20357206857415924, "grad_norm": 0.7344901897075886, "learning_rate": 1.998291672548673e-05, "loss": 0.4369, "step": 426 }, { "epoch": 0.2040499372797324, "grad_norm": 0.743895441020455, "learning_rate": 1.9982607621879545e-05, "loss": 0.4235, "step": 427 }, { "epoch": 0.20452780598530554, "grad_norm": 0.6646898981475607, "learning_rate": 1.9982295749273093e-05, "loss": 0.4378, "step": 428 }, { "epoch": 0.2050056746908787, "grad_norm": 0.7093024591727115, "learning_rate": 1.998198110775389e-05, "loss": 0.428, "step": 429 }, { "epoch": 0.20548354339645183, "grad_norm": 0.7212954419357257, "learning_rate": 1.9981663697409203e-05, "loss": 0.4356, "step": 430 }, { "epoch": 0.20596141210202498, "grad_norm": 0.723396603925273, "learning_rate": 1.998134351832708e-05, "loss": 0.4309, "step": 431 }, { "epoch": 0.2064392808075981, "grad_norm": 0.7369743637876824, "learning_rate": 1.9981020570596334e-05, "loss": 0.446, "step": 432 }, { "epoch": 0.20691714951317125, "grad_norm": 0.7097781588794618, "learning_rate": 1.9980694854306545e-05, "loss": 0.4302, "step": 433 }, { "epoch": 0.2073950182187444, "grad_norm": 0.6844965007718748, "learning_rate": 1.998036636954806e-05, "loss": 0.4293, "step": 434 }, { "epoch": 0.20787288692431755, "grad_norm": 0.7056310983568724, "learning_rate": 1.998003511641199e-05, "loss": 0.4541, "step": 435 }, { "epoch": 0.2083507556298907, "grad_norm": 0.6540869976167216, "learning_rate": 1.9979701094990226e-05, "loss": 0.4187, "step": 436 }, { "epoch": 0.20882862433546384, "grad_norm": 0.7375068029416245, "learning_rate": 1.997936430537542e-05, "loss": 0.4377, "step": 437 }, { "epoch": 0.20930649304103696, "grad_norm": 0.6814790625160986, "learning_rate": 1.9979024747660985e-05, "loss": 0.4445, "step": 438 }, { "epoch": 0.2097843617466101, "grad_norm": 0.7103604757794988, "learning_rate": 1.9978682421941114e-05, "loss": 0.4313, "step": 439 }, { "epoch": 0.21026223045218326, "grad_norm": 0.7165788798454129, "learning_rate": 1.997833732831076e-05, "loss": 0.4419, "step": 440 }, { "epoch": 0.2107400991577564, "grad_norm": 0.7673501597057077, "learning_rate": 1.9977989466865645e-05, "loss": 0.435, "step": 441 }, { "epoch": 0.21121796786332955, "grad_norm": 0.6931708197147421, "learning_rate": 1.9977638837702263e-05, "loss": 0.4317, "step": 442 }, { "epoch": 0.2116958365689027, "grad_norm": 0.6687440502755292, "learning_rate": 1.997728544091787e-05, "loss": 0.4575, "step": 443 }, { "epoch": 0.21217370527447585, "grad_norm": 0.7234453514272886, "learning_rate": 1.997692927661049e-05, "loss": 0.4186, "step": 444 }, { "epoch": 0.21265157398004897, "grad_norm": 0.8989542148130306, "learning_rate": 1.9976570344878916e-05, "loss": 0.4277, "step": 445 }, { "epoch": 0.21312944268562212, "grad_norm": 0.6798823076977999, "learning_rate": 1.9976208645822716e-05, "loss": 0.4203, "step": 446 }, { "epoch": 0.21360731139119526, "grad_norm": 0.7297558856068362, "learning_rate": 1.9975844179542213e-05, "loss": 0.4361, "step": 447 }, { "epoch": 0.2140851800967684, "grad_norm": 0.7211293256955031, "learning_rate": 1.9975476946138506e-05, "loss": 0.4296, "step": 448 }, { "epoch": 0.21456304880234156, "grad_norm": 0.7882118710161115, "learning_rate": 1.997510694571346e-05, "loss": 0.4462, "step": 449 }, { "epoch": 0.2150409175079147, "grad_norm": 0.7179853617013857, "learning_rate": 1.9974734178369702e-05, "loss": 0.4296, "step": 450 }, { "epoch": 0.21551878621348786, "grad_norm": 0.7095782872691374, "learning_rate": 1.9974358644210635e-05, "loss": 0.4411, "step": 451 }, { "epoch": 0.21599665491906098, "grad_norm": 0.6772035963394542, "learning_rate": 1.997398034334043e-05, "loss": 0.4317, "step": 452 }, { "epoch": 0.21647452362463412, "grad_norm": 0.6743627379910052, "learning_rate": 1.997359927586401e-05, "loss": 0.4401, "step": 453 }, { "epoch": 0.21695239233020727, "grad_norm": 0.7837750007770807, "learning_rate": 1.9973215441887085e-05, "loss": 0.4448, "step": 454 }, { "epoch": 0.21743026103578042, "grad_norm": 0.7047243533444423, "learning_rate": 1.997282884151612e-05, "loss": 0.4473, "step": 455 }, { "epoch": 0.21790812974135357, "grad_norm": 1.4379264367718632, "learning_rate": 1.9972439474858348e-05, "loss": 0.454, "step": 456 }, { "epoch": 0.21838599844692672, "grad_norm": 0.8191228771916826, "learning_rate": 1.9972047342021784e-05, "loss": 0.4322, "step": 457 }, { "epoch": 0.21886386715249986, "grad_norm": 0.7133604179165115, "learning_rate": 1.9971652443115186e-05, "loss": 0.4153, "step": 458 }, { "epoch": 0.21934173585807298, "grad_norm": 0.6701729667469203, "learning_rate": 1.99712547782481e-05, "loss": 0.4286, "step": 459 }, { "epoch": 0.21981960456364613, "grad_norm": 0.7794564597547953, "learning_rate": 1.9970854347530828e-05, "loss": 0.4385, "step": 460 }, { "epoch": 0.22029747326921928, "grad_norm": 0.8024059316138707, "learning_rate": 1.9970451151074442e-05, "loss": 0.4374, "step": 461 }, { "epoch": 0.22077534197479243, "grad_norm": 0.6631388176963441, "learning_rate": 1.997004518899078e-05, "loss": 0.4368, "step": 462 }, { "epoch": 0.22125321068036558, "grad_norm": 0.7042713011158211, "learning_rate": 1.9969636461392454e-05, "loss": 0.4398, "step": 463 }, { "epoch": 0.22173107938593872, "grad_norm": 0.7010671119076932, "learning_rate": 1.9969224968392837e-05, "loss": 0.4541, "step": 464 }, { "epoch": 0.22220894809151187, "grad_norm": 0.7354792559659475, "learning_rate": 1.9968810710106065e-05, "loss": 0.4397, "step": 465 }, { "epoch": 0.222686816797085, "grad_norm": 0.7095812173991619, "learning_rate": 1.9968393686647046e-05, "loss": 0.4487, "step": 466 }, { "epoch": 0.22316468550265814, "grad_norm": 0.6677575026918666, "learning_rate": 1.9967973898131462e-05, "loss": 0.4468, "step": 467 }, { "epoch": 0.2236425542082313, "grad_norm": 0.7939463667123086, "learning_rate": 1.9967551344675752e-05, "loss": 0.4281, "step": 468 }, { "epoch": 0.22412042291380443, "grad_norm": 0.7618631702940217, "learning_rate": 1.996712602639712e-05, "loss": 0.4283, "step": 469 }, { "epoch": 0.22459829161937758, "grad_norm": 0.7333732414541805, "learning_rate": 1.9966697943413548e-05, "loss": 0.4305, "step": 470 }, { "epoch": 0.22507616032495073, "grad_norm": 0.7779594408406364, "learning_rate": 1.9966267095843776e-05, "loss": 0.4052, "step": 471 }, { "epoch": 0.22555402903052385, "grad_norm": 0.6757227788709603, "learning_rate": 1.996583348380731e-05, "loss": 0.4196, "step": 472 }, { "epoch": 0.226031897736097, "grad_norm": 0.7580164038914531, "learning_rate": 1.9965397107424434e-05, "loss": 0.404, "step": 473 }, { "epoch": 0.22650976644167015, "grad_norm": 0.7558024725157709, "learning_rate": 1.9964957966816184e-05, "loss": 0.4426, "step": 474 }, { "epoch": 0.2269876351472433, "grad_norm": 0.780681584126318, "learning_rate": 1.9964516062104377e-05, "loss": 0.4283, "step": 475 }, { "epoch": 0.22746550385281644, "grad_norm": 0.9899203517841794, "learning_rate": 1.996407139341158e-05, "loss": 0.4241, "step": 476 }, { "epoch": 0.2279433725583896, "grad_norm": 0.6521695714753446, "learning_rate": 1.9963623960861144e-05, "loss": 0.4198, "step": 477 }, { "epoch": 0.22842124126396274, "grad_norm": 0.725943514393016, "learning_rate": 1.9963173764577178e-05, "loss": 0.4447, "step": 478 }, { "epoch": 0.22889910996953586, "grad_norm": 0.6805293528269828, "learning_rate": 1.9962720804684555e-05, "loss": 0.4276, "step": 479 }, { "epoch": 0.229376978675109, "grad_norm": 0.7471831254807846, "learning_rate": 1.996226508130892e-05, "loss": 0.4209, "step": 480 }, { "epoch": 0.22985484738068215, "grad_norm": 0.6443809058619084, "learning_rate": 1.9961806594576684e-05, "loss": 0.4351, "step": 481 }, { "epoch": 0.2303327160862553, "grad_norm": 1.3415000224918492, "learning_rate": 1.996134534461502e-05, "loss": 0.459, "step": 482 }, { "epoch": 0.23081058479182845, "grad_norm": 0.7134420590449075, "learning_rate": 1.996088133155188e-05, "loss": 0.4387, "step": 483 }, { "epoch": 0.2312884534974016, "grad_norm": 0.7813274776781214, "learning_rate": 1.9960414555515958e-05, "loss": 0.4126, "step": 484 }, { "epoch": 0.23176632220297474, "grad_norm": 0.6898643605397596, "learning_rate": 1.995994501663674e-05, "loss": 0.4246, "step": 485 }, { "epoch": 0.23224419090854787, "grad_norm": 0.9121587741249505, "learning_rate": 1.9959472715044463e-05, "loss": 0.4244, "step": 486 }, { "epoch": 0.232722059614121, "grad_norm": 0.7147251675369962, "learning_rate": 1.9958997650870137e-05, "loss": 0.4224, "step": 487 }, { "epoch": 0.23319992831969416, "grad_norm": 0.7899586618920265, "learning_rate": 1.9958519824245536e-05, "loss": 0.4112, "step": 488 }, { "epoch": 0.2336777970252673, "grad_norm": 0.7904708831006497, "learning_rate": 1.99580392353032e-05, "loss": 0.4389, "step": 489 }, { "epoch": 0.23415566573084046, "grad_norm": 0.6741342579240754, "learning_rate": 1.995755588417644e-05, "loss": 0.4243, "step": 490 }, { "epoch": 0.2346335344364136, "grad_norm": 0.738629652421803, "learning_rate": 1.9957069770999324e-05, "loss": 0.4437, "step": 491 }, { "epoch": 0.23511140314198675, "grad_norm": 0.6924217432219697, "learning_rate": 1.9956580895906694e-05, "loss": 0.43, "step": 492 }, { "epoch": 0.23558927184755987, "grad_norm": 0.7369180221087179, "learning_rate": 1.9956089259034154e-05, "loss": 0.4339, "step": 493 }, { "epoch": 0.23606714055313302, "grad_norm": 0.805559465873891, "learning_rate": 1.995559486051808e-05, "loss": 0.4361, "step": 494 }, { "epoch": 0.23654500925870617, "grad_norm": 0.7823248018398075, "learning_rate": 1.99550977004956e-05, "loss": 0.4359, "step": 495 }, { "epoch": 0.23702287796427932, "grad_norm": 0.7554148112351713, "learning_rate": 1.9954597779104624e-05, "loss": 0.4242, "step": 496 }, { "epoch": 0.23750074666985246, "grad_norm": 0.7027609576658066, "learning_rate": 1.995409509648382e-05, "loss": 0.4309, "step": 497 }, { "epoch": 0.2379786153754256, "grad_norm": 0.705129151371866, "learning_rate": 1.9953589652772627e-05, "loss": 0.4464, "step": 498 }, { "epoch": 0.23845648408099873, "grad_norm": 0.7214815482253628, "learning_rate": 1.995308144811124e-05, "loss": 0.4427, "step": 499 }, { "epoch": 0.23893435278657188, "grad_norm": 0.7183066127583475, "learning_rate": 1.9952570482640628e-05, "loss": 0.4301, "step": 500 }, { "epoch": 0.23941222149214503, "grad_norm": 0.797831280147801, "learning_rate": 1.9952056756502525e-05, "loss": 0.4367, "step": 501 }, { "epoch": 0.23989009019771818, "grad_norm": 0.7379315395749698, "learning_rate": 1.9951540269839428e-05, "loss": 0.4399, "step": 502 }, { "epoch": 0.24036795890329132, "grad_norm": 0.7945392228374598, "learning_rate": 1.9951021022794602e-05, "loss": 0.4234, "step": 503 }, { "epoch": 0.24084582760886447, "grad_norm": 0.7264140524348444, "learning_rate": 1.995049901551208e-05, "loss": 0.4264, "step": 504 }, { "epoch": 0.24132369631443762, "grad_norm": 0.668632865034176, "learning_rate": 1.9949974248136655e-05, "loss": 0.4288, "step": 505 }, { "epoch": 0.24180156502001074, "grad_norm": 0.8309640004970509, "learning_rate": 1.9949446720813886e-05, "loss": 0.4438, "step": 506 }, { "epoch": 0.2422794337255839, "grad_norm": 0.6904435675318091, "learning_rate": 1.9948916433690103e-05, "loss": 0.4439, "step": 507 }, { "epoch": 0.24275730243115703, "grad_norm": 0.7354434100825048, "learning_rate": 1.99483833869124e-05, "loss": 0.4283, "step": 508 }, { "epoch": 0.24323517113673018, "grad_norm": 0.6765398629661438, "learning_rate": 1.9947847580628625e-05, "loss": 0.4387, "step": 509 }, { "epoch": 0.24371303984230333, "grad_norm": 0.7239562081327908, "learning_rate": 1.9947309014987414e-05, "loss": 0.4375, "step": 510 }, { "epoch": 0.24419090854787648, "grad_norm": 0.6978300565774074, "learning_rate": 1.9946767690138146e-05, "loss": 0.4356, "step": 511 }, { "epoch": 0.24466877725344963, "grad_norm": 0.7117480219425797, "learning_rate": 1.994622360623098e-05, "loss": 0.4403, "step": 512 }, { "epoch": 0.24514664595902275, "grad_norm": 0.7477545746890273, "learning_rate": 1.994567676341683e-05, "loss": 0.4245, "step": 513 }, { "epoch": 0.2456245146645959, "grad_norm": 0.7374013791361971, "learning_rate": 1.9945127161847393e-05, "loss": 0.4223, "step": 514 }, { "epoch": 0.24610238337016904, "grad_norm": 0.6642173278949521, "learning_rate": 1.9944574801675106e-05, "loss": 0.4341, "step": 515 }, { "epoch": 0.2465802520757422, "grad_norm": 0.7259774358658097, "learning_rate": 1.994401968305319e-05, "loss": 0.4252, "step": 516 }, { "epoch": 0.24705812078131534, "grad_norm": 0.7180794651515113, "learning_rate": 1.994346180613562e-05, "loss": 0.4348, "step": 517 }, { "epoch": 0.24753598948688849, "grad_norm": 0.6955289749178415, "learning_rate": 1.9942901171077146e-05, "loss": 0.4375, "step": 518 }, { "epoch": 0.24801385819246163, "grad_norm": 0.7245432909819279, "learning_rate": 1.994233777803328e-05, "loss": 0.4399, "step": 519 }, { "epoch": 0.24849172689803475, "grad_norm": 0.6067174282133342, "learning_rate": 1.9941771627160287e-05, "loss": 0.4217, "step": 520 }, { "epoch": 0.2489695956036079, "grad_norm": 0.703988857071276, "learning_rate": 1.994120271861522e-05, "loss": 0.4412, "step": 521 }, { "epoch": 0.24944746430918105, "grad_norm": 0.8991621677494505, "learning_rate": 1.9940631052555882e-05, "loss": 0.3995, "step": 522 }, { "epoch": 0.2499253330147542, "grad_norm": 0.7976604424556334, "learning_rate": 1.9940056629140835e-05, "loss": 0.4105, "step": 523 }, { "epoch": 0.2504032017203273, "grad_norm": 0.6019680965372577, "learning_rate": 1.9939479448529418e-05, "loss": 0.4192, "step": 524 }, { "epoch": 0.25088107042590047, "grad_norm": 0.704506878051367, "learning_rate": 1.9938899510881732e-05, "loss": 0.4196, "step": 525 }, { "epoch": 0.2513589391314736, "grad_norm": 0.6554375963262496, "learning_rate": 1.9938316816358644e-05, "loss": 0.4343, "step": 526 }, { "epoch": 0.25183680783704676, "grad_norm": 0.992457986922531, "learning_rate": 1.9937731365121777e-05, "loss": 0.4328, "step": 527 }, { "epoch": 0.2523146765426199, "grad_norm": 0.6658083513417471, "learning_rate": 1.9937143157333528e-05, "loss": 0.4298, "step": 528 }, { "epoch": 0.25279254524819306, "grad_norm": 0.7222172239047646, "learning_rate": 1.9936552193157055e-05, "loss": 0.428, "step": 529 }, { "epoch": 0.2532704139537662, "grad_norm": 0.6390063477285883, "learning_rate": 1.9935958472756283e-05, "loss": 0.4348, "step": 530 }, { "epoch": 0.25374828265933935, "grad_norm": 0.749202476640269, "learning_rate": 1.9935361996295896e-05, "loss": 0.4424, "step": 531 }, { "epoch": 0.2542261513649125, "grad_norm": 0.7192147324773318, "learning_rate": 1.993476276394135e-05, "loss": 0.447, "step": 532 }, { "epoch": 0.25470402007048565, "grad_norm": 0.7498159675525952, "learning_rate": 1.9934160775858856e-05, "loss": 0.416, "step": 533 }, { "epoch": 0.2551818887760588, "grad_norm": 0.7187313720282974, "learning_rate": 1.9933556032215402e-05, "loss": 0.4033, "step": 534 }, { "epoch": 0.25565975748163194, "grad_norm": 0.7075911635545219, "learning_rate": 1.993294853317873e-05, "loss": 0.4233, "step": 535 }, { "epoch": 0.2561376261872051, "grad_norm": 0.7116488897227271, "learning_rate": 1.9932338278917348e-05, "loss": 0.4469, "step": 536 }, { "epoch": 0.2566154948927782, "grad_norm": 0.6870926308030396, "learning_rate": 1.993172526960053e-05, "loss": 0.4429, "step": 537 }, { "epoch": 0.25709336359835133, "grad_norm": 0.7109693256013045, "learning_rate": 1.9931109505398316e-05, "loss": 0.4263, "step": 538 }, { "epoch": 0.2575712323039245, "grad_norm": 0.6545214091784688, "learning_rate": 1.9930490986481507e-05, "loss": 0.4346, "step": 539 }, { "epoch": 0.25804910100949763, "grad_norm": 0.8547757226945168, "learning_rate": 1.9929869713021668e-05, "loss": 0.4203, "step": 540 }, { "epoch": 0.2585269697150708, "grad_norm": 0.6813604542557925, "learning_rate": 1.9929245685191133e-05, "loss": 0.4267, "step": 541 }, { "epoch": 0.2590048384206439, "grad_norm": 0.6641936751299076, "learning_rate": 1.9928618903162992e-05, "loss": 0.4339, "step": 542 }, { "epoch": 0.25948270712621707, "grad_norm": 0.9793051618462918, "learning_rate": 1.9927989367111102e-05, "loss": 0.4389, "step": 543 }, { "epoch": 0.2599605758317902, "grad_norm": 0.5970607719650806, "learning_rate": 1.9927357077210093e-05, "loss": 0.4285, "step": 544 }, { "epoch": 0.26043844453736337, "grad_norm": 0.6677945714629177, "learning_rate": 1.9926722033635343e-05, "loss": 0.4185, "step": 545 }, { "epoch": 0.2609163132429365, "grad_norm": 0.7032868326166338, "learning_rate": 1.992608423656301e-05, "loss": 0.4248, "step": 546 }, { "epoch": 0.26139418194850966, "grad_norm": 0.675972367560421, "learning_rate": 1.9925443686169998e-05, "loss": 0.4358, "step": 547 }, { "epoch": 0.2618720506540828, "grad_norm": 0.6648018975438753, "learning_rate": 1.992480038263399e-05, "loss": 0.4243, "step": 548 }, { "epoch": 0.26234991935965596, "grad_norm": 0.7300070842008606, "learning_rate": 1.992415432613343e-05, "loss": 0.426, "step": 549 }, { "epoch": 0.26282778806522905, "grad_norm": 0.6966820393669727, "learning_rate": 1.9923505516847514e-05, "loss": 0.4462, "step": 550 }, { "epoch": 0.2633056567708022, "grad_norm": 0.6799521833338832, "learning_rate": 1.9922853954956217e-05, "loss": 0.4165, "step": 551 }, { "epoch": 0.26378352547637535, "grad_norm": 0.6842858880612487, "learning_rate": 1.9922199640640272e-05, "loss": 0.4215, "step": 552 }, { "epoch": 0.2642613941819485, "grad_norm": 0.684498742912379, "learning_rate": 1.9921542574081168e-05, "loss": 0.4376, "step": 553 }, { "epoch": 0.26473926288752164, "grad_norm": 0.7472555717042567, "learning_rate": 1.992088275546117e-05, "loss": 0.4281, "step": 554 }, { "epoch": 0.2652171315930948, "grad_norm": 0.6526006080700162, "learning_rate": 1.9920220184963296e-05, "loss": 0.4203, "step": 555 }, { "epoch": 0.26569500029866794, "grad_norm": 0.6864252975558336, "learning_rate": 1.991955486277133e-05, "loss": 0.4253, "step": 556 }, { "epoch": 0.2661728690042411, "grad_norm": 0.7010911608893313, "learning_rate": 1.9918886789069824e-05, "loss": 0.4268, "step": 557 }, { "epoch": 0.26665073770981423, "grad_norm": 0.6432530703558913, "learning_rate": 1.991821596404409e-05, "loss": 0.4095, "step": 558 }, { "epoch": 0.2671286064153874, "grad_norm": 0.6714812788316195, "learning_rate": 1.99175423878802e-05, "loss": 0.439, "step": 559 }, { "epoch": 0.26760647512096053, "grad_norm": 0.7313943296188351, "learning_rate": 1.9916866060764994e-05, "loss": 0.4207, "step": 560 }, { "epoch": 0.2680843438265337, "grad_norm": 0.6753023749397835, "learning_rate": 1.9916186982886074e-05, "loss": 0.4232, "step": 561 }, { "epoch": 0.2685622125321068, "grad_norm": 0.6812854395562149, "learning_rate": 1.9915505154431806e-05, "loss": 0.4312, "step": 562 }, { "epoch": 0.26904008123768, "grad_norm": 0.7326468942961418, "learning_rate": 1.9914820575591315e-05, "loss": 0.4106, "step": 563 }, { "epoch": 0.26951794994325307, "grad_norm": 0.710879982047124, "learning_rate": 1.9914133246554486e-05, "loss": 0.4312, "step": 564 }, { "epoch": 0.2699958186488262, "grad_norm": 0.6098533019354313, "learning_rate": 1.991344316751198e-05, "loss": 0.4269, "step": 565 }, { "epoch": 0.27047368735439936, "grad_norm": 0.6527319247618629, "learning_rate": 1.9912750338655207e-05, "loss": 0.4253, "step": 566 }, { "epoch": 0.2709515560599725, "grad_norm": 0.6554170109236619, "learning_rate": 1.9912054760176352e-05, "loss": 0.4175, "step": 567 }, { "epoch": 0.27142942476554566, "grad_norm": 0.6487793205519652, "learning_rate": 1.991135643226835e-05, "loss": 0.4306, "step": 568 }, { "epoch": 0.2719072934711188, "grad_norm": 0.6830594174977985, "learning_rate": 1.9910655355124905e-05, "loss": 0.4062, "step": 569 }, { "epoch": 0.27238516217669195, "grad_norm": 0.7070510514831456, "learning_rate": 1.9909951528940485e-05, "loss": 0.4244, "step": 570 }, { "epoch": 0.2728630308822651, "grad_norm": 0.6919752747886392, "learning_rate": 1.9909244953910324e-05, "loss": 0.4374, "step": 571 }, { "epoch": 0.27334089958783825, "grad_norm": 0.627083497227152, "learning_rate": 1.990853563023041e-05, "loss": 0.4293, "step": 572 }, { "epoch": 0.2738187682934114, "grad_norm": 0.6811946382742817, "learning_rate": 1.990782355809749e-05, "loss": 0.4314, "step": 573 }, { "epoch": 0.27429663699898454, "grad_norm": 0.8011136341064639, "learning_rate": 1.9907108737709088e-05, "loss": 0.4313, "step": 574 }, { "epoch": 0.2747745057045577, "grad_norm": 0.7022098540651683, "learning_rate": 1.990639116926348e-05, "loss": 0.4102, "step": 575 }, { "epoch": 0.27525237441013084, "grad_norm": 0.7139794448818395, "learning_rate": 1.9905670852959707e-05, "loss": 0.4131, "step": 576 }, { "epoch": 0.275730243115704, "grad_norm": 0.6566170557292315, "learning_rate": 1.9904947788997572e-05, "loss": 0.4492, "step": 577 }, { "epoch": 0.2762081118212771, "grad_norm": 0.6960200627247798, "learning_rate": 1.9904221977577644e-05, "loss": 0.4236, "step": 578 }, { "epoch": 0.27668598052685023, "grad_norm": 0.7038180733792689, "learning_rate": 1.9903493418901246e-05, "loss": 0.4156, "step": 579 }, { "epoch": 0.2771638492324234, "grad_norm": 0.6995395704949278, "learning_rate": 1.9902762113170467e-05, "loss": 0.427, "step": 580 }, { "epoch": 0.2776417179379965, "grad_norm": 0.7141523375442532, "learning_rate": 1.990202806058816e-05, "loss": 0.4075, "step": 581 }, { "epoch": 0.27811958664356967, "grad_norm": 0.6927565201255096, "learning_rate": 1.990129126135794e-05, "loss": 0.4238, "step": 582 }, { "epoch": 0.2785974553491428, "grad_norm": 0.6417220798020897, "learning_rate": 1.9900551715684175e-05, "loss": 0.4144, "step": 583 }, { "epoch": 0.27907532405471597, "grad_norm": 0.6392690865600442, "learning_rate": 1.989980942377201e-05, "loss": 0.415, "step": 584 }, { "epoch": 0.2795531927602891, "grad_norm": 0.639620744803394, "learning_rate": 1.989906438582734e-05, "loss": 0.4251, "step": 585 }, { "epoch": 0.28003106146586226, "grad_norm": 0.6707471902399823, "learning_rate": 1.9898316602056825e-05, "loss": 0.4225, "step": 586 }, { "epoch": 0.2805089301714354, "grad_norm": 0.6292145878798392, "learning_rate": 1.989756607266789e-05, "loss": 0.4281, "step": 587 }, { "epoch": 0.28098679887700856, "grad_norm": 0.6650049148813777, "learning_rate": 1.9896812797868714e-05, "loss": 0.4273, "step": 588 }, { "epoch": 0.2814646675825817, "grad_norm": 0.6989760949516968, "learning_rate": 1.9896056777868245e-05, "loss": 0.4133, "step": 589 }, { "epoch": 0.28194253628815485, "grad_norm": 0.6273420717769904, "learning_rate": 1.9895298012876192e-05, "loss": 0.4117, "step": 590 }, { "epoch": 0.28242040499372795, "grad_norm": 0.6513446444776289, "learning_rate": 1.9894536503103018e-05, "loss": 0.4243, "step": 591 }, { "epoch": 0.2828982736993011, "grad_norm": 0.7257067616669561, "learning_rate": 1.9893772248759956e-05, "loss": 0.4377, "step": 592 }, { "epoch": 0.28337614240487424, "grad_norm": 0.6449292794210849, "learning_rate": 1.9893005250058994e-05, "loss": 0.4262, "step": 593 }, { "epoch": 0.2838540111104474, "grad_norm": 0.7101669933858333, "learning_rate": 1.9892235507212885e-05, "loss": 0.4166, "step": 594 }, { "epoch": 0.28433187981602054, "grad_norm": 0.9532443107189144, "learning_rate": 1.9891463020435144e-05, "loss": 0.412, "step": 595 }, { "epoch": 0.2848097485215937, "grad_norm": 0.6763594227901946, "learning_rate": 1.9890687789940044e-05, "loss": 0.4305, "step": 596 }, { "epoch": 0.28528761722716683, "grad_norm": 0.666645047214093, "learning_rate": 1.9889909815942615e-05, "loss": 0.4262, "step": 597 }, { "epoch": 0.28576548593274, "grad_norm": 0.7187479088271995, "learning_rate": 1.9889129098658662e-05, "loss": 0.4127, "step": 598 }, { "epoch": 0.28624335463831313, "grad_norm": 0.7191161821405772, "learning_rate": 1.9888345638304737e-05, "loss": 0.4345, "step": 599 }, { "epoch": 0.2867212233438863, "grad_norm": 0.706994011714395, "learning_rate": 1.9887559435098162e-05, "loss": 0.41, "step": 600 }, { "epoch": 0.2871990920494594, "grad_norm": 0.7497992817297052, "learning_rate": 1.988677048925701e-05, "loss": 0.4285, "step": 601 }, { "epoch": 0.2876769607550326, "grad_norm": 0.652784327493937, "learning_rate": 1.9885978801000124e-05, "loss": 0.4256, "step": 602 }, { "epoch": 0.2881548294606057, "grad_norm": 0.776621607360754, "learning_rate": 1.988518437054711e-05, "loss": 0.4322, "step": 603 }, { "epoch": 0.28863269816617887, "grad_norm": 0.5839322380726024, "learning_rate": 1.9884387198118316e-05, "loss": 0.4167, "step": 604 }, { "epoch": 0.28911056687175196, "grad_norm": 0.7907030900607616, "learning_rate": 1.9883587283934875e-05, "loss": 0.406, "step": 605 }, { "epoch": 0.2895884355773251, "grad_norm": 1.0014227126496111, "learning_rate": 1.988278462821866e-05, "loss": 0.4151, "step": 606 }, { "epoch": 0.29006630428289826, "grad_norm": 0.66994522088662, "learning_rate": 1.9881979231192323e-05, "loss": 0.4392, "step": 607 }, { "epoch": 0.2905441729884714, "grad_norm": 0.5824214225034886, "learning_rate": 1.9881171093079264e-05, "loss": 0.4267, "step": 608 }, { "epoch": 0.29102204169404455, "grad_norm": 0.6495953436821396, "learning_rate": 1.988036021410364e-05, "loss": 0.4334, "step": 609 }, { "epoch": 0.2914999103996177, "grad_norm": 0.6878938109775274, "learning_rate": 1.9879546594490383e-05, "loss": 0.41, "step": 610 }, { "epoch": 0.29197777910519085, "grad_norm": 0.7296069094771699, "learning_rate": 1.987873023446517e-05, "loss": 0.4134, "step": 611 }, { "epoch": 0.292455647810764, "grad_norm": 0.7455591671402041, "learning_rate": 1.987791113425445e-05, "loss": 0.424, "step": 612 }, { "epoch": 0.29293351651633714, "grad_norm": 0.6695483498248275, "learning_rate": 1.9877089294085424e-05, "loss": 0.425, "step": 613 }, { "epoch": 0.2934113852219103, "grad_norm": 0.6796642781672113, "learning_rate": 1.9876264714186054e-05, "loss": 0.3983, "step": 614 }, { "epoch": 0.29388925392748344, "grad_norm": 0.6095965308498806, "learning_rate": 1.987543739478507e-05, "loss": 0.4225, "step": 615 }, { "epoch": 0.2943671226330566, "grad_norm": 0.6305677380212332, "learning_rate": 1.987460733611195e-05, "loss": 0.4356, "step": 616 }, { "epoch": 0.29484499133862974, "grad_norm": 0.7050343310604908, "learning_rate": 1.9873774538396945e-05, "loss": 0.4283, "step": 617 }, { "epoch": 0.29532286004420283, "grad_norm": 0.6530927062141915, "learning_rate": 1.987293900187105e-05, "loss": 0.4316, "step": 618 }, { "epoch": 0.295800728749776, "grad_norm": 0.6928306836676819, "learning_rate": 1.9872100726766028e-05, "loss": 0.4066, "step": 619 }, { "epoch": 0.2962785974553491, "grad_norm": 0.7451946223590281, "learning_rate": 1.987125971331441e-05, "loss": 0.4227, "step": 620 }, { "epoch": 0.29675646616092227, "grad_norm": 0.7569155679553317, "learning_rate": 1.9870415961749472e-05, "loss": 0.4171, "step": 621 }, { "epoch": 0.2972343348664954, "grad_norm": 0.7729456988114033, "learning_rate": 1.986956947230526e-05, "loss": 0.4374, "step": 622 }, { "epoch": 0.29771220357206857, "grad_norm": 0.7222068350990688, "learning_rate": 1.986872024521657e-05, "loss": 0.4055, "step": 623 }, { "epoch": 0.2981900722776417, "grad_norm": 0.777255093182333, "learning_rate": 1.9867868280718966e-05, "loss": 0.4184, "step": 624 }, { "epoch": 0.29866794098321486, "grad_norm": 0.707058786281402, "learning_rate": 1.9867013579048765e-05, "loss": 0.4278, "step": 625 }, { "epoch": 0.299145809688788, "grad_norm": 0.6978123671395932, "learning_rate": 1.986615614044305e-05, "loss": 0.3976, "step": 626 }, { "epoch": 0.29962367839436116, "grad_norm": 0.6497844128498012, "learning_rate": 1.9865295965139654e-05, "loss": 0.4233, "step": 627 }, { "epoch": 0.3001015470999343, "grad_norm": 0.7279868951567827, "learning_rate": 1.9864433053377183e-05, "loss": 0.421, "step": 628 }, { "epoch": 0.30057941580550745, "grad_norm": 0.6228075334334027, "learning_rate": 1.9863567405394987e-05, "loss": 0.4182, "step": 629 }, { "epoch": 0.3010572845110806, "grad_norm": 1.2081410007291293, "learning_rate": 1.9862699021433186e-05, "loss": 0.43, "step": 630 }, { "epoch": 0.30153515321665375, "grad_norm": 0.6584806729516838, "learning_rate": 1.9861827901732645e-05, "loss": 0.4243, "step": 631 }, { "epoch": 0.30201302192222684, "grad_norm": 0.7145104561793435, "learning_rate": 1.986095404653501e-05, "loss": 0.4149, "step": 632 }, { "epoch": 0.3024908906278, "grad_norm": 0.6293456214888333, "learning_rate": 1.986007745608266e-05, "loss": 0.4059, "step": 633 }, { "epoch": 0.30296875933337314, "grad_norm": 0.6105778855320209, "learning_rate": 1.985919813061876e-05, "loss": 0.4075, "step": 634 }, { "epoch": 0.3034466280389463, "grad_norm": 0.5850039212779303, "learning_rate": 1.9858316070387208e-05, "loss": 0.4265, "step": 635 }, { "epoch": 0.30392449674451943, "grad_norm": 0.6552242491777841, "learning_rate": 1.985743127563268e-05, "loss": 0.4211, "step": 636 }, { "epoch": 0.3044023654500926, "grad_norm": 0.6302333591257354, "learning_rate": 1.9856543746600593e-05, "loss": 0.4223, "step": 637 }, { "epoch": 0.30488023415566573, "grad_norm": 0.6537304777031439, "learning_rate": 1.985565348353714e-05, "loss": 0.4278, "step": 638 }, { "epoch": 0.3053581028612389, "grad_norm": 0.6730389531773375, "learning_rate": 1.9854760486689257e-05, "loss": 0.4222, "step": 639 }, { "epoch": 0.305835971566812, "grad_norm": 0.64240760182782, "learning_rate": 1.9853864756304654e-05, "loss": 0.4157, "step": 640 }, { "epoch": 0.3063138402723852, "grad_norm": 0.6417456096649262, "learning_rate": 1.9852966292631785e-05, "loss": 0.3962, "step": 641 }, { "epoch": 0.3067917089779583, "grad_norm": 0.6690827386602772, "learning_rate": 1.985206509591987e-05, "loss": 0.4155, "step": 642 }, { "epoch": 0.30726957768353147, "grad_norm": 0.6670159655708577, "learning_rate": 1.9851161166418888e-05, "loss": 0.4209, "step": 643 }, { "epoch": 0.3077474463891046, "grad_norm": 0.6837870559936386, "learning_rate": 1.9850254504379568e-05, "loss": 0.4239, "step": 644 }, { "epoch": 0.3082253150946777, "grad_norm": 0.635771654798046, "learning_rate": 1.9849345110053405e-05, "loss": 0.4115, "step": 645 }, { "epoch": 0.30870318380025086, "grad_norm": 0.6317438803320911, "learning_rate": 1.9848432983692642e-05, "loss": 0.4167, "step": 646 }, { "epoch": 0.309181052505824, "grad_norm": 0.7355553259854325, "learning_rate": 1.98475181255503e-05, "loss": 0.4191, "step": 647 }, { "epoch": 0.30965892121139715, "grad_norm": 0.6759418052818299, "learning_rate": 1.9846600535880135e-05, "loss": 0.4287, "step": 648 }, { "epoch": 0.3101367899169703, "grad_norm": 0.8036676894573562, "learning_rate": 1.9845680214936668e-05, "loss": 0.4001, "step": 649 }, { "epoch": 0.31061465862254345, "grad_norm": 0.6256985874959373, "learning_rate": 1.984475716297519e-05, "loss": 0.4126, "step": 650 }, { "epoch": 0.3110925273281166, "grad_norm": 0.6830281771469799, "learning_rate": 1.984383138025173e-05, "loss": 0.4133, "step": 651 }, { "epoch": 0.31157039603368974, "grad_norm": 0.7108602267936108, "learning_rate": 1.984290286702309e-05, "loss": 0.4056, "step": 652 }, { "epoch": 0.3120482647392629, "grad_norm": 0.6539475916242298, "learning_rate": 1.984197162354682e-05, "loss": 0.4332, "step": 653 }, { "epoch": 0.31252613344483604, "grad_norm": 0.7003890411182644, "learning_rate": 1.984103765008123e-05, "loss": 0.4152, "step": 654 }, { "epoch": 0.3130040021504092, "grad_norm": 0.8734062183276717, "learning_rate": 1.984010094688539e-05, "loss": 0.4308, "step": 655 }, { "epoch": 0.31348187085598234, "grad_norm": 0.6483836791788339, "learning_rate": 1.9839161514219125e-05, "loss": 0.4335, "step": 656 }, { "epoch": 0.3139597395615555, "grad_norm": 0.7224253461656356, "learning_rate": 1.983821935234301e-05, "loss": 0.4079, "step": 657 }, { "epoch": 0.31443760826712863, "grad_norm": 0.6023125747576976, "learning_rate": 1.9837274461518393e-05, "loss": 0.4208, "step": 658 }, { "epoch": 0.3149154769727017, "grad_norm": 0.6840565183544368, "learning_rate": 1.9836326842007368e-05, "loss": 0.4179, "step": 659 }, { "epoch": 0.3153933456782749, "grad_norm": 0.6070851699714096, "learning_rate": 1.9835376494072788e-05, "loss": 0.421, "step": 660 }, { "epoch": 0.315871214383848, "grad_norm": 0.7883725816487038, "learning_rate": 1.9834423417978258e-05, "loss": 0.4161, "step": 661 }, { "epoch": 0.31634908308942117, "grad_norm": 0.7058787543409192, "learning_rate": 1.983346761398815e-05, "loss": 0.4077, "step": 662 }, { "epoch": 0.3168269517949943, "grad_norm": 0.6615940212203109, "learning_rate": 1.983250908236759e-05, "loss": 0.4149, "step": 663 }, { "epoch": 0.31730482050056746, "grad_norm": 0.7006736509156779, "learning_rate": 1.9831547823382446e-05, "loss": 0.4088, "step": 664 }, { "epoch": 0.3177826892061406, "grad_norm": 0.6646570566525254, "learning_rate": 1.9830583837299363e-05, "loss": 0.4131, "step": 665 }, { "epoch": 0.31826055791171376, "grad_norm": 0.6530528056282681, "learning_rate": 1.9829617124385732e-05, "loss": 0.396, "step": 666 }, { "epoch": 0.3187384266172869, "grad_norm": 0.9328303194521024, "learning_rate": 1.9828647684909703e-05, "loss": 0.3951, "step": 667 }, { "epoch": 0.31921629532286006, "grad_norm": 0.6959860169532506, "learning_rate": 1.9827675519140183e-05, "loss": 0.427, "step": 668 }, { "epoch": 0.3196941640284332, "grad_norm": 0.6996213590788233, "learning_rate": 1.9826700627346825e-05, "loss": 0.4142, "step": 669 }, { "epoch": 0.32017203273400635, "grad_norm": 0.6591077715112674, "learning_rate": 1.9825723009800058e-05, "loss": 0.4163, "step": 670 }, { "epoch": 0.3206499014395795, "grad_norm": 0.6696332483194858, "learning_rate": 1.982474266677105e-05, "loss": 0.407, "step": 671 }, { "epoch": 0.32112777014515265, "grad_norm": 0.792144854122436, "learning_rate": 1.9823759598531732e-05, "loss": 0.4204, "step": 672 }, { "epoch": 0.32160563885072574, "grad_norm": 0.7610226047172992, "learning_rate": 1.9822773805354788e-05, "loss": 0.412, "step": 673 }, { "epoch": 0.3220835075562989, "grad_norm": 0.7518454562539055, "learning_rate": 1.982178528751366e-05, "loss": 0.4263, "step": 674 }, { "epoch": 0.32256137626187203, "grad_norm": 0.6802817311894305, "learning_rate": 1.9820794045282553e-05, "loss": 0.4107, "step": 675 }, { "epoch": 0.3230392449674452, "grad_norm": 0.7023162206342487, "learning_rate": 1.9819800078936406e-05, "loss": 0.4114, "step": 676 }, { "epoch": 0.32351711367301833, "grad_norm": 0.6685874705533328, "learning_rate": 1.981880338875094e-05, "loss": 0.4185, "step": 677 }, { "epoch": 0.3239949823785915, "grad_norm": 0.6244222422392075, "learning_rate": 1.9817803975002614e-05, "loss": 0.4241, "step": 678 }, { "epoch": 0.3244728510841646, "grad_norm": 0.6522844807838616, "learning_rate": 1.9816801837968647e-05, "loss": 0.4089, "step": 679 }, { "epoch": 0.3249507197897378, "grad_norm": 0.7102809988759002, "learning_rate": 1.9815796977927015e-05, "loss": 0.4094, "step": 680 }, { "epoch": 0.3254285884953109, "grad_norm": 0.9736649325340913, "learning_rate": 1.981478939515645e-05, "loss": 0.3995, "step": 681 }, { "epoch": 0.32590645720088407, "grad_norm": 0.6342355412436739, "learning_rate": 1.981377908993644e-05, "loss": 0.4086, "step": 682 }, { "epoch": 0.3263843259064572, "grad_norm": 0.6557900154548946, "learning_rate": 1.9812766062547218e-05, "loss": 0.4167, "step": 683 }, { "epoch": 0.32686219461203037, "grad_norm": 0.7089582460412668, "learning_rate": 1.9811750313269785e-05, "loss": 0.4184, "step": 684 }, { "epoch": 0.3273400633176035, "grad_norm": 0.643622934429116, "learning_rate": 1.9810731842385892e-05, "loss": 0.4198, "step": 685 }, { "epoch": 0.3278179320231766, "grad_norm": 0.6544401734455819, "learning_rate": 1.9809710650178043e-05, "loss": 0.4297, "step": 686 }, { "epoch": 0.32829580072874975, "grad_norm": 0.6350037232509057, "learning_rate": 1.9808686736929507e-05, "loss": 0.4139, "step": 687 }, { "epoch": 0.3287736694343229, "grad_norm": 0.6453998273855003, "learning_rate": 1.9807660102924285e-05, "loss": 0.4104, "step": 688 }, { "epoch": 0.32925153813989605, "grad_norm": 0.6118435089195414, "learning_rate": 1.980663074844716e-05, "loss": 0.4439, "step": 689 }, { "epoch": 0.3297294068454692, "grad_norm": 0.7137050965412601, "learning_rate": 1.9805598673783644e-05, "loss": 0.4051, "step": 690 }, { "epoch": 0.33020727555104235, "grad_norm": 0.660701532438893, "learning_rate": 1.980456387922003e-05, "loss": 0.4247, "step": 691 }, { "epoch": 0.3306851442566155, "grad_norm": 0.6219578421483081, "learning_rate": 1.9803526365043342e-05, "loss": 0.4311, "step": 692 }, { "epoch": 0.33116301296218864, "grad_norm": 0.7398002604532392, "learning_rate": 1.980248613154137e-05, "loss": 0.4162, "step": 693 }, { "epoch": 0.3316408816677618, "grad_norm": 0.594653754515913, "learning_rate": 1.9801443179002664e-05, "loss": 0.4291, "step": 694 }, { "epoch": 0.33211875037333494, "grad_norm": 0.7039036720424287, "learning_rate": 1.980039750771651e-05, "loss": 0.4154, "step": 695 }, { "epoch": 0.3325966190789081, "grad_norm": 0.6360396889801934, "learning_rate": 1.9799349117972966e-05, "loss": 0.4097, "step": 696 }, { "epoch": 0.33307448778448123, "grad_norm": 0.650673130557327, "learning_rate": 1.9798298010062834e-05, "loss": 0.408, "step": 697 }, { "epoch": 0.3335523564900544, "grad_norm": 0.6449814851092306, "learning_rate": 1.979724418427767e-05, "loss": 0.4237, "step": 698 }, { "epoch": 0.33403022519562753, "grad_norm": 0.6952694171904437, "learning_rate": 1.9796187640909793e-05, "loss": 0.414, "step": 699 }, { "epoch": 0.3345080939012006, "grad_norm": 0.6255490632329522, "learning_rate": 1.9795128380252263e-05, "loss": 0.4128, "step": 700 }, { "epoch": 0.33498596260677377, "grad_norm": 0.7726068650204944, "learning_rate": 1.9794066402598905e-05, "loss": 0.3941, "step": 701 }, { "epoch": 0.3354638313123469, "grad_norm": 0.7260875044062942, "learning_rate": 1.9793001708244293e-05, "loss": 0.4278, "step": 702 }, { "epoch": 0.33594170001792006, "grad_norm": 0.6812812508438351, "learning_rate": 1.9791934297483754e-05, "loss": 0.4121, "step": 703 }, { "epoch": 0.3364195687234932, "grad_norm": 0.6382204766381031, "learning_rate": 1.9790864170613363e-05, "loss": 0.4128, "step": 704 }, { "epoch": 0.33689743742906636, "grad_norm": 0.7007245066237098, "learning_rate": 1.978979132792996e-05, "loss": 0.4226, "step": 705 }, { "epoch": 0.3373753061346395, "grad_norm": 0.6498578233016445, "learning_rate": 1.978871576973113e-05, "loss": 0.4003, "step": 706 }, { "epoch": 0.33785317484021266, "grad_norm": 0.8549420085188715, "learning_rate": 1.9787637496315223e-05, "loss": 0.4117, "step": 707 }, { "epoch": 0.3383310435457858, "grad_norm": 0.7414633332196651, "learning_rate": 1.978655650798132e-05, "loss": 0.3992, "step": 708 }, { "epoch": 0.33880891225135895, "grad_norm": 0.6427548458668187, "learning_rate": 1.9785472805029274e-05, "loss": 0.4097, "step": 709 }, { "epoch": 0.3392867809569321, "grad_norm": 0.6277498179900414, "learning_rate": 1.9784386387759684e-05, "loss": 0.419, "step": 710 }, { "epoch": 0.33976464966250525, "grad_norm": 0.7227930165728, "learning_rate": 1.9783297256473905e-05, "loss": 0.3995, "step": 711 }, { "epoch": 0.3402425183680784, "grad_norm": 0.6041073332058529, "learning_rate": 1.9782205411474042e-05, "loss": 0.4028, "step": 712 }, { "epoch": 0.3407203870736515, "grad_norm": 0.697676563953654, "learning_rate": 1.978111085306295e-05, "loss": 0.4131, "step": 713 }, { "epoch": 0.34119825577922464, "grad_norm": 0.6626931153008405, "learning_rate": 1.9780013581544245e-05, "loss": 0.4364, "step": 714 }, { "epoch": 0.3416761244847978, "grad_norm": 0.6841487019463591, "learning_rate": 1.977891359722229e-05, "loss": 0.423, "step": 715 }, { "epoch": 0.34215399319037093, "grad_norm": 0.7564619275323727, "learning_rate": 1.9777810900402203e-05, "loss": 0.4161, "step": 716 }, { "epoch": 0.3426318618959441, "grad_norm": 1.3658709278718753, "learning_rate": 1.9776705491389844e-05, "loss": 0.4109, "step": 717 }, { "epoch": 0.3431097306015172, "grad_norm": 0.7907397360768815, "learning_rate": 1.9775597370491838e-05, "loss": 0.4051, "step": 718 }, { "epoch": 0.3435875993070904, "grad_norm": 0.6514561615066782, "learning_rate": 1.977448653801557e-05, "loss": 0.4034, "step": 719 }, { "epoch": 0.3440654680126635, "grad_norm": 0.6824032819742455, "learning_rate": 1.9773372994269147e-05, "loss": 0.4276, "step": 720 }, { "epoch": 0.34454333671823667, "grad_norm": 0.6883595228191173, "learning_rate": 1.9772256739561454e-05, "loss": 0.4267, "step": 721 }, { "epoch": 0.3450212054238098, "grad_norm": 0.7266627480718902, "learning_rate": 1.9771137774202126e-05, "loss": 0.4171, "step": 722 }, { "epoch": 0.34549907412938297, "grad_norm": 0.6857743582824786, "learning_rate": 1.9770016098501535e-05, "loss": 0.4133, "step": 723 }, { "epoch": 0.3459769428349561, "grad_norm": 0.8685391109702237, "learning_rate": 1.9768891712770824e-05, "loss": 0.394, "step": 724 }, { "epoch": 0.34645481154052926, "grad_norm": 0.6842643433044535, "learning_rate": 1.976776461732187e-05, "loss": 0.4152, "step": 725 }, { "epoch": 0.3469326802461024, "grad_norm": 0.7671231877298523, "learning_rate": 1.976663481246731e-05, "loss": 0.4238, "step": 726 }, { "epoch": 0.3474105489516755, "grad_norm": 0.6577325703350719, "learning_rate": 1.9765502298520534e-05, "loss": 0.412, "step": 727 }, { "epoch": 0.34788841765724865, "grad_norm": 0.6791922311294679, "learning_rate": 1.976436707579568e-05, "loss": 0.4071, "step": 728 }, { "epoch": 0.3483662863628218, "grad_norm": 0.7164375380905718, "learning_rate": 1.9763229144607643e-05, "loss": 0.4213, "step": 729 }, { "epoch": 0.34884415506839495, "grad_norm": 0.6448079596578629, "learning_rate": 1.976208850527206e-05, "loss": 0.4264, "step": 730 }, { "epoch": 0.3493220237739681, "grad_norm": 0.6810860347020636, "learning_rate": 1.9760945158105326e-05, "loss": 0.4401, "step": 731 }, { "epoch": 0.34979989247954124, "grad_norm": 0.6508629347760229, "learning_rate": 1.975979910342458e-05, "loss": 0.404, "step": 732 }, { "epoch": 0.3502777611851144, "grad_norm": 0.667860785115438, "learning_rate": 1.975865034154773e-05, "loss": 0.4189, "step": 733 }, { "epoch": 0.35075562989068754, "grad_norm": 0.5960633232464225, "learning_rate": 1.975749887279341e-05, "loss": 0.4063, "step": 734 }, { "epoch": 0.3512334985962607, "grad_norm": 0.6676835117003757, "learning_rate": 1.9756344697481027e-05, "loss": 0.4168, "step": 735 }, { "epoch": 0.35171136730183383, "grad_norm": 0.6027086678736178, "learning_rate": 1.975518781593072e-05, "loss": 0.3967, "step": 736 }, { "epoch": 0.352189236007407, "grad_norm": 0.7011001055447835, "learning_rate": 1.975402822846339e-05, "loss": 0.409, "step": 737 }, { "epoch": 0.35266710471298013, "grad_norm": 0.6173498609221076, "learning_rate": 1.9752865935400692e-05, "loss": 0.4069, "step": 738 }, { "epoch": 0.3531449734185533, "grad_norm": 0.7500007957464343, "learning_rate": 1.975170093706502e-05, "loss": 0.4039, "step": 739 }, { "epoch": 0.35362284212412637, "grad_norm": 0.586011297625854, "learning_rate": 1.975053323377952e-05, "loss": 0.4071, "step": 740 }, { "epoch": 0.3541007108296995, "grad_norm": 0.749881324377058, "learning_rate": 1.9749362825868105e-05, "loss": 0.4236, "step": 741 }, { "epoch": 0.35457857953527266, "grad_norm": 0.6097150903798377, "learning_rate": 1.9748189713655414e-05, "loss": 0.4343, "step": 742 }, { "epoch": 0.3550564482408458, "grad_norm": 0.5888296839949839, "learning_rate": 1.9747013897466852e-05, "loss": 0.413, "step": 743 }, { "epoch": 0.35553431694641896, "grad_norm": 0.6218972588587672, "learning_rate": 1.974583537762857e-05, "loss": 0.4002, "step": 744 }, { "epoch": 0.3560121856519921, "grad_norm": 0.6244729217199203, "learning_rate": 1.9744654154467468e-05, "loss": 0.4249, "step": 745 }, { "epoch": 0.35649005435756526, "grad_norm": 0.6019630528796382, "learning_rate": 1.9743470228311195e-05, "loss": 0.4108, "step": 746 }, { "epoch": 0.3569679230631384, "grad_norm": 0.6053335199202433, "learning_rate": 1.9742283599488154e-05, "loss": 0.4164, "step": 747 }, { "epoch": 0.35744579176871155, "grad_norm": 0.5390394035508936, "learning_rate": 1.974109426832749e-05, "loss": 0.4269, "step": 748 }, { "epoch": 0.3579236604742847, "grad_norm": 0.6864448137729777, "learning_rate": 1.973990223515911e-05, "loss": 0.4151, "step": 749 }, { "epoch": 0.35840152917985785, "grad_norm": 0.6117555520134601, "learning_rate": 1.9738707500313655e-05, "loss": 0.4127, "step": 750 }, { "epoch": 0.358879397885431, "grad_norm": 0.6231362351908816, "learning_rate": 1.973751006412253e-05, "loss": 0.4107, "step": 751 }, { "epoch": 0.35935726659100414, "grad_norm": 0.624929534918908, "learning_rate": 1.973630992691788e-05, "loss": 0.4006, "step": 752 }, { "epoch": 0.3598351352965773, "grad_norm": 0.6397155829395167, "learning_rate": 1.97351070890326e-05, "loss": 0.4036, "step": 753 }, { "epoch": 0.3603130040021504, "grad_norm": 0.6568169413470653, "learning_rate": 1.9733901550800342e-05, "loss": 0.3832, "step": 754 }, { "epoch": 0.36079087270772353, "grad_norm": 0.6058849776764632, "learning_rate": 1.9732693312555492e-05, "loss": 0.4093, "step": 755 }, { "epoch": 0.3612687414132967, "grad_norm": 0.6417564281631485, "learning_rate": 1.9731482374633203e-05, "loss": 0.4261, "step": 756 }, { "epoch": 0.3617466101188698, "grad_norm": 0.6139727827460876, "learning_rate": 1.973026873736936e-05, "loss": 0.4099, "step": 757 }, { "epoch": 0.362224478824443, "grad_norm": 0.6510466515537121, "learning_rate": 1.972905240110061e-05, "loss": 0.4054, "step": 758 }, { "epoch": 0.3627023475300161, "grad_norm": 0.6367387723523807, "learning_rate": 1.9727833366164342e-05, "loss": 0.4288, "step": 759 }, { "epoch": 0.36318021623558927, "grad_norm": 0.7047833872183114, "learning_rate": 1.9726611632898693e-05, "loss": 0.4184, "step": 760 }, { "epoch": 0.3636580849411624, "grad_norm": 0.6625607174095874, "learning_rate": 1.9725387201642553e-05, "loss": 0.3984, "step": 761 }, { "epoch": 0.36413595364673557, "grad_norm": 0.7543210649256787, "learning_rate": 1.9724160072735553e-05, "loss": 0.4157, "step": 762 }, { "epoch": 0.3646138223523087, "grad_norm": 0.8409356525795596, "learning_rate": 1.9722930246518083e-05, "loss": 0.3997, "step": 763 }, { "epoch": 0.36509169105788186, "grad_norm": 0.6419317088471457, "learning_rate": 1.9721697723331273e-05, "loss": 0.4059, "step": 764 }, { "epoch": 0.365569559763455, "grad_norm": 0.739982349078809, "learning_rate": 1.9720462503517e-05, "loss": 0.4118, "step": 765 }, { "epoch": 0.36604742846902816, "grad_norm": 0.6898565679952424, "learning_rate": 1.9719224587417896e-05, "loss": 0.4039, "step": 766 }, { "epoch": 0.3665252971746013, "grad_norm": 0.7612996038765697, "learning_rate": 1.971798397537733e-05, "loss": 0.4153, "step": 767 }, { "epoch": 0.3670031658801744, "grad_norm": 0.6660225921682122, "learning_rate": 1.971674066773944e-05, "loss": 0.4148, "step": 768 }, { "epoch": 0.36748103458574755, "grad_norm": 0.7220884614015122, "learning_rate": 1.9715494664849088e-05, "loss": 0.4055, "step": 769 }, { "epoch": 0.3679589032913207, "grad_norm": 0.677877244256832, "learning_rate": 1.971424596705189e-05, "loss": 0.4119, "step": 770 }, { "epoch": 0.36843677199689384, "grad_norm": 0.7526211652684072, "learning_rate": 1.971299457469422e-05, "loss": 0.4177, "step": 771 }, { "epoch": 0.368914640702467, "grad_norm": 0.6114869523814213, "learning_rate": 1.971174048812319e-05, "loss": 0.4101, "step": 772 }, { "epoch": 0.36939250940804014, "grad_norm": 0.6899914780670549, "learning_rate": 1.971048370768666e-05, "loss": 0.4351, "step": 773 }, { "epoch": 0.3698703781136133, "grad_norm": 0.6192089565424979, "learning_rate": 1.9709224233733236e-05, "loss": 0.4042, "step": 774 }, { "epoch": 0.37034824681918643, "grad_norm": 0.7355290743652367, "learning_rate": 1.9707962066612278e-05, "loss": 0.4027, "step": 775 }, { "epoch": 0.3708261155247596, "grad_norm": 0.7403645775826277, "learning_rate": 1.9706697206673892e-05, "loss": 0.4023, "step": 776 }, { "epoch": 0.37130398423033273, "grad_norm": 0.6999584425787945, "learning_rate": 1.9705429654268925e-05, "loss": 0.4097, "step": 777 }, { "epoch": 0.3717818529359059, "grad_norm": 0.6998503016090031, "learning_rate": 1.9704159409748967e-05, "loss": 0.4081, "step": 778 }, { "epoch": 0.372259721641479, "grad_norm": 0.6443494078717972, "learning_rate": 1.9702886473466368e-05, "loss": 0.4077, "step": 779 }, { "epoch": 0.3727375903470522, "grad_norm": 0.6852044402539894, "learning_rate": 1.970161084577422e-05, "loss": 0.4193, "step": 780 }, { "epoch": 0.37321545905262526, "grad_norm": 0.6236384893553105, "learning_rate": 1.970033252702636e-05, "loss": 0.4031, "step": 781 }, { "epoch": 0.3736933277581984, "grad_norm": 0.6469866739392119, "learning_rate": 1.969905151757736e-05, "loss": 0.4099, "step": 782 }, { "epoch": 0.37417119646377156, "grad_norm": 0.6504567142773998, "learning_rate": 1.9697767817782565e-05, "loss": 0.4075, "step": 783 }, { "epoch": 0.3746490651693447, "grad_norm": 0.7743161467373921, "learning_rate": 1.969648142799804e-05, "loss": 0.4241, "step": 784 }, { "epoch": 0.37512693387491786, "grad_norm": 0.61791736904947, "learning_rate": 1.9695192348580606e-05, "loss": 0.4073, "step": 785 }, { "epoch": 0.375604802580491, "grad_norm": 0.6551482938668043, "learning_rate": 1.969390057988784e-05, "loss": 0.4157, "step": 786 }, { "epoch": 0.37608267128606415, "grad_norm": 0.6241776375892095, "learning_rate": 1.9692606122278047e-05, "loss": 0.3991, "step": 787 }, { "epoch": 0.3765605399916373, "grad_norm": 0.8697779384559257, "learning_rate": 1.9691308976110293e-05, "loss": 0.3943, "step": 788 }, { "epoch": 0.37703840869721045, "grad_norm": 0.6646525801024487, "learning_rate": 1.969000914174438e-05, "loss": 0.4011, "step": 789 }, { "epoch": 0.3775162774027836, "grad_norm": 0.6842360997446302, "learning_rate": 1.9688706619540863e-05, "loss": 0.4229, "step": 790 }, { "epoch": 0.37799414610835674, "grad_norm": 0.6269010992340172, "learning_rate": 1.9687401409861032e-05, "loss": 0.4165, "step": 791 }, { "epoch": 0.3784720148139299, "grad_norm": 0.6948929157285494, "learning_rate": 1.9686093513066933e-05, "loss": 0.4189, "step": 792 }, { "epoch": 0.37894988351950304, "grad_norm": 0.6938250597089114, "learning_rate": 1.9684782929521355e-05, "loss": 0.4382, "step": 793 }, { "epoch": 0.3794277522250762, "grad_norm": 0.710536040353905, "learning_rate": 1.9683469659587826e-05, "loss": 0.4018, "step": 794 }, { "epoch": 0.3799056209306493, "grad_norm": 0.7765930152534725, "learning_rate": 1.968215370363063e-05, "loss": 0.4025, "step": 795 }, { "epoch": 0.3803834896362224, "grad_norm": 0.673786846379775, "learning_rate": 1.9680835062014784e-05, "loss": 0.4072, "step": 796 }, { "epoch": 0.3808613583417956, "grad_norm": 0.6624205844915216, "learning_rate": 1.967951373510606e-05, "loss": 0.4029, "step": 797 }, { "epoch": 0.3813392270473687, "grad_norm": 0.6702031183000383, "learning_rate": 1.967818972327097e-05, "loss": 0.3933, "step": 798 }, { "epoch": 0.38181709575294187, "grad_norm": 0.6643830669329299, "learning_rate": 1.967686302687677e-05, "loss": 0.4078, "step": 799 }, { "epoch": 0.382294964458515, "grad_norm": 0.6820114775894023, "learning_rate": 1.9675533646291463e-05, "loss": 0.4019, "step": 800 }, { "epoch": 0.38277283316408817, "grad_norm": 0.6654337293955677, "learning_rate": 1.9674201581883796e-05, "loss": 0.4143, "step": 801 }, { "epoch": 0.3832507018696613, "grad_norm": 0.6277257073491326, "learning_rate": 1.9672866834023263e-05, "loss": 0.3954, "step": 802 }, { "epoch": 0.38372857057523446, "grad_norm": 1.181246701089062, "learning_rate": 1.9671529403080095e-05, "loss": 0.4115, "step": 803 }, { "epoch": 0.3842064392808076, "grad_norm": 0.6999373807721258, "learning_rate": 1.9670189289425273e-05, "loss": 0.4085, "step": 804 }, { "epoch": 0.38468430798638076, "grad_norm": 0.6351836651328577, "learning_rate": 1.9668846493430522e-05, "loss": 0.4024, "step": 805 }, { "epoch": 0.3851621766919539, "grad_norm": 0.6200352215300432, "learning_rate": 1.966750101546831e-05, "loss": 0.4217, "step": 806 }, { "epoch": 0.38564004539752705, "grad_norm": 0.6312607230160627, "learning_rate": 1.9666152855911845e-05, "loss": 0.4148, "step": 807 }, { "epoch": 0.38611791410310015, "grad_norm": 0.6247739745978539, "learning_rate": 1.966480201513509e-05, "loss": 0.4129, "step": 808 }, { "epoch": 0.3865957828086733, "grad_norm": 23.521606298262835, "learning_rate": 1.966344849351274e-05, "loss": 0.3985, "step": 809 }, { "epoch": 0.38707365151424644, "grad_norm": 0.7307789504862988, "learning_rate": 1.9662092291420233e-05, "loss": 0.4355, "step": 810 }, { "epoch": 0.3875515202198196, "grad_norm": 0.6338895925736729, "learning_rate": 1.9660733409233763e-05, "loss": 0.4114, "step": 811 }, { "epoch": 0.38802938892539274, "grad_norm": 1.022414001164045, "learning_rate": 1.965937184733026e-05, "loss": 0.4177, "step": 812 }, { "epoch": 0.3885072576309659, "grad_norm": 0.6257154243428295, "learning_rate": 1.965800760608739e-05, "loss": 0.4238, "step": 813 }, { "epoch": 0.38898512633653903, "grad_norm": 0.6578526200723673, "learning_rate": 1.965664068588358e-05, "loss": 0.4027, "step": 814 }, { "epoch": 0.3894629950421122, "grad_norm": 0.6394160956438139, "learning_rate": 1.965527108709798e-05, "loss": 0.4047, "step": 815 }, { "epoch": 0.38994086374768533, "grad_norm": 0.6469488408202481, "learning_rate": 1.96538988101105e-05, "loss": 0.4035, "step": 816 }, { "epoch": 0.3904187324532585, "grad_norm": 0.6625645754583935, "learning_rate": 1.9652523855301783e-05, "loss": 0.4144, "step": 817 }, { "epoch": 0.3908966011588316, "grad_norm": 0.6841818131947363, "learning_rate": 1.9651146223053213e-05, "loss": 0.389, "step": 818 }, { "epoch": 0.3913744698644048, "grad_norm": 0.6438625312526988, "learning_rate": 1.9649765913746923e-05, "loss": 0.4111, "step": 819 }, { "epoch": 0.3918523385699779, "grad_norm": 1.01601163370014, "learning_rate": 1.964838292776579e-05, "loss": 0.4031, "step": 820 }, { "epoch": 0.39233020727555107, "grad_norm": 0.677050070767038, "learning_rate": 1.9646997265493426e-05, "loss": 0.4149, "step": 821 }, { "epoch": 0.39280807598112416, "grad_norm": 0.6738615859070755, "learning_rate": 1.9645608927314194e-05, "loss": 0.4123, "step": 822 }, { "epoch": 0.3932859446866973, "grad_norm": 0.6651159143963696, "learning_rate": 1.9644217913613187e-05, "loss": 0.4057, "step": 823 }, { "epoch": 0.39376381339227046, "grad_norm": 0.7691000084987812, "learning_rate": 1.9642824224776252e-05, "loss": 0.4046, "step": 824 }, { "epoch": 0.3942416820978436, "grad_norm": 0.7211243084062602, "learning_rate": 1.9641427861189973e-05, "loss": 0.4089, "step": 825 }, { "epoch": 0.39471955080341675, "grad_norm": 0.63694129075116, "learning_rate": 1.964002882324168e-05, "loss": 0.3879, "step": 826 }, { "epoch": 0.3951974195089899, "grad_norm": 0.5886615227252643, "learning_rate": 1.9638627111319437e-05, "loss": 0.4098, "step": 827 }, { "epoch": 0.39567528821456305, "grad_norm": 0.6720674176320514, "learning_rate": 1.963722272581206e-05, "loss": 0.4014, "step": 828 }, { "epoch": 0.3961531569201362, "grad_norm": 0.5791735119683044, "learning_rate": 1.963581566710909e-05, "loss": 0.4008, "step": 829 }, { "epoch": 0.39663102562570934, "grad_norm": 0.6464775042145513, "learning_rate": 1.963440593560083e-05, "loss": 0.4164, "step": 830 }, { "epoch": 0.3971088943312825, "grad_norm": 0.6144881514988998, "learning_rate": 1.963299353167831e-05, "loss": 0.411, "step": 831 }, { "epoch": 0.39758676303685564, "grad_norm": 0.670072169690497, "learning_rate": 1.9631578455733307e-05, "loss": 0.4089, "step": 832 }, { "epoch": 0.3980646317424288, "grad_norm": 0.6989267984529208, "learning_rate": 1.9630160708158343e-05, "loss": 0.406, "step": 833 }, { "epoch": 0.39854250044800194, "grad_norm": 1.073909712530086, "learning_rate": 1.9628740289346668e-05, "loss": 0.395, "step": 834 }, { "epoch": 0.3990203691535751, "grad_norm": 0.9121845119973022, "learning_rate": 1.9627317199692287e-05, "loss": 0.4055, "step": 835 }, { "epoch": 0.3994982378591482, "grad_norm": 0.6116187864851871, "learning_rate": 1.9625891439589933e-05, "loss": 0.3942, "step": 836 }, { "epoch": 0.3999761065647213, "grad_norm": 0.6622950577654838, "learning_rate": 1.9624463009435097e-05, "loss": 0.4037, "step": 837 }, { "epoch": 0.40045397527029447, "grad_norm": 0.7953106318152663, "learning_rate": 1.9623031909623993e-05, "loss": 0.4165, "step": 838 }, { "epoch": 0.4009318439758676, "grad_norm": 0.6653838350357102, "learning_rate": 1.962159814055358e-05, "loss": 0.4084, "step": 839 }, { "epoch": 0.40140971268144077, "grad_norm": 0.6890324725741804, "learning_rate": 1.962016170262157e-05, "loss": 0.4236, "step": 840 }, { "epoch": 0.4018875813870139, "grad_norm": 0.6550797312828021, "learning_rate": 1.96187225962264e-05, "loss": 0.4096, "step": 841 }, { "epoch": 0.40236545009258706, "grad_norm": 0.7180548191723333, "learning_rate": 1.9617280821767253e-05, "loss": 0.3889, "step": 842 }, { "epoch": 0.4028433187981602, "grad_norm": 0.6990218911792896, "learning_rate": 1.9615836379644054e-05, "loss": 0.3876, "step": 843 }, { "epoch": 0.40332118750373336, "grad_norm": 0.8178032329800812, "learning_rate": 1.961438927025746e-05, "loss": 0.3978, "step": 844 }, { "epoch": 0.4037990562093065, "grad_norm": 0.5792010743662441, "learning_rate": 1.961293949400888e-05, "loss": 0.4065, "step": 845 }, { "epoch": 0.40427692491487965, "grad_norm": 0.6641398130174404, "learning_rate": 1.9611487051300454e-05, "loss": 0.3914, "step": 846 }, { "epoch": 0.4047547936204528, "grad_norm": 0.6195936771890139, "learning_rate": 1.961003194253506e-05, "loss": 0.402, "step": 847 }, { "epoch": 0.40523266232602595, "grad_norm": 0.6768052082314968, "learning_rate": 1.9608574168116324e-05, "loss": 0.4199, "step": 848 }, { "epoch": 0.40571053103159904, "grad_norm": 0.6082693604746662, "learning_rate": 1.960711372844861e-05, "loss": 0.4074, "step": 849 }, { "epoch": 0.4061883997371722, "grad_norm": 0.6268219384593703, "learning_rate": 1.960565062393701e-05, "loss": 0.4157, "step": 850 }, { "epoch": 0.40666626844274534, "grad_norm": 0.6158932362492403, "learning_rate": 1.960418485498737e-05, "loss": 0.3942, "step": 851 }, { "epoch": 0.4071441371483185, "grad_norm": 0.7217988468496267, "learning_rate": 1.9602716422006266e-05, "loss": 0.4123, "step": 852 }, { "epoch": 0.40762200585389163, "grad_norm": 0.6549013187083295, "learning_rate": 1.9601245325401016e-05, "loss": 0.3881, "step": 853 }, { "epoch": 0.4080998745594648, "grad_norm": 0.6163590041751829, "learning_rate": 1.9599771565579673e-05, "loss": 0.4224, "step": 854 }, { "epoch": 0.40857774326503793, "grad_norm": 0.8712876541825809, "learning_rate": 1.9598295142951035e-05, "loss": 0.4042, "step": 855 }, { "epoch": 0.4090556119706111, "grad_norm": 0.7278398539493856, "learning_rate": 1.959681605792464e-05, "loss": 0.4164, "step": 856 }, { "epoch": 0.4095334806761842, "grad_norm": 0.600386439964583, "learning_rate": 1.9595334310910753e-05, "loss": 0.4135, "step": 857 }, { "epoch": 0.4100113493817574, "grad_norm": 0.7081480085926267, "learning_rate": 1.9593849902320386e-05, "loss": 0.4027, "step": 858 }, { "epoch": 0.4104892180873305, "grad_norm": 0.6188620046952817, "learning_rate": 1.9592362832565287e-05, "loss": 0.4016, "step": 859 }, { "epoch": 0.41096708679290367, "grad_norm": 0.682019844909097, "learning_rate": 1.9590873102057948e-05, "loss": 0.4071, "step": 860 }, { "epoch": 0.4114449554984768, "grad_norm": 0.6603131181430578, "learning_rate": 1.9589380711211588e-05, "loss": 0.3946, "step": 861 }, { "epoch": 0.41192282420404996, "grad_norm": 0.6577106617903441, "learning_rate": 1.9587885660440176e-05, "loss": 0.4133, "step": 862 }, { "epoch": 0.41240069290962306, "grad_norm": 0.6628545122914132, "learning_rate": 1.9586387950158406e-05, "loss": 0.4131, "step": 863 }, { "epoch": 0.4128785616151962, "grad_norm": 0.6190751337142328, "learning_rate": 1.958488758078172e-05, "loss": 0.4191, "step": 864 }, { "epoch": 0.41335643032076935, "grad_norm": 0.7460256010176625, "learning_rate": 1.9583384552726294e-05, "loss": 0.3936, "step": 865 }, { "epoch": 0.4138342990263425, "grad_norm": 0.5886477110046239, "learning_rate": 1.9581878866409042e-05, "loss": 0.4069, "step": 866 }, { "epoch": 0.41431216773191565, "grad_norm": 0.6886448680334144, "learning_rate": 1.9580370522247614e-05, "loss": 0.397, "step": 867 }, { "epoch": 0.4147900364374888, "grad_norm": 0.6232904922457612, "learning_rate": 1.9578859520660396e-05, "loss": 0.4029, "step": 868 }, { "epoch": 0.41526790514306194, "grad_norm": 0.5941334358676716, "learning_rate": 1.9577345862066518e-05, "loss": 0.3878, "step": 869 }, { "epoch": 0.4157457738486351, "grad_norm": 1.0139675059105102, "learning_rate": 1.957582954688584e-05, "loss": 0.3952, "step": 870 }, { "epoch": 0.41622364255420824, "grad_norm": 3.476711388368822, "learning_rate": 1.9574310575538956e-05, "loss": 0.4103, "step": 871 }, { "epoch": 0.4167015112597814, "grad_norm": 0.7611765590691459, "learning_rate": 1.9572788948447206e-05, "loss": 0.4179, "step": 872 }, { "epoch": 0.41717937996535454, "grad_norm": 0.600202395984294, "learning_rate": 1.9571264666032667e-05, "loss": 0.4074, "step": 873 }, { "epoch": 0.4176572486709277, "grad_norm": 0.651033950877546, "learning_rate": 1.9569737728718143e-05, "loss": 0.4007, "step": 874 }, { "epoch": 0.41813511737650083, "grad_norm": 0.6518303558888998, "learning_rate": 1.9568208136927177e-05, "loss": 0.4078, "step": 875 }, { "epoch": 0.4186129860820739, "grad_norm": 0.6107727192896184, "learning_rate": 1.956667589108406e-05, "loss": 0.4205, "step": 876 }, { "epoch": 0.41909085478764707, "grad_norm": 0.5977988275186865, "learning_rate": 1.95651409916138e-05, "loss": 0.3981, "step": 877 }, { "epoch": 0.4195687234932202, "grad_norm": 0.6053736216356521, "learning_rate": 1.9563603438942155e-05, "loss": 0.41, "step": 878 }, { "epoch": 0.42004659219879337, "grad_norm": 0.7201616837003876, "learning_rate": 1.9562063233495615e-05, "loss": 0.4086, "step": 879 }, { "epoch": 0.4205244609043665, "grad_norm": 0.6109033092609298, "learning_rate": 1.9560520375701408e-05, "loss": 0.4061, "step": 880 }, { "epoch": 0.42100232960993966, "grad_norm": 0.6186670618629603, "learning_rate": 1.9558974865987494e-05, "loss": 0.4207, "step": 881 }, { "epoch": 0.4214801983155128, "grad_norm": 0.6147640108712225, "learning_rate": 1.9557426704782564e-05, "loss": 0.4166, "step": 882 }, { "epoch": 0.42195806702108596, "grad_norm": 0.6202320300282839, "learning_rate": 1.9555875892516064e-05, "loss": 0.4055, "step": 883 }, { "epoch": 0.4224359357266591, "grad_norm": 0.6438580742369829, "learning_rate": 1.955432242961815e-05, "loss": 0.4077, "step": 884 }, { "epoch": 0.42291380443223225, "grad_norm": 0.6294856559072706, "learning_rate": 1.9552766316519726e-05, "loss": 0.4149, "step": 885 }, { "epoch": 0.4233916731378054, "grad_norm": 0.6161283088521784, "learning_rate": 1.955120755365244e-05, "loss": 0.411, "step": 886 }, { "epoch": 0.42386954184337855, "grad_norm": 0.6014577535152139, "learning_rate": 1.9549646141448657e-05, "loss": 0.3914, "step": 887 }, { "epoch": 0.4243474105489517, "grad_norm": 0.6848333020026872, "learning_rate": 1.9548082080341486e-05, "loss": 0.4001, "step": 888 }, { "epoch": 0.42482527925452485, "grad_norm": 0.6720870979122917, "learning_rate": 1.954651537076477e-05, "loss": 0.4238, "step": 889 }, { "epoch": 0.42530314796009794, "grad_norm": 0.5818304388573133, "learning_rate": 1.9544946013153093e-05, "loss": 0.4042, "step": 890 }, { "epoch": 0.4257810166656711, "grad_norm": 0.7744562017315486, "learning_rate": 1.9543374007941756e-05, "loss": 0.4238, "step": 891 }, { "epoch": 0.42625888537124423, "grad_norm": 0.7762559725025642, "learning_rate": 1.9541799355566813e-05, "loss": 0.4111, "step": 892 }, { "epoch": 0.4267367540768174, "grad_norm": 0.6599610950235643, "learning_rate": 1.9540222056465046e-05, "loss": 0.411, "step": 893 }, { "epoch": 0.42721462278239053, "grad_norm": 0.7413734398856531, "learning_rate": 1.9538642111073966e-05, "loss": 0.417, "step": 894 }, { "epoch": 0.4276924914879637, "grad_norm": 0.6502451354585309, "learning_rate": 1.9537059519831822e-05, "loss": 0.416, "step": 895 }, { "epoch": 0.4281703601935368, "grad_norm": 0.8797654195920014, "learning_rate": 1.9535474283177597e-05, "loss": 0.4156, "step": 896 }, { "epoch": 0.42864822889911, "grad_norm": 0.5595634974263763, "learning_rate": 1.953388640155101e-05, "loss": 0.4082, "step": 897 }, { "epoch": 0.4291260976046831, "grad_norm": 0.6530429716085568, "learning_rate": 1.953229587539251e-05, "loss": 0.4025, "step": 898 }, { "epoch": 0.42960396631025627, "grad_norm": 0.6657647909949589, "learning_rate": 1.953070270514328e-05, "loss": 0.4143, "step": 899 }, { "epoch": 0.4300818350158294, "grad_norm": 0.7463671068027646, "learning_rate": 1.9529106891245244e-05, "loss": 0.3887, "step": 900 }, { "epoch": 0.43055970372140256, "grad_norm": 0.6059170836222347, "learning_rate": 1.952750843414104e-05, "loss": 0.4028, "step": 901 }, { "epoch": 0.4310375724269757, "grad_norm": 0.6074415817250135, "learning_rate": 1.9525907334274063e-05, "loss": 0.3867, "step": 902 }, { "epoch": 0.4315154411325488, "grad_norm": 0.6460281093432895, "learning_rate": 1.9524303592088424e-05, "loss": 0.403, "step": 903 }, { "epoch": 0.43199330983812195, "grad_norm": 0.6192867647004487, "learning_rate": 1.9522697208028975e-05, "loss": 0.4004, "step": 904 }, { "epoch": 0.4324711785436951, "grad_norm": 0.7247569688470655, "learning_rate": 1.9521088182541298e-05, "loss": 0.4167, "step": 905 }, { "epoch": 0.43294904724926825, "grad_norm": 0.7590859276414997, "learning_rate": 1.9519476516071706e-05, "loss": 0.4002, "step": 906 }, { "epoch": 0.4334269159548414, "grad_norm": 0.5841622232228564, "learning_rate": 1.951786220906725e-05, "loss": 0.4182, "step": 907 }, { "epoch": 0.43390478466041454, "grad_norm": 0.6692111403784917, "learning_rate": 1.951624526197571e-05, "loss": 0.401, "step": 908 }, { "epoch": 0.4343826533659877, "grad_norm": 0.6097916498450394, "learning_rate": 1.95146256752456e-05, "loss": 0.4001, "step": 909 }, { "epoch": 0.43486052207156084, "grad_norm": 0.7357451300421103, "learning_rate": 1.951300344932616e-05, "loss": 0.4279, "step": 910 }, { "epoch": 0.435338390777134, "grad_norm": 1.4472772781586367, "learning_rate": 1.9511378584667372e-05, "loss": 0.409, "step": 911 }, { "epoch": 0.43581625948270714, "grad_norm": 0.6412845231251528, "learning_rate": 1.950975108171994e-05, "loss": 0.4232, "step": 912 }, { "epoch": 0.4362941281882803, "grad_norm": 0.6815096104930907, "learning_rate": 1.950812094093531e-05, "loss": 0.3989, "step": 913 }, { "epoch": 0.43677199689385343, "grad_norm": 0.6287441394449275, "learning_rate": 1.950648816276565e-05, "loss": 0.4141, "step": 914 }, { "epoch": 0.4372498655994266, "grad_norm": 0.6475760769332336, "learning_rate": 1.9504852747663862e-05, "loss": 0.4116, "step": 915 }, { "epoch": 0.4377277343049997, "grad_norm": 0.5697529534279385, "learning_rate": 1.9503214696083587e-05, "loss": 0.3941, "step": 916 }, { "epoch": 0.4382056030105728, "grad_norm": 0.6214698860437475, "learning_rate": 1.9501574008479188e-05, "loss": 0.4199, "step": 917 }, { "epoch": 0.43868347171614597, "grad_norm": 0.5978885664378845, "learning_rate": 1.9499930685305767e-05, "loss": 0.3966, "step": 918 }, { "epoch": 0.4391613404217191, "grad_norm": 0.6051257671018444, "learning_rate": 1.949828472701915e-05, "loss": 0.4008, "step": 919 }, { "epoch": 0.43963920912729226, "grad_norm": 0.6920182979659302, "learning_rate": 1.9496636134075894e-05, "loss": 0.3959, "step": 920 }, { "epoch": 0.4401170778328654, "grad_norm": 0.650235126033297, "learning_rate": 1.9494984906933293e-05, "loss": 0.3951, "step": 921 }, { "epoch": 0.44059494653843856, "grad_norm": 0.683430884067445, "learning_rate": 1.9493331046049366e-05, "loss": 0.398, "step": 922 }, { "epoch": 0.4410728152440117, "grad_norm": 0.6452456126774382, "learning_rate": 1.9491674551882867e-05, "loss": 0.389, "step": 923 }, { "epoch": 0.44155068394958485, "grad_norm": 0.695330236916925, "learning_rate": 1.9490015424893277e-05, "loss": 0.4016, "step": 924 }, { "epoch": 0.442028552655158, "grad_norm": 0.6316596603137281, "learning_rate": 1.9488353665540813e-05, "loss": 0.3978, "step": 925 }, { "epoch": 0.44250642136073115, "grad_norm": 0.8094826470406035, "learning_rate": 1.9486689274286413e-05, "loss": 0.3837, "step": 926 }, { "epoch": 0.4429842900663043, "grad_norm": 0.8597357950448963, "learning_rate": 1.9485022251591744e-05, "loss": 0.4175, "step": 927 }, { "epoch": 0.44346215877187745, "grad_norm": 0.6850435114555503, "learning_rate": 1.9483352597919222e-05, "loss": 0.416, "step": 928 }, { "epoch": 0.4439400274774506, "grad_norm": 0.6105257300932792, "learning_rate": 1.9481680313731973e-05, "loss": 0.4087, "step": 929 }, { "epoch": 0.44441789618302374, "grad_norm": 0.7783073049281799, "learning_rate": 1.9480005399493857e-05, "loss": 0.4084, "step": 930 }, { "epoch": 0.44489576488859683, "grad_norm": 0.8160008351900309, "learning_rate": 1.9478327855669468e-05, "loss": 0.4102, "step": 931 }, { "epoch": 0.44537363359417, "grad_norm": 0.6251532938868944, "learning_rate": 1.9476647682724125e-05, "loss": 0.4028, "step": 932 }, { "epoch": 0.44585150229974313, "grad_norm": 0.7279316882881499, "learning_rate": 1.9474964881123883e-05, "loss": 0.4085, "step": 933 }, { "epoch": 0.4463293710053163, "grad_norm": 0.6648782797147017, "learning_rate": 1.9473279451335517e-05, "loss": 0.412, "step": 934 }, { "epoch": 0.4468072397108894, "grad_norm": 0.7245218627960338, "learning_rate": 1.9471591393826536e-05, "loss": 0.3952, "step": 935 }, { "epoch": 0.4472851084164626, "grad_norm": 0.7468126769642631, "learning_rate": 1.9469900709065176e-05, "loss": 0.4059, "step": 936 }, { "epoch": 0.4477629771220357, "grad_norm": 0.7157504712908894, "learning_rate": 1.9468207397520413e-05, "loss": 0.3994, "step": 937 }, { "epoch": 0.44824084582760887, "grad_norm": 0.6310072705677632, "learning_rate": 1.946651145966193e-05, "loss": 0.406, "step": 938 }, { "epoch": 0.448718714533182, "grad_norm": 0.6784353372400441, "learning_rate": 1.9464812895960152e-05, "loss": 0.4009, "step": 939 }, { "epoch": 0.44919658323875516, "grad_norm": 0.6479984452905685, "learning_rate": 1.9463111706886234e-05, "loss": 0.4168, "step": 940 }, { "epoch": 0.4496744519443283, "grad_norm": 0.6287167548848303, "learning_rate": 1.9461407892912055e-05, "loss": 0.4014, "step": 941 }, { "epoch": 0.45015232064990146, "grad_norm": 0.7024078893645669, "learning_rate": 1.9459701454510228e-05, "loss": 0.4097, "step": 942 }, { "epoch": 0.4506301893554746, "grad_norm": 0.6977676751624544, "learning_rate": 1.945799239215408e-05, "loss": 0.4026, "step": 943 }, { "epoch": 0.4511080580610477, "grad_norm": 0.64820079330831, "learning_rate": 1.945628070631768e-05, "loss": 0.3986, "step": 944 }, { "epoch": 0.45158592676662085, "grad_norm": 0.7286318274900389, "learning_rate": 1.9454566397475813e-05, "loss": 0.3888, "step": 945 }, { "epoch": 0.452063795472194, "grad_norm": 1.4877125746903945, "learning_rate": 1.9452849466104008e-05, "loss": 0.4151, "step": 946 }, { "epoch": 0.45254166417776714, "grad_norm": 0.6879684254386736, "learning_rate": 1.9451129912678506e-05, "loss": 0.3934, "step": 947 }, { "epoch": 0.4530195328833403, "grad_norm": 0.6050674990436092, "learning_rate": 1.9449407737676277e-05, "loss": 0.4019, "step": 948 }, { "epoch": 0.45349740158891344, "grad_norm": 0.6424207582798632, "learning_rate": 1.9447682941575032e-05, "loss": 0.4142, "step": 949 }, { "epoch": 0.4539752702944866, "grad_norm": 0.6584464274725602, "learning_rate": 1.944595552485319e-05, "loss": 0.4112, "step": 950 }, { "epoch": 0.45445313900005974, "grad_norm": 1.8201606803835755, "learning_rate": 1.9444225487989912e-05, "loss": 0.3986, "step": 951 }, { "epoch": 0.4549310077056329, "grad_norm": 0.6649041164921152, "learning_rate": 1.9442492831465075e-05, "loss": 0.4124, "step": 952 }, { "epoch": 0.45540887641120603, "grad_norm": 0.6103343936848789, "learning_rate": 1.944075755575929e-05, "loss": 0.402, "step": 953 }, { "epoch": 0.4558867451167792, "grad_norm": 1.1183419241012187, "learning_rate": 1.943901966135389e-05, "loss": 0.4032, "step": 954 }, { "epoch": 0.4563646138223523, "grad_norm": 0.6278256875792309, "learning_rate": 1.943727914873094e-05, "loss": 0.397, "step": 955 }, { "epoch": 0.4568424825279255, "grad_norm": 0.5725976955845966, "learning_rate": 1.943553601837322e-05, "loss": 0.4117, "step": 956 }, { "epoch": 0.4573203512334986, "grad_norm": 0.7341816058812604, "learning_rate": 1.943379027076425e-05, "loss": 0.389, "step": 957 }, { "epoch": 0.4577982199390717, "grad_norm": 0.6161118906924941, "learning_rate": 1.943204190638827e-05, "loss": 0.402, "step": 958 }, { "epoch": 0.45827608864464486, "grad_norm": 0.5931536750814665, "learning_rate": 1.9430290925730245e-05, "loss": 0.4064, "step": 959 }, { "epoch": 0.458753957350218, "grad_norm": 0.6397321208831805, "learning_rate": 1.9428537329275862e-05, "loss": 0.4029, "step": 960 }, { "epoch": 0.45923182605579116, "grad_norm": 0.5770485075045017, "learning_rate": 1.942678111751154e-05, "loss": 0.4154, "step": 961 }, { "epoch": 0.4597096947613643, "grad_norm": 0.6983859402707672, "learning_rate": 1.942502229092442e-05, "loss": 0.3865, "step": 962 }, { "epoch": 0.46018756346693745, "grad_norm": 0.566499169480957, "learning_rate": 1.9423260850002375e-05, "loss": 0.391, "step": 963 }, { "epoch": 0.4606654321725106, "grad_norm": 0.6674856926358066, "learning_rate": 1.9421496795233995e-05, "loss": 0.4177, "step": 964 }, { "epoch": 0.46114330087808375, "grad_norm": 0.6077827552367149, "learning_rate": 1.941973012710859e-05, "loss": 0.4064, "step": 965 }, { "epoch": 0.4616211695836569, "grad_norm": 0.6393677236982209, "learning_rate": 1.9417960846116214e-05, "loss": 0.4079, "step": 966 }, { "epoch": 0.46209903828923005, "grad_norm": 0.6681026939421527, "learning_rate": 1.941618895274763e-05, "loss": 0.3953, "step": 967 }, { "epoch": 0.4625769069948032, "grad_norm": 0.6460174077865963, "learning_rate": 1.9414414447494326e-05, "loss": 0.4012, "step": 968 }, { "epoch": 0.46305477570037634, "grad_norm": 0.6582432277442615, "learning_rate": 1.9412637330848524e-05, "loss": 0.4101, "step": 969 }, { "epoch": 0.4635326444059495, "grad_norm": 0.6318462060701575, "learning_rate": 1.941085760330316e-05, "loss": 0.4059, "step": 970 }, { "epoch": 0.4640105131115226, "grad_norm": 0.7203162939351494, "learning_rate": 1.9409075265351904e-05, "loss": 0.4137, "step": 971 }, { "epoch": 0.46448838181709573, "grad_norm": 0.6894708541476223, "learning_rate": 1.940729031748914e-05, "loss": 0.409, "step": 972 }, { "epoch": 0.4649662505226689, "grad_norm": 0.6082868118061796, "learning_rate": 1.9405502760209988e-05, "loss": 0.3811, "step": 973 }, { "epoch": 0.465444119228242, "grad_norm": 0.575259488569313, "learning_rate": 1.9403712594010275e-05, "loss": 0.3959, "step": 974 }, { "epoch": 0.4659219879338152, "grad_norm": 0.6187113385107887, "learning_rate": 1.940191981938657e-05, "loss": 0.3927, "step": 975 }, { "epoch": 0.4663998566393883, "grad_norm": 0.5980885678751732, "learning_rate": 1.9400124436836155e-05, "loss": 0.403, "step": 976 }, { "epoch": 0.46687772534496147, "grad_norm": 0.59607435141776, "learning_rate": 1.9398326446857034e-05, "loss": 0.4037, "step": 977 }, { "epoch": 0.4673555940505346, "grad_norm": 0.6121952890654038, "learning_rate": 1.939652584994794e-05, "loss": 0.3928, "step": 978 }, { "epoch": 0.46783346275610777, "grad_norm": 0.6519511555849895, "learning_rate": 1.9394722646608332e-05, "loss": 0.3984, "step": 979 }, { "epoch": 0.4683113314616809, "grad_norm": 0.5807245005292517, "learning_rate": 1.9392916837338376e-05, "loss": 0.4097, "step": 980 }, { "epoch": 0.46878920016725406, "grad_norm": 0.5679167625106275, "learning_rate": 1.939110842263898e-05, "loss": 0.3963, "step": 981 }, { "epoch": 0.4692670688728272, "grad_norm": 0.6227411181855099, "learning_rate": 1.9389297403011767e-05, "loss": 0.4104, "step": 982 }, { "epoch": 0.46974493757840036, "grad_norm": 0.5888976376216906, "learning_rate": 1.9387483778959075e-05, "loss": 0.3936, "step": 983 }, { "epoch": 0.4702228062839735, "grad_norm": 1.09297261340797, "learning_rate": 1.9385667550983974e-05, "loss": 0.4219, "step": 984 }, { "epoch": 0.4707006749895466, "grad_norm": 0.5838151733918109, "learning_rate": 1.9383848719590257e-05, "loss": 0.4059, "step": 985 }, { "epoch": 0.47117854369511974, "grad_norm": 0.5684738684896777, "learning_rate": 1.9382027285282437e-05, "loss": 0.386, "step": 986 }, { "epoch": 0.4716564124006929, "grad_norm": 0.5797945596171794, "learning_rate": 1.9380203248565738e-05, "loss": 0.406, "step": 987 }, { "epoch": 0.47213428110626604, "grad_norm": 0.5735206647369371, "learning_rate": 1.9378376609946126e-05, "loss": 0.4109, "step": 988 }, { "epoch": 0.4726121498118392, "grad_norm": 0.5591211466804421, "learning_rate": 1.937654736993027e-05, "loss": 0.4157, "step": 989 }, { "epoch": 0.47309001851741234, "grad_norm": 0.5955292703363935, "learning_rate": 1.9374715529025575e-05, "loss": 0.3973, "step": 990 }, { "epoch": 0.4735678872229855, "grad_norm": 0.967343371666392, "learning_rate": 1.9372881087740162e-05, "loss": 0.3997, "step": 991 }, { "epoch": 0.47404575592855863, "grad_norm": 0.5801781650436966, "learning_rate": 1.9371044046582867e-05, "loss": 0.4082, "step": 992 }, { "epoch": 0.4745236246341318, "grad_norm": 0.6145894286769649, "learning_rate": 1.9369204406063257e-05, "loss": 0.4004, "step": 993 }, { "epoch": 0.4750014933397049, "grad_norm": 0.6330844181843237, "learning_rate": 1.936736216669161e-05, "loss": 0.3982, "step": 994 }, { "epoch": 0.4754793620452781, "grad_norm": 0.5704791125251828, "learning_rate": 1.9365517328978943e-05, "loss": 0.4129, "step": 995 }, { "epoch": 0.4759572307508512, "grad_norm": 0.5887093260184034, "learning_rate": 1.936366989343697e-05, "loss": 0.4071, "step": 996 }, { "epoch": 0.47643509945642437, "grad_norm": 0.6293352936706972, "learning_rate": 1.9361819860578143e-05, "loss": 0.4105, "step": 997 }, { "epoch": 0.47691296816199746, "grad_norm": 0.5368324375592416, "learning_rate": 1.9359967230915622e-05, "loss": 0.3939, "step": 998 }, { "epoch": 0.4773908368675706, "grad_norm": 0.6637884556032649, "learning_rate": 1.9358112004963304e-05, "loss": 0.4003, "step": 999 }, { "epoch": 0.47786870557314376, "grad_norm": 0.6269569640161438, "learning_rate": 1.9356254183235785e-05, "loss": 0.3977, "step": 1000 }, { "epoch": 0.4783465742787169, "grad_norm": 0.5907094927745844, "learning_rate": 1.93543937662484e-05, "loss": 0.3888, "step": 1001 }, { "epoch": 0.47882444298429006, "grad_norm": 0.6356082307658898, "learning_rate": 1.935253075451719e-05, "loss": 0.4009, "step": 1002 }, { "epoch": 0.4793023116898632, "grad_norm": 0.598063943812148, "learning_rate": 1.935066514855893e-05, "loss": 0.3996, "step": 1003 }, { "epoch": 0.47978018039543635, "grad_norm": 0.6878231195525895, "learning_rate": 1.9348796948891094e-05, "loss": 0.3917, "step": 1004 }, { "epoch": 0.4802580491010095, "grad_norm": 0.5939172442059919, "learning_rate": 1.93469261560319e-05, "loss": 0.3907, "step": 1005 }, { "epoch": 0.48073591780658265, "grad_norm": 0.7585300644554811, "learning_rate": 1.9345052770500262e-05, "loss": 0.3896, "step": 1006 }, { "epoch": 0.4812137865121558, "grad_norm": 0.5717895671680949, "learning_rate": 1.934317679281583e-05, "loss": 0.4091, "step": 1007 }, { "epoch": 0.48169165521772894, "grad_norm": 0.6208962871977249, "learning_rate": 1.9341298223498973e-05, "loss": 0.3867, "step": 1008 }, { "epoch": 0.4821695239233021, "grad_norm": 0.66712603907002, "learning_rate": 1.933941706307076e-05, "loss": 0.3796, "step": 1009 }, { "epoch": 0.48264739262887524, "grad_norm": 0.586042042111701, "learning_rate": 1.9337533312053002e-05, "loss": 0.4176, "step": 1010 }, { "epoch": 0.4831252613344484, "grad_norm": 0.6269719997423575, "learning_rate": 1.9335646970968214e-05, "loss": 0.3991, "step": 1011 }, { "epoch": 0.4836031300400215, "grad_norm": 0.55742694480261, "learning_rate": 1.933375804033963e-05, "loss": 0.3899, "step": 1012 }, { "epoch": 0.4840809987455946, "grad_norm": 0.6669718179555711, "learning_rate": 1.9331866520691214e-05, "loss": 0.397, "step": 1013 }, { "epoch": 0.4845588674511678, "grad_norm": 0.6257739428321304, "learning_rate": 1.9329972412547637e-05, "loss": 0.3954, "step": 1014 }, { "epoch": 0.4850367361567409, "grad_norm": 0.6256701257508585, "learning_rate": 1.9328075716434287e-05, "loss": 0.3933, "step": 1015 }, { "epoch": 0.48551460486231407, "grad_norm": 0.6720722566805102, "learning_rate": 1.932617643287728e-05, "loss": 0.3918, "step": 1016 }, { "epoch": 0.4859924735678872, "grad_norm": 0.6950543974048602, "learning_rate": 1.9324274562403446e-05, "loss": 0.4021, "step": 1017 }, { "epoch": 0.48647034227346037, "grad_norm": 0.6220688444979767, "learning_rate": 1.9322370105540317e-05, "loss": 0.4057, "step": 1018 }, { "epoch": 0.4869482109790335, "grad_norm": 0.6165966442005212, "learning_rate": 1.932046306281617e-05, "loss": 0.3905, "step": 1019 }, { "epoch": 0.48742607968460666, "grad_norm": 1.0211125046724039, "learning_rate": 1.931855343475998e-05, "loss": 0.4144, "step": 1020 }, { "epoch": 0.4879039483901798, "grad_norm": 0.5733822946288798, "learning_rate": 1.931664122190144e-05, "loss": 0.3917, "step": 1021 }, { "epoch": 0.48838181709575296, "grad_norm": 0.6130027608133938, "learning_rate": 1.931472642477097e-05, "loss": 0.413, "step": 1022 }, { "epoch": 0.4888596858013261, "grad_norm": 0.6073794838231226, "learning_rate": 1.93128090438997e-05, "loss": 0.3926, "step": 1023 }, { "epoch": 0.48933755450689925, "grad_norm": 0.6699051309493238, "learning_rate": 1.9310889079819474e-05, "loss": 0.3945, "step": 1024 }, { "epoch": 0.4898154232124724, "grad_norm": 0.6983896244728203, "learning_rate": 1.930896653306286e-05, "loss": 0.4017, "step": 1025 }, { "epoch": 0.4902932919180455, "grad_norm": 0.5963071167388689, "learning_rate": 1.9307041404163135e-05, "loss": 0.4059, "step": 1026 }, { "epoch": 0.49077116062361864, "grad_norm": 0.6671617092464391, "learning_rate": 1.93051136936543e-05, "loss": 0.3939, "step": 1027 }, { "epoch": 0.4912490293291918, "grad_norm": 0.6222549383683285, "learning_rate": 1.9303183402071065e-05, "loss": 0.4081, "step": 1028 }, { "epoch": 0.49172689803476494, "grad_norm": 0.6266179396807892, "learning_rate": 1.930125052994886e-05, "loss": 0.4013, "step": 1029 }, { "epoch": 0.4922047667403381, "grad_norm": 0.5868185951865348, "learning_rate": 1.929931507782383e-05, "loss": 0.3842, "step": 1030 }, { "epoch": 0.49268263544591123, "grad_norm": 0.5854933919657626, "learning_rate": 1.9297377046232833e-05, "loss": 0.3961, "step": 1031 }, { "epoch": 0.4931605041514844, "grad_norm": 0.5729556005202394, "learning_rate": 1.9295436435713447e-05, "loss": 0.3947, "step": 1032 }, { "epoch": 0.49363837285705753, "grad_norm": 0.6238098390423348, "learning_rate": 1.9293493246803962e-05, "loss": 0.3816, "step": 1033 }, { "epoch": 0.4941162415626307, "grad_norm": 0.5629873685546706, "learning_rate": 1.9291547480043385e-05, "loss": 0.4014, "step": 1034 }, { "epoch": 0.4945941102682038, "grad_norm": 0.5648249030788314, "learning_rate": 1.9289599135971437e-05, "loss": 0.4241, "step": 1035 }, { "epoch": 0.49507197897377697, "grad_norm": 0.6247491915063801, "learning_rate": 1.9287648215128553e-05, "loss": 0.4027, "step": 1036 }, { "epoch": 0.4955498476793501, "grad_norm": 0.5182781354785916, "learning_rate": 1.928569471805589e-05, "loss": 0.3816, "step": 1037 }, { "epoch": 0.49602771638492327, "grad_norm": 0.5842774907201587, "learning_rate": 1.9283738645295304e-05, "loss": 0.405, "step": 1038 }, { "epoch": 0.49650558509049636, "grad_norm": 0.7006381584927244, "learning_rate": 1.928177999738938e-05, "loss": 0.3895, "step": 1039 }, { "epoch": 0.4969834537960695, "grad_norm": 0.6100647649213121, "learning_rate": 1.9279818774881418e-05, "loss": 0.388, "step": 1040 }, { "epoch": 0.49746132250164266, "grad_norm": 0.5485271123253042, "learning_rate": 1.9277854978315415e-05, "loss": 0.3972, "step": 1041 }, { "epoch": 0.4979391912072158, "grad_norm": 0.5710883012225493, "learning_rate": 1.9275888608236104e-05, "loss": 0.4059, "step": 1042 }, { "epoch": 0.49841705991278895, "grad_norm": 0.5630532049128263, "learning_rate": 1.9273919665188913e-05, "loss": 0.3959, "step": 1043 }, { "epoch": 0.4988949286183621, "grad_norm": 0.5595415721679393, "learning_rate": 1.9271948149719998e-05, "loss": 0.4058, "step": 1044 }, { "epoch": 0.49937279732393525, "grad_norm": 0.5890600892979772, "learning_rate": 1.9269974062376224e-05, "loss": 0.3893, "step": 1045 }, { "epoch": 0.4998506660295084, "grad_norm": 0.5984090472815659, "learning_rate": 1.926799740370516e-05, "loss": 0.3982, "step": 1046 }, { "epoch": 0.5003285347350815, "grad_norm": 0.6502431881696009, "learning_rate": 1.92660181742551e-05, "loss": 0.3988, "step": 1047 }, { "epoch": 0.5008064034406546, "grad_norm": 0.5794688514925903, "learning_rate": 1.926403637457505e-05, "loss": 0.3978, "step": 1048 }, { "epoch": 0.5012842721462278, "grad_norm": 0.5486583842322683, "learning_rate": 1.926205200521473e-05, "loss": 0.405, "step": 1049 }, { "epoch": 0.5017621408518009, "grad_norm": 0.5964484082934959, "learning_rate": 1.926006506672456e-05, "loss": 0.4075, "step": 1050 }, { "epoch": 0.5022400095573741, "grad_norm": 0.6015432115221289, "learning_rate": 1.925807555965568e-05, "loss": 0.4011, "step": 1051 }, { "epoch": 0.5027178782629472, "grad_norm": 0.594525307065842, "learning_rate": 1.9256083484559953e-05, "loss": 0.3979, "step": 1052 }, { "epoch": 0.5031957469685204, "grad_norm": 0.6660777792568852, "learning_rate": 1.9254088841989943e-05, "loss": 0.3944, "step": 1053 }, { "epoch": 0.5036736156740935, "grad_norm": 0.589208348377751, "learning_rate": 1.9252091632498926e-05, "loss": 0.3863, "step": 1054 }, { "epoch": 0.5041514843796667, "grad_norm": 0.635977217231809, "learning_rate": 1.9250091856640895e-05, "loss": 0.4113, "step": 1055 }, { "epoch": 0.5046293530852398, "grad_norm": 0.6828960314162973, "learning_rate": 1.9248089514970553e-05, "loss": 0.3817, "step": 1056 }, { "epoch": 0.505107221790813, "grad_norm": 0.5707103289605037, "learning_rate": 1.924608460804331e-05, "loss": 0.3892, "step": 1057 }, { "epoch": 0.5055850904963861, "grad_norm": 0.5594636089450963, "learning_rate": 1.9244077136415298e-05, "loss": 0.3878, "step": 1058 }, { "epoch": 0.5060629592019593, "grad_norm": 0.566732821838709, "learning_rate": 1.924206710064335e-05, "loss": 0.4012, "step": 1059 }, { "epoch": 0.5065408279075324, "grad_norm": 0.5934091175624526, "learning_rate": 1.9240054501285015e-05, "loss": 0.3814, "step": 1060 }, { "epoch": 0.5070186966131055, "grad_norm": 0.5563172693307783, "learning_rate": 1.9238039338898554e-05, "loss": 0.4011, "step": 1061 }, { "epoch": 0.5074965653186787, "grad_norm": 0.5998159022961965, "learning_rate": 1.9236021614042936e-05, "loss": 0.3854, "step": 1062 }, { "epoch": 0.5079744340242518, "grad_norm": 0.5778247983953256, "learning_rate": 1.9234001327277842e-05, "loss": 0.3855, "step": 1063 }, { "epoch": 0.508452302729825, "grad_norm": 0.5761650392879268, "learning_rate": 1.9231978479163666e-05, "loss": 0.4016, "step": 1064 }, { "epoch": 0.5089301714353981, "grad_norm": 0.5599768828396271, "learning_rate": 1.922995307026151e-05, "loss": 0.3971, "step": 1065 }, { "epoch": 0.5094080401409713, "grad_norm": 0.8720261103107927, "learning_rate": 1.9227925101133184e-05, "loss": 0.4097, "step": 1066 }, { "epoch": 0.5098859088465444, "grad_norm": 0.5924340309412149, "learning_rate": 1.922589457234121e-05, "loss": 0.3871, "step": 1067 }, { "epoch": 0.5103637775521176, "grad_norm": 0.6004985435021857, "learning_rate": 1.9223861484448833e-05, "loss": 0.3951, "step": 1068 }, { "epoch": 0.5108416462576907, "grad_norm": 0.6146759569604939, "learning_rate": 1.922182583801998e-05, "loss": 0.3922, "step": 1069 }, { "epoch": 0.5113195149632639, "grad_norm": 0.5821995968641629, "learning_rate": 1.921978763361931e-05, "loss": 0.4043, "step": 1070 }, { "epoch": 0.511797383668837, "grad_norm": 0.6337816029347101, "learning_rate": 1.9217746871812184e-05, "loss": 0.4012, "step": 1071 }, { "epoch": 0.5122752523744102, "grad_norm": 0.5637983730800908, "learning_rate": 1.9215703553164676e-05, "loss": 0.4098, "step": 1072 }, { "epoch": 0.5127531210799833, "grad_norm": 0.6159837626313385, "learning_rate": 1.921365767824356e-05, "loss": 0.402, "step": 1073 }, { "epoch": 0.5132309897855564, "grad_norm": 0.6617255676511058, "learning_rate": 1.9211609247616335e-05, "loss": 0.4129, "step": 1074 }, { "epoch": 0.5137088584911296, "grad_norm": 0.5934568209062026, "learning_rate": 1.9209558261851194e-05, "loss": 0.3833, "step": 1075 }, { "epoch": 0.5141867271967027, "grad_norm": 0.6221409035389256, "learning_rate": 1.9207504721517047e-05, "loss": 0.3989, "step": 1076 }, { "epoch": 0.5146645959022759, "grad_norm": 0.5989406482919697, "learning_rate": 1.92054486271835e-05, "loss": 0.3982, "step": 1077 }, { "epoch": 0.515142464607849, "grad_norm": 0.6476320490359937, "learning_rate": 1.9203389979420893e-05, "loss": 0.3723, "step": 1078 }, { "epoch": 0.5156203333134222, "grad_norm": 0.5853567195202979, "learning_rate": 1.9201328778800247e-05, "loss": 0.4077, "step": 1079 }, { "epoch": 0.5160982020189953, "grad_norm": 0.5900737762201892, "learning_rate": 1.919926502589331e-05, "loss": 0.3866, "step": 1080 }, { "epoch": 0.5165760707245685, "grad_norm": 0.6051998816584814, "learning_rate": 1.9197198721272527e-05, "loss": 0.4011, "step": 1081 }, { "epoch": 0.5170539394301416, "grad_norm": 0.5901233973103801, "learning_rate": 1.919512986551105e-05, "loss": 0.3873, "step": 1082 }, { "epoch": 0.5175318081357148, "grad_norm": 0.5723558837242273, "learning_rate": 1.9193058459182747e-05, "loss": 0.4008, "step": 1083 }, { "epoch": 0.5180096768412878, "grad_norm": 0.586853172304618, "learning_rate": 1.9190984502862196e-05, "loss": 0.372, "step": 1084 }, { "epoch": 0.518487545546861, "grad_norm": 0.5970057412271904, "learning_rate": 1.9188907997124666e-05, "loss": 0.4002, "step": 1085 }, { "epoch": 0.5189654142524341, "grad_norm": 0.5890213702570649, "learning_rate": 1.9186828942546148e-05, "loss": 0.3988, "step": 1086 }, { "epoch": 0.5194432829580072, "grad_norm": 0.5345329489947781, "learning_rate": 1.9184747339703334e-05, "loss": 0.3914, "step": 1087 }, { "epoch": 0.5199211516635804, "grad_norm": 0.6791735135926099, "learning_rate": 1.9182663189173625e-05, "loss": 0.3988, "step": 1088 }, { "epoch": 0.5203990203691535, "grad_norm": 0.5596329309086331, "learning_rate": 1.9180576491535125e-05, "loss": 0.3962, "step": 1089 }, { "epoch": 0.5208768890747267, "grad_norm": 0.5628392832054393, "learning_rate": 1.9178487247366652e-05, "loss": 0.3747, "step": 1090 }, { "epoch": 0.5213547577802998, "grad_norm": 0.569936759080529, "learning_rate": 1.9176395457247722e-05, "loss": 0.4074, "step": 1091 }, { "epoch": 0.521832626485873, "grad_norm": 0.5528738761628226, "learning_rate": 1.917430112175856e-05, "loss": 0.3992, "step": 1092 }, { "epoch": 0.5223104951914461, "grad_norm": 0.6109126673803167, "learning_rate": 1.9172204241480098e-05, "loss": 0.4046, "step": 1093 }, { "epoch": 0.5227883638970193, "grad_norm": 0.5513494264708154, "learning_rate": 1.9170104816993973e-05, "loss": 0.388, "step": 1094 }, { "epoch": 0.5232662326025924, "grad_norm": 0.5523842737345893, "learning_rate": 1.916800284888253e-05, "loss": 0.3811, "step": 1095 }, { "epoch": 0.5237441013081656, "grad_norm": 0.5443431872996419, "learning_rate": 1.9165898337728818e-05, "loss": 0.4142, "step": 1096 }, { "epoch": 0.5242219700137387, "grad_norm": 0.5903680509886946, "learning_rate": 1.9163791284116592e-05, "loss": 0.3954, "step": 1097 }, { "epoch": 0.5246998387193119, "grad_norm": 0.7018103685542808, "learning_rate": 1.916168168863031e-05, "loss": 0.4088, "step": 1098 }, { "epoch": 0.525177707424885, "grad_norm": 0.5915389193430727, "learning_rate": 1.915956955185514e-05, "loss": 0.4055, "step": 1099 }, { "epoch": 0.5256555761304581, "grad_norm": 0.6388748207474322, "learning_rate": 1.915745487437694e-05, "loss": 0.4055, "step": 1100 }, { "epoch": 0.5261334448360313, "grad_norm": 0.5374359808793951, "learning_rate": 1.91553376567823e-05, "loss": 0.4095, "step": 1101 }, { "epoch": 0.5266113135416044, "grad_norm": 0.6253661283286657, "learning_rate": 1.915321789965849e-05, "loss": 0.3993, "step": 1102 }, { "epoch": 0.5270891822471776, "grad_norm": 0.6007927062737137, "learning_rate": 1.915109560359349e-05, "loss": 0.3789, "step": 1103 }, { "epoch": 0.5275670509527507, "grad_norm": 0.5773114426599819, "learning_rate": 1.9148970769176e-05, "loss": 0.4031, "step": 1104 }, { "epoch": 0.5280449196583239, "grad_norm": 1.5644927863876732, "learning_rate": 1.9146843396995396e-05, "loss": 0.3807, "step": 1105 }, { "epoch": 0.528522788363897, "grad_norm": 0.7068271199032458, "learning_rate": 1.9144713487641786e-05, "loss": 0.404, "step": 1106 }, { "epoch": 0.5290006570694702, "grad_norm": 0.5842159117368475, "learning_rate": 1.9142581041705964e-05, "loss": 0.3733, "step": 1107 }, { "epoch": 0.5294785257750433, "grad_norm": 0.629512416750542, "learning_rate": 1.914044605977943e-05, "loss": 0.4029, "step": 1108 }, { "epoch": 0.5299563944806165, "grad_norm": 0.5591127726288794, "learning_rate": 1.91383085424544e-05, "loss": 0.3975, "step": 1109 }, { "epoch": 0.5304342631861896, "grad_norm": 0.5508233170263688, "learning_rate": 1.9136168490323772e-05, "loss": 0.3975, "step": 1110 }, { "epoch": 0.5309121318917628, "grad_norm": 0.6177805307841441, "learning_rate": 1.9134025903981163e-05, "loss": 0.3835, "step": 1111 }, { "epoch": 0.5313900005973359, "grad_norm": 0.5833143529773639, "learning_rate": 1.9131880784020893e-05, "loss": 0.4139, "step": 1112 }, { "epoch": 0.5318678693029091, "grad_norm": 0.5967595237608566, "learning_rate": 1.9129733131037977e-05, "loss": 0.4106, "step": 1113 }, { "epoch": 0.5323457380084822, "grad_norm": 0.6172221950353266, "learning_rate": 1.9127582945628135e-05, "loss": 0.3902, "step": 1114 }, { "epoch": 0.5328236067140553, "grad_norm": 0.5682174098963437, "learning_rate": 1.9125430228387794e-05, "loss": 0.3874, "step": 1115 }, { "epoch": 0.5333014754196285, "grad_norm": 0.6193518812671628, "learning_rate": 1.9123274979914076e-05, "loss": 0.4007, "step": 1116 }, { "epoch": 0.5337793441252016, "grad_norm": 0.604016269024644, "learning_rate": 1.9121117200804812e-05, "loss": 0.3932, "step": 1117 }, { "epoch": 0.5342572128307748, "grad_norm": 0.6209761683524013, "learning_rate": 1.9118956891658526e-05, "loss": 0.3821, "step": 1118 }, { "epoch": 0.5347350815363479, "grad_norm": 0.6931704696111478, "learning_rate": 1.9116794053074458e-05, "loss": 0.4071, "step": 1119 }, { "epoch": 0.5352129502419211, "grad_norm": 0.632080163475699, "learning_rate": 1.9114628685652535e-05, "loss": 0.3839, "step": 1120 }, { "epoch": 0.5356908189474942, "grad_norm": 0.5748984989211147, "learning_rate": 1.9112460789993394e-05, "loss": 0.3929, "step": 1121 }, { "epoch": 0.5361686876530674, "grad_norm": 0.5635604927408805, "learning_rate": 1.9110290366698373e-05, "loss": 0.4027, "step": 1122 }, { "epoch": 0.5366465563586404, "grad_norm": 0.7558151194826072, "learning_rate": 1.9108117416369502e-05, "loss": 0.3986, "step": 1123 }, { "epoch": 0.5371244250642137, "grad_norm": 0.5980206275007937, "learning_rate": 1.910594193960953e-05, "loss": 0.4006, "step": 1124 }, { "epoch": 0.5376022937697867, "grad_norm": 0.6780188301974273, "learning_rate": 1.9103763937021887e-05, "loss": 0.3826, "step": 1125 }, { "epoch": 0.53808016247536, "grad_norm": 0.5790597345181424, "learning_rate": 1.9101583409210714e-05, "loss": 0.3835, "step": 1126 }, { "epoch": 0.538558031180933, "grad_norm": 0.6487135859049837, "learning_rate": 1.9099400356780857e-05, "loss": 0.3995, "step": 1127 }, { "epoch": 0.5390358998865061, "grad_norm": 0.5780178648506041, "learning_rate": 1.909721478033785e-05, "loss": 0.3925, "step": 1128 }, { "epoch": 0.5395137685920793, "grad_norm": 0.642292787021177, "learning_rate": 1.909502668048793e-05, "loss": 0.4016, "step": 1129 }, { "epoch": 0.5399916372976524, "grad_norm": 0.5939162000913498, "learning_rate": 1.909283605783805e-05, "loss": 0.4057, "step": 1130 }, { "epoch": 0.5404695060032256, "grad_norm": 0.6390737833949265, "learning_rate": 1.9090642912995836e-05, "loss": 0.3997, "step": 1131 }, { "epoch": 0.5409473747087987, "grad_norm": 0.5629045570791384, "learning_rate": 1.9088447246569638e-05, "loss": 0.3948, "step": 1132 }, { "epoch": 0.5414252434143719, "grad_norm": 0.7058159449700776, "learning_rate": 1.908624905916849e-05, "loss": 0.4006, "step": 1133 }, { "epoch": 0.541903112119945, "grad_norm": 0.5821061957075071, "learning_rate": 1.9084048351402135e-05, "loss": 0.4029, "step": 1134 }, { "epoch": 0.5423809808255182, "grad_norm": 0.5745676634359547, "learning_rate": 1.9081845123881002e-05, "loss": 0.4067, "step": 1135 }, { "epoch": 0.5428588495310913, "grad_norm": 0.5799046046456168, "learning_rate": 1.9079639377216236e-05, "loss": 0.3895, "step": 1136 }, { "epoch": 0.5433367182366645, "grad_norm": 0.5423826236018965, "learning_rate": 1.9077431112019666e-05, "loss": 0.392, "step": 1137 }, { "epoch": 0.5438145869422376, "grad_norm": 0.5614547514964748, "learning_rate": 1.9075220328903833e-05, "loss": 0.419, "step": 1138 }, { "epoch": 0.5442924556478108, "grad_norm": 0.7294948459972235, "learning_rate": 1.907300702848196e-05, "loss": 0.3725, "step": 1139 }, { "epoch": 0.5447703243533839, "grad_norm": 0.7211253855583865, "learning_rate": 1.9070791211367984e-05, "loss": 0.3958, "step": 1140 }, { "epoch": 0.545248193058957, "grad_norm": 0.7264564481802851, "learning_rate": 1.9068572878176535e-05, "loss": 0.3969, "step": 1141 }, { "epoch": 0.5457260617645302, "grad_norm": 0.5947660314640929, "learning_rate": 1.9066352029522933e-05, "loss": 0.4076, "step": 1142 }, { "epoch": 0.5462039304701033, "grad_norm": 0.6646952552501058, "learning_rate": 1.906412866602321e-05, "loss": 0.4096, "step": 1143 }, { "epoch": 0.5466817991756765, "grad_norm": 0.6440292968735994, "learning_rate": 1.906190278829408e-05, "loss": 0.3947, "step": 1144 }, { "epoch": 0.5471596678812496, "grad_norm": 0.6531777155361128, "learning_rate": 1.9059674396952963e-05, "loss": 0.3921, "step": 1145 }, { "epoch": 0.5476375365868228, "grad_norm": 0.6263069312765026, "learning_rate": 1.9057443492617983e-05, "loss": 0.3908, "step": 1146 }, { "epoch": 0.5481154052923959, "grad_norm": 0.6009448091108315, "learning_rate": 1.9055210075907946e-05, "loss": 0.3872, "step": 1147 }, { "epoch": 0.5485932739979691, "grad_norm": 0.7151497490060705, "learning_rate": 1.9052974147442365e-05, "loss": 0.4047, "step": 1148 }, { "epoch": 0.5490711427035422, "grad_norm": 0.6210354172889246, "learning_rate": 1.905073570784145e-05, "loss": 0.3875, "step": 1149 }, { "epoch": 0.5495490114091154, "grad_norm": 0.6071292868798474, "learning_rate": 1.90484947577261e-05, "loss": 0.3786, "step": 1150 }, { "epoch": 0.5500268801146885, "grad_norm": 0.5393751375916709, "learning_rate": 1.9046251297717915e-05, "loss": 0.3815, "step": 1151 }, { "epoch": 0.5505047488202617, "grad_norm": 0.549917579146435, "learning_rate": 1.9044005328439197e-05, "loss": 0.3955, "step": 1152 }, { "epoch": 0.5509826175258348, "grad_norm": 0.7395520846508221, "learning_rate": 1.9041756850512932e-05, "loss": 0.4056, "step": 1153 }, { "epoch": 0.551460486231408, "grad_norm": 0.5862288750441804, "learning_rate": 1.9039505864562812e-05, "loss": 0.3972, "step": 1154 }, { "epoch": 0.5519383549369811, "grad_norm": 0.5831709652721829, "learning_rate": 1.903725237121322e-05, "loss": 0.3919, "step": 1155 }, { "epoch": 0.5524162236425542, "grad_norm": 0.5916849238602208, "learning_rate": 1.9034996371089233e-05, "loss": 0.3856, "step": 1156 }, { "epoch": 0.5528940923481274, "grad_norm": 0.6411973294376119, "learning_rate": 1.9032737864816627e-05, "loss": 0.4067, "step": 1157 }, { "epoch": 0.5533719610537005, "grad_norm": 0.5832416797946984, "learning_rate": 1.9030476853021875e-05, "loss": 0.389, "step": 1158 }, { "epoch": 0.5538498297592737, "grad_norm": 0.622158136439798, "learning_rate": 1.9028213336332135e-05, "loss": 0.3893, "step": 1159 }, { "epoch": 0.5543276984648468, "grad_norm": 0.5762766222613097, "learning_rate": 1.902594731537527e-05, "loss": 0.394, "step": 1160 }, { "epoch": 0.55480556717042, "grad_norm": 0.550291838034785, "learning_rate": 1.9023678790779838e-05, "loss": 0.3888, "step": 1161 }, { "epoch": 0.555283435875993, "grad_norm": 0.5662654245608815, "learning_rate": 1.9021407763175083e-05, "loss": 0.3926, "step": 1162 }, { "epoch": 0.5557613045815663, "grad_norm": 0.6334933581251789, "learning_rate": 1.901913423319095e-05, "loss": 0.4011, "step": 1163 }, { "epoch": 0.5562391732871393, "grad_norm": 0.5575895045734839, "learning_rate": 1.9016858201458075e-05, "loss": 0.3972, "step": 1164 }, { "epoch": 0.5567170419927125, "grad_norm": 0.5966619103161227, "learning_rate": 1.901457966860779e-05, "loss": 0.3824, "step": 1165 }, { "epoch": 0.5571949106982856, "grad_norm": 0.5870555991442512, "learning_rate": 1.9012298635272117e-05, "loss": 0.4003, "step": 1166 }, { "epoch": 0.5576727794038588, "grad_norm": 0.569548018108478, "learning_rate": 1.9010015102083778e-05, "loss": 0.3992, "step": 1167 }, { "epoch": 0.5581506481094319, "grad_norm": 0.5537323419722724, "learning_rate": 1.9007729069676185e-05, "loss": 0.4126, "step": 1168 }, { "epoch": 0.558628516815005, "grad_norm": 0.6214624827198821, "learning_rate": 1.9005440538683443e-05, "loss": 0.4034, "step": 1169 }, { "epoch": 0.5591063855205782, "grad_norm": 0.5794512137484054, "learning_rate": 1.9003149509740347e-05, "loss": 0.4042, "step": 1170 }, { "epoch": 0.5595842542261513, "grad_norm": 0.6471077379542854, "learning_rate": 1.9000855983482386e-05, "loss": 0.3955, "step": 1171 }, { "epoch": 0.5600621229317245, "grad_norm": 0.7713823924796754, "learning_rate": 1.8998559960545753e-05, "loss": 0.3894, "step": 1172 }, { "epoch": 0.5605399916372976, "grad_norm": 0.669563924210895, "learning_rate": 1.8996261441567318e-05, "loss": 0.4061, "step": 1173 }, { "epoch": 0.5610178603428708, "grad_norm": 0.6358401107367022, "learning_rate": 1.8993960427184647e-05, "loss": 0.4049, "step": 1174 }, { "epoch": 0.5614957290484439, "grad_norm": 0.632834423434781, "learning_rate": 1.899165691803601e-05, "loss": 0.3974, "step": 1175 }, { "epoch": 0.5619735977540171, "grad_norm": 0.7647633906590625, "learning_rate": 1.8989350914760348e-05, "loss": 0.3967, "step": 1176 }, { "epoch": 0.5624514664595902, "grad_norm": 0.6692881107694713, "learning_rate": 1.8987042417997313e-05, "loss": 0.4027, "step": 1177 }, { "epoch": 0.5629293351651634, "grad_norm": 0.831620122043209, "learning_rate": 1.898473142838724e-05, "loss": 0.4012, "step": 1178 }, { "epoch": 0.5634072038707365, "grad_norm": 1.1686193693749636, "learning_rate": 1.898241794657116e-05, "loss": 0.405, "step": 1179 }, { "epoch": 0.5638850725763097, "grad_norm": 0.6351278759311869, "learning_rate": 1.8980101973190787e-05, "loss": 0.3754, "step": 1180 }, { "epoch": 0.5643629412818828, "grad_norm": 0.5873130279029877, "learning_rate": 1.8977783508888535e-05, "loss": 0.3965, "step": 1181 }, { "epoch": 0.5648408099874559, "grad_norm": 0.5989794788052755, "learning_rate": 1.89754625543075e-05, "loss": 0.399, "step": 1182 }, { "epoch": 0.5653186786930291, "grad_norm": 0.5698375975862137, "learning_rate": 1.8973139110091477e-05, "loss": 0.4015, "step": 1183 }, { "epoch": 0.5657965473986022, "grad_norm": 0.6737645692540567, "learning_rate": 1.897081317688495e-05, "loss": 0.4223, "step": 1184 }, { "epoch": 0.5662744161041754, "grad_norm": 0.593327748620439, "learning_rate": 1.896848475533309e-05, "loss": 0.4066, "step": 1185 }, { "epoch": 0.5667522848097485, "grad_norm": 0.5883337985827269, "learning_rate": 1.896615384608176e-05, "loss": 0.3736, "step": 1186 }, { "epoch": 0.5672301535153217, "grad_norm": 0.9321510339697601, "learning_rate": 1.896382044977751e-05, "loss": 0.4119, "step": 1187 }, { "epoch": 0.5677080222208948, "grad_norm": 0.5793456890053796, "learning_rate": 1.896148456706759e-05, "loss": 0.3928, "step": 1188 }, { "epoch": 0.568185890926468, "grad_norm": 0.8624241078357229, "learning_rate": 1.8959146198599928e-05, "loss": 0.4138, "step": 1189 }, { "epoch": 0.5686637596320411, "grad_norm": 0.6540030367738001, "learning_rate": 1.8956805345023145e-05, "loss": 0.3981, "step": 1190 }, { "epoch": 0.5691416283376143, "grad_norm": 0.6327764655932094, "learning_rate": 1.8954462006986557e-05, "loss": 0.4082, "step": 1191 }, { "epoch": 0.5696194970431874, "grad_norm": 0.6018338777252428, "learning_rate": 1.8952116185140164e-05, "loss": 0.3884, "step": 1192 }, { "epoch": 0.5700973657487606, "grad_norm": 0.5645211426190209, "learning_rate": 1.8949767880134652e-05, "loss": 0.4008, "step": 1193 }, { "epoch": 0.5705752344543337, "grad_norm": 0.62032674267378, "learning_rate": 1.89474170926214e-05, "loss": 0.387, "step": 1194 }, { "epoch": 0.5710531031599068, "grad_norm": 0.5331504995268993, "learning_rate": 1.894506382325248e-05, "loss": 0.3944, "step": 1195 }, { "epoch": 0.57153097186548, "grad_norm": 0.5571879177321089, "learning_rate": 1.8942708072680637e-05, "loss": 0.3873, "step": 1196 }, { "epoch": 0.5720088405710531, "grad_norm": 0.5434082219860525, "learning_rate": 1.8940349841559325e-05, "loss": 0.3819, "step": 1197 }, { "epoch": 0.5724867092766263, "grad_norm": 0.5556021619843906, "learning_rate": 1.8937989130542672e-05, "loss": 0.3819, "step": 1198 }, { "epoch": 0.5729645779821994, "grad_norm": 0.6319621645757422, "learning_rate": 1.8935625940285502e-05, "loss": 0.3921, "step": 1199 }, { "epoch": 0.5734424466877726, "grad_norm": 0.6282911906906646, "learning_rate": 1.8933260271443313e-05, "loss": 0.4087, "step": 1200 }, { "epoch": 0.5739203153933456, "grad_norm": 0.5581191438877338, "learning_rate": 1.8930892124672303e-05, "loss": 0.3932, "step": 1201 }, { "epoch": 0.5743981840989189, "grad_norm": 0.5463800521790595, "learning_rate": 1.892852150062936e-05, "loss": 0.394, "step": 1202 }, { "epoch": 0.5748760528044919, "grad_norm": 0.5931390296947407, "learning_rate": 1.8926148399972047e-05, "loss": 0.3773, "step": 1203 }, { "epoch": 0.5753539215100651, "grad_norm": 0.5452845931438769, "learning_rate": 1.8923772823358624e-05, "loss": 0.3901, "step": 1204 }, { "epoch": 0.5758317902156382, "grad_norm": 0.5840615558705706, "learning_rate": 1.8921394771448032e-05, "loss": 0.4034, "step": 1205 }, { "epoch": 0.5763096589212114, "grad_norm": 0.6291436076221341, "learning_rate": 1.89190142448999e-05, "loss": 0.3905, "step": 1206 }, { "epoch": 0.5767875276267845, "grad_norm": 0.5327483631263784, "learning_rate": 1.8916631244374548e-05, "loss": 0.3999, "step": 1207 }, { "epoch": 0.5772653963323577, "grad_norm": 0.6494525583362151, "learning_rate": 1.891424577053297e-05, "loss": 0.3893, "step": 1208 }, { "epoch": 0.5777432650379308, "grad_norm": 0.5510447847761317, "learning_rate": 1.8911857824036863e-05, "loss": 0.3835, "step": 1209 }, { "epoch": 0.5782211337435039, "grad_norm": 0.6227648730898245, "learning_rate": 1.89094674055486e-05, "loss": 0.3999, "step": 1210 }, { "epoch": 0.5786990024490771, "grad_norm": 0.5541558527671858, "learning_rate": 1.8907074515731236e-05, "loss": 0.3994, "step": 1211 }, { "epoch": 0.5791768711546502, "grad_norm": 0.5676146792664295, "learning_rate": 1.8904679155248523e-05, "loss": 0.3874, "step": 1212 }, { "epoch": 0.5796547398602234, "grad_norm": 0.7499627866819074, "learning_rate": 1.890228132476488e-05, "loss": 0.3683, "step": 1213 }, { "epoch": 0.5801326085657965, "grad_norm": 0.5526639952242319, "learning_rate": 1.889988102494544e-05, "loss": 0.3977, "step": 1214 }, { "epoch": 0.5806104772713697, "grad_norm": 0.6492832611355143, "learning_rate": 1.889747825645599e-05, "loss": 0.3861, "step": 1215 }, { "epoch": 0.5810883459769428, "grad_norm": 0.5411532191238587, "learning_rate": 1.8895073019963022e-05, "loss": 0.3934, "step": 1216 }, { "epoch": 0.581566214682516, "grad_norm": 0.6698750283375203, "learning_rate": 1.8892665316133706e-05, "loss": 0.4089, "step": 1217 }, { "epoch": 0.5820440833880891, "grad_norm": 0.604672289766661, "learning_rate": 1.8890255145635895e-05, "loss": 0.3985, "step": 1218 }, { "epoch": 0.5825219520936623, "grad_norm": 0.5809198566541879, "learning_rate": 1.888784250913813e-05, "loss": 0.4029, "step": 1219 }, { "epoch": 0.5829998207992354, "grad_norm": 0.5575695641227716, "learning_rate": 1.8885427407309627e-05, "loss": 0.3918, "step": 1220 }, { "epoch": 0.5834776895048086, "grad_norm": 0.6012108926363768, "learning_rate": 1.88830098408203e-05, "loss": 0.3864, "step": 1221 }, { "epoch": 0.5839555582103817, "grad_norm": 0.5621575791142275, "learning_rate": 1.8880589810340734e-05, "loss": 0.38, "step": 1222 }, { "epoch": 0.5844334269159548, "grad_norm": 0.5831322537081107, "learning_rate": 1.8878167316542207e-05, "loss": 0.3894, "step": 1223 }, { "epoch": 0.584911295621528, "grad_norm": 0.5871042303485483, "learning_rate": 1.8875742360096675e-05, "loss": 0.3896, "step": 1224 }, { "epoch": 0.5853891643271011, "grad_norm": 0.5564242439821103, "learning_rate": 1.887331494167678e-05, "loss": 0.3933, "step": 1225 }, { "epoch": 0.5858670330326743, "grad_norm": 0.5819982187143664, "learning_rate": 1.887088506195584e-05, "loss": 0.4114, "step": 1226 }, { "epoch": 0.5863449017382474, "grad_norm": 0.5090058539216136, "learning_rate": 1.8868452721607865e-05, "loss": 0.3962, "step": 1227 }, { "epoch": 0.5868227704438206, "grad_norm": 0.5875412543113175, "learning_rate": 1.8866017921307544e-05, "loss": 0.4056, "step": 1228 }, { "epoch": 0.5873006391493937, "grad_norm": 0.6166139308626585, "learning_rate": 1.886358066173024e-05, "loss": 0.3882, "step": 1229 }, { "epoch": 0.5877785078549669, "grad_norm": 0.5888127004124792, "learning_rate": 1.8861140943552014e-05, "loss": 0.3912, "step": 1230 }, { "epoch": 0.58825637656054, "grad_norm": 0.6162981466724767, "learning_rate": 1.8858698767449598e-05, "loss": 0.3937, "step": 1231 }, { "epoch": 0.5887342452661132, "grad_norm": 0.6715967270841622, "learning_rate": 1.8856254134100408e-05, "loss": 0.401, "step": 1232 }, { "epoch": 0.5892121139716863, "grad_norm": 0.5366020380685378, "learning_rate": 1.8853807044182544e-05, "loss": 0.3875, "step": 1233 }, { "epoch": 0.5896899826772595, "grad_norm": 0.591930559738908, "learning_rate": 1.8851357498374785e-05, "loss": 0.4082, "step": 1234 }, { "epoch": 0.5901678513828326, "grad_norm": 0.5911447561891597, "learning_rate": 1.884890549735659e-05, "loss": 0.3904, "step": 1235 }, { "epoch": 0.5906457200884057, "grad_norm": 0.5226862065805489, "learning_rate": 1.8846451041808102e-05, "loss": 0.3937, "step": 1236 }, { "epoch": 0.5911235887939789, "grad_norm": 0.6596621571985957, "learning_rate": 1.8843994132410143e-05, "loss": 0.403, "step": 1237 }, { "epoch": 0.591601457499552, "grad_norm": 0.5701923237556955, "learning_rate": 1.884153476984422e-05, "loss": 0.4008, "step": 1238 }, { "epoch": 0.5920793262051252, "grad_norm": 0.5930242333237962, "learning_rate": 1.883907295479251e-05, "loss": 0.3795, "step": 1239 }, { "epoch": 0.5925571949106982, "grad_norm": 0.7314294978915366, "learning_rate": 1.8836608687937883e-05, "loss": 0.3928, "step": 1240 }, { "epoch": 0.5930350636162715, "grad_norm": 0.5401549968229234, "learning_rate": 1.883414196996388e-05, "loss": 0.3828, "step": 1241 }, { "epoch": 0.5935129323218445, "grad_norm": 0.5649220265715015, "learning_rate": 1.8831672801554726e-05, "loss": 0.4082, "step": 1242 }, { "epoch": 0.5939908010274177, "grad_norm": 0.5284937507489004, "learning_rate": 1.882920118339533e-05, "loss": 0.3894, "step": 1243 }, { "epoch": 0.5944686697329908, "grad_norm": 0.5645488013407426, "learning_rate": 1.8826727116171264e-05, "loss": 0.3931, "step": 1244 }, { "epoch": 0.594946538438564, "grad_norm": 0.5376862100448556, "learning_rate": 1.8824250600568798e-05, "loss": 0.3859, "step": 1245 }, { "epoch": 0.5954244071441371, "grad_norm": 0.601826503242195, "learning_rate": 1.8821771637274874e-05, "loss": 0.3896, "step": 1246 }, { "epoch": 0.5959022758497103, "grad_norm": 0.547959448752086, "learning_rate": 1.881929022697711e-05, "loss": 0.3716, "step": 1247 }, { "epoch": 0.5963801445552834, "grad_norm": 0.5519251178713137, "learning_rate": 1.881680637036381e-05, "loss": 0.4019, "step": 1248 }, { "epoch": 0.5968580132608566, "grad_norm": 0.5714630327545864, "learning_rate": 1.8814320068123945e-05, "loss": 0.3937, "step": 1249 }, { "epoch": 0.5973358819664297, "grad_norm": 0.5470336630560245, "learning_rate": 1.8811831320947177e-05, "loss": 0.4043, "step": 1250 }, { "epoch": 0.5978137506720028, "grad_norm": 0.5569332027496797, "learning_rate": 1.8809340129523835e-05, "loss": 0.3718, "step": 1251 }, { "epoch": 0.598291619377576, "grad_norm": 0.5598645339474388, "learning_rate": 1.880684649454494e-05, "loss": 0.4028, "step": 1252 }, { "epoch": 0.5987694880831491, "grad_norm": 0.5364480060048303, "learning_rate": 1.8804350416702174e-05, "loss": 0.3782, "step": 1253 }, { "epoch": 0.5992473567887223, "grad_norm": 0.5873593424397728, "learning_rate": 1.880185189668791e-05, "loss": 0.392, "step": 1254 }, { "epoch": 0.5997252254942954, "grad_norm": 0.5235640424479179, "learning_rate": 1.879935093519519e-05, "loss": 0.3805, "step": 1255 }, { "epoch": 0.6002030941998686, "grad_norm": 0.620909525268386, "learning_rate": 1.8796847532917743e-05, "loss": 0.4068, "step": 1256 }, { "epoch": 0.6006809629054417, "grad_norm": 0.5489235045804418, "learning_rate": 1.879434169054996e-05, "loss": 0.3848, "step": 1257 }, { "epoch": 0.6011588316110149, "grad_norm": 0.5347491918019722, "learning_rate": 1.8791833408786922e-05, "loss": 0.3819, "step": 1258 }, { "epoch": 0.601636700316588, "grad_norm": 0.5915201608837969, "learning_rate": 1.878932268832438e-05, "loss": 0.3951, "step": 1259 }, { "epoch": 0.6021145690221612, "grad_norm": 0.5356326976446867, "learning_rate": 1.878680952985877e-05, "loss": 0.3905, "step": 1260 }, { "epoch": 0.6025924377277343, "grad_norm": 0.6181767591406878, "learning_rate": 1.878429393408719e-05, "loss": 0.4024, "step": 1261 }, { "epoch": 0.6030703064333075, "grad_norm": 0.5632469969713791, "learning_rate": 1.8781775901707425e-05, "loss": 0.3937, "step": 1262 }, { "epoch": 0.6035481751388806, "grad_norm": 0.5877175455362745, "learning_rate": 1.8779255433417935e-05, "loss": 0.3943, "step": 1263 }, { "epoch": 0.6040260438444537, "grad_norm": 0.5503903372236624, "learning_rate": 1.8776732529917846e-05, "loss": 0.3984, "step": 1264 }, { "epoch": 0.6045039125500269, "grad_norm": 0.5649581602924505, "learning_rate": 1.8774207191906976e-05, "loss": 0.4017, "step": 1265 }, { "epoch": 0.6049817812556, "grad_norm": 0.563853155134363, "learning_rate": 1.8771679420085805e-05, "loss": 0.3891, "step": 1266 }, { "epoch": 0.6054596499611732, "grad_norm": 0.5651839005098936, "learning_rate": 1.8769149215155497e-05, "loss": 0.3913, "step": 1267 }, { "epoch": 0.6059375186667463, "grad_norm": 0.5963210739445259, "learning_rate": 1.8766616577817875e-05, "loss": 0.4133, "step": 1268 }, { "epoch": 0.6064153873723195, "grad_norm": 0.55539068455758, "learning_rate": 1.876408150877546e-05, "loss": 0.3797, "step": 1269 }, { "epoch": 0.6068932560778926, "grad_norm": 0.5856042654690761, "learning_rate": 1.8761544008731426e-05, "loss": 0.4058, "step": 1270 }, { "epoch": 0.6073711247834658, "grad_norm": 0.542197554269779, "learning_rate": 1.8759004078389635e-05, "loss": 0.39, "step": 1271 }, { "epoch": 0.6078489934890389, "grad_norm": 0.5353927253355668, "learning_rate": 1.8756461718454622e-05, "loss": 0.4147, "step": 1272 }, { "epoch": 0.6083268621946121, "grad_norm": 0.7529582857438246, "learning_rate": 1.8753916929631586e-05, "loss": 0.4046, "step": 1273 }, { "epoch": 0.6088047309001852, "grad_norm": 0.5958352414182106, "learning_rate": 1.8751369712626413e-05, "loss": 0.385, "step": 1274 }, { "epoch": 0.6092825996057584, "grad_norm": 0.5841015694045317, "learning_rate": 1.874882006814565e-05, "loss": 0.3854, "step": 1275 }, { "epoch": 0.6097604683113315, "grad_norm": 0.6682019912149343, "learning_rate": 1.874626799689653e-05, "loss": 0.3775, "step": 1276 }, { "epoch": 0.6102383370169046, "grad_norm": 0.5543315924200671, "learning_rate": 1.874371349958695e-05, "loss": 0.3767, "step": 1277 }, { "epoch": 0.6107162057224778, "grad_norm": 0.5842970394229539, "learning_rate": 1.874115657692548e-05, "loss": 0.3992, "step": 1278 }, { "epoch": 0.6111940744280508, "grad_norm": 0.5711349862291916, "learning_rate": 1.8738597229621368e-05, "loss": 0.3913, "step": 1279 }, { "epoch": 0.611671943133624, "grad_norm": 0.6023071222272888, "learning_rate": 1.8736035458384528e-05, "loss": 0.3896, "step": 1280 }, { "epoch": 0.6121498118391971, "grad_norm": 0.5777611740504066, "learning_rate": 1.8733471263925553e-05, "loss": 0.4059, "step": 1281 }, { "epoch": 0.6126276805447703, "grad_norm": 0.5729042128264256, "learning_rate": 1.8730904646955706e-05, "loss": 0.4117, "step": 1282 }, { "epoch": 0.6131055492503434, "grad_norm": 0.5307580555984255, "learning_rate": 1.8728335608186923e-05, "loss": 0.3967, "step": 1283 }, { "epoch": 0.6135834179559166, "grad_norm": 0.5972005721716636, "learning_rate": 1.8725764148331804e-05, "loss": 0.3851, "step": 1284 }, { "epoch": 0.6140612866614897, "grad_norm": 0.5486242431937549, "learning_rate": 1.8723190268103634e-05, "loss": 0.3921, "step": 1285 }, { "epoch": 0.6145391553670629, "grad_norm": 0.5129417046829837, "learning_rate": 1.8720613968216356e-05, "loss": 0.3799, "step": 1286 }, { "epoch": 0.615017024072636, "grad_norm": 0.5389718288002542, "learning_rate": 1.871803524938459e-05, "loss": 0.3784, "step": 1287 }, { "epoch": 0.6154948927782092, "grad_norm": 0.5413932174652784, "learning_rate": 1.871545411232363e-05, "loss": 0.3883, "step": 1288 }, { "epoch": 0.6159727614837823, "grad_norm": 0.5576540640582818, "learning_rate": 1.871287055774944e-05, "loss": 0.3725, "step": 1289 }, { "epoch": 0.6164506301893554, "grad_norm": 0.5438843907191524, "learning_rate": 1.8710284586378645e-05, "loss": 0.3853, "step": 1290 }, { "epoch": 0.6169284988949286, "grad_norm": 0.5921910838665108, "learning_rate": 1.870769619892856e-05, "loss": 0.3881, "step": 1291 }, { "epoch": 0.6174063676005017, "grad_norm": 0.5281942703357103, "learning_rate": 1.8705105396117145e-05, "loss": 0.4026, "step": 1292 }, { "epoch": 0.6178842363060749, "grad_norm": 0.6001108596932736, "learning_rate": 1.870251217866305e-05, "loss": 0.3875, "step": 1293 }, { "epoch": 0.618362105011648, "grad_norm": 0.610945572306042, "learning_rate": 1.8699916547285583e-05, "loss": 0.3812, "step": 1294 }, { "epoch": 0.6188399737172212, "grad_norm": 0.5695158823092838, "learning_rate": 1.8697318502704734e-05, "loss": 0.371, "step": 1295 }, { "epoch": 0.6193178424227943, "grad_norm": 0.569180451522841, "learning_rate": 1.869471804564115e-05, "loss": 0.4033, "step": 1296 }, { "epoch": 0.6197957111283675, "grad_norm": 0.5328007824386596, "learning_rate": 1.869211517681615e-05, "loss": 0.4016, "step": 1297 }, { "epoch": 0.6202735798339406, "grad_norm": 0.5787431750554224, "learning_rate": 1.868950989695173e-05, "loss": 0.3875, "step": 1298 }, { "epoch": 0.6207514485395138, "grad_norm": 0.7099261830662204, "learning_rate": 1.8686902206770542e-05, "loss": 0.3925, "step": 1299 }, { "epoch": 0.6212293172450869, "grad_norm": 0.5775144630618253, "learning_rate": 1.8684292106995916e-05, "loss": 0.3876, "step": 1300 }, { "epoch": 0.6217071859506601, "grad_norm": 0.5960304043808599, "learning_rate": 1.868167959835185e-05, "loss": 0.3966, "step": 1301 }, { "epoch": 0.6221850546562332, "grad_norm": 0.6193706361120256, "learning_rate": 1.8679064681563005e-05, "loss": 0.4009, "step": 1302 }, { "epoch": 0.6226629233618064, "grad_norm": 0.6051679338180642, "learning_rate": 1.867644735735471e-05, "loss": 0.385, "step": 1303 }, { "epoch": 0.6231407920673795, "grad_norm": 0.6745787304908575, "learning_rate": 1.8673827626452972e-05, "loss": 0.4009, "step": 1304 }, { "epoch": 0.6236186607729526, "grad_norm": 0.591651344835618, "learning_rate": 1.8671205489584453e-05, "loss": 0.3935, "step": 1305 }, { "epoch": 0.6240965294785258, "grad_norm": 0.5499144339680516, "learning_rate": 1.8668580947476487e-05, "loss": 0.3803, "step": 1306 }, { "epoch": 0.6245743981840989, "grad_norm": 0.5742789490169716, "learning_rate": 1.8665954000857077e-05, "loss": 0.3976, "step": 1307 }, { "epoch": 0.6250522668896721, "grad_norm": 0.5342157539830956, "learning_rate": 1.8663324650454896e-05, "loss": 0.3966, "step": 1308 }, { "epoch": 0.6255301355952452, "grad_norm": 0.557937880620906, "learning_rate": 1.8660692896999272e-05, "loss": 0.3891, "step": 1309 }, { "epoch": 0.6260080043008184, "grad_norm": 0.6049761505152792, "learning_rate": 1.865805874122021e-05, "loss": 0.3903, "step": 1310 }, { "epoch": 0.6264858730063915, "grad_norm": 0.5638630151330256, "learning_rate": 1.865542218384838e-05, "loss": 0.3987, "step": 1311 }, { "epoch": 0.6269637417119647, "grad_norm": 0.5558339859479475, "learning_rate": 1.865278322561512e-05, "loss": 0.3838, "step": 1312 }, { "epoch": 0.6274416104175378, "grad_norm": 0.7599896980096237, "learning_rate": 1.8650141867252418e-05, "loss": 0.3783, "step": 1313 }, { "epoch": 0.627919479123111, "grad_norm": 0.5377592981486465, "learning_rate": 1.8647498109492952e-05, "loss": 0.3748, "step": 1314 }, { "epoch": 0.6283973478286841, "grad_norm": 0.5747378065579319, "learning_rate": 1.8644851953070045e-05, "loss": 0.3797, "step": 1315 }, { "epoch": 0.6288752165342573, "grad_norm": 0.5611211100705439, "learning_rate": 1.8642203398717704e-05, "loss": 0.374, "step": 1316 }, { "epoch": 0.6293530852398304, "grad_norm": 0.5398727147760198, "learning_rate": 1.8639552447170586e-05, "loss": 0.4111, "step": 1317 }, { "epoch": 0.6298309539454034, "grad_norm": 0.6478100942138757, "learning_rate": 1.8636899099164016e-05, "loss": 0.373, "step": 1318 }, { "epoch": 0.6303088226509767, "grad_norm": 0.5239375476824424, "learning_rate": 1.863424335543399e-05, "loss": 0.3934, "step": 1319 }, { "epoch": 0.6307866913565497, "grad_norm": 0.5594162158750057, "learning_rate": 1.863158521671716e-05, "loss": 0.3914, "step": 1320 }, { "epoch": 0.631264560062123, "grad_norm": 0.5605106637954282, "learning_rate": 1.862892468375085e-05, "loss": 0.3907, "step": 1321 }, { "epoch": 0.631742428767696, "grad_norm": 0.5566958695945204, "learning_rate": 1.8626261757273047e-05, "loss": 0.3853, "step": 1322 }, { "epoch": 0.6322202974732692, "grad_norm": 0.5930367871951245, "learning_rate": 1.8623596438022395e-05, "loss": 0.3697, "step": 1323 }, { "epoch": 0.6326981661788423, "grad_norm": 0.5744435257090056, "learning_rate": 1.862092872673821e-05, "loss": 0.3937, "step": 1324 }, { "epoch": 0.6331760348844155, "grad_norm": 0.6885689441398278, "learning_rate": 1.8618258624160465e-05, "loss": 0.395, "step": 1325 }, { "epoch": 0.6336539035899886, "grad_norm": 0.5363058119426725, "learning_rate": 1.86155861310298e-05, "loss": 0.3835, "step": 1326 }, { "epoch": 0.6341317722955618, "grad_norm": 0.5648056274555979, "learning_rate": 1.8612911248087523e-05, "loss": 0.401, "step": 1327 }, { "epoch": 0.6346096410011349, "grad_norm": 0.5517054491297217, "learning_rate": 1.8610233976075595e-05, "loss": 0.4094, "step": 1328 }, { "epoch": 0.6350875097067081, "grad_norm": 0.610689280585854, "learning_rate": 1.860755431573664e-05, "loss": 0.4007, "step": 1329 }, { "epoch": 0.6355653784122812, "grad_norm": 0.5856790883141125, "learning_rate": 1.8604872267813954e-05, "loss": 0.3868, "step": 1330 }, { "epoch": 0.6360432471178543, "grad_norm": 0.5170823321710041, "learning_rate": 1.8602187833051487e-05, "loss": 0.3772, "step": 1331 }, { "epoch": 0.6365211158234275, "grad_norm": 0.6525237313804392, "learning_rate": 1.859950101219386e-05, "loss": 0.3696, "step": 1332 }, { "epoch": 0.6369989845290006, "grad_norm": 0.5357547476404161, "learning_rate": 1.859681180598634e-05, "loss": 0.3866, "step": 1333 }, { "epoch": 0.6374768532345738, "grad_norm": 0.5669350371510699, "learning_rate": 1.859412021517487e-05, "loss": 0.3887, "step": 1334 }, { "epoch": 0.6379547219401469, "grad_norm": 0.5936322359971947, "learning_rate": 1.859142624050605e-05, "loss": 0.4002, "step": 1335 }, { "epoch": 0.6384325906457201, "grad_norm": 0.572891703767859, "learning_rate": 1.8588729882727142e-05, "loss": 0.3722, "step": 1336 }, { "epoch": 0.6389104593512932, "grad_norm": 0.6958298176911252, "learning_rate": 1.8586031142586073e-05, "loss": 0.3872, "step": 1337 }, { "epoch": 0.6393883280568664, "grad_norm": 0.6299358564635291, "learning_rate": 1.858333002083141e-05, "loss": 0.4038, "step": 1338 }, { "epoch": 0.6398661967624395, "grad_norm": 0.5567224363270409, "learning_rate": 1.8580626518212413e-05, "loss": 0.3945, "step": 1339 }, { "epoch": 0.6403440654680127, "grad_norm": 0.5356857164607767, "learning_rate": 1.8577920635478976e-05, "loss": 0.4022, "step": 1340 }, { "epoch": 0.6408219341735858, "grad_norm": 0.5602058275360913, "learning_rate": 1.8575212373381672e-05, "loss": 0.3972, "step": 1341 }, { "epoch": 0.641299802879159, "grad_norm": 0.5391006769979308, "learning_rate": 1.8572501732671714e-05, "loss": 0.4078, "step": 1342 }, { "epoch": 0.6417776715847321, "grad_norm": 0.5274111377005911, "learning_rate": 1.8569788714100993e-05, "loss": 0.3887, "step": 1343 }, { "epoch": 0.6422555402903053, "grad_norm": 0.5803845464955352, "learning_rate": 1.8567073318422053e-05, "loss": 0.402, "step": 1344 }, { "epoch": 0.6427334089958784, "grad_norm": 0.5602953934235403, "learning_rate": 1.8564355546388094e-05, "loss": 0.4069, "step": 1345 }, { "epoch": 0.6432112777014515, "grad_norm": 0.5353517895711432, "learning_rate": 1.856163539875298e-05, "loss": 0.3795, "step": 1346 }, { "epoch": 0.6436891464070247, "grad_norm": 0.5562727303504088, "learning_rate": 1.855891287627123e-05, "loss": 0.388, "step": 1347 }, { "epoch": 0.6441670151125978, "grad_norm": 0.6540276248275614, "learning_rate": 1.8556187979698024e-05, "loss": 0.3935, "step": 1348 }, { "epoch": 0.644644883818171, "grad_norm": 0.5318329859411215, "learning_rate": 1.85534607097892e-05, "loss": 0.391, "step": 1349 }, { "epoch": 0.6451227525237441, "grad_norm": 0.5984520705371762, "learning_rate": 1.855073106730126e-05, "loss": 0.3743, "step": 1350 }, { "epoch": 0.6456006212293173, "grad_norm": 0.8656630009596894, "learning_rate": 1.8547999052991353e-05, "loss": 0.3831, "step": 1351 }, { "epoch": 0.6460784899348904, "grad_norm": 0.5900719253503339, "learning_rate": 1.854526466761729e-05, "loss": 0.391, "step": 1352 }, { "epoch": 0.6465563586404636, "grad_norm": 0.5874685110312172, "learning_rate": 1.8542527911937546e-05, "loss": 0.3891, "step": 1353 }, { "epoch": 0.6470342273460367, "grad_norm": 0.5208727410164187, "learning_rate": 1.8539788786711247e-05, "loss": 0.388, "step": 1354 }, { "epoch": 0.6475120960516099, "grad_norm": 0.5379067771394836, "learning_rate": 1.8537047292698175e-05, "loss": 0.3847, "step": 1355 }, { "epoch": 0.647989964757183, "grad_norm": 0.5686849032925404, "learning_rate": 1.853430343065878e-05, "loss": 0.3902, "step": 1356 }, { "epoch": 0.6484678334627562, "grad_norm": 0.5425524249952016, "learning_rate": 1.853155720135415e-05, "loss": 0.3964, "step": 1357 }, { "epoch": 0.6489457021683293, "grad_norm": 0.5446815678020227, "learning_rate": 1.8528808605546053e-05, "loss": 0.4028, "step": 1358 }, { "epoch": 0.6494235708739023, "grad_norm": 0.5502869577791677, "learning_rate": 1.852605764399689e-05, "loss": 0.4149, "step": 1359 }, { "epoch": 0.6499014395794755, "grad_norm": 0.5356342796004244, "learning_rate": 1.852330431746973e-05, "loss": 0.3927, "step": 1360 }, { "epoch": 0.6503793082850486, "grad_norm": 0.5620414346740084, "learning_rate": 1.852054862672831e-05, "loss": 0.3847, "step": 1361 }, { "epoch": 0.6508571769906218, "grad_norm": 0.5494824917990645, "learning_rate": 1.8517790572536996e-05, "loss": 0.3893, "step": 1362 }, { "epoch": 0.6513350456961949, "grad_norm": 0.6052620118835617, "learning_rate": 1.851503015566083e-05, "loss": 0.3853, "step": 1363 }, { "epoch": 0.6518129144017681, "grad_norm": 0.5723298054745771, "learning_rate": 1.85122673768655e-05, "loss": 0.3837, "step": 1364 }, { "epoch": 0.6522907831073412, "grad_norm": 1.0042668593415573, "learning_rate": 1.8509502236917353e-05, "loss": 0.3901, "step": 1365 }, { "epoch": 0.6527686518129144, "grad_norm": 0.7999380988229664, "learning_rate": 1.8506734736583388e-05, "loss": 0.3922, "step": 1366 }, { "epoch": 0.6532465205184875, "grad_norm": 0.583257327373365, "learning_rate": 1.850396487663127e-05, "loss": 0.3916, "step": 1367 }, { "epoch": 0.6537243892240607, "grad_norm": 0.563446303201131, "learning_rate": 1.85011926578293e-05, "loss": 0.3973, "step": 1368 }, { "epoch": 0.6542022579296338, "grad_norm": 0.5276619379868954, "learning_rate": 1.8498418080946444e-05, "loss": 0.3765, "step": 1369 }, { "epoch": 0.654680126635207, "grad_norm": 0.5838719171402456, "learning_rate": 1.8495641146752322e-05, "loss": 0.4094, "step": 1370 }, { "epoch": 0.6551579953407801, "grad_norm": 0.5690336348547724, "learning_rate": 1.8492861856017206e-05, "loss": 0.3861, "step": 1371 }, { "epoch": 0.6556358640463532, "grad_norm": 0.6359126253694918, "learning_rate": 1.8490080209512024e-05, "loss": 0.4018, "step": 1372 }, { "epoch": 0.6561137327519264, "grad_norm": 0.6504835153018964, "learning_rate": 1.848729620800835e-05, "loss": 0.4012, "step": 1373 }, { "epoch": 0.6565916014574995, "grad_norm": 0.5305539981752121, "learning_rate": 1.8484509852278426e-05, "loss": 0.387, "step": 1374 }, { "epoch": 0.6570694701630727, "grad_norm": 0.5413728281865292, "learning_rate": 1.848172114309513e-05, "loss": 0.3996, "step": 1375 }, { "epoch": 0.6575473388686458, "grad_norm": 0.5770591202114015, "learning_rate": 1.847893008123201e-05, "loss": 0.3918, "step": 1376 }, { "epoch": 0.658025207574219, "grad_norm": 0.5418743549514499, "learning_rate": 1.8476136667463246e-05, "loss": 0.3675, "step": 1377 }, { "epoch": 0.6585030762797921, "grad_norm": 0.5384300809696159, "learning_rate": 1.8473340902563686e-05, "loss": 0.3804, "step": 1378 }, { "epoch": 0.6589809449853653, "grad_norm": 0.582974175378902, "learning_rate": 1.847054278730883e-05, "loss": 0.3924, "step": 1379 }, { "epoch": 0.6594588136909384, "grad_norm": 0.521915823350355, "learning_rate": 1.8467742322474822e-05, "loss": 0.3678, "step": 1380 }, { "epoch": 0.6599366823965116, "grad_norm": 0.552505801390087, "learning_rate": 1.846493950883846e-05, "loss": 0.3673, "step": 1381 }, { "epoch": 0.6604145511020847, "grad_norm": 0.5738503571000322, "learning_rate": 1.84621343471772e-05, "loss": 0.3784, "step": 1382 }, { "epoch": 0.6608924198076579, "grad_norm": 0.543933586148958, "learning_rate": 1.8459326838269137e-05, "loss": 0.3883, "step": 1383 }, { "epoch": 0.661370288513231, "grad_norm": 0.5624988585079359, "learning_rate": 1.8456516982893036e-05, "loss": 0.3841, "step": 1384 }, { "epoch": 0.6618481572188041, "grad_norm": 0.8760123821285536, "learning_rate": 1.845370478182829e-05, "loss": 0.3789, "step": 1385 }, { "epoch": 0.6623260259243773, "grad_norm": 0.5255149377123545, "learning_rate": 1.8450890235854958e-05, "loss": 0.3995, "step": 1386 }, { "epoch": 0.6628038946299504, "grad_norm": 0.8238825404916186, "learning_rate": 1.8448073345753746e-05, "loss": 0.381, "step": 1387 }, { "epoch": 0.6632817633355236, "grad_norm": 0.5368697433670593, "learning_rate": 1.8445254112306013e-05, "loss": 0.3901, "step": 1388 }, { "epoch": 0.6637596320410967, "grad_norm": 0.5844500955136835, "learning_rate": 1.8442432536293756e-05, "loss": 0.3841, "step": 1389 }, { "epoch": 0.6642375007466699, "grad_norm": 0.5317108098879734, "learning_rate": 1.8439608618499637e-05, "loss": 0.3846, "step": 1390 }, { "epoch": 0.664715369452243, "grad_norm": 0.5486204761949643, "learning_rate": 1.843678235970696e-05, "loss": 0.4078, "step": 1391 }, { "epoch": 0.6651932381578162, "grad_norm": 0.5621459280922341, "learning_rate": 1.8433953760699678e-05, "loss": 0.3711, "step": 1392 }, { "epoch": 0.6656711068633893, "grad_norm": 0.626320660146386, "learning_rate": 1.8431122822262398e-05, "loss": 0.3721, "step": 1393 }, { "epoch": 0.6661489755689625, "grad_norm": 0.5634692726095906, "learning_rate": 1.8428289545180367e-05, "loss": 0.3997, "step": 1394 }, { "epoch": 0.6666268442745356, "grad_norm": 0.5806101182458545, "learning_rate": 1.842545393023949e-05, "loss": 0.3834, "step": 1395 }, { "epoch": 0.6671047129801088, "grad_norm": 0.5508052241942929, "learning_rate": 1.8422615978226313e-05, "loss": 0.398, "step": 1396 }, { "epoch": 0.6675825816856819, "grad_norm": 0.5741650592949158, "learning_rate": 1.8419775689928035e-05, "loss": 0.3806, "step": 1397 }, { "epoch": 0.6680604503912551, "grad_norm": 0.5163994653334596, "learning_rate": 1.8416933066132507e-05, "loss": 0.3815, "step": 1398 }, { "epoch": 0.6685383190968281, "grad_norm": 0.6173088911528165, "learning_rate": 1.8414088107628215e-05, "loss": 0.3758, "step": 1399 }, { "epoch": 0.6690161878024012, "grad_norm": 0.6040108460649616, "learning_rate": 1.841124081520431e-05, "loss": 0.3933, "step": 1400 }, { "epoch": 0.6694940565079744, "grad_norm": 0.6180489249655141, "learning_rate": 1.840839118965057e-05, "loss": 0.3931, "step": 1401 }, { "epoch": 0.6699719252135475, "grad_norm": 0.5294472563078757, "learning_rate": 1.8405539231757435e-05, "loss": 0.3943, "step": 1402 }, { "epoch": 0.6704497939191207, "grad_norm": 0.6292773643061201, "learning_rate": 1.840268494231599e-05, "loss": 0.3938, "step": 1403 }, { "epoch": 0.6709276626246938, "grad_norm": 0.713996443805077, "learning_rate": 1.839982832211796e-05, "loss": 0.3781, "step": 1404 }, { "epoch": 0.671405531330267, "grad_norm": 0.5799035626176681, "learning_rate": 1.8396969371955724e-05, "loss": 0.4023, "step": 1405 }, { "epoch": 0.6718834000358401, "grad_norm": 0.5700183080878829, "learning_rate": 1.8394108092622307e-05, "loss": 0.3844, "step": 1406 }, { "epoch": 0.6723612687414133, "grad_norm": 0.6189918262496643, "learning_rate": 1.839124448491137e-05, "loss": 0.3948, "step": 1407 }, { "epoch": 0.6728391374469864, "grad_norm": 0.5222965203774697, "learning_rate": 1.8388378549617238e-05, "loss": 0.4008, "step": 1408 }, { "epoch": 0.6733170061525596, "grad_norm": 0.7906792457229658, "learning_rate": 1.838551028753486e-05, "loss": 0.3897, "step": 1409 }, { "epoch": 0.6737948748581327, "grad_norm": 0.6020606169233097, "learning_rate": 1.838263969945985e-05, "loss": 0.3839, "step": 1410 }, { "epoch": 0.6742727435637059, "grad_norm": 0.5357579566712205, "learning_rate": 1.8379766786188457e-05, "loss": 0.3836, "step": 1411 }, { "epoch": 0.674750612269279, "grad_norm": 0.5633942493212672, "learning_rate": 1.8376891548517566e-05, "loss": 0.3847, "step": 1412 }, { "epoch": 0.6752284809748521, "grad_norm": 0.5798470271651702, "learning_rate": 1.837401398724473e-05, "loss": 0.3813, "step": 1413 }, { "epoch": 0.6757063496804253, "grad_norm": 0.5636278137204155, "learning_rate": 1.837113410316813e-05, "loss": 0.3861, "step": 1414 }, { "epoch": 0.6761842183859984, "grad_norm": 0.6008374318361374, "learning_rate": 1.836825189708659e-05, "loss": 0.3771, "step": 1415 }, { "epoch": 0.6766620870915716, "grad_norm": 0.5521729571309322, "learning_rate": 1.836536736979959e-05, "loss": 0.3854, "step": 1416 }, { "epoch": 0.6771399557971447, "grad_norm": 0.557361883029169, "learning_rate": 1.8362480522107244e-05, "loss": 0.3679, "step": 1417 }, { "epoch": 0.6776178245027179, "grad_norm": 0.5435824654885472, "learning_rate": 1.8359591354810313e-05, "loss": 0.391, "step": 1418 }, { "epoch": 0.678095693208291, "grad_norm": 0.5417613234703497, "learning_rate": 1.8356699868710198e-05, "loss": 0.3881, "step": 1419 }, { "epoch": 0.6785735619138642, "grad_norm": 0.5308236388209336, "learning_rate": 1.8353806064608953e-05, "loss": 0.4036, "step": 1420 }, { "epoch": 0.6790514306194373, "grad_norm": 0.5884130068215864, "learning_rate": 1.8350909943309262e-05, "loss": 0.3732, "step": 1421 }, { "epoch": 0.6795292993250105, "grad_norm": 0.5409490290787906, "learning_rate": 1.8348011505614462e-05, "loss": 0.3896, "step": 1422 }, { "epoch": 0.6800071680305836, "grad_norm": 0.5970144011295364, "learning_rate": 1.8345110752328527e-05, "loss": 0.3834, "step": 1423 }, { "epoch": 0.6804850367361568, "grad_norm": 0.5261236913058273, "learning_rate": 1.834220768425607e-05, "loss": 0.389, "step": 1424 }, { "epoch": 0.6809629054417299, "grad_norm": 0.5800849754387479, "learning_rate": 1.833930230220236e-05, "loss": 0.3769, "step": 1425 }, { "epoch": 0.681440774147303, "grad_norm": 0.5513763570891704, "learning_rate": 1.8336394606973293e-05, "loss": 0.3755, "step": 1426 }, { "epoch": 0.6819186428528762, "grad_norm": 0.5923393622326396, "learning_rate": 1.8333484599375414e-05, "loss": 0.3781, "step": 1427 }, { "epoch": 0.6823965115584493, "grad_norm": 0.6542084688710019, "learning_rate": 1.8330572280215904e-05, "loss": 0.3802, "step": 1428 }, { "epoch": 0.6828743802640225, "grad_norm": 0.5657355120408663, "learning_rate": 1.8327657650302596e-05, "loss": 0.3916, "step": 1429 }, { "epoch": 0.6833522489695956, "grad_norm": 0.6676995536636485, "learning_rate": 1.8324740710443955e-05, "loss": 0.3841, "step": 1430 }, { "epoch": 0.6838301176751688, "grad_norm": 0.5639564465708701, "learning_rate": 1.8321821461449084e-05, "loss": 0.3787, "step": 1431 }, { "epoch": 0.6843079863807419, "grad_norm": 0.5518394477924401, "learning_rate": 1.831889990412773e-05, "loss": 0.389, "step": 1432 }, { "epoch": 0.6847858550863151, "grad_norm": 0.5756892172775409, "learning_rate": 1.831597603929029e-05, "loss": 0.391, "step": 1433 }, { "epoch": 0.6852637237918882, "grad_norm": 0.538534263526804, "learning_rate": 1.8313049867747788e-05, "loss": 0.3897, "step": 1434 }, { "epoch": 0.6857415924974614, "grad_norm": 0.5680170895960552, "learning_rate": 1.831012139031189e-05, "loss": 0.3967, "step": 1435 }, { "epoch": 0.6862194612030345, "grad_norm": 0.586463487068828, "learning_rate": 1.830719060779491e-05, "loss": 0.3813, "step": 1436 }, { "epoch": 0.6866973299086077, "grad_norm": 0.6179525461545747, "learning_rate": 1.830425752100979e-05, "loss": 0.3918, "step": 1437 }, { "epoch": 0.6871751986141807, "grad_norm": 0.6674306038089053, "learning_rate": 1.8301322130770117e-05, "loss": 0.3828, "step": 1438 }, { "epoch": 0.687653067319754, "grad_norm": 2.332261711848657, "learning_rate": 1.829838443789012e-05, "loss": 0.3838, "step": 1439 }, { "epoch": 0.688130936025327, "grad_norm": 0.6981750566847236, "learning_rate": 1.829544444318466e-05, "loss": 0.3946, "step": 1440 }, { "epoch": 0.6886088047309001, "grad_norm": 0.5303694950239685, "learning_rate": 1.829250214746924e-05, "loss": 0.3809, "step": 1441 }, { "epoch": 0.6890866734364733, "grad_norm": 0.7179900691839765, "learning_rate": 1.8289557551560002e-05, "loss": 0.3862, "step": 1442 }, { "epoch": 0.6895645421420464, "grad_norm": 0.5345675046456023, "learning_rate": 1.8286610656273724e-05, "loss": 0.3658, "step": 1443 }, { "epoch": 0.6900424108476196, "grad_norm": 0.6704734216159564, "learning_rate": 1.828366146242782e-05, "loss": 0.4057, "step": 1444 }, { "epoch": 0.6905202795531927, "grad_norm": 0.6051417776741872, "learning_rate": 1.8280709970840352e-05, "loss": 0.4059, "step": 1445 }, { "epoch": 0.6909981482587659, "grad_norm": 0.5450521205173916, "learning_rate": 1.8277756182330008e-05, "loss": 0.3914, "step": 1446 }, { "epoch": 0.691476016964339, "grad_norm": 0.6382596888643569, "learning_rate": 1.8274800097716113e-05, "loss": 0.3682, "step": 1447 }, { "epoch": 0.6919538856699122, "grad_norm": 0.5469859901894466, "learning_rate": 1.827184171781864e-05, "loss": 0.396, "step": 1448 }, { "epoch": 0.6924317543754853, "grad_norm": 0.6169779108480756, "learning_rate": 1.8268881043458183e-05, "loss": 0.3852, "step": 1449 }, { "epoch": 0.6929096230810585, "grad_norm": 0.5515949799511892, "learning_rate": 1.8265918075455985e-05, "loss": 0.3879, "step": 1450 }, { "epoch": 0.6933874917866316, "grad_norm": 0.5852784051902299, "learning_rate": 1.8262952814633927e-05, "loss": 0.3777, "step": 1451 }, { "epoch": 0.6938653604922048, "grad_norm": 0.5747324569861376, "learning_rate": 1.8259985261814506e-05, "loss": 0.3708, "step": 1452 }, { "epoch": 0.6943432291977779, "grad_norm": 0.5584613448774921, "learning_rate": 1.825701541782088e-05, "loss": 0.3939, "step": 1453 }, { "epoch": 0.694821097903351, "grad_norm": 0.6166551656589301, "learning_rate": 1.825404328347683e-05, "loss": 0.3956, "step": 1454 }, { "epoch": 0.6952989666089242, "grad_norm": 0.5834938296403458, "learning_rate": 1.8251068859606777e-05, "loss": 0.3797, "step": 1455 }, { "epoch": 0.6957768353144973, "grad_norm": 0.6153031826664653, "learning_rate": 1.8248092147035762e-05, "loss": 0.3873, "step": 1456 }, { "epoch": 0.6962547040200705, "grad_norm": 0.5549036883726521, "learning_rate": 1.8245113146589478e-05, "loss": 0.3811, "step": 1457 }, { "epoch": 0.6967325727256436, "grad_norm": 0.6100500222117385, "learning_rate": 1.8242131859094253e-05, "loss": 0.3739, "step": 1458 }, { "epoch": 0.6972104414312168, "grad_norm": 0.5644164022778724, "learning_rate": 1.823914828537704e-05, "loss": 0.3805, "step": 1459 }, { "epoch": 0.6976883101367899, "grad_norm": 0.5720320474251424, "learning_rate": 1.823616242626542e-05, "loss": 0.3716, "step": 1460 }, { "epoch": 0.6981661788423631, "grad_norm": 0.5353119220152026, "learning_rate": 1.8233174282587636e-05, "loss": 0.3736, "step": 1461 }, { "epoch": 0.6986440475479362, "grad_norm": 0.6632595208658023, "learning_rate": 1.823018385517253e-05, "loss": 0.3728, "step": 1462 }, { "epoch": 0.6991219162535094, "grad_norm": 0.5332034499910223, "learning_rate": 1.8227191144849606e-05, "loss": 0.3905, "step": 1463 }, { "epoch": 0.6995997849590825, "grad_norm": 0.6114377061257181, "learning_rate": 1.822419615244898e-05, "loss": 0.3729, "step": 1464 }, { "epoch": 0.7000776536646557, "grad_norm": 0.5341668385517253, "learning_rate": 1.8221198878801415e-05, "loss": 0.3839, "step": 1465 }, { "epoch": 0.7005555223702288, "grad_norm": 0.6080766324288399, "learning_rate": 1.82181993247383e-05, "loss": 0.3795, "step": 1466 }, { "epoch": 0.7010333910758019, "grad_norm": 0.5588790456997228, "learning_rate": 1.8215197491091657e-05, "loss": 0.4014, "step": 1467 }, { "epoch": 0.7015112597813751, "grad_norm": 0.5512763293405722, "learning_rate": 1.8212193378694145e-05, "loss": 0.3927, "step": 1468 }, { "epoch": 0.7019891284869482, "grad_norm": 0.5615590012756244, "learning_rate": 1.8209186988379053e-05, "loss": 0.3922, "step": 1469 }, { "epoch": 0.7024669971925214, "grad_norm": 0.5651868825673871, "learning_rate": 1.8206178320980295e-05, "loss": 0.3872, "step": 1470 }, { "epoch": 0.7029448658980945, "grad_norm": 0.5672597491790865, "learning_rate": 1.8203167377332428e-05, "loss": 0.3755, "step": 1471 }, { "epoch": 0.7034227346036677, "grad_norm": 0.6384822322202582, "learning_rate": 1.820015415827063e-05, "loss": 0.388, "step": 1472 }, { "epoch": 0.7039006033092408, "grad_norm": 0.6101936905626903, "learning_rate": 1.8197138664630714e-05, "loss": 0.4005, "step": 1473 }, { "epoch": 0.704378472014814, "grad_norm": 0.5780890528276408, "learning_rate": 1.819412089724913e-05, "loss": 0.3919, "step": 1474 }, { "epoch": 0.704856340720387, "grad_norm": 0.5709336635510021, "learning_rate": 1.819110085696295e-05, "loss": 0.3851, "step": 1475 }, { "epoch": 0.7053342094259603, "grad_norm": 0.5653007211064114, "learning_rate": 1.8188078544609885e-05, "loss": 0.3736, "step": 1476 }, { "epoch": 0.7058120781315333, "grad_norm": 0.5489304827020616, "learning_rate": 1.8185053961028262e-05, "loss": 0.3939, "step": 1477 }, { "epoch": 0.7062899468371066, "grad_norm": 0.57579168376753, "learning_rate": 1.8182027107057054e-05, "loss": 0.3797, "step": 1478 }, { "epoch": 0.7067678155426796, "grad_norm": 0.5694198738113794, "learning_rate": 1.8178997983535852e-05, "loss": 0.3695, "step": 1479 }, { "epoch": 0.7072456842482527, "grad_norm": 0.6271433046445659, "learning_rate": 1.817596659130489e-05, "loss": 0.3941, "step": 1480 }, { "epoch": 0.7077235529538259, "grad_norm": 0.5356698848547248, "learning_rate": 1.8172932931205018e-05, "loss": 0.3814, "step": 1481 }, { "epoch": 0.708201421659399, "grad_norm": 0.5350117309625678, "learning_rate": 1.8169897004077714e-05, "loss": 0.3684, "step": 1482 }, { "epoch": 0.7086792903649722, "grad_norm": 0.563828736209021, "learning_rate": 1.8166858810765093e-05, "loss": 0.3759, "step": 1483 }, { "epoch": 0.7091571590705453, "grad_norm": 0.5015045069229582, "learning_rate": 1.8163818352109905e-05, "loss": 0.3837, "step": 1484 }, { "epoch": 0.7096350277761185, "grad_norm": 0.5494681307134424, "learning_rate": 1.816077562895551e-05, "loss": 0.3835, "step": 1485 }, { "epoch": 0.7101128964816916, "grad_norm": 0.8231790789588842, "learning_rate": 1.8157730642145912e-05, "loss": 0.3928, "step": 1486 }, { "epoch": 0.7105907651872648, "grad_norm": 0.5469496135283491, "learning_rate": 1.815468339252573e-05, "loss": 0.3996, "step": 1487 }, { "epoch": 0.7110686338928379, "grad_norm": 0.5496003239400815, "learning_rate": 1.8151633880940226e-05, "loss": 0.3915, "step": 1488 }, { "epoch": 0.7115465025984111, "grad_norm": 0.5228411620381861, "learning_rate": 1.8148582108235274e-05, "loss": 0.3725, "step": 1489 }, { "epoch": 0.7120243713039842, "grad_norm": 0.5704507706283268, "learning_rate": 1.814552807525738e-05, "loss": 0.3919, "step": 1490 }, { "epoch": 0.7125022400095574, "grad_norm": 0.648766640982509, "learning_rate": 1.8142471782853686e-05, "loss": 0.3908, "step": 1491 }, { "epoch": 0.7129801087151305, "grad_norm": 0.8667271303998992, "learning_rate": 1.813941323187195e-05, "loss": 0.386, "step": 1492 }, { "epoch": 0.7134579774207037, "grad_norm": 0.5901056863467703, "learning_rate": 1.813635242316056e-05, "loss": 0.3683, "step": 1493 }, { "epoch": 0.7139358461262768, "grad_norm": 0.5222770355952134, "learning_rate": 1.813328935756853e-05, "loss": 0.396, "step": 1494 }, { "epoch": 0.7144137148318499, "grad_norm": 0.6020991844265385, "learning_rate": 1.81302240359455e-05, "loss": 0.3642, "step": 1495 }, { "epoch": 0.7148915835374231, "grad_norm": 0.573707434054128, "learning_rate": 1.812715645914174e-05, "loss": 0.3872, "step": 1496 }, { "epoch": 0.7153694522429962, "grad_norm": 0.5132272553395804, "learning_rate": 1.8124086628008137e-05, "loss": 0.3992, "step": 1497 }, { "epoch": 0.7158473209485694, "grad_norm": 0.5814015590506686, "learning_rate": 1.812101454339621e-05, "loss": 0.3813, "step": 1498 }, { "epoch": 0.7163251896541425, "grad_norm": 0.5029998615126221, "learning_rate": 1.81179402061581e-05, "loss": 0.3932, "step": 1499 }, { "epoch": 0.7168030583597157, "grad_norm": 0.5827864769681101, "learning_rate": 1.8114863617146576e-05, "loss": 0.3809, "step": 1500 }, { "epoch": 0.7172809270652888, "grad_norm": 0.6147481989073579, "learning_rate": 1.8111784777215028e-05, "loss": 0.3914, "step": 1501 }, { "epoch": 0.717758795770862, "grad_norm": 0.6022165391681865, "learning_rate": 1.8108703687217474e-05, "loss": 0.3926, "step": 1502 }, { "epoch": 0.7182366644764351, "grad_norm": 0.5749879738062318, "learning_rate": 1.8105620348008548e-05, "loss": 0.3698, "step": 1503 }, { "epoch": 0.7187145331820083, "grad_norm": 0.5144494515929651, "learning_rate": 1.810253476044352e-05, "loss": 0.3903, "step": 1504 }, { "epoch": 0.7191924018875814, "grad_norm": 0.6680656863681617, "learning_rate": 1.8099446925378278e-05, "loss": 0.4031, "step": 1505 }, { "epoch": 0.7196702705931546, "grad_norm": 0.5363554834856091, "learning_rate": 1.809635684366933e-05, "loss": 0.3897, "step": 1506 }, { "epoch": 0.7201481392987277, "grad_norm": 0.57060574097619, "learning_rate": 1.809326451617381e-05, "loss": 0.3847, "step": 1507 }, { "epoch": 0.7206260080043008, "grad_norm": 0.5082209282023774, "learning_rate": 1.8090169943749477e-05, "loss": 0.386, "step": 1508 }, { "epoch": 0.721103876709874, "grad_norm": 0.531721764246233, "learning_rate": 1.8087073127254708e-05, "loss": 0.379, "step": 1509 }, { "epoch": 0.7215817454154471, "grad_norm": 0.5672223837229015, "learning_rate": 1.8083974067548506e-05, "loss": 0.3743, "step": 1510 }, { "epoch": 0.7220596141210203, "grad_norm": 0.557363288254791, "learning_rate": 1.80808727654905e-05, "loss": 0.3922, "step": 1511 }, { "epoch": 0.7225374828265934, "grad_norm": 0.5524986533091046, "learning_rate": 1.807776922194093e-05, "loss": 0.3828, "step": 1512 }, { "epoch": 0.7230153515321666, "grad_norm": 0.5624030376925013, "learning_rate": 1.807466343776067e-05, "loss": 0.3879, "step": 1513 }, { "epoch": 0.7234932202377397, "grad_norm": 0.5587652959424021, "learning_rate": 1.8071555413811202e-05, "loss": 0.3977, "step": 1514 }, { "epoch": 0.7239710889433129, "grad_norm": 0.5243505053339486, "learning_rate": 1.806844515095465e-05, "loss": 0.3756, "step": 1515 }, { "epoch": 0.724448957648886, "grad_norm": 0.5136588134139919, "learning_rate": 1.8065332650053733e-05, "loss": 0.386, "step": 1516 }, { "epoch": 0.7249268263544592, "grad_norm": 0.765740907023463, "learning_rate": 1.806221791197181e-05, "loss": 0.3731, "step": 1517 }, { "epoch": 0.7254046950600322, "grad_norm": 0.5765742196934072, "learning_rate": 1.8059100937572853e-05, "loss": 0.3859, "step": 1518 }, { "epoch": 0.7258825637656054, "grad_norm": 0.508218622133711, "learning_rate": 1.8055981727721454e-05, "loss": 0.3887, "step": 1519 }, { "epoch": 0.7263604324711785, "grad_norm": 0.5169436575801677, "learning_rate": 1.8052860283282832e-05, "loss": 0.3969, "step": 1520 }, { "epoch": 0.7268383011767516, "grad_norm": 0.5627369158971857, "learning_rate": 1.8049736605122814e-05, "loss": 0.3957, "step": 1521 }, { "epoch": 0.7273161698823248, "grad_norm": 0.5642116374092477, "learning_rate": 1.804661069410786e-05, "loss": 0.3877, "step": 1522 }, { "epoch": 0.7277940385878979, "grad_norm": 0.5632828254864813, "learning_rate": 1.8043482551105038e-05, "loss": 0.3846, "step": 1523 }, { "epoch": 0.7282719072934711, "grad_norm": 0.6720264991530787, "learning_rate": 1.8040352176982042e-05, "loss": 0.3742, "step": 1524 }, { "epoch": 0.7287497759990442, "grad_norm": 0.5295580482800217, "learning_rate": 1.8037219572607177e-05, "loss": 0.3812, "step": 1525 }, { "epoch": 0.7292276447046174, "grad_norm": 0.5343714521991437, "learning_rate": 1.803408473884938e-05, "loss": 0.3767, "step": 1526 }, { "epoch": 0.7297055134101905, "grad_norm": 0.5713754536699026, "learning_rate": 1.8030947676578198e-05, "loss": 0.3829, "step": 1527 }, { "epoch": 0.7301833821157637, "grad_norm": 0.5439449953744703, "learning_rate": 1.802780838666379e-05, "loss": 0.3883, "step": 1528 }, { "epoch": 0.7306612508213368, "grad_norm": 1.4893821900130726, "learning_rate": 1.8024666869976946e-05, "loss": 0.3874, "step": 1529 }, { "epoch": 0.73113911952691, "grad_norm": 0.5642566066926745, "learning_rate": 1.8021523127389066e-05, "loss": 0.3707, "step": 1530 }, { "epoch": 0.7316169882324831, "grad_norm": 0.5131365637356557, "learning_rate": 1.8018377159772163e-05, "loss": 0.3842, "step": 1531 }, { "epoch": 0.7320948569380563, "grad_norm": 0.6078967074642584, "learning_rate": 1.801522896799888e-05, "loss": 0.3802, "step": 1532 }, { "epoch": 0.7325727256436294, "grad_norm": 0.5556930415295182, "learning_rate": 1.801207855294247e-05, "loss": 0.3974, "step": 1533 }, { "epoch": 0.7330505943492026, "grad_norm": 0.5433320644721947, "learning_rate": 1.8008925915476795e-05, "loss": 0.3817, "step": 1534 }, { "epoch": 0.7335284630547757, "grad_norm": 0.5725907910354026, "learning_rate": 1.800577105647635e-05, "loss": 0.3862, "step": 1535 }, { "epoch": 0.7340063317603488, "grad_norm": 0.5275822555651795, "learning_rate": 1.800261397681623e-05, "loss": 0.3662, "step": 1536 }, { "epoch": 0.734484200465922, "grad_norm": 0.5958868088113626, "learning_rate": 1.799945467737216e-05, "loss": 0.3861, "step": 1537 }, { "epoch": 0.7349620691714951, "grad_norm": 0.528135241277979, "learning_rate": 1.7996293159020468e-05, "loss": 0.3633, "step": 1538 }, { "epoch": 0.7354399378770683, "grad_norm": 0.5629034073505758, "learning_rate": 1.799312942263811e-05, "loss": 0.3752, "step": 1539 }, { "epoch": 0.7359178065826414, "grad_norm": 0.5369379864183004, "learning_rate": 1.7989963469102643e-05, "loss": 0.3788, "step": 1540 }, { "epoch": 0.7363956752882146, "grad_norm": 0.5181107364251343, "learning_rate": 1.798679529929225e-05, "loss": 0.3908, "step": 1541 }, { "epoch": 0.7368735439937877, "grad_norm": 0.5889689370863749, "learning_rate": 1.7983624914085726e-05, "loss": 0.38, "step": 1542 }, { "epoch": 0.7373514126993609, "grad_norm": 0.5087601460758195, "learning_rate": 1.7980452314362482e-05, "loss": 0.3816, "step": 1543 }, { "epoch": 0.737829281404934, "grad_norm": 0.5940221012894452, "learning_rate": 1.7977277501002538e-05, "loss": 0.3637, "step": 1544 }, { "epoch": 0.7383071501105072, "grad_norm": 0.5633247453068111, "learning_rate": 1.797410047488653e-05, "loss": 0.3969, "step": 1545 }, { "epoch": 0.7387850188160803, "grad_norm": 0.5712343756297124, "learning_rate": 1.797092123689571e-05, "loss": 0.3811, "step": 1546 }, { "epoch": 0.7392628875216535, "grad_norm": 0.7181347071469072, "learning_rate": 1.7967739787911946e-05, "loss": 0.3998, "step": 1547 }, { "epoch": 0.7397407562272266, "grad_norm": 0.5637357459690837, "learning_rate": 1.7964556128817713e-05, "loss": 0.3927, "step": 1548 }, { "epoch": 0.7402186249327997, "grad_norm": 0.5365699147490028, "learning_rate": 1.79613702604961e-05, "loss": 0.3888, "step": 1549 }, { "epoch": 0.7406964936383729, "grad_norm": 0.5518595874098879, "learning_rate": 1.7958182183830816e-05, "loss": 0.3996, "step": 1550 }, { "epoch": 0.741174362343946, "grad_norm": 0.5208148323047387, "learning_rate": 1.795499189970617e-05, "loss": 0.382, "step": 1551 }, { "epoch": 0.7416522310495192, "grad_norm": 0.5007085516775489, "learning_rate": 1.79517994090071e-05, "loss": 0.4056, "step": 1552 }, { "epoch": 0.7421300997550923, "grad_norm": 0.509836881110819, "learning_rate": 1.794860471261914e-05, "loss": 0.3774, "step": 1553 }, { "epoch": 0.7426079684606655, "grad_norm": 0.5656747401049432, "learning_rate": 1.794540781142844e-05, "loss": 0.3795, "step": 1554 }, { "epoch": 0.7430858371662385, "grad_norm": 0.5513113429693962, "learning_rate": 1.794220870632177e-05, "loss": 0.373, "step": 1555 }, { "epoch": 0.7435637058718118, "grad_norm": 0.5180245500331098, "learning_rate": 1.7939007398186507e-05, "loss": 0.3891, "step": 1556 }, { "epoch": 0.7440415745773848, "grad_norm": 0.5856101098872712, "learning_rate": 1.793580388791063e-05, "loss": 0.3906, "step": 1557 }, { "epoch": 0.744519443282958, "grad_norm": 0.5597746606677304, "learning_rate": 1.7932598176382735e-05, "loss": 0.3768, "step": 1558 }, { "epoch": 0.7449973119885311, "grad_norm": 0.5634426580173892, "learning_rate": 1.792939026449204e-05, "loss": 0.3708, "step": 1559 }, { "epoch": 0.7454751806941043, "grad_norm": 0.5467128365442397, "learning_rate": 1.7926180153128358e-05, "loss": 0.3797, "step": 1560 }, { "epoch": 0.7459530493996774, "grad_norm": 0.5203223948570378, "learning_rate": 1.7922967843182113e-05, "loss": 0.3872, "step": 1561 }, { "epoch": 0.7464309181052505, "grad_norm": 0.5963679934080937, "learning_rate": 1.7919753335544352e-05, "loss": 0.3822, "step": 1562 }, { "epoch": 0.7469087868108237, "grad_norm": 0.5959650909771645, "learning_rate": 1.7916536631106714e-05, "loss": 0.3906, "step": 1563 }, { "epoch": 0.7473866555163968, "grad_norm": 0.5685204218306926, "learning_rate": 1.7913317730761463e-05, "loss": 0.3845, "step": 1564 }, { "epoch": 0.74786452422197, "grad_norm": 0.6029950556254274, "learning_rate": 1.791009663540146e-05, "loss": 0.3794, "step": 1565 }, { "epoch": 0.7483423929275431, "grad_norm": 0.5300367366403645, "learning_rate": 1.790687334592018e-05, "loss": 0.3836, "step": 1566 }, { "epoch": 0.7488202616331163, "grad_norm": 0.5994658946073593, "learning_rate": 1.790364786321171e-05, "loss": 0.3881, "step": 1567 }, { "epoch": 0.7492981303386894, "grad_norm": 0.5964406475861134, "learning_rate": 1.7900420188170745e-05, "loss": 0.3905, "step": 1568 }, { "epoch": 0.7497759990442626, "grad_norm": 1.7504865632734914, "learning_rate": 1.7897190321692578e-05, "loss": 0.3996, "step": 1569 }, { "epoch": 0.7502538677498357, "grad_norm": 0.5830761379235893, "learning_rate": 1.789395826467312e-05, "loss": 0.3859, "step": 1570 }, { "epoch": 0.7507317364554089, "grad_norm": 0.5242952519103954, "learning_rate": 1.7890724018008883e-05, "loss": 0.3939, "step": 1571 }, { "epoch": 0.751209605160982, "grad_norm": 0.543050764985698, "learning_rate": 1.7887487582596995e-05, "loss": 0.394, "step": 1572 }, { "epoch": 0.7516874738665552, "grad_norm": 0.5259949143161994, "learning_rate": 1.7884248959335186e-05, "loss": 0.3858, "step": 1573 }, { "epoch": 0.7521653425721283, "grad_norm": 0.5710757710219158, "learning_rate": 1.7881008149121793e-05, "loss": 0.3782, "step": 1574 }, { "epoch": 0.7526432112777014, "grad_norm": 0.5437004375278933, "learning_rate": 1.7877765152855757e-05, "loss": 0.3744, "step": 1575 }, { "epoch": 0.7531210799832746, "grad_norm": 0.5158132068664555, "learning_rate": 1.7874519971436627e-05, "loss": 0.378, "step": 1576 }, { "epoch": 0.7535989486888477, "grad_norm": 0.5706843234145509, "learning_rate": 1.787127260576456e-05, "loss": 0.3843, "step": 1577 }, { "epoch": 0.7540768173944209, "grad_norm": 0.5829854434809196, "learning_rate": 1.7868023056740323e-05, "loss": 0.383, "step": 1578 }, { "epoch": 0.754554686099994, "grad_norm": 0.5491598281489602, "learning_rate": 1.7864771325265276e-05, "loss": 0.3843, "step": 1579 }, { "epoch": 0.7550325548055672, "grad_norm": 0.5283360065954008, "learning_rate": 1.78615174122414e-05, "loss": 0.3867, "step": 1580 }, { "epoch": 0.7555104235111403, "grad_norm": 0.5361665431481477, "learning_rate": 1.7858261318571265e-05, "loss": 0.3836, "step": 1581 }, { "epoch": 0.7559882922167135, "grad_norm": 0.5773171737336307, "learning_rate": 1.785500304515806e-05, "loss": 0.4076, "step": 1582 }, { "epoch": 0.7564661609222866, "grad_norm": 0.5639322568045663, "learning_rate": 1.785174259290557e-05, "loss": 0.3856, "step": 1583 }, { "epoch": 0.7569440296278598, "grad_norm": 0.5659655900325912, "learning_rate": 1.7848479962718183e-05, "loss": 0.3686, "step": 1584 }, { "epoch": 0.7574218983334329, "grad_norm": 0.6962795442570291, "learning_rate": 1.78452151555009e-05, "loss": 0.3881, "step": 1585 }, { "epoch": 0.7578997670390061, "grad_norm": 0.5417577285230938, "learning_rate": 1.7841948172159322e-05, "loss": 0.3794, "step": 1586 }, { "epoch": 0.7583776357445792, "grad_norm": 0.5211348547780497, "learning_rate": 1.783867901359965e-05, "loss": 0.388, "step": 1587 }, { "epoch": 0.7588555044501524, "grad_norm": 0.5104365801361086, "learning_rate": 1.7835407680728695e-05, "loss": 0.3777, "step": 1588 }, { "epoch": 0.7593333731557255, "grad_norm": 0.540783001344937, "learning_rate": 1.783213417445386e-05, "loss": 0.3805, "step": 1589 }, { "epoch": 0.7598112418612986, "grad_norm": 0.5258232259033346, "learning_rate": 1.7828858495683162e-05, "loss": 0.3767, "step": 1590 }, { "epoch": 0.7602891105668718, "grad_norm": 0.5222845479618173, "learning_rate": 1.7825580645325217e-05, "loss": 0.3866, "step": 1591 }, { "epoch": 0.7607669792724449, "grad_norm": 0.5399407703301767, "learning_rate": 1.7822300624289242e-05, "loss": 0.3754, "step": 1592 }, { "epoch": 0.7612448479780181, "grad_norm": 0.4996656893078933, "learning_rate": 1.7819018433485055e-05, "loss": 0.3792, "step": 1593 }, { "epoch": 0.7617227166835912, "grad_norm": 0.5274635062067855, "learning_rate": 1.7815734073823078e-05, "loss": 0.3822, "step": 1594 }, { "epoch": 0.7622005853891644, "grad_norm": 0.6480497541249455, "learning_rate": 1.781244754621434e-05, "loss": 0.3848, "step": 1595 }, { "epoch": 0.7626784540947374, "grad_norm": 0.5526359451420565, "learning_rate": 1.7809158851570463e-05, "loss": 0.3725, "step": 1596 }, { "epoch": 0.7631563228003106, "grad_norm": 0.49088307821668725, "learning_rate": 1.780586799080367e-05, "loss": 0.3823, "step": 1597 }, { "epoch": 0.7636341915058837, "grad_norm": 0.5511921737669051, "learning_rate": 1.780257496482679e-05, "loss": 0.3808, "step": 1598 }, { "epoch": 0.764112060211457, "grad_norm": 0.6775694510669721, "learning_rate": 1.7799279774553246e-05, "loss": 0.3802, "step": 1599 }, { "epoch": 0.76458992891703, "grad_norm": 0.5310444266054842, "learning_rate": 1.779598242089707e-05, "loss": 0.3809, "step": 1600 }, { "epoch": 0.7650677976226032, "grad_norm": 0.4885991078950156, "learning_rate": 1.7792682904772894e-05, "loss": 0.3891, "step": 1601 }, { "epoch": 0.7655456663281763, "grad_norm": 0.7799415079754219, "learning_rate": 1.7789381227095938e-05, "loss": 0.4004, "step": 1602 }, { "epoch": 0.7660235350337494, "grad_norm": 0.48613695514881117, "learning_rate": 1.7786077388782034e-05, "loss": 0.3995, "step": 1603 }, { "epoch": 0.7665014037393226, "grad_norm": 0.5366975828343058, "learning_rate": 1.77827713907476e-05, "loss": 0.3757, "step": 1604 }, { "epoch": 0.7669792724448957, "grad_norm": 0.5182970925432647, "learning_rate": 1.7779463233909677e-05, "loss": 0.3675, "step": 1605 }, { "epoch": 0.7674571411504689, "grad_norm": 0.566425081373052, "learning_rate": 1.7776152919185873e-05, "loss": 0.3983, "step": 1606 }, { "epoch": 0.767935009856042, "grad_norm": 0.5637827498293969, "learning_rate": 1.777284044749442e-05, "loss": 0.3732, "step": 1607 }, { "epoch": 0.7684128785616152, "grad_norm": 0.6832393610992347, "learning_rate": 1.7769525819754138e-05, "loss": 0.3705, "step": 1608 }, { "epoch": 0.7688907472671883, "grad_norm": 0.6312177052802135, "learning_rate": 1.7766209036884447e-05, "loss": 0.3672, "step": 1609 }, { "epoch": 0.7693686159727615, "grad_norm": 0.5350057787652999, "learning_rate": 1.7762890099805362e-05, "loss": 0.3753, "step": 1610 }, { "epoch": 0.7698464846783346, "grad_norm": 0.5139252783072269, "learning_rate": 1.77595690094375e-05, "loss": 0.3825, "step": 1611 }, { "epoch": 0.7703243533839078, "grad_norm": 0.849473799017312, "learning_rate": 1.7756245766702068e-05, "loss": 0.3682, "step": 1612 }, { "epoch": 0.7708022220894809, "grad_norm": 0.5132435684021898, "learning_rate": 1.775292037252088e-05, "loss": 0.3896, "step": 1613 }, { "epoch": 0.7712800907950541, "grad_norm": 0.5945126305429924, "learning_rate": 1.774959282781634e-05, "loss": 0.3948, "step": 1614 }, { "epoch": 0.7717579595006272, "grad_norm": 0.5332717557684908, "learning_rate": 1.774626313351145e-05, "loss": 0.3671, "step": 1615 }, { "epoch": 0.7722358282062003, "grad_norm": 0.5596768760181904, "learning_rate": 1.7742931290529804e-05, "loss": 0.3519, "step": 1616 }, { "epoch": 0.7727136969117735, "grad_norm": 0.801668548538205, "learning_rate": 1.7739597299795606e-05, "loss": 0.3759, "step": 1617 }, { "epoch": 0.7731915656173466, "grad_norm": 0.5443956847817307, "learning_rate": 1.773626116223364e-05, "loss": 0.3701, "step": 1618 }, { "epoch": 0.7736694343229198, "grad_norm": 0.5223788847180523, "learning_rate": 1.7732922878769298e-05, "loss": 0.3815, "step": 1619 }, { "epoch": 0.7741473030284929, "grad_norm": 0.5153447344508111, "learning_rate": 1.7729582450328547e-05, "loss": 0.388, "step": 1620 }, { "epoch": 0.7746251717340661, "grad_norm": 0.5399342464141584, "learning_rate": 1.7726239877837977e-05, "loss": 0.3771, "step": 1621 }, { "epoch": 0.7751030404396392, "grad_norm": 0.5827130864299022, "learning_rate": 1.7722895162224752e-05, "loss": 0.3812, "step": 1622 }, { "epoch": 0.7755809091452124, "grad_norm": 0.5264498651194451, "learning_rate": 1.7719548304416638e-05, "loss": 0.37, "step": 1623 }, { "epoch": 0.7760587778507855, "grad_norm": 0.5525289832285949, "learning_rate": 1.7716199305341993e-05, "loss": 0.3641, "step": 1624 }, { "epoch": 0.7765366465563587, "grad_norm": 0.6073361920730956, "learning_rate": 1.771284816592978e-05, "loss": 0.3751, "step": 1625 }, { "epoch": 0.7770145152619318, "grad_norm": 0.5261064606183812, "learning_rate": 1.770949488710953e-05, "loss": 0.3831, "step": 1626 }, { "epoch": 0.777492383967505, "grad_norm": 0.5346800888345036, "learning_rate": 1.7706139469811395e-05, "loss": 0.4004, "step": 1627 }, { "epoch": 0.7779702526730781, "grad_norm": 0.585135599645527, "learning_rate": 1.77027819149661e-05, "loss": 0.3757, "step": 1628 }, { "epoch": 0.7784481213786513, "grad_norm": 0.5412022951804295, "learning_rate": 1.7699422223504983e-05, "loss": 0.3903, "step": 1629 }, { "epoch": 0.7789259900842244, "grad_norm": 0.5087555055370976, "learning_rate": 1.7696060396359956e-05, "loss": 0.3858, "step": 1630 }, { "epoch": 0.7794038587897975, "grad_norm": 0.5392498385275896, "learning_rate": 1.7692696434463527e-05, "loss": 0.3908, "step": 1631 }, { "epoch": 0.7798817274953707, "grad_norm": 0.5248886136108247, "learning_rate": 1.768933033874881e-05, "loss": 0.3818, "step": 1632 }, { "epoch": 0.7803595962009438, "grad_norm": 0.6344637610053451, "learning_rate": 1.768596211014949e-05, "loss": 0.3705, "step": 1633 }, { "epoch": 0.780837464906517, "grad_norm": 0.5079797518477398, "learning_rate": 1.7682591749599856e-05, "loss": 0.3876, "step": 1634 }, { "epoch": 0.78131533361209, "grad_norm": 0.7193783569627261, "learning_rate": 1.7679219258034798e-05, "loss": 0.4143, "step": 1635 }, { "epoch": 0.7817932023176632, "grad_norm": 0.5441633206776926, "learning_rate": 1.767584463638977e-05, "loss": 0.3819, "step": 1636 }, { "epoch": 0.7822710710232363, "grad_norm": 0.5501870910998597, "learning_rate": 1.767246788560084e-05, "loss": 0.3675, "step": 1637 }, { "epoch": 0.7827489397288095, "grad_norm": 1.1914603909692592, "learning_rate": 1.766908900660466e-05, "loss": 0.3664, "step": 1638 }, { "epoch": 0.7832268084343826, "grad_norm": 0.5737236436900817, "learning_rate": 1.7665708000338472e-05, "loss": 0.3912, "step": 1639 }, { "epoch": 0.7837046771399558, "grad_norm": 0.5111903213622228, "learning_rate": 1.7662324867740102e-05, "loss": 0.3729, "step": 1640 }, { "epoch": 0.7841825458455289, "grad_norm": 0.5448282748052444, "learning_rate": 1.7658939609747978e-05, "loss": 0.3828, "step": 1641 }, { "epoch": 0.7846604145511021, "grad_norm": 0.5536887303747314, "learning_rate": 1.7655552227301105e-05, "loss": 0.3734, "step": 1642 }, { "epoch": 0.7851382832566752, "grad_norm": 0.5055836319468296, "learning_rate": 1.7652162721339085e-05, "loss": 0.3666, "step": 1643 }, { "epoch": 0.7856161519622483, "grad_norm": 0.588666775287373, "learning_rate": 1.764877109280211e-05, "loss": 0.3917, "step": 1644 }, { "epoch": 0.7860940206678215, "grad_norm": 0.5912101719269106, "learning_rate": 1.7645377342630956e-05, "loss": 0.3778, "step": 1645 }, { "epoch": 0.7865718893733946, "grad_norm": 0.5355115123869113, "learning_rate": 1.764198147176699e-05, "loss": 0.3786, "step": 1646 }, { "epoch": 0.7870497580789678, "grad_norm": 0.5273687503114382, "learning_rate": 1.7638583481152164e-05, "loss": 0.375, "step": 1647 }, { "epoch": 0.7875276267845409, "grad_norm": 0.6354283594908688, "learning_rate": 1.7635183371729022e-05, "loss": 0.393, "step": 1648 }, { "epoch": 0.7880054954901141, "grad_norm": 0.5268426064765231, "learning_rate": 1.7631781144440697e-05, "loss": 0.3834, "step": 1649 }, { "epoch": 0.7884833641956872, "grad_norm": 0.5851405613750261, "learning_rate": 1.76283768002309e-05, "loss": 0.369, "step": 1650 }, { "epoch": 0.7889612329012604, "grad_norm": 0.5514500060660226, "learning_rate": 1.7624970340043948e-05, "loss": 0.4001, "step": 1651 }, { "epoch": 0.7894391016068335, "grad_norm": 0.5303902826647586, "learning_rate": 1.7621561764824724e-05, "loss": 0.383, "step": 1652 }, { "epoch": 0.7899169703124067, "grad_norm": 0.5956286874077665, "learning_rate": 1.7618151075518706e-05, "loss": 0.3741, "step": 1653 }, { "epoch": 0.7903948390179798, "grad_norm": 0.6017706875819347, "learning_rate": 1.7614738273071963e-05, "loss": 0.3778, "step": 1654 }, { "epoch": 0.790872707723553, "grad_norm": 0.5218421728887426, "learning_rate": 1.7611323358431145e-05, "loss": 0.3719, "step": 1655 }, { "epoch": 0.7913505764291261, "grad_norm": 0.5695441720555854, "learning_rate": 1.7607906332543486e-05, "loss": 0.394, "step": 1656 }, { "epoch": 0.7918284451346992, "grad_norm": 0.5713620867531857, "learning_rate": 1.7604487196356818e-05, "loss": 0.3932, "step": 1657 }, { "epoch": 0.7923063138402724, "grad_norm": 0.5142635919293859, "learning_rate": 1.7601065950819536e-05, "loss": 0.393, "step": 1658 }, { "epoch": 0.7927841825458455, "grad_norm": 0.5563424936264186, "learning_rate": 1.7597642596880642e-05, "loss": 0.365, "step": 1659 }, { "epoch": 0.7932620512514187, "grad_norm": 0.5538505182272977, "learning_rate": 1.759421713548971e-05, "loss": 0.3783, "step": 1660 }, { "epoch": 0.7937399199569918, "grad_norm": 0.5142957210677673, "learning_rate": 1.7590789567596908e-05, "loss": 0.3953, "step": 1661 }, { "epoch": 0.794217788662565, "grad_norm": 0.5688406081526746, "learning_rate": 1.758735989415298e-05, "loss": 0.3934, "step": 1662 }, { "epoch": 0.7946956573681381, "grad_norm": 0.5009105683579143, "learning_rate": 1.758392811610925e-05, "loss": 0.3844, "step": 1663 }, { "epoch": 0.7951735260737113, "grad_norm": 0.5603598984766094, "learning_rate": 1.7580494234417645e-05, "loss": 0.3952, "step": 1664 }, { "epoch": 0.7956513947792844, "grad_norm": 0.5385081611958358, "learning_rate": 1.757705825003065e-05, "loss": 0.3791, "step": 1665 }, { "epoch": 0.7961292634848576, "grad_norm": 0.550224901020255, "learning_rate": 1.7573620163901362e-05, "loss": 0.3723, "step": 1666 }, { "epoch": 0.7966071321904307, "grad_norm": 0.6130708493238943, "learning_rate": 1.7570179976983433e-05, "loss": 0.3807, "step": 1667 }, { "epoch": 0.7970850008960039, "grad_norm": 0.5930943169087218, "learning_rate": 1.7566737690231113e-05, "loss": 0.3436, "step": 1668 }, { "epoch": 0.797562869601577, "grad_norm": 0.5666432561035308, "learning_rate": 1.756329330459923e-05, "loss": 0.3721, "step": 1669 }, { "epoch": 0.7980407383071502, "grad_norm": 0.5599329725398824, "learning_rate": 1.7559846821043205e-05, "loss": 0.3896, "step": 1670 }, { "epoch": 0.7985186070127233, "grad_norm": 0.520091169063164, "learning_rate": 1.7556398240519025e-05, "loss": 0.3766, "step": 1671 }, { "epoch": 0.7989964757182964, "grad_norm": 0.5987277410784928, "learning_rate": 1.755294756398326e-05, "loss": 0.3887, "step": 1672 }, { "epoch": 0.7994743444238696, "grad_norm": 0.4945539006204766, "learning_rate": 1.7549494792393077e-05, "loss": 0.3861, "step": 1673 }, { "epoch": 0.7999522131294426, "grad_norm": 0.536193701750873, "learning_rate": 1.7546039926706206e-05, "loss": 0.3772, "step": 1674 }, { "epoch": 0.8004300818350158, "grad_norm": 0.5264371388469762, "learning_rate": 1.754258296788097e-05, "loss": 0.3936, "step": 1675 }, { "epoch": 0.8009079505405889, "grad_norm": 0.49365045426457704, "learning_rate": 1.753912391687627e-05, "loss": 0.3854, "step": 1676 }, { "epoch": 0.8013858192461621, "grad_norm": 0.572847244678029, "learning_rate": 1.753566277465158e-05, "loss": 0.3892, "step": 1677 }, { "epoch": 0.8018636879517352, "grad_norm": 0.5167500232282058, "learning_rate": 1.7532199542166967e-05, "loss": 0.3847, "step": 1678 }, { "epoch": 0.8023415566573084, "grad_norm": 0.5616325554937263, "learning_rate": 1.7528734220383065e-05, "loss": 0.3672, "step": 1679 }, { "epoch": 0.8028194253628815, "grad_norm": 0.5320597666482352, "learning_rate": 1.7525266810261096e-05, "loss": 0.387, "step": 1680 }, { "epoch": 0.8032972940684547, "grad_norm": 0.5555593972083235, "learning_rate": 1.7521797312762854e-05, "loss": 0.3973, "step": 1681 }, { "epoch": 0.8037751627740278, "grad_norm": 0.5880958847721388, "learning_rate": 1.7518325728850722e-05, "loss": 0.3683, "step": 1682 }, { "epoch": 0.804253031479601, "grad_norm": 0.5565593460383242, "learning_rate": 1.7514852059487652e-05, "loss": 0.3751, "step": 1683 }, { "epoch": 0.8047309001851741, "grad_norm": 0.5268326213678646, "learning_rate": 1.7511376305637183e-05, "loss": 0.3874, "step": 1684 }, { "epoch": 0.8052087688907472, "grad_norm": 0.5685440803479197, "learning_rate": 1.7507898468263422e-05, "loss": 0.367, "step": 1685 }, { "epoch": 0.8056866375963204, "grad_norm": 0.5404765320360659, "learning_rate": 1.7504418548331065e-05, "loss": 0.3781, "step": 1686 }, { "epoch": 0.8061645063018935, "grad_norm": 0.5458851473805669, "learning_rate": 1.750093654680538e-05, "loss": 0.3834, "step": 1687 }, { "epoch": 0.8066423750074667, "grad_norm": 0.53472908598489, "learning_rate": 1.7497452464652207e-05, "loss": 0.3931, "step": 1688 }, { "epoch": 0.8071202437130398, "grad_norm": 0.5339306448364256, "learning_rate": 1.7493966302837978e-05, "loss": 0.3765, "step": 1689 }, { "epoch": 0.807598112418613, "grad_norm": 0.5452629105131837, "learning_rate": 1.7490478062329686e-05, "loss": 0.3854, "step": 1690 }, { "epoch": 0.8080759811241861, "grad_norm": 0.5007685109159807, "learning_rate": 1.7486987744094905e-05, "loss": 0.3822, "step": 1691 }, { "epoch": 0.8085538498297593, "grad_norm": 0.5114501233435106, "learning_rate": 1.74834953491018e-05, "loss": 0.3751, "step": 1692 }, { "epoch": 0.8090317185353324, "grad_norm": 0.5141980507523353, "learning_rate": 1.7480000878319084e-05, "loss": 0.3684, "step": 1693 }, { "epoch": 0.8095095872409056, "grad_norm": 0.5034630274486521, "learning_rate": 1.7476504332716072e-05, "loss": 0.3758, "step": 1694 }, { "epoch": 0.8099874559464787, "grad_norm": 0.5387147348927491, "learning_rate": 1.7473005713262644e-05, "loss": 0.37, "step": 1695 }, { "epoch": 0.8104653246520519, "grad_norm": 0.5442093019130165, "learning_rate": 1.7469505020929252e-05, "loss": 0.3686, "step": 1696 }, { "epoch": 0.810943193357625, "grad_norm": 0.4947860144971616, "learning_rate": 1.7466002256686925e-05, "loss": 0.3813, "step": 1697 }, { "epoch": 0.8114210620631981, "grad_norm": 0.5537428519276251, "learning_rate": 1.746249742150727e-05, "loss": 0.3845, "step": 1698 }, { "epoch": 0.8118989307687713, "grad_norm": 0.5074492556160554, "learning_rate": 1.7458990516362468e-05, "loss": 0.3706, "step": 1699 }, { "epoch": 0.8123767994743444, "grad_norm": 0.5464251538547661, "learning_rate": 1.7455481542225272e-05, "loss": 0.3771, "step": 1700 }, { "epoch": 0.8128546681799176, "grad_norm": 0.5386465641293379, "learning_rate": 1.7451970500069007e-05, "loss": 0.3802, "step": 1701 }, { "epoch": 0.8133325368854907, "grad_norm": 0.6455115743390833, "learning_rate": 1.7448457390867575e-05, "loss": 0.3586, "step": 1702 }, { "epoch": 0.8138104055910639, "grad_norm": 0.5305807234406511, "learning_rate": 1.744494221559545e-05, "loss": 0.3918, "step": 1703 }, { "epoch": 0.814288274296637, "grad_norm": 0.5536892586879414, "learning_rate": 1.7441424975227685e-05, "loss": 0.3853, "step": 1704 }, { "epoch": 0.8147661430022102, "grad_norm": 0.5262189394069378, "learning_rate": 1.7437905670739893e-05, "loss": 0.3738, "step": 1705 }, { "epoch": 0.8152440117077833, "grad_norm": 0.5550027476074038, "learning_rate": 1.7434384303108273e-05, "loss": 0.3778, "step": 1706 }, { "epoch": 0.8157218804133565, "grad_norm": 0.5346890732228916, "learning_rate": 1.7430860873309586e-05, "loss": 0.3702, "step": 1707 }, { "epoch": 0.8161997491189296, "grad_norm": 0.5437600270856643, "learning_rate": 1.7427335382321173e-05, "loss": 0.3684, "step": 1708 }, { "epoch": 0.8166776178245028, "grad_norm": 0.6463467252286066, "learning_rate": 1.742380783112094e-05, "loss": 0.383, "step": 1709 }, { "epoch": 0.8171554865300759, "grad_norm": 0.5461358713423635, "learning_rate": 1.7420278220687366e-05, "loss": 0.3672, "step": 1710 }, { "epoch": 0.817633355235649, "grad_norm": 0.5179296288711173, "learning_rate": 1.7416746551999504e-05, "loss": 0.3631, "step": 1711 }, { "epoch": 0.8181112239412222, "grad_norm": 0.62856546922575, "learning_rate": 1.741321282603698e-05, "loss": 0.3587, "step": 1712 }, { "epoch": 0.8185890926467952, "grad_norm": 0.5022602921463213, "learning_rate": 1.7409677043779986e-05, "loss": 0.3827, "step": 1713 }, { "epoch": 0.8190669613523685, "grad_norm": 0.5287967693920427, "learning_rate": 1.7406139206209283e-05, "loss": 0.3813, "step": 1714 }, { "epoch": 0.8195448300579415, "grad_norm": 0.5110900484564262, "learning_rate": 1.7402599314306207e-05, "loss": 0.3874, "step": 1715 }, { "epoch": 0.8200226987635147, "grad_norm": 0.5781567849028013, "learning_rate": 1.739905736905266e-05, "loss": 0.3567, "step": 1716 }, { "epoch": 0.8205005674690878, "grad_norm": 0.49833085488616846, "learning_rate": 1.739551337143112e-05, "loss": 0.3574, "step": 1717 }, { "epoch": 0.820978436174661, "grad_norm": 0.5263629142124272, "learning_rate": 1.739196732242462e-05, "loss": 0.3899, "step": 1718 }, { "epoch": 0.8214563048802341, "grad_norm": 0.5022935161884177, "learning_rate": 1.7388419223016778e-05, "loss": 0.3576, "step": 1719 }, { "epoch": 0.8219341735858073, "grad_norm": 0.5254848193160996, "learning_rate": 1.7384869074191777e-05, "loss": 0.3779, "step": 1720 }, { "epoch": 0.8224120422913804, "grad_norm": 0.5213123941213047, "learning_rate": 1.738131687693436e-05, "loss": 0.3586, "step": 1721 }, { "epoch": 0.8228899109969536, "grad_norm": 0.5156498268841546, "learning_rate": 1.737776263222984e-05, "loss": 0.3724, "step": 1722 }, { "epoch": 0.8233677797025267, "grad_norm": 0.5704476991648906, "learning_rate": 1.737420634106411e-05, "loss": 0.3705, "step": 1723 }, { "epoch": 0.8238456484080999, "grad_norm": 0.5546790809955634, "learning_rate": 1.7370648004423623e-05, "loss": 0.3704, "step": 1724 }, { "epoch": 0.824323517113673, "grad_norm": 0.618179020385375, "learning_rate": 1.7367087623295394e-05, "loss": 0.3723, "step": 1725 }, { "epoch": 0.8248013858192461, "grad_norm": 0.5663350618454257, "learning_rate": 1.7363525198667013e-05, "loss": 0.3964, "step": 1726 }, { "epoch": 0.8252792545248193, "grad_norm": 0.5530576617160258, "learning_rate": 1.735996073152663e-05, "loss": 0.3638, "step": 1727 }, { "epoch": 0.8257571232303924, "grad_norm": 0.5518889071407768, "learning_rate": 1.7356394222862966e-05, "loss": 0.3777, "step": 1728 }, { "epoch": 0.8262349919359656, "grad_norm": 0.5219424160449163, "learning_rate": 1.7352825673665313e-05, "loss": 0.3773, "step": 1729 }, { "epoch": 0.8267128606415387, "grad_norm": 0.5416629798429636, "learning_rate": 1.7349255084923517e-05, "loss": 0.3728, "step": 1730 }, { "epoch": 0.8271907293471119, "grad_norm": 0.542222397838335, "learning_rate": 1.7345682457627998e-05, "loss": 0.3846, "step": 1731 }, { "epoch": 0.827668598052685, "grad_norm": 0.5564300713817316, "learning_rate": 1.7342107792769747e-05, "loss": 0.3854, "step": 1732 }, { "epoch": 0.8281464667582582, "grad_norm": 0.5047965719419824, "learning_rate": 1.7338531091340304e-05, "loss": 0.3684, "step": 1733 }, { "epoch": 0.8286243354638313, "grad_norm": 0.5525238950280134, "learning_rate": 1.7334952354331783e-05, "loss": 0.3604, "step": 1734 }, { "epoch": 0.8291022041694045, "grad_norm": 0.5386457884764131, "learning_rate": 1.7331371582736864e-05, "loss": 0.382, "step": 1735 }, { "epoch": 0.8295800728749776, "grad_norm": 0.5721485678679854, "learning_rate": 1.7327788777548796e-05, "loss": 0.3591, "step": 1736 }, { "epoch": 0.8300579415805508, "grad_norm": 0.9233246568993144, "learning_rate": 1.732420393976138e-05, "loss": 0.3704, "step": 1737 }, { "epoch": 0.8305358102861239, "grad_norm": 0.5345664368757062, "learning_rate": 1.7320617070368985e-05, "loss": 0.3895, "step": 1738 }, { "epoch": 0.831013678991697, "grad_norm": 0.5011860668006918, "learning_rate": 1.731702817036655e-05, "loss": 0.3743, "step": 1739 }, { "epoch": 0.8314915476972702, "grad_norm": 0.49769794527863426, "learning_rate": 1.731343724074957e-05, "loss": 0.3729, "step": 1740 }, { "epoch": 0.8319694164028433, "grad_norm": 0.4910631974353203, "learning_rate": 1.730984428251411e-05, "loss": 0.3775, "step": 1741 }, { "epoch": 0.8324472851084165, "grad_norm": 0.4981131547731618, "learning_rate": 1.7306249296656784e-05, "loss": 0.3819, "step": 1742 }, { "epoch": 0.8329251538139896, "grad_norm": 0.5176839501122841, "learning_rate": 1.7302652284174785e-05, "loss": 0.3719, "step": 1743 }, { "epoch": 0.8334030225195628, "grad_norm": 0.5152962449518306, "learning_rate": 1.729905324606586e-05, "loss": 0.3804, "step": 1744 }, { "epoch": 0.8338808912251359, "grad_norm": 0.5558715961134091, "learning_rate": 1.7295452183328317e-05, "loss": 0.3828, "step": 1745 }, { "epoch": 0.8343587599307091, "grad_norm": 0.5422705807257295, "learning_rate": 1.7291849096961027e-05, "loss": 0.3728, "step": 1746 }, { "epoch": 0.8348366286362822, "grad_norm": 0.5132286453110108, "learning_rate": 1.7288243987963423e-05, "loss": 0.3915, "step": 1747 }, { "epoch": 0.8353144973418554, "grad_norm": 0.5896569071524325, "learning_rate": 1.7284636857335503e-05, "loss": 0.3721, "step": 1748 }, { "epoch": 0.8357923660474285, "grad_norm": 0.5315131515573899, "learning_rate": 1.7281027706077815e-05, "loss": 0.3916, "step": 1749 }, { "epoch": 0.8362702347530017, "grad_norm": 0.5270182046627034, "learning_rate": 1.7277416535191478e-05, "loss": 0.3867, "step": 1750 }, { "epoch": 0.8367481034585748, "grad_norm": 0.5016720673015266, "learning_rate": 1.7273803345678163e-05, "loss": 0.3793, "step": 1751 }, { "epoch": 0.8372259721641478, "grad_norm": 0.538715167203939, "learning_rate": 1.7270188138540106e-05, "loss": 0.3835, "step": 1752 }, { "epoch": 0.837703840869721, "grad_norm": 0.5152936944221022, "learning_rate": 1.726657091478011e-05, "loss": 0.3848, "step": 1753 }, { "epoch": 0.8381817095752941, "grad_norm": 0.5144600147503787, "learning_rate": 1.7262951675401517e-05, "loss": 0.375, "step": 1754 }, { "epoch": 0.8386595782808673, "grad_norm": 0.5789439256338922, "learning_rate": 1.7259330421408247e-05, "loss": 0.3686, "step": 1755 }, { "epoch": 0.8391374469864404, "grad_norm": 0.5385957955102172, "learning_rate": 1.7255707153804772e-05, "loss": 0.3666, "step": 1756 }, { "epoch": 0.8396153156920136, "grad_norm": 0.6176394509383132, "learning_rate": 1.725208187359612e-05, "loss": 0.3837, "step": 1757 }, { "epoch": 0.8400931843975867, "grad_norm": 0.5271166709229913, "learning_rate": 1.724845458178788e-05, "loss": 0.3616, "step": 1758 }, { "epoch": 0.8405710531031599, "grad_norm": 0.5785356635347916, "learning_rate": 1.72448252793862e-05, "loss": 0.3735, "step": 1759 }, { "epoch": 0.841048921808733, "grad_norm": 0.5396169495403562, "learning_rate": 1.7241193967397784e-05, "loss": 0.3863, "step": 1760 }, { "epoch": 0.8415267905143062, "grad_norm": 0.5968305743269035, "learning_rate": 1.7237560646829893e-05, "loss": 0.3844, "step": 1761 }, { "epoch": 0.8420046592198793, "grad_norm": 0.5739568058873602, "learning_rate": 1.723392531869035e-05, "loss": 0.3623, "step": 1762 }, { "epoch": 0.8424825279254525, "grad_norm": 0.5245097908307391, "learning_rate": 1.7230287983987524e-05, "loss": 0.3704, "step": 1763 }, { "epoch": 0.8429603966310256, "grad_norm": 0.5908273151839591, "learning_rate": 1.722664864373035e-05, "loss": 0.3819, "step": 1764 }, { "epoch": 0.8434382653365988, "grad_norm": 0.5155637558679567, "learning_rate": 1.7223007298928322e-05, "loss": 0.3592, "step": 1765 }, { "epoch": 0.8439161340421719, "grad_norm": 0.5175194051496753, "learning_rate": 1.7219363950591482e-05, "loss": 0.3819, "step": 1766 }, { "epoch": 0.844394002747745, "grad_norm": 0.5743671321894646, "learning_rate": 1.7215718599730427e-05, "loss": 0.3766, "step": 1767 }, { "epoch": 0.8448718714533182, "grad_norm": 0.501602630302012, "learning_rate": 1.7212071247356316e-05, "loss": 0.3879, "step": 1768 }, { "epoch": 0.8453497401588913, "grad_norm": 0.49054416633074915, "learning_rate": 1.720842189448086e-05, "loss": 0.385, "step": 1769 }, { "epoch": 0.8458276088644645, "grad_norm": 0.5155620766796909, "learning_rate": 1.7204770542116326e-05, "loss": 0.3691, "step": 1770 }, { "epoch": 0.8463054775700376, "grad_norm": 0.7473764351264286, "learning_rate": 1.720111719127553e-05, "loss": 0.3691, "step": 1771 }, { "epoch": 0.8467833462756108, "grad_norm": 0.5182869320161151, "learning_rate": 1.7197461842971854e-05, "loss": 0.3742, "step": 1772 }, { "epoch": 0.8472612149811839, "grad_norm": 0.539699951292783, "learning_rate": 1.7193804498219222e-05, "loss": 0.3665, "step": 1773 }, { "epoch": 0.8477390836867571, "grad_norm": 0.49195363335336606, "learning_rate": 1.719014515803212e-05, "loss": 0.3781, "step": 1774 }, { "epoch": 0.8482169523923302, "grad_norm": 0.5107276989908707, "learning_rate": 1.7186483823425582e-05, "loss": 0.3844, "step": 1775 }, { "epoch": 0.8486948210979034, "grad_norm": 0.5454810223277834, "learning_rate": 1.7182820495415197e-05, "loss": 0.3615, "step": 1776 }, { "epoch": 0.8491726898034765, "grad_norm": 0.5294996389762363, "learning_rate": 1.7179155175017115e-05, "loss": 0.3734, "step": 1777 }, { "epoch": 0.8496505585090497, "grad_norm": 0.5499916031625075, "learning_rate": 1.717548786324802e-05, "loss": 0.3849, "step": 1778 }, { "epoch": 0.8501284272146228, "grad_norm": 0.4973577029191277, "learning_rate": 1.7171818561125168e-05, "loss": 0.3731, "step": 1779 }, { "epoch": 0.8506062959201959, "grad_norm": 0.5851851900260322, "learning_rate": 1.7168147269666357e-05, "loss": 0.3799, "step": 1780 }, { "epoch": 0.8510841646257691, "grad_norm": 0.4822034404377725, "learning_rate": 1.7164473989889937e-05, "loss": 0.3693, "step": 1781 }, { "epoch": 0.8515620333313422, "grad_norm": 1.2985086241604744, "learning_rate": 1.7160798722814808e-05, "loss": 0.3677, "step": 1782 }, { "epoch": 0.8520399020369154, "grad_norm": 0.5822593257906487, "learning_rate": 1.7157121469460428e-05, "loss": 0.3575, "step": 1783 }, { "epoch": 0.8525177707424885, "grad_norm": 0.6086802336624687, "learning_rate": 1.7153442230846808e-05, "loss": 0.3706, "step": 1784 }, { "epoch": 0.8529956394480617, "grad_norm": 0.5605480795542676, "learning_rate": 1.714976100799449e-05, "loss": 0.3898, "step": 1785 }, { "epoch": 0.8534735081536348, "grad_norm": 0.4972956581461137, "learning_rate": 1.7146077801924593e-05, "loss": 0.3694, "step": 1786 }, { "epoch": 0.853951376859208, "grad_norm": 0.5805310199986851, "learning_rate": 1.7142392613658764e-05, "loss": 0.3718, "step": 1787 }, { "epoch": 0.8544292455647811, "grad_norm": 0.5287792910272413, "learning_rate": 1.7138705444219215e-05, "loss": 0.3714, "step": 1788 }, { "epoch": 0.8549071142703543, "grad_norm": 0.5101894702630763, "learning_rate": 1.7135016294628703e-05, "loss": 0.3621, "step": 1789 }, { "epoch": 0.8553849829759274, "grad_norm": 0.6079137123999032, "learning_rate": 1.713132516591053e-05, "loss": 0.3794, "step": 1790 }, { "epoch": 0.8558628516815006, "grad_norm": 0.5669666874135264, "learning_rate": 1.7127632059088547e-05, "loss": 0.3777, "step": 1791 }, { "epoch": 0.8563407203870737, "grad_norm": 0.545562892761849, "learning_rate": 1.7123936975187164e-05, "loss": 0.3797, "step": 1792 }, { "epoch": 0.8568185890926467, "grad_norm": 0.6098893498728746, "learning_rate": 1.7120239915231326e-05, "loss": 0.3689, "step": 1793 }, { "epoch": 0.85729645779822, "grad_norm": 0.5161732997271268, "learning_rate": 1.7116540880246536e-05, "loss": 0.3894, "step": 1794 }, { "epoch": 0.857774326503793, "grad_norm": 0.5403314394420677, "learning_rate": 1.7112839871258838e-05, "loss": 0.3683, "step": 1795 }, { "epoch": 0.8582521952093662, "grad_norm": 0.5784670620134145, "learning_rate": 1.710913688929483e-05, "loss": 0.3815, "step": 1796 }, { "epoch": 0.8587300639149393, "grad_norm": 0.5025205431269636, "learning_rate": 1.710543193538165e-05, "loss": 0.3879, "step": 1797 }, { "epoch": 0.8592079326205125, "grad_norm": 0.5892397369469627, "learning_rate": 1.7101725010546988e-05, "loss": 0.3956, "step": 1798 }, { "epoch": 0.8596858013260856, "grad_norm": 0.47462809973773024, "learning_rate": 1.7098016115819082e-05, "loss": 0.3783, "step": 1799 }, { "epoch": 0.8601636700316588, "grad_norm": 0.5288631373582081, "learning_rate": 1.7094305252226713e-05, "loss": 0.3958, "step": 1800 }, { "epoch": 0.8606415387372319, "grad_norm": 0.530585661371997, "learning_rate": 1.7090592420799206e-05, "loss": 0.3794, "step": 1801 }, { "epoch": 0.8611194074428051, "grad_norm": 0.5288225495006263, "learning_rate": 1.708687762256644e-05, "loss": 0.3562, "step": 1802 }, { "epoch": 0.8615972761483782, "grad_norm": 0.5052081248332658, "learning_rate": 1.708316085855883e-05, "loss": 0.3788, "step": 1803 }, { "epoch": 0.8620751448539514, "grad_norm": 0.5037294594905739, "learning_rate": 1.7079442129807345e-05, "loss": 0.3726, "step": 1804 }, { "epoch": 0.8625530135595245, "grad_norm": 0.49391286749648405, "learning_rate": 1.7075721437343488e-05, "loss": 0.3716, "step": 1805 }, { "epoch": 0.8630308822650976, "grad_norm": 0.4971338729755265, "learning_rate": 1.707199878219932e-05, "loss": 0.3703, "step": 1806 }, { "epoch": 0.8635087509706708, "grad_norm": 0.5233353158987447, "learning_rate": 1.7068274165407438e-05, "loss": 0.3863, "step": 1807 }, { "epoch": 0.8639866196762439, "grad_norm": 0.5533231164405442, "learning_rate": 1.706454758800099e-05, "loss": 0.3771, "step": 1808 }, { "epoch": 0.8644644883818171, "grad_norm": 0.5368308793227745, "learning_rate": 1.706081905101365e-05, "loss": 0.3606, "step": 1809 }, { "epoch": 0.8649423570873902, "grad_norm": 0.5642371259255226, "learning_rate": 1.705708855547966e-05, "loss": 0.3536, "step": 1810 }, { "epoch": 0.8654202257929634, "grad_norm": 0.5545194035776633, "learning_rate": 1.7053356102433786e-05, "loss": 0.3738, "step": 1811 }, { "epoch": 0.8658980944985365, "grad_norm": 0.66942068698917, "learning_rate": 1.704962169291135e-05, "loss": 0.3773, "step": 1812 }, { "epoch": 0.8663759632041097, "grad_norm": 0.5154302230933852, "learning_rate": 1.704588532794821e-05, "loss": 0.3703, "step": 1813 }, { "epoch": 0.8668538319096828, "grad_norm": 0.5317590606844941, "learning_rate": 1.7042147008580768e-05, "loss": 0.3655, "step": 1814 }, { "epoch": 0.867331700615256, "grad_norm": 0.5285596576650338, "learning_rate": 1.7038406735845967e-05, "loss": 0.3862, "step": 1815 }, { "epoch": 0.8678095693208291, "grad_norm": 0.556947670099024, "learning_rate": 1.7034664510781294e-05, "loss": 0.3494, "step": 1816 }, { "epoch": 0.8682874380264023, "grad_norm": 0.5007734114771821, "learning_rate": 1.7030920334424774e-05, "loss": 0.3784, "step": 1817 }, { "epoch": 0.8687653067319754, "grad_norm": 0.5306943675714777, "learning_rate": 1.7027174207814977e-05, "loss": 0.3735, "step": 1818 }, { "epoch": 0.8692431754375486, "grad_norm": 0.49311406820837095, "learning_rate": 1.7023426131991008e-05, "loss": 0.3745, "step": 1819 }, { "epoch": 0.8697210441431217, "grad_norm": 2.167028523901367, "learning_rate": 1.7019676107992523e-05, "loss": 0.384, "step": 1820 }, { "epoch": 0.8701989128486948, "grad_norm": 0.5774159028549396, "learning_rate": 1.701592413685971e-05, "loss": 0.3688, "step": 1821 }, { "epoch": 0.870676781554268, "grad_norm": 0.5107936854972478, "learning_rate": 1.7012170219633306e-05, "loss": 0.3823, "step": 1822 }, { "epoch": 0.8711546502598411, "grad_norm": 0.5153038414963254, "learning_rate": 1.700841435735457e-05, "loss": 0.3906, "step": 1823 }, { "epoch": 0.8716325189654143, "grad_norm": 0.6927821263237308, "learning_rate": 1.7004656551065317e-05, "loss": 0.3549, "step": 1824 }, { "epoch": 0.8721103876709874, "grad_norm": 0.5401362110735732, "learning_rate": 1.70008968018079e-05, "loss": 0.3759, "step": 1825 }, { "epoch": 0.8725882563765606, "grad_norm": 0.5538300760634917, "learning_rate": 1.6997135110625203e-05, "loss": 0.3658, "step": 1826 }, { "epoch": 0.8730661250821337, "grad_norm": 0.6412840566926354, "learning_rate": 1.6993371478560652e-05, "loss": 0.3801, "step": 1827 }, { "epoch": 0.8735439937877069, "grad_norm": 0.5385013396738852, "learning_rate": 1.6989605906658217e-05, "loss": 0.3696, "step": 1828 }, { "epoch": 0.87402186249328, "grad_norm": 0.5786447420911893, "learning_rate": 1.6985838395962397e-05, "loss": 0.3773, "step": 1829 }, { "epoch": 0.8744997311988532, "grad_norm": 0.528475998356483, "learning_rate": 1.6982068947518235e-05, "loss": 0.3615, "step": 1830 }, { "epoch": 0.8749775999044263, "grad_norm": 0.9317925627901069, "learning_rate": 1.6978297562371304e-05, "loss": 0.3869, "step": 1831 }, { "epoch": 0.8754554686099995, "grad_norm": 0.491169996492111, "learning_rate": 1.6974524241567726e-05, "loss": 0.3694, "step": 1832 }, { "epoch": 0.8759333373155725, "grad_norm": 0.5289533891513116, "learning_rate": 1.6970748986154153e-05, "loss": 0.3659, "step": 1833 }, { "epoch": 0.8764112060211456, "grad_norm": 0.5541497878429626, "learning_rate": 1.6966971797177777e-05, "loss": 0.3617, "step": 1834 }, { "epoch": 0.8768890747267188, "grad_norm": 0.5832036352429374, "learning_rate": 1.6963192675686312e-05, "loss": 0.3667, "step": 1835 }, { "epoch": 0.8773669434322919, "grad_norm": 0.5650619500446709, "learning_rate": 1.6959411622728034e-05, "loss": 0.3808, "step": 1836 }, { "epoch": 0.8778448121378651, "grad_norm": 0.5699075770705608, "learning_rate": 1.695562863935173e-05, "loss": 0.3656, "step": 1837 }, { "epoch": 0.8783226808434382, "grad_norm": 0.551562318574458, "learning_rate": 1.695184372660674e-05, "loss": 0.3712, "step": 1838 }, { "epoch": 0.8788005495490114, "grad_norm": 0.5240567810273391, "learning_rate": 1.6948056885542925e-05, "loss": 0.3543, "step": 1839 }, { "epoch": 0.8792784182545845, "grad_norm": 0.5020036514932894, "learning_rate": 1.694426811721069e-05, "loss": 0.3679, "step": 1840 }, { "epoch": 0.8797562869601577, "grad_norm": 0.556752437838373, "learning_rate": 1.6940477422660976e-05, "loss": 0.3704, "step": 1841 }, { "epoch": 0.8802341556657308, "grad_norm": 0.5191567869165117, "learning_rate": 1.6936684802945255e-05, "loss": 0.3773, "step": 1842 }, { "epoch": 0.880712024371304, "grad_norm": 0.5533663083233841, "learning_rate": 1.693289025911553e-05, "loss": 0.3779, "step": 1843 }, { "epoch": 0.8811898930768771, "grad_norm": 0.5503898647969782, "learning_rate": 1.692909379222434e-05, "loss": 0.3791, "step": 1844 }, { "epoch": 0.8816677617824503, "grad_norm": 0.5551067074917195, "learning_rate": 1.6925295403324758e-05, "loss": 0.3821, "step": 1845 }, { "epoch": 0.8821456304880234, "grad_norm": 0.5622953147656727, "learning_rate": 1.6921495093470394e-05, "loss": 0.3873, "step": 1846 }, { "epoch": 0.8826234991935965, "grad_norm": 0.5365186748712514, "learning_rate": 1.6917692863715384e-05, "loss": 0.3717, "step": 1847 }, { "epoch": 0.8831013678991697, "grad_norm": 0.5478602830128808, "learning_rate": 1.69138887151144e-05, "loss": 0.3723, "step": 1848 }, { "epoch": 0.8835792366047428, "grad_norm": 0.5427879963525947, "learning_rate": 1.6910082648722643e-05, "loss": 0.3716, "step": 1849 }, { "epoch": 0.884057105310316, "grad_norm": 0.5101893372822378, "learning_rate": 1.6906274665595854e-05, "loss": 0.369, "step": 1850 }, { "epoch": 0.8845349740158891, "grad_norm": 0.5434862693905707, "learning_rate": 1.6902464766790295e-05, "loss": 0.3731, "step": 1851 }, { "epoch": 0.8850128427214623, "grad_norm": 0.5250634322593507, "learning_rate": 1.6898652953362765e-05, "loss": 0.3736, "step": 1852 }, { "epoch": 0.8854907114270354, "grad_norm": 0.5732211434436395, "learning_rate": 1.68948392263706e-05, "loss": 0.3756, "step": 1853 }, { "epoch": 0.8859685801326086, "grad_norm": 0.5470798537140039, "learning_rate": 1.6891023586871654e-05, "loss": 0.3834, "step": 1854 }, { "epoch": 0.8864464488381817, "grad_norm": 0.5126373876854253, "learning_rate": 1.688720603592432e-05, "loss": 0.3726, "step": 1855 }, { "epoch": 0.8869243175437549, "grad_norm": 0.574477636771307, "learning_rate": 1.6883386574587524e-05, "loss": 0.3726, "step": 1856 }, { "epoch": 0.887402186249328, "grad_norm": 0.5760651136056916, "learning_rate": 1.687956520392071e-05, "loss": 0.3623, "step": 1857 }, { "epoch": 0.8878800549549012, "grad_norm": 0.8682852157187648, "learning_rate": 1.6875741924983865e-05, "loss": 0.3741, "step": 1858 }, { "epoch": 0.8883579236604743, "grad_norm": 0.623287286436862, "learning_rate": 1.687191673883749e-05, "loss": 0.359, "step": 1859 }, { "epoch": 0.8888357923660475, "grad_norm": 0.5331585373043249, "learning_rate": 1.6868089646542632e-05, "loss": 0.3788, "step": 1860 }, { "epoch": 0.8893136610716206, "grad_norm": 0.5578200837319638, "learning_rate": 1.686426064916086e-05, "loss": 0.3794, "step": 1861 }, { "epoch": 0.8897915297771937, "grad_norm": 0.5422159312602259, "learning_rate": 1.6860429747754267e-05, "loss": 0.3772, "step": 1862 }, { "epoch": 0.8902693984827669, "grad_norm": 0.5228366276138506, "learning_rate": 1.685659694338548e-05, "loss": 0.3806, "step": 1863 }, { "epoch": 0.89074726718834, "grad_norm": 0.5358911036348268, "learning_rate": 1.6852762237117643e-05, "loss": 0.3834, "step": 1864 }, { "epoch": 0.8912251358939132, "grad_norm": 0.528212445670647, "learning_rate": 1.6848925630014445e-05, "loss": 0.3882, "step": 1865 }, { "epoch": 0.8917030045994863, "grad_norm": 0.5542155155507105, "learning_rate": 1.684508712314009e-05, "loss": 0.3753, "step": 1866 }, { "epoch": 0.8921808733050595, "grad_norm": 0.504104107379108, "learning_rate": 1.6841246717559316e-05, "loss": 0.3875, "step": 1867 }, { "epoch": 0.8926587420106326, "grad_norm": 0.5434960586896977, "learning_rate": 1.6837404414337374e-05, "loss": 0.3558, "step": 1868 }, { "epoch": 0.8931366107162058, "grad_norm": 0.5567958899249101, "learning_rate": 1.683356021454006e-05, "loss": 0.3672, "step": 1869 }, { "epoch": 0.8936144794217789, "grad_norm": 0.5022263534465929, "learning_rate": 1.6829714119233688e-05, "loss": 0.3537, "step": 1870 }, { "epoch": 0.894092348127352, "grad_norm": 0.549284213737301, "learning_rate": 1.6825866129485088e-05, "loss": 0.3574, "step": 1871 }, { "epoch": 0.8945702168329251, "grad_norm": 0.4705829377264607, "learning_rate": 1.6822016246361633e-05, "loss": 0.3851, "step": 1872 }, { "epoch": 0.8950480855384984, "grad_norm": 0.5062114611657378, "learning_rate": 1.681816447093121e-05, "loss": 0.3609, "step": 1873 }, { "epoch": 0.8955259542440714, "grad_norm": 0.5826140193721359, "learning_rate": 1.6814310804262225e-05, "loss": 0.3726, "step": 1874 }, { "epoch": 0.8960038229496445, "grad_norm": 0.5520132821306097, "learning_rate": 1.6810455247423634e-05, "loss": 0.3782, "step": 1875 }, { "epoch": 0.8964816916552177, "grad_norm": 0.5170660835598903, "learning_rate": 1.680659780148489e-05, "loss": 0.357, "step": 1876 }, { "epoch": 0.8969595603607908, "grad_norm": 0.583946284953352, "learning_rate": 1.680273846751598e-05, "loss": 0.3776, "step": 1877 }, { "epoch": 0.897437429066364, "grad_norm": 0.5723420342490975, "learning_rate": 1.6798877246587418e-05, "loss": 0.3895, "step": 1878 }, { "epoch": 0.8979152977719371, "grad_norm": 0.5082366243810379, "learning_rate": 1.679501413977024e-05, "loss": 0.3808, "step": 1879 }, { "epoch": 0.8983931664775103, "grad_norm": 0.507443343885867, "learning_rate": 1.6791149148136003e-05, "loss": 0.3536, "step": 1880 }, { "epoch": 0.8988710351830834, "grad_norm": 0.5703634285838638, "learning_rate": 1.6787282272756784e-05, "loss": 0.3682, "step": 1881 }, { "epoch": 0.8993489038886566, "grad_norm": 0.5158030787718338, "learning_rate": 1.6783413514705186e-05, "loss": 0.3813, "step": 1882 }, { "epoch": 0.8998267725942297, "grad_norm": 0.5015158481243369, "learning_rate": 1.677954287505434e-05, "loss": 0.3775, "step": 1883 }, { "epoch": 0.9003046412998029, "grad_norm": 0.5098825854109337, "learning_rate": 1.6775670354877888e-05, "loss": 0.3526, "step": 1884 }, { "epoch": 0.900782510005376, "grad_norm": 0.5242066060684257, "learning_rate": 1.677179595525e-05, "loss": 0.4053, "step": 1885 }, { "epoch": 0.9012603787109492, "grad_norm": 0.5351980918146104, "learning_rate": 1.6767919677245367e-05, "loss": 0.3714, "step": 1886 }, { "epoch": 0.9017382474165223, "grad_norm": 0.5328925764874355, "learning_rate": 1.6764041521939194e-05, "loss": 0.3675, "step": 1887 }, { "epoch": 0.9022161161220954, "grad_norm": 0.5494270094887949, "learning_rate": 1.6760161490407227e-05, "loss": 0.388, "step": 1888 }, { "epoch": 0.9026939848276686, "grad_norm": 0.5031329675709398, "learning_rate": 1.675627958372571e-05, "loss": 0.3811, "step": 1889 }, { "epoch": 0.9031718535332417, "grad_norm": 0.5262746250301709, "learning_rate": 1.675239580297141e-05, "loss": 0.3935, "step": 1890 }, { "epoch": 0.9036497222388149, "grad_norm": 0.5069957733853212, "learning_rate": 1.6748510149221623e-05, "loss": 0.3777, "step": 1891 }, { "epoch": 0.904127590944388, "grad_norm": 0.5428615459596646, "learning_rate": 1.6744622623554166e-05, "loss": 0.3802, "step": 1892 }, { "epoch": 0.9046054596499612, "grad_norm": 0.5266303210825704, "learning_rate": 1.6740733227047365e-05, "loss": 0.3776, "step": 1893 }, { "epoch": 0.9050833283555343, "grad_norm": 0.5348979320077515, "learning_rate": 1.673684196078007e-05, "loss": 0.3848, "step": 1894 }, { "epoch": 0.9055611970611075, "grad_norm": 0.5314119413401441, "learning_rate": 1.6732948825831657e-05, "loss": 0.364, "step": 1895 }, { "epoch": 0.9060390657666806, "grad_norm": 0.5516947593383675, "learning_rate": 1.6729053823282e-05, "loss": 0.3553, "step": 1896 }, { "epoch": 0.9065169344722538, "grad_norm": 0.5090831476794752, "learning_rate": 1.6725156954211516e-05, "loss": 0.3713, "step": 1897 }, { "epoch": 0.9069948031778269, "grad_norm": 0.6262571396693599, "learning_rate": 1.672125821970112e-05, "loss": 0.3689, "step": 1898 }, { "epoch": 0.9074726718834001, "grad_norm": 0.5193279263946532, "learning_rate": 1.6717357620832256e-05, "loss": 0.3783, "step": 1899 }, { "epoch": 0.9079505405889732, "grad_norm": 0.503064317241693, "learning_rate": 1.671345515868688e-05, "loss": 0.3793, "step": 1900 }, { "epoch": 0.9084284092945463, "grad_norm": 0.5130059203403388, "learning_rate": 1.6709550834347463e-05, "loss": 0.3715, "step": 1901 }, { "epoch": 0.9089062780001195, "grad_norm": 0.5369725030475303, "learning_rate": 1.6705644648897004e-05, "loss": 0.3592, "step": 1902 }, { "epoch": 0.9093841467056926, "grad_norm": 0.5150663544165971, "learning_rate": 1.6701736603419002e-05, "loss": 0.3637, "step": 1903 }, { "epoch": 0.9098620154112658, "grad_norm": 0.5399311902064715, "learning_rate": 1.6697826698997483e-05, "loss": 0.3594, "step": 1904 }, { "epoch": 0.9103398841168389, "grad_norm": 0.5518934019533673, "learning_rate": 1.6693914936716983e-05, "loss": 0.3682, "step": 1905 }, { "epoch": 0.9108177528224121, "grad_norm": 0.5209140835926345, "learning_rate": 1.6690001317662563e-05, "loss": 0.382, "step": 1906 }, { "epoch": 0.9112956215279852, "grad_norm": 0.5660288276754546, "learning_rate": 1.6686085842919784e-05, "loss": 0.3801, "step": 1907 }, { "epoch": 0.9117734902335584, "grad_norm": 0.47006676537993314, "learning_rate": 1.668216851357473e-05, "loss": 0.3724, "step": 1908 }, { "epoch": 0.9122513589391315, "grad_norm": 0.5577644040191538, "learning_rate": 1.667824933071401e-05, "loss": 0.3622, "step": 1909 }, { "epoch": 0.9127292276447047, "grad_norm": 0.5340230250631273, "learning_rate": 1.6674328295424723e-05, "loss": 0.369, "step": 1910 }, { "epoch": 0.9132070963502777, "grad_norm": 0.5079231886115417, "learning_rate": 1.6670405408794498e-05, "loss": 0.3715, "step": 1911 }, { "epoch": 0.913684965055851, "grad_norm": 0.5325193692349207, "learning_rate": 1.666648067191148e-05, "loss": 0.3789, "step": 1912 }, { "epoch": 0.914162833761424, "grad_norm": 0.5323003539826137, "learning_rate": 1.666255408586432e-05, "loss": 0.3818, "step": 1913 }, { "epoch": 0.9146407024669972, "grad_norm": 0.4869815217201792, "learning_rate": 1.6658625651742178e-05, "loss": 0.3733, "step": 1914 }, { "epoch": 0.9151185711725703, "grad_norm": 0.5576240488695627, "learning_rate": 1.6654695370634738e-05, "loss": 0.3589, "step": 1915 }, { "epoch": 0.9155964398781434, "grad_norm": 0.49247551368135417, "learning_rate": 1.6650763243632187e-05, "loss": 0.3585, "step": 1916 }, { "epoch": 0.9160743085837166, "grad_norm": 0.5148567366006771, "learning_rate": 1.664682927182523e-05, "loss": 0.3838, "step": 1917 }, { "epoch": 0.9165521772892897, "grad_norm": 0.49717329992717463, "learning_rate": 1.6642893456305086e-05, "loss": 0.3915, "step": 1918 }, { "epoch": 0.9170300459948629, "grad_norm": 0.5628249235481696, "learning_rate": 1.663895579816347e-05, "loss": 0.3689, "step": 1919 }, { "epoch": 0.917507914700436, "grad_norm": 0.5136903278787316, "learning_rate": 1.6635016298492628e-05, "loss": 0.3838, "step": 1920 }, { "epoch": 0.9179857834060092, "grad_norm": 0.5334625808445861, "learning_rate": 1.6631074958385304e-05, "loss": 0.374, "step": 1921 }, { "epoch": 0.9184636521115823, "grad_norm": 0.5386195590160799, "learning_rate": 1.6627131778934755e-05, "loss": 0.3751, "step": 1922 }, { "epoch": 0.9189415208171555, "grad_norm": 0.5379052116006212, "learning_rate": 1.662318676123475e-05, "loss": 0.3809, "step": 1923 }, { "epoch": 0.9194193895227286, "grad_norm": 0.5100602264626746, "learning_rate": 1.6619239906379574e-05, "loss": 0.3701, "step": 1924 }, { "epoch": 0.9198972582283018, "grad_norm": 0.5239150729526496, "learning_rate": 1.6615291215464005e-05, "loss": 0.3877, "step": 1925 }, { "epoch": 0.9203751269338749, "grad_norm": 0.6375609494234671, "learning_rate": 1.6611340689583343e-05, "loss": 0.3844, "step": 1926 }, { "epoch": 0.9208529956394481, "grad_norm": 0.5123037134022694, "learning_rate": 1.66073883298334e-05, "loss": 0.3651, "step": 1927 }, { "epoch": 0.9213308643450212, "grad_norm": 0.5110576525109256, "learning_rate": 1.6603434137310482e-05, "loss": 0.3707, "step": 1928 }, { "epoch": 0.9218087330505943, "grad_norm": 0.5021261687202652, "learning_rate": 1.6599478113111424e-05, "loss": 0.3682, "step": 1929 }, { "epoch": 0.9222866017561675, "grad_norm": 0.5371433965125264, "learning_rate": 1.6595520258333545e-05, "loss": 0.3624, "step": 1930 }, { "epoch": 0.9227644704617406, "grad_norm": 0.5366454399722713, "learning_rate": 1.659156057407469e-05, "loss": 0.3809, "step": 1931 }, { "epoch": 0.9232423391673138, "grad_norm": 0.9136617720639402, "learning_rate": 1.6587599061433207e-05, "loss": 0.3738, "step": 1932 }, { "epoch": 0.9237202078728869, "grad_norm": 0.8488713857773934, "learning_rate": 1.6583635721507944e-05, "loss": 0.3637, "step": 1933 }, { "epoch": 0.9241980765784601, "grad_norm": 0.49495459546373427, "learning_rate": 1.6579670555398268e-05, "loss": 0.367, "step": 1934 }, { "epoch": 0.9246759452840332, "grad_norm": 0.5422350097681802, "learning_rate": 1.657570356420404e-05, "loss": 0.3747, "step": 1935 }, { "epoch": 0.9251538139896064, "grad_norm": 0.5485293026148379, "learning_rate": 1.657173474902564e-05, "loss": 0.3767, "step": 1936 }, { "epoch": 0.9256316826951795, "grad_norm": 0.7216669697310142, "learning_rate": 1.6567764110963948e-05, "loss": 0.3761, "step": 1937 }, { "epoch": 0.9261095514007527, "grad_norm": 0.4879951676041884, "learning_rate": 1.6563791651120336e-05, "loss": 0.3637, "step": 1938 }, { "epoch": 0.9265874201063258, "grad_norm": 0.49428473376773735, "learning_rate": 1.6559817370596708e-05, "loss": 0.3843, "step": 1939 }, { "epoch": 0.927065288811899, "grad_norm": 0.48309373277907747, "learning_rate": 1.6555841270495456e-05, "loss": 0.3765, "step": 1940 }, { "epoch": 0.9275431575174721, "grad_norm": 0.5013632030999634, "learning_rate": 1.6551863351919478e-05, "loss": 0.3694, "step": 1941 }, { "epoch": 0.9280210262230452, "grad_norm": 0.5110638525672886, "learning_rate": 1.6547883615972176e-05, "loss": 0.363, "step": 1942 }, { "epoch": 0.9284988949286184, "grad_norm": 0.5185504025173149, "learning_rate": 1.6543902063757462e-05, "loss": 0.3801, "step": 1943 }, { "epoch": 0.9289767636341915, "grad_norm": 0.5049387315294666, "learning_rate": 1.653991869637975e-05, "loss": 0.369, "step": 1944 }, { "epoch": 0.9294546323397647, "grad_norm": 0.4912665471357074, "learning_rate": 1.6535933514943955e-05, "loss": 0.3702, "step": 1945 }, { "epoch": 0.9299325010453378, "grad_norm": 0.5118031125606555, "learning_rate": 1.653194652055549e-05, "loss": 0.3732, "step": 1946 }, { "epoch": 0.930410369750911, "grad_norm": 0.6431555235235055, "learning_rate": 1.6527957714320283e-05, "loss": 0.3725, "step": 1947 }, { "epoch": 0.930888238456484, "grad_norm": 0.5097523807572212, "learning_rate": 1.6523967097344763e-05, "loss": 0.3712, "step": 1948 }, { "epoch": 0.9313661071620573, "grad_norm": 0.5046812127135673, "learning_rate": 1.6519974670735846e-05, "loss": 0.3737, "step": 1949 }, { "epoch": 0.9318439758676303, "grad_norm": 0.5194776247965222, "learning_rate": 1.6515980435600965e-05, "loss": 0.3742, "step": 1950 }, { "epoch": 0.9323218445732036, "grad_norm": 0.5404940696555603, "learning_rate": 1.6511984393048055e-05, "loss": 0.381, "step": 1951 }, { "epoch": 0.9327997132787766, "grad_norm": 0.5553063994108789, "learning_rate": 1.6507986544185543e-05, "loss": 0.3649, "step": 1952 }, { "epoch": 0.9332775819843498, "grad_norm": 0.5499668048968661, "learning_rate": 1.650398689012236e-05, "loss": 0.3793, "step": 1953 }, { "epoch": 0.9337554506899229, "grad_norm": 0.563687647978333, "learning_rate": 1.649998543196794e-05, "loss": 0.3808, "step": 1954 }, { "epoch": 0.9342333193954961, "grad_norm": 0.545135564437546, "learning_rate": 1.6495982170832224e-05, "loss": 0.3706, "step": 1955 }, { "epoch": 0.9347111881010692, "grad_norm": 0.5303943311116299, "learning_rate": 1.6491977107825642e-05, "loss": 0.3685, "step": 1956 }, { "epoch": 0.9351890568066423, "grad_norm": 0.5619908532760816, "learning_rate": 1.648797024405912e-05, "loss": 0.3883, "step": 1957 }, { "epoch": 0.9356669255122155, "grad_norm": 0.5038859263688038, "learning_rate": 1.64839615806441e-05, "loss": 0.3766, "step": 1958 }, { "epoch": 0.9361447942177886, "grad_norm": 0.521422602845323, "learning_rate": 1.6479951118692515e-05, "loss": 0.3895, "step": 1959 }, { "epoch": 0.9366226629233618, "grad_norm": 0.5282657221332856, "learning_rate": 1.6475938859316795e-05, "loss": 0.3605, "step": 1960 }, { "epoch": 0.9371005316289349, "grad_norm": 0.6351039349256459, "learning_rate": 1.6471924803629867e-05, "loss": 0.3651, "step": 1961 }, { "epoch": 0.9375784003345081, "grad_norm": 0.5764981472703029, "learning_rate": 1.6467908952745163e-05, "loss": 0.38, "step": 1962 }, { "epoch": 0.9380562690400812, "grad_norm": 8.536515562815161, "learning_rate": 1.6463891307776606e-05, "loss": 0.3956, "step": 1963 }, { "epoch": 0.9385341377456544, "grad_norm": 0.6278724935607395, "learning_rate": 1.645987186983862e-05, "loss": 0.3739, "step": 1964 }, { "epoch": 0.9390120064512275, "grad_norm": 0.48387133354028933, "learning_rate": 1.6455850640046134e-05, "loss": 0.3888, "step": 1965 }, { "epoch": 0.9394898751568007, "grad_norm": 0.531275176529521, "learning_rate": 1.6451827619514552e-05, "loss": 0.3649, "step": 1966 }, { "epoch": 0.9399677438623738, "grad_norm": 0.569449658310809, "learning_rate": 1.6447802809359802e-05, "loss": 0.3699, "step": 1967 }, { "epoch": 0.940445612567947, "grad_norm": 0.5120339336372176, "learning_rate": 1.6443776210698288e-05, "loss": 0.3846, "step": 1968 }, { "epoch": 0.9409234812735201, "grad_norm": 0.5460326477017806, "learning_rate": 1.643974782464692e-05, "loss": 0.3465, "step": 1969 }, { "epoch": 0.9414013499790932, "grad_norm": 0.518882665215977, "learning_rate": 1.6435717652323097e-05, "loss": 0.3839, "step": 1970 }, { "epoch": 0.9418792186846664, "grad_norm": 0.49852258072808264, "learning_rate": 1.6431685694844725e-05, "loss": 0.3816, "step": 1971 }, { "epoch": 0.9423570873902395, "grad_norm": 0.5815555518919842, "learning_rate": 1.6427651953330196e-05, "loss": 0.381, "step": 1972 }, { "epoch": 0.9428349560958127, "grad_norm": 0.48319880339598503, "learning_rate": 1.6423616428898392e-05, "loss": 0.3701, "step": 1973 }, { "epoch": 0.9433128248013858, "grad_norm": 0.5167840266697357, "learning_rate": 1.6419579122668704e-05, "loss": 0.3556, "step": 1974 }, { "epoch": 0.943790693506959, "grad_norm": 0.5618010439515246, "learning_rate": 1.6415540035761008e-05, "loss": 0.3744, "step": 1975 }, { "epoch": 0.9442685622125321, "grad_norm": 0.5051139249357881, "learning_rate": 1.641149916929567e-05, "loss": 0.3876, "step": 1976 }, { "epoch": 0.9447464309181053, "grad_norm": 0.5611312662809763, "learning_rate": 1.6407456524393562e-05, "loss": 0.3686, "step": 1977 }, { "epoch": 0.9452242996236784, "grad_norm": 0.5732842999649458, "learning_rate": 1.640341210217604e-05, "loss": 0.3865, "step": 1978 }, { "epoch": 0.9457021683292516, "grad_norm": 0.550740123702534, "learning_rate": 1.6399365903764956e-05, "loss": 0.3743, "step": 1979 }, { "epoch": 0.9461800370348247, "grad_norm": 0.5379350100432237, "learning_rate": 1.639531793028265e-05, "loss": 0.3808, "step": 1980 }, { "epoch": 0.9466579057403979, "grad_norm": 0.5315866561065092, "learning_rate": 1.6391268182851963e-05, "loss": 0.3734, "step": 1981 }, { "epoch": 0.947135774445971, "grad_norm": 0.5224017338900245, "learning_rate": 1.638721666259622e-05, "loss": 0.3859, "step": 1982 }, { "epoch": 0.9476136431515441, "grad_norm": 0.5967691599781494, "learning_rate": 1.638316337063925e-05, "loss": 0.3581, "step": 1983 }, { "epoch": 0.9480915118571173, "grad_norm": 0.589119592504554, "learning_rate": 1.6379108308105354e-05, "loss": 0.3835, "step": 1984 }, { "epoch": 0.9485693805626904, "grad_norm": 0.51383408409613, "learning_rate": 1.637505147611934e-05, "loss": 0.3586, "step": 1985 }, { "epoch": 0.9490472492682636, "grad_norm": 0.4994797959754618, "learning_rate": 1.63709928758065e-05, "loss": 0.377, "step": 1986 }, { "epoch": 0.9495251179738367, "grad_norm": 0.5048066876195055, "learning_rate": 1.6366932508292618e-05, "loss": 0.3731, "step": 1987 }, { "epoch": 0.9500029866794099, "grad_norm": 0.524811235817121, "learning_rate": 1.6362870374703967e-05, "loss": 0.3793, "step": 1988 }, { "epoch": 0.950480855384983, "grad_norm": 0.5735930473974293, "learning_rate": 1.6358806476167316e-05, "loss": 0.3656, "step": 1989 }, { "epoch": 0.9509587240905562, "grad_norm": 0.4832184523655007, "learning_rate": 1.6354740813809917e-05, "loss": 0.373, "step": 1990 }, { "epoch": 0.9514365927961292, "grad_norm": 0.5128768000065408, "learning_rate": 1.635067338875951e-05, "loss": 0.3821, "step": 1991 }, { "epoch": 0.9519144615017024, "grad_norm": 0.5022274034535443, "learning_rate": 1.6346604202144326e-05, "loss": 0.3901, "step": 1992 }, { "epoch": 0.9523923302072755, "grad_norm": 0.5489970781237143, "learning_rate": 1.634253325509309e-05, "loss": 0.3858, "step": 1993 }, { "epoch": 0.9528701989128487, "grad_norm": 0.4975450932340785, "learning_rate": 1.6338460548735015e-05, "loss": 0.3686, "step": 1994 }, { "epoch": 0.9533480676184218, "grad_norm": 0.48127408958137835, "learning_rate": 1.6334386084199787e-05, "loss": 0.3745, "step": 1995 }, { "epoch": 0.9538259363239949, "grad_norm": 0.5704221304645912, "learning_rate": 1.6330309862617598e-05, "loss": 0.3613, "step": 1996 }, { "epoch": 0.9543038050295681, "grad_norm": 0.5148160729331526, "learning_rate": 1.6326231885119117e-05, "loss": 0.3827, "step": 1997 }, { "epoch": 0.9547816737351412, "grad_norm": 0.6284728203646394, "learning_rate": 1.63221521528355e-05, "loss": 0.3824, "step": 1998 }, { "epoch": 0.9552595424407144, "grad_norm": 0.49271041299162915, "learning_rate": 1.63180706668984e-05, "loss": 0.3691, "step": 1999 }, { "epoch": 0.9557374111462875, "grad_norm": 0.5390149320147591, "learning_rate": 1.631398742843995e-05, "loss": 0.3652, "step": 2000 }, { "epoch": 0.9562152798518607, "grad_norm": 0.5025205155225639, "learning_rate": 1.6309902438592762e-05, "loss": 0.3694, "step": 2001 }, { "epoch": 0.9566931485574338, "grad_norm": 0.5222794351472518, "learning_rate": 1.6305815698489938e-05, "loss": 0.3681, "step": 2002 }, { "epoch": 0.957171017263007, "grad_norm": 0.5145118353547241, "learning_rate": 1.6301727209265077e-05, "loss": 0.3719, "step": 2003 }, { "epoch": 0.9576488859685801, "grad_norm": 0.4908469670116591, "learning_rate": 1.629763697205225e-05, "loss": 0.3705, "step": 2004 }, { "epoch": 0.9581267546741533, "grad_norm": 0.5012664663409069, "learning_rate": 1.629354498798601e-05, "loss": 0.3719, "step": 2005 }, { "epoch": 0.9586046233797264, "grad_norm": 0.518588891339538, "learning_rate": 1.628945125820141e-05, "loss": 0.3709, "step": 2006 }, { "epoch": 0.9590824920852996, "grad_norm": 0.5117699235235205, "learning_rate": 1.628535578383397e-05, "loss": 0.3737, "step": 2007 }, { "epoch": 0.9595603607908727, "grad_norm": 0.5356226048939458, "learning_rate": 1.6281258566019712e-05, "loss": 0.3764, "step": 2008 }, { "epoch": 0.9600382294964459, "grad_norm": 0.5083134780109303, "learning_rate": 1.6277159605895124e-05, "loss": 0.3723, "step": 2009 }, { "epoch": 0.960516098202019, "grad_norm": 0.5023579232084872, "learning_rate": 1.627305890459719e-05, "loss": 0.3704, "step": 2010 }, { "epoch": 0.9609939669075921, "grad_norm": 0.46631779465683365, "learning_rate": 1.6268956463263372e-05, "loss": 0.3526, "step": 2011 }, { "epoch": 0.9614718356131653, "grad_norm": 0.547788682791615, "learning_rate": 1.6264852283031614e-05, "loss": 0.3779, "step": 2012 }, { "epoch": 0.9619497043187384, "grad_norm": 0.5104177809427011, "learning_rate": 1.6260746365040342e-05, "loss": 0.3729, "step": 2013 }, { "epoch": 0.9624275730243116, "grad_norm": 0.5421693818087101, "learning_rate": 1.6256638710428468e-05, "loss": 0.3698, "step": 2014 }, { "epoch": 0.9629054417298847, "grad_norm": 0.5878685955211844, "learning_rate": 1.625252932033538e-05, "loss": 0.3702, "step": 2015 }, { "epoch": 0.9633833104354579, "grad_norm": 0.5392634273182241, "learning_rate": 1.6248418195900944e-05, "loss": 0.3714, "step": 2016 }, { "epoch": 0.963861179141031, "grad_norm": 0.5041453371523026, "learning_rate": 1.6244305338265528e-05, "loss": 0.3757, "step": 2017 }, { "epoch": 0.9643390478466042, "grad_norm": 0.5611684018900728, "learning_rate": 1.6240190748569958e-05, "loss": 0.3615, "step": 2018 }, { "epoch": 0.9648169165521773, "grad_norm": 0.4958027277870733, "learning_rate": 1.6236074427955547e-05, "loss": 0.3643, "step": 2019 }, { "epoch": 0.9652947852577505, "grad_norm": 0.5065207954211904, "learning_rate": 1.6231956377564095e-05, "loss": 0.372, "step": 2020 }, { "epoch": 0.9657726539633236, "grad_norm": 0.48383262645888736, "learning_rate": 1.6227836598537874e-05, "loss": 0.3924, "step": 2021 }, { "epoch": 0.9662505226688968, "grad_norm": 0.5200139461232003, "learning_rate": 1.6223715092019636e-05, "loss": 0.3671, "step": 2022 }, { "epoch": 0.9667283913744699, "grad_norm": 0.4953988372654815, "learning_rate": 1.6219591859152618e-05, "loss": 0.3812, "step": 2023 }, { "epoch": 0.967206260080043, "grad_norm": 0.5102837388791924, "learning_rate": 1.621546690108053e-05, "loss": 0.3652, "step": 2024 }, { "epoch": 0.9676841287856162, "grad_norm": 0.5629208320653784, "learning_rate": 1.621134021894756e-05, "loss": 0.3611, "step": 2025 }, { "epoch": 0.9681619974911893, "grad_norm": 0.4952840329409754, "learning_rate": 1.6207211813898377e-05, "loss": 0.3972, "step": 2026 }, { "epoch": 0.9686398661967625, "grad_norm": 0.5037099311393208, "learning_rate": 1.6203081687078136e-05, "loss": 0.373, "step": 2027 }, { "epoch": 0.9691177349023355, "grad_norm": 0.5838992353089708, "learning_rate": 1.6198949839632453e-05, "loss": 0.3793, "step": 2028 }, { "epoch": 0.9695956036079088, "grad_norm": 0.49580068754399503, "learning_rate": 1.619481627270743e-05, "loss": 0.3613, "step": 2029 }, { "epoch": 0.9700734723134818, "grad_norm": 0.532208761854008, "learning_rate": 1.619068098744965e-05, "loss": 0.3866, "step": 2030 }, { "epoch": 0.970551341019055, "grad_norm": 0.5224695569046617, "learning_rate": 1.6186543985006164e-05, "loss": 0.3699, "step": 2031 }, { "epoch": 0.9710292097246281, "grad_norm": 0.5227062263015988, "learning_rate": 1.6182405266524507e-05, "loss": 0.3912, "step": 2032 }, { "epoch": 0.9715070784302013, "grad_norm": 0.52945280171711, "learning_rate": 1.6178264833152688e-05, "loss": 0.3403, "step": 2033 }, { "epoch": 0.9719849471357744, "grad_norm": 0.5629792708553427, "learning_rate": 1.6174122686039182e-05, "loss": 0.3784, "step": 2034 }, { "epoch": 0.9724628158413476, "grad_norm": 0.5163999238740223, "learning_rate": 1.6169978826332955e-05, "loss": 0.3659, "step": 2035 }, { "epoch": 0.9729406845469207, "grad_norm": 0.5133995557530827, "learning_rate": 1.6165833255183438e-05, "loss": 0.3694, "step": 2036 }, { "epoch": 0.9734185532524938, "grad_norm": 0.5617728810922827, "learning_rate": 1.616168597374054e-05, "loss": 0.363, "step": 2037 }, { "epoch": 0.973896421958067, "grad_norm": 0.5237674641752671, "learning_rate": 1.615753698315464e-05, "loss": 0.3803, "step": 2038 }, { "epoch": 0.9743742906636401, "grad_norm": 0.5228216583365605, "learning_rate": 1.61533862845766e-05, "loss": 0.3681, "step": 2039 }, { "epoch": 0.9748521593692133, "grad_norm": 0.4805229482972204, "learning_rate": 1.6149233879157747e-05, "loss": 0.3658, "step": 2040 }, { "epoch": 0.9753300280747864, "grad_norm": 0.5534895159460543, "learning_rate": 1.614507976804989e-05, "loss": 0.3784, "step": 2041 }, { "epoch": 0.9758078967803596, "grad_norm": 0.5486724592299036, "learning_rate": 1.6140923952405302e-05, "loss": 0.3786, "step": 2042 }, { "epoch": 0.9762857654859327, "grad_norm": 0.8080884579911872, "learning_rate": 1.6136766433376728e-05, "loss": 0.3723, "step": 2043 }, { "epoch": 0.9767636341915059, "grad_norm": 0.5359191028098362, "learning_rate": 1.6132607212117404e-05, "loss": 0.3526, "step": 2044 }, { "epoch": 0.977241502897079, "grad_norm": 0.5053369417965282, "learning_rate": 1.6128446289781012e-05, "loss": 0.3772, "step": 2045 }, { "epoch": 0.9777193716026522, "grad_norm": 0.5211994750741029, "learning_rate": 1.6124283667521727e-05, "loss": 0.3808, "step": 2046 }, { "epoch": 0.9781972403082253, "grad_norm": 0.5676346296630453, "learning_rate": 1.612011934649418e-05, "loss": 0.3715, "step": 2047 }, { "epoch": 0.9786751090137985, "grad_norm": 0.5248006849417738, "learning_rate": 1.611595332785348e-05, "loss": 0.3553, "step": 2048 }, { "epoch": 0.9791529777193716, "grad_norm": 0.508623732364023, "learning_rate": 1.6111785612755214e-05, "loss": 0.3754, "step": 2049 }, { "epoch": 0.9796308464249448, "grad_norm": 0.5266899133269225, "learning_rate": 1.610761620235543e-05, "loss": 0.3859, "step": 2050 }, { "epoch": 0.9801087151305179, "grad_norm": 0.4751459644310755, "learning_rate": 1.610344509781065e-05, "loss": 0.387, "step": 2051 }, { "epoch": 0.980586583836091, "grad_norm": 0.5316863726383205, "learning_rate": 1.609927230027786e-05, "loss": 0.3959, "step": 2052 }, { "epoch": 0.9810644525416642, "grad_norm": 0.5453039434436993, "learning_rate": 1.609509781091452e-05, "loss": 0.3697, "step": 2053 }, { "epoch": 0.9815423212472373, "grad_norm": 0.4825716401466539, "learning_rate": 1.6090921630878568e-05, "loss": 0.3835, "step": 2054 }, { "epoch": 0.9820201899528105, "grad_norm": 0.5178510139332422, "learning_rate": 1.60867437613284e-05, "loss": 0.3717, "step": 2055 }, { "epoch": 0.9824980586583836, "grad_norm": 0.524887791800861, "learning_rate": 1.6082564203422876e-05, "loss": 0.3653, "step": 2056 }, { "epoch": 0.9829759273639568, "grad_norm": 0.5213297178508581, "learning_rate": 1.6078382958321336e-05, "loss": 0.3725, "step": 2057 }, { "epoch": 0.9834537960695299, "grad_norm": 0.5282681790101221, "learning_rate": 1.6074200027183584e-05, "loss": 0.3695, "step": 2058 }, { "epoch": 0.9839316647751031, "grad_norm": 0.489145805606624, "learning_rate": 1.6070015411169896e-05, "loss": 0.3623, "step": 2059 }, { "epoch": 0.9844095334806762, "grad_norm": 0.5331793559039849, "learning_rate": 1.6065829111441e-05, "loss": 0.3556, "step": 2060 }, { "epoch": 0.9848874021862494, "grad_norm": 0.5443653643393922, "learning_rate": 1.6061641129158112e-05, "loss": 0.3988, "step": 2061 }, { "epoch": 0.9853652708918225, "grad_norm": 0.8351330747470432, "learning_rate": 1.60574514654829e-05, "loss": 0.3804, "step": 2062 }, { "epoch": 0.9858431395973957, "grad_norm": 0.5030615466689651, "learning_rate": 1.6053260121577503e-05, "loss": 0.3618, "step": 2063 }, { "epoch": 0.9863210083029688, "grad_norm": 0.5471550674629175, "learning_rate": 1.6049067098604523e-05, "loss": 0.3557, "step": 2064 }, { "epoch": 0.9867988770085419, "grad_norm": 0.48069192998466753, "learning_rate": 1.6044872397727037e-05, "loss": 0.3612, "step": 2065 }, { "epoch": 0.9872767457141151, "grad_norm": 0.5253037063797116, "learning_rate": 1.6040676020108577e-05, "loss": 0.3655, "step": 2066 }, { "epoch": 0.9877546144196881, "grad_norm": 0.5275280865811858, "learning_rate": 1.6036477966913143e-05, "loss": 0.3852, "step": 2067 }, { "epoch": 0.9882324831252614, "grad_norm": 0.47485994365355333, "learning_rate": 1.6032278239305204e-05, "loss": 0.3638, "step": 2068 }, { "epoch": 0.9887103518308344, "grad_norm": 0.5362179099114655, "learning_rate": 1.6028076838449692e-05, "loss": 0.3725, "step": 2069 }, { "epoch": 0.9891882205364076, "grad_norm": 0.5119867606693408, "learning_rate": 1.6023873765511993e-05, "loss": 0.3757, "step": 2070 }, { "epoch": 0.9896660892419807, "grad_norm": 0.5570349801753108, "learning_rate": 1.6019669021657972e-05, "loss": 0.362, "step": 2071 }, { "epoch": 0.9901439579475539, "grad_norm": 0.5023908521667267, "learning_rate": 1.601546260805395e-05, "loss": 0.3664, "step": 2072 }, { "epoch": 0.990621826653127, "grad_norm": 0.4919070804327344, "learning_rate": 1.6011254525866715e-05, "loss": 0.3746, "step": 2073 }, { "epoch": 0.9910996953587002, "grad_norm": 0.5274265284295131, "learning_rate": 1.600704477626351e-05, "loss": 0.3554, "step": 2074 }, { "epoch": 0.9915775640642733, "grad_norm": 0.4973674393943186, "learning_rate": 1.6002833360412044e-05, "loss": 0.3734, "step": 2075 }, { "epoch": 0.9920554327698465, "grad_norm": 0.6994137247566886, "learning_rate": 1.599862027948049e-05, "loss": 0.3785, "step": 2076 }, { "epoch": 0.9925333014754196, "grad_norm": 0.5005262403669962, "learning_rate": 1.5994405534637487e-05, "loss": 0.3508, "step": 2077 }, { "epoch": 0.9930111701809927, "grad_norm": 0.48923707105340497, "learning_rate": 1.5990189127052128e-05, "loss": 0.3767, "step": 2078 }, { "epoch": 0.9934890388865659, "grad_norm": 0.5287580848621545, "learning_rate": 1.5985971057893973e-05, "loss": 0.3754, "step": 2079 }, { "epoch": 0.993966907592139, "grad_norm": 0.4820940737829678, "learning_rate": 1.5981751328333036e-05, "loss": 0.3631, "step": 2080 }, { "epoch": 0.9944447762977122, "grad_norm": 0.47169419878350755, "learning_rate": 1.5977529939539794e-05, "loss": 0.3737, "step": 2081 }, { "epoch": 0.9949226450032853, "grad_norm": 0.4819615935924389, "learning_rate": 1.597330689268519e-05, "loss": 0.3776, "step": 2082 }, { "epoch": 0.9954005137088585, "grad_norm": 0.4915767151527447, "learning_rate": 1.5969082188940623e-05, "loss": 0.3855, "step": 2083 }, { "epoch": 0.9958783824144316, "grad_norm": 0.5283871739026111, "learning_rate": 1.5964855829477946e-05, "loss": 0.3643, "step": 2084 }, { "epoch": 0.9963562511200048, "grad_norm": 0.5657962366230493, "learning_rate": 1.5960627815469486e-05, "loss": 0.3671, "step": 2085 }, { "epoch": 0.9968341198255779, "grad_norm": 0.47929648182468015, "learning_rate": 1.5956398148088007e-05, "loss": 0.3733, "step": 2086 }, { "epoch": 0.9973119885311511, "grad_norm": 0.5282895558309355, "learning_rate": 1.5952166828506754e-05, "loss": 0.3546, "step": 2087 }, { "epoch": 0.9977898572367242, "grad_norm": 0.49273301808521663, "learning_rate": 1.5947933857899418e-05, "loss": 0.3794, "step": 2088 }, { "epoch": 0.9982677259422974, "grad_norm": 0.5307309337337829, "learning_rate": 1.594369923744015e-05, "loss": 0.367, "step": 2089 }, { "epoch": 0.9987455946478705, "grad_norm": 0.5584015875131911, "learning_rate": 1.5939462968303554e-05, "loss": 0.3699, "step": 2090 }, { "epoch": 0.9992234633534436, "grad_norm": 0.5866978710305609, "learning_rate": 1.5935225051664708e-05, "loss": 0.3759, "step": 2091 }, { "epoch": 0.9997013320590168, "grad_norm": 0.48873807272784037, "learning_rate": 1.593098548869912e-05, "loss": 0.3608, "step": 2092 }, { "epoch": 1.0, "grad_norm": 0.5723334542119868, "learning_rate": 1.5926744280582786e-05, "loss": 0.3643, "step": 2093 }, { "epoch": 1.000477868705573, "grad_norm": 0.6300673986894956, "learning_rate": 1.5922501428492126e-05, "loss": 0.333, "step": 2094 }, { "epoch": 1.0009557374111462, "grad_norm": 0.5750968585044649, "learning_rate": 1.5918256933604047e-05, "loss": 0.3407, "step": 2095 }, { "epoch": 1.0014336061167195, "grad_norm": 0.5649190535406365, "learning_rate": 1.591401079709589e-05, "loss": 0.3313, "step": 2096 }, { "epoch": 1.0019114748222926, "grad_norm": 0.6470098562888061, "learning_rate": 1.590976302014546e-05, "loss": 0.3193, "step": 2097 }, { "epoch": 1.0023893435278657, "grad_norm": 0.6124457488598071, "learning_rate": 1.5905513603931013e-05, "loss": 0.3453, "step": 2098 }, { "epoch": 1.0028672122334388, "grad_norm": 0.5569653564403705, "learning_rate": 1.5901262549631266e-05, "loss": 0.3308, "step": 2099 }, { "epoch": 1.003345080939012, "grad_norm": 0.5630136553754124, "learning_rate": 1.589700985842538e-05, "loss": 0.3401, "step": 2100 }, { "epoch": 1.0038229496445852, "grad_norm": 0.5491390499825274, "learning_rate": 1.5892755531492986e-05, "loss": 0.3368, "step": 2101 }, { "epoch": 1.0043008183501583, "grad_norm": 0.5802336855773184, "learning_rate": 1.5888499570014152e-05, "loss": 0.3379, "step": 2102 }, { "epoch": 1.0047786870557314, "grad_norm": 0.5027412997145168, "learning_rate": 1.5884241975169406e-05, "loss": 0.3239, "step": 2103 }, { "epoch": 1.0052565557613047, "grad_norm": 0.5219023282328462, "learning_rate": 1.5879982748139738e-05, "loss": 0.3316, "step": 2104 }, { "epoch": 1.0057344244668778, "grad_norm": 0.6198051403881502, "learning_rate": 1.5875721890106574e-05, "loss": 0.3355, "step": 2105 }, { "epoch": 1.0062122931724509, "grad_norm": 0.5823835508633309, "learning_rate": 1.58714594022518e-05, "loss": 0.3421, "step": 2106 }, { "epoch": 1.006690161878024, "grad_norm": 0.6314654286023943, "learning_rate": 1.586719528575776e-05, "loss": 0.3492, "step": 2107 }, { "epoch": 1.007168030583597, "grad_norm": 0.5494826948922337, "learning_rate": 1.5862929541807247e-05, "loss": 0.3187, "step": 2108 }, { "epoch": 1.0076458992891704, "grad_norm": 0.49367247951130727, "learning_rate": 1.5858662171583495e-05, "loss": 0.343, "step": 2109 }, { "epoch": 1.0081237679947435, "grad_norm": 0.5867003697639408, "learning_rate": 1.5854393176270205e-05, "loss": 0.3222, "step": 2110 }, { "epoch": 1.0086016367003166, "grad_norm": 0.5202276726676689, "learning_rate": 1.585012255705152e-05, "loss": 0.3395, "step": 2111 }, { "epoch": 1.0090795054058896, "grad_norm": 0.5331521180511377, "learning_rate": 1.5845850315112025e-05, "loss": 0.304, "step": 2112 }, { "epoch": 1.009557374111463, "grad_norm": 0.5838843873042685, "learning_rate": 1.5841576451636777e-05, "loss": 0.3392, "step": 2113 }, { "epoch": 1.010035242817036, "grad_norm": 0.5162331359042389, "learning_rate": 1.5837300967811258e-05, "loss": 0.343, "step": 2114 }, { "epoch": 1.0105131115226091, "grad_norm": 0.6115190735592178, "learning_rate": 1.5833023864821427e-05, "loss": 0.321, "step": 2115 }, { "epoch": 1.0109909802281822, "grad_norm": 0.5925028855716654, "learning_rate": 1.5828745143853665e-05, "loss": 0.3393, "step": 2116 }, { "epoch": 1.0114688489337555, "grad_norm": 0.6132879239893447, "learning_rate": 1.5824464806094817e-05, "loss": 0.3292, "step": 2117 }, { "epoch": 1.0119467176393286, "grad_norm": 0.5969110143142136, "learning_rate": 1.5820182852732177e-05, "loss": 0.3234, "step": 2118 }, { "epoch": 1.0124245863449017, "grad_norm": 0.6131274616744073, "learning_rate": 1.5815899284953477e-05, "loss": 0.3437, "step": 2119 }, { "epoch": 1.0129024550504748, "grad_norm": 0.5161682873500958, "learning_rate": 1.5811614103946905e-05, "loss": 0.3366, "step": 2120 }, { "epoch": 1.013380323756048, "grad_norm": 0.5311306907505919, "learning_rate": 1.5807327310901096e-05, "loss": 0.338, "step": 2121 }, { "epoch": 1.0138581924616212, "grad_norm": 0.5252428625482294, "learning_rate": 1.580303890700513e-05, "loss": 0.3215, "step": 2122 }, { "epoch": 1.0143360611671943, "grad_norm": 0.5329298201469379, "learning_rate": 1.579874889344854e-05, "loss": 0.3231, "step": 2123 }, { "epoch": 1.0148139298727674, "grad_norm": 0.8494224841188093, "learning_rate": 1.579445727142129e-05, "loss": 0.3209, "step": 2124 }, { "epoch": 1.0152917985783405, "grad_norm": 0.575938313407013, "learning_rate": 1.5790164042113805e-05, "loss": 0.314, "step": 2125 }, { "epoch": 1.0157696672839138, "grad_norm": 0.658256081712634, "learning_rate": 1.5785869206716957e-05, "loss": 0.3192, "step": 2126 }, { "epoch": 1.016247535989487, "grad_norm": 0.7068653729035548, "learning_rate": 1.578157276642205e-05, "loss": 0.3306, "step": 2127 }, { "epoch": 1.01672540469506, "grad_norm": 0.5411767755265546, "learning_rate": 1.577727472242084e-05, "loss": 0.3233, "step": 2128 }, { "epoch": 1.017203273400633, "grad_norm": 0.6005936988154575, "learning_rate": 1.577297507590553e-05, "loss": 0.3284, "step": 2129 }, { "epoch": 1.0176811421062064, "grad_norm": 0.5499614192701662, "learning_rate": 1.576867382806877e-05, "loss": 0.32, "step": 2130 }, { "epoch": 1.0181590108117795, "grad_norm": 0.5390243829826628, "learning_rate": 1.5764370980103652e-05, "loss": 0.3355, "step": 2131 }, { "epoch": 1.0186368795173526, "grad_norm": 0.5867439844608485, "learning_rate": 1.57600665332037e-05, "loss": 0.3323, "step": 2132 }, { "epoch": 1.0191147482229257, "grad_norm": 0.567986991755531, "learning_rate": 1.5755760488562898e-05, "loss": 0.336, "step": 2133 }, { "epoch": 1.0195926169284988, "grad_norm": 0.5275519852089237, "learning_rate": 1.575145284737567e-05, "loss": 0.3272, "step": 2134 }, { "epoch": 1.020070485634072, "grad_norm": 0.5116807564883972, "learning_rate": 1.5747143610836873e-05, "loss": 0.3287, "step": 2135 }, { "epoch": 1.0205483543396452, "grad_norm": 0.5309021872208404, "learning_rate": 1.5742832780141816e-05, "loss": 0.3517, "step": 2136 }, { "epoch": 1.0210262230452183, "grad_norm": 0.5056039849696519, "learning_rate": 1.573852035648625e-05, "loss": 0.3317, "step": 2137 }, { "epoch": 1.0215040917507914, "grad_norm": 0.5168796985969982, "learning_rate": 1.5734206341066363e-05, "loss": 0.3287, "step": 2138 }, { "epoch": 1.0219819604563647, "grad_norm": 0.4971819258293254, "learning_rate": 1.5729890735078782e-05, "loss": 0.3465, "step": 2139 }, { "epoch": 1.0224598291619378, "grad_norm": 0.5362308867880675, "learning_rate": 1.5725573539720592e-05, "loss": 0.3321, "step": 2140 }, { "epoch": 1.0229376978675109, "grad_norm": 0.5409439946440039, "learning_rate": 1.5721254756189293e-05, "loss": 0.3385, "step": 2141 }, { "epoch": 1.023415566573084, "grad_norm": 0.5435540704296682, "learning_rate": 1.5716934385682847e-05, "loss": 0.3117, "step": 2142 }, { "epoch": 1.0238934352786573, "grad_norm": 0.5353171577649578, "learning_rate": 1.5712612429399648e-05, "loss": 0.3213, "step": 2143 }, { "epoch": 1.0243713039842304, "grad_norm": 0.5572982587837408, "learning_rate": 1.570828888853853e-05, "loss": 0.325, "step": 2144 }, { "epoch": 1.0248491726898035, "grad_norm": 0.5032264715461846, "learning_rate": 1.570396376429877e-05, "loss": 0.3253, "step": 2145 }, { "epoch": 1.0253270413953766, "grad_norm": 0.5157482930173531, "learning_rate": 1.569963705788007e-05, "loss": 0.3426, "step": 2146 }, { "epoch": 1.0258049101009497, "grad_norm": 0.5008263600349416, "learning_rate": 1.56953087704826e-05, "loss": 0.343, "step": 2147 }, { "epoch": 1.026282778806523, "grad_norm": 0.5392376218710371, "learning_rate": 1.5690978903306936e-05, "loss": 0.333, "step": 2148 }, { "epoch": 1.026760647512096, "grad_norm": 0.49909224648803696, "learning_rate": 1.5686647457554108e-05, "loss": 0.3223, "step": 2149 }, { "epoch": 1.0272385162176692, "grad_norm": 0.5514337816765721, "learning_rate": 1.5682314434425593e-05, "loss": 0.3257, "step": 2150 }, { "epoch": 1.0277163849232422, "grad_norm": 0.5560498032013678, "learning_rate": 1.5677979835123282e-05, "loss": 0.3172, "step": 2151 }, { "epoch": 1.0281942536288156, "grad_norm": 0.497633056500836, "learning_rate": 1.5673643660849525e-05, "loss": 0.3472, "step": 2152 }, { "epoch": 1.0286721223343886, "grad_norm": 0.5090101129547484, "learning_rate": 1.5669305912807095e-05, "loss": 0.3431, "step": 2153 }, { "epoch": 1.0291499910399617, "grad_norm": 0.5115208530872307, "learning_rate": 1.5664966592199213e-05, "loss": 0.3179, "step": 2154 }, { "epoch": 1.0296278597455348, "grad_norm": 0.4941887725586954, "learning_rate": 1.5660625700229526e-05, "loss": 0.3199, "step": 2155 }, { "epoch": 1.0301057284511081, "grad_norm": 0.5051050314513967, "learning_rate": 1.5656283238102125e-05, "loss": 0.3503, "step": 2156 }, { "epoch": 1.0305835971566812, "grad_norm": 0.47274872720440475, "learning_rate": 1.5651939207021522e-05, "loss": 0.3417, "step": 2157 }, { "epoch": 1.0310614658622543, "grad_norm": 0.5026973690751055, "learning_rate": 1.5647593608192685e-05, "loss": 0.3307, "step": 2158 }, { "epoch": 1.0315393345678274, "grad_norm": 0.506764397545058, "learning_rate": 1.5643246442821004e-05, "loss": 0.3446, "step": 2159 }, { "epoch": 1.0320172032734007, "grad_norm": 0.5727967952694278, "learning_rate": 1.5638897712112303e-05, "loss": 0.3459, "step": 2160 }, { "epoch": 1.0324950719789738, "grad_norm": 0.59780425526287, "learning_rate": 1.5634547417272847e-05, "loss": 0.3292, "step": 2161 }, { "epoch": 1.032972940684547, "grad_norm": 0.5022372242667101, "learning_rate": 1.5630195559509326e-05, "loss": 0.3218, "step": 2162 }, { "epoch": 1.03345080939012, "grad_norm": 0.5536484199445086, "learning_rate": 1.562584214002887e-05, "loss": 0.3037, "step": 2163 }, { "epoch": 1.033928678095693, "grad_norm": 0.5211875586311786, "learning_rate": 1.562148716003905e-05, "loss": 0.347, "step": 2164 }, { "epoch": 1.0344065468012664, "grad_norm": 0.5504469873275528, "learning_rate": 1.561713062074785e-05, "loss": 0.3189, "step": 2165 }, { "epoch": 1.0348844155068395, "grad_norm": 0.5578520136556673, "learning_rate": 1.56127725233637e-05, "loss": 0.3397, "step": 2166 }, { "epoch": 1.0353622842124126, "grad_norm": 0.547653966471447, "learning_rate": 1.560841286909546e-05, "loss": 0.3355, "step": 2167 }, { "epoch": 1.0358401529179857, "grad_norm": 0.5470544888308998, "learning_rate": 1.5604051659152418e-05, "loss": 0.3302, "step": 2168 }, { "epoch": 1.036318021623559, "grad_norm": 0.5805024891652473, "learning_rate": 1.5599688894744304e-05, "loss": 0.3335, "step": 2169 }, { "epoch": 1.036795890329132, "grad_norm": 0.5136197878985662, "learning_rate": 1.5595324577081265e-05, "loss": 0.3295, "step": 2170 }, { "epoch": 1.0372737590347052, "grad_norm": 0.56863395023198, "learning_rate": 1.5590958707373886e-05, "loss": 0.3175, "step": 2171 }, { "epoch": 1.0377516277402783, "grad_norm": 0.5502967372343314, "learning_rate": 1.558659128683319e-05, "loss": 0.337, "step": 2172 }, { "epoch": 1.0382294964458516, "grad_norm": 0.5135053364771395, "learning_rate": 1.558222231667061e-05, "loss": 0.3324, "step": 2173 }, { "epoch": 1.0387073651514247, "grad_norm": 0.537644706467191, "learning_rate": 1.5577851798098032e-05, "loss": 0.3261, "step": 2174 }, { "epoch": 1.0391852338569978, "grad_norm": 0.5382510505587826, "learning_rate": 1.5573479732327758e-05, "loss": 0.338, "step": 2175 }, { "epoch": 1.0396631025625709, "grad_norm": 0.49204890495941894, "learning_rate": 1.556910612057252e-05, "loss": 0.3477, "step": 2176 }, { "epoch": 1.040140971268144, "grad_norm": 0.523151317823713, "learning_rate": 1.5564730964045476e-05, "loss": 0.352, "step": 2177 }, { "epoch": 1.0406188399737173, "grad_norm": 0.4894794188961839, "learning_rate": 1.556035426396023e-05, "loss": 0.3135, "step": 2178 }, { "epoch": 1.0410967086792904, "grad_norm": 0.6266176917586536, "learning_rate": 1.555597602153079e-05, "loss": 0.3143, "step": 2179 }, { "epoch": 1.0415745773848635, "grad_norm": 0.5309556876141732, "learning_rate": 1.555159623797161e-05, "loss": 0.3403, "step": 2180 }, { "epoch": 1.0420524460904366, "grad_norm": 0.4931662085144718, "learning_rate": 1.554721491449756e-05, "loss": 0.3397, "step": 2181 }, { "epoch": 1.0425303147960099, "grad_norm": 0.5645039752023514, "learning_rate": 1.5542832052323943e-05, "loss": 0.3396, "step": 2182 }, { "epoch": 1.043008183501583, "grad_norm": 0.49990356604600616, "learning_rate": 1.553844765266649e-05, "loss": 0.339, "step": 2183 }, { "epoch": 1.043486052207156, "grad_norm": 0.5468317171758479, "learning_rate": 1.5534061716741358e-05, "loss": 0.3219, "step": 2184 }, { "epoch": 1.0439639209127292, "grad_norm": 1.455991273998016, "learning_rate": 1.552967424576512e-05, "loss": 0.3444, "step": 2185 }, { "epoch": 1.0444417896183025, "grad_norm": 0.5138914378146582, "learning_rate": 1.5525285240954793e-05, "loss": 0.3437, "step": 2186 }, { "epoch": 1.0449196583238756, "grad_norm": 0.4771708845133923, "learning_rate": 1.55208947035278e-05, "loss": 0.3401, "step": 2187 }, { "epoch": 1.0453975270294487, "grad_norm": 17.211331468923103, "learning_rate": 1.5516502634702003e-05, "loss": 0.3233, "step": 2188 }, { "epoch": 1.0458753957350218, "grad_norm": 0.6438223472017501, "learning_rate": 1.5512109035695688e-05, "loss": 0.3215, "step": 2189 }, { "epoch": 1.0463532644405948, "grad_norm": 0.5071979054947092, "learning_rate": 1.5507713907727557e-05, "loss": 0.3381, "step": 2190 }, { "epoch": 1.0468311331461682, "grad_norm": 0.5360652181387802, "learning_rate": 1.550331725201674e-05, "loss": 0.3309, "step": 2191 }, { "epoch": 1.0473090018517413, "grad_norm": 0.5811188677891611, "learning_rate": 1.54989190697828e-05, "loss": 0.3271, "step": 2192 }, { "epoch": 1.0477868705573143, "grad_norm": 0.49109998644777375, "learning_rate": 1.5494519362245702e-05, "loss": 0.3217, "step": 2193 }, { "epoch": 1.0482647392628874, "grad_norm": 0.5249085574500518, "learning_rate": 1.549011813062586e-05, "loss": 0.3363, "step": 2194 }, { "epoch": 1.0487426079684607, "grad_norm": 0.5450121620506873, "learning_rate": 1.5485715376144087e-05, "loss": 0.3368, "step": 2195 }, { "epoch": 1.0492204766740338, "grad_norm": 0.5333106429783062, "learning_rate": 1.5481311100021642e-05, "loss": 0.3482, "step": 2196 }, { "epoch": 1.049698345379607, "grad_norm": 0.5132325689081423, "learning_rate": 1.5476905303480183e-05, "loss": 0.3235, "step": 2197 }, { "epoch": 1.05017621408518, "grad_norm": 0.5603270076262434, "learning_rate": 1.5472497987741803e-05, "loss": 0.3245, "step": 2198 }, { "epoch": 1.0506540827907533, "grad_norm": 0.5083807526273509, "learning_rate": 1.5468089154029016e-05, "loss": 0.3382, "step": 2199 }, { "epoch": 1.0511319514963264, "grad_norm": 0.5338317283874571, "learning_rate": 1.5463678803564753e-05, "loss": 0.3225, "step": 2200 }, { "epoch": 1.0516098202018995, "grad_norm": 0.5268204769201611, "learning_rate": 1.5459266937572367e-05, "loss": 0.3419, "step": 2201 }, { "epoch": 1.0520876889074726, "grad_norm": 0.4795433149023876, "learning_rate": 1.5454853557275632e-05, "loss": 0.3464, "step": 2202 }, { "epoch": 1.0525655576130457, "grad_norm": 0.5808022832179441, "learning_rate": 1.5450438663898743e-05, "loss": 0.3177, "step": 2203 }, { "epoch": 1.053043426318619, "grad_norm": 0.5145172026904812, "learning_rate": 1.5446022258666313e-05, "loss": 0.3361, "step": 2204 }, { "epoch": 1.0535212950241921, "grad_norm": 0.47386368364154696, "learning_rate": 1.5441604342803374e-05, "loss": 0.3188, "step": 2205 }, { "epoch": 1.0539991637297652, "grad_norm": 0.6038585929683551, "learning_rate": 1.5437184917535377e-05, "loss": 0.3415, "step": 2206 }, { "epoch": 1.0544770324353383, "grad_norm": 0.51095983032144, "learning_rate": 1.5432763984088195e-05, "loss": 0.328, "step": 2207 }, { "epoch": 1.0549549011409116, "grad_norm": 0.49778348332654354, "learning_rate": 1.5428341543688116e-05, "loss": 0.3249, "step": 2208 }, { "epoch": 1.0554327698464847, "grad_norm": 0.5813291087757343, "learning_rate": 1.542391759756185e-05, "loss": 0.3324, "step": 2209 }, { "epoch": 1.0559106385520578, "grad_norm": 0.5194247870224709, "learning_rate": 1.5419492146936518e-05, "loss": 0.3412, "step": 2210 }, { "epoch": 1.056388507257631, "grad_norm": 0.49251753974846363, "learning_rate": 1.5415065193039658e-05, "loss": 0.3357, "step": 2211 }, { "epoch": 1.0568663759632042, "grad_norm": 0.5290839901414063, "learning_rate": 1.5410636737099238e-05, "loss": 0.327, "step": 2212 }, { "epoch": 1.0573442446687773, "grad_norm": 0.5447923991162895, "learning_rate": 1.5406206780343626e-05, "loss": 0.3332, "step": 2213 }, { "epoch": 1.0578221133743504, "grad_norm": 0.563508010318956, "learning_rate": 1.540177532400162e-05, "loss": 0.3398, "step": 2214 }, { "epoch": 1.0582999820799235, "grad_norm": 0.5608591424899783, "learning_rate": 1.5397342369302425e-05, "loss": 0.3121, "step": 2215 }, { "epoch": 1.0587778507854968, "grad_norm": 0.5390541825320886, "learning_rate": 1.5392907917475662e-05, "loss": 0.3263, "step": 2216 }, { "epoch": 1.05925571949107, "grad_norm": 0.6375454291641146, "learning_rate": 1.5388471969751373e-05, "loss": 0.3286, "step": 2217 }, { "epoch": 1.059733588196643, "grad_norm": 0.6197563874990852, "learning_rate": 1.538403452736001e-05, "loss": 0.3375, "step": 2218 }, { "epoch": 1.060211456902216, "grad_norm": 0.6485799859247667, "learning_rate": 1.5379595591532442e-05, "loss": 0.3498, "step": 2219 }, { "epoch": 1.0606893256077892, "grad_norm": 0.5016640951997636, "learning_rate": 1.5375155163499953e-05, "loss": 0.3103, "step": 2220 }, { "epoch": 1.0611671943133625, "grad_norm": 0.505528792203374, "learning_rate": 1.5370713244494235e-05, "loss": 0.3398, "step": 2221 }, { "epoch": 1.0616450630189356, "grad_norm": 0.6607061590143123, "learning_rate": 1.53662698357474e-05, "loss": 0.3382, "step": 2222 }, { "epoch": 1.0621229317245087, "grad_norm": 0.5195925433144004, "learning_rate": 1.536182493849198e-05, "loss": 0.3231, "step": 2223 }, { "epoch": 1.0626008004300818, "grad_norm": 0.5789827056376714, "learning_rate": 1.5357378553960892e-05, "loss": 0.3313, "step": 2224 }, { "epoch": 1.063078669135655, "grad_norm": 0.5365461420949971, "learning_rate": 1.5352930683387502e-05, "loss": 0.3299, "step": 2225 }, { "epoch": 1.0635565378412282, "grad_norm": 0.512379234100911, "learning_rate": 1.5348481328005566e-05, "loss": 0.321, "step": 2226 }, { "epoch": 1.0640344065468013, "grad_norm": 0.5335894256345767, "learning_rate": 1.534403048904925e-05, "loss": 0.3451, "step": 2227 }, { "epoch": 1.0645122752523744, "grad_norm": 0.5464542705568193, "learning_rate": 1.5339578167753145e-05, "loss": 0.3389, "step": 2228 }, { "epoch": 1.0649901439579477, "grad_norm": 0.5232741311541103, "learning_rate": 1.5335124365352246e-05, "loss": 0.3342, "step": 2229 }, { "epoch": 1.0654680126635208, "grad_norm": 0.5900396821697615, "learning_rate": 1.5330669083081956e-05, "loss": 0.3391, "step": 2230 }, { "epoch": 1.0659458813690939, "grad_norm": 0.5093924605652755, "learning_rate": 1.5326212322178097e-05, "loss": 0.3286, "step": 2231 }, { "epoch": 1.066423750074667, "grad_norm": 0.5716841100121078, "learning_rate": 1.5321754083876893e-05, "loss": 0.323, "step": 2232 }, { "epoch": 1.06690161878024, "grad_norm": 0.4842785242623549, "learning_rate": 1.5317294369414975e-05, "loss": 0.3408, "step": 2233 }, { "epoch": 1.0673794874858133, "grad_norm": 0.513994095767651, "learning_rate": 1.53128331800294e-05, "loss": 0.3466, "step": 2234 }, { "epoch": 1.0678573561913864, "grad_norm": 0.5758288986174495, "learning_rate": 1.5308370516957617e-05, "loss": 0.3217, "step": 2235 }, { "epoch": 1.0683352248969595, "grad_norm": 0.5200971785900195, "learning_rate": 1.5303906381437487e-05, "loss": 0.3251, "step": 2236 }, { "epoch": 1.0688130936025326, "grad_norm": 0.5033891597260092, "learning_rate": 1.529944077470729e-05, "loss": 0.3236, "step": 2237 }, { "epoch": 1.069290962308106, "grad_norm": 0.49510942051055734, "learning_rate": 1.52949736980057e-05, "loss": 0.3232, "step": 2238 }, { "epoch": 1.069768831013679, "grad_norm": 0.49918931484754825, "learning_rate": 1.529050515257181e-05, "loss": 0.3292, "step": 2239 }, { "epoch": 1.0702466997192521, "grad_norm": 0.5263649316343764, "learning_rate": 1.528603513964511e-05, "loss": 0.3186, "step": 2240 }, { "epoch": 1.0707245684248252, "grad_norm": 0.506050997373646, "learning_rate": 1.528156366046551e-05, "loss": 0.3386, "step": 2241 }, { "epoch": 1.0712024371303985, "grad_norm": 0.4759187913918262, "learning_rate": 1.5277090716273313e-05, "loss": 0.3244, "step": 2242 }, { "epoch": 1.0716803058359716, "grad_norm": 0.5170077950073158, "learning_rate": 1.527261630830924e-05, "loss": 0.34, "step": 2243 }, { "epoch": 1.0721581745415447, "grad_norm": 0.5371316049130486, "learning_rate": 1.526814043781441e-05, "loss": 0.3139, "step": 2244 }, { "epoch": 1.0726360432471178, "grad_norm": 0.4661135714534529, "learning_rate": 1.5263663106030347e-05, "loss": 0.3431, "step": 2245 }, { "epoch": 1.073113911952691, "grad_norm": 0.5050579846579514, "learning_rate": 1.5259184314198995e-05, "loss": 0.3182, "step": 2246 }, { "epoch": 1.0735917806582642, "grad_norm": 0.49161171778636, "learning_rate": 1.5254704063562678e-05, "loss": 0.3662, "step": 2247 }, { "epoch": 1.0740696493638373, "grad_norm": 0.6153694810824738, "learning_rate": 1.5250222355364149e-05, "loss": 0.3144, "step": 2248 }, { "epoch": 1.0745475180694104, "grad_norm": 0.4888202707715523, "learning_rate": 1.5245739190846549e-05, "loss": 0.3398, "step": 2249 }, { "epoch": 1.0750253867749835, "grad_norm": 0.5124355341785777, "learning_rate": 1.5241254571253433e-05, "loss": 0.3548, "step": 2250 }, { "epoch": 1.0755032554805568, "grad_norm": 0.5077616262970712, "learning_rate": 1.5236768497828753e-05, "loss": 0.3502, "step": 2251 }, { "epoch": 1.07598112418613, "grad_norm": 0.53688126358671, "learning_rate": 1.5232280971816864e-05, "loss": 0.3308, "step": 2252 }, { "epoch": 1.076458992891703, "grad_norm": 0.49623504126840645, "learning_rate": 1.5227791994462529e-05, "loss": 0.3179, "step": 2253 }, { "epoch": 1.076936861597276, "grad_norm": 0.5575295987434887, "learning_rate": 1.5223301567010916e-05, "loss": 0.321, "step": 2254 }, { "epoch": 1.0774147303028494, "grad_norm": 0.5298944158079708, "learning_rate": 1.5218809690707583e-05, "loss": 0.3341, "step": 2255 }, { "epoch": 1.0778925990084225, "grad_norm": 0.5077709934710473, "learning_rate": 1.5214316366798498e-05, "loss": 0.3521, "step": 2256 }, { "epoch": 1.0783704677139956, "grad_norm": 0.5439955998090592, "learning_rate": 1.5209821596530035e-05, "loss": 0.3184, "step": 2257 }, { "epoch": 1.0788483364195687, "grad_norm": 0.5084561135875706, "learning_rate": 1.5205325381148958e-05, "loss": 0.345, "step": 2258 }, { "epoch": 1.0793262051251418, "grad_norm": 0.5025281601561041, "learning_rate": 1.5200827721902443e-05, "loss": 0.3134, "step": 2259 }, { "epoch": 1.079804073830715, "grad_norm": 0.5229250055944893, "learning_rate": 1.5196328620038059e-05, "loss": 0.3437, "step": 2260 }, { "epoch": 1.0802819425362882, "grad_norm": 0.5010024267654524, "learning_rate": 1.5191828076803776e-05, "loss": 0.3161, "step": 2261 }, { "epoch": 1.0807598112418613, "grad_norm": 0.5752421274767535, "learning_rate": 1.5187326093447965e-05, "loss": 0.346, "step": 2262 }, { "epoch": 1.0812376799474344, "grad_norm": 0.5513552600942755, "learning_rate": 1.5182822671219404e-05, "loss": 0.3368, "step": 2263 }, { "epoch": 1.0817155486530077, "grad_norm": 0.49596162792066967, "learning_rate": 1.5178317811367254e-05, "loss": 0.325, "step": 2264 }, { "epoch": 1.0821934173585808, "grad_norm": 0.5479700565884935, "learning_rate": 1.5173811515141083e-05, "loss": 0.3346, "step": 2265 }, { "epoch": 1.0826712860641539, "grad_norm": 0.6256543187747687, "learning_rate": 1.516930378379087e-05, "loss": 0.3251, "step": 2266 }, { "epoch": 1.083149154769727, "grad_norm": 0.5689155161140599, "learning_rate": 1.516479461856697e-05, "loss": 0.3363, "step": 2267 }, { "epoch": 1.0836270234753003, "grad_norm": 0.7023942415790342, "learning_rate": 1.5160284020720144e-05, "loss": 0.3256, "step": 2268 }, { "epoch": 1.0841048921808734, "grad_norm": 0.4944938967568079, "learning_rate": 1.5155771991501562e-05, "loss": 0.3397, "step": 2269 }, { "epoch": 1.0845827608864465, "grad_norm": 0.7545407502316881, "learning_rate": 1.5151258532162771e-05, "loss": 0.3239, "step": 2270 }, { "epoch": 1.0850606295920195, "grad_norm": 0.5318090108055559, "learning_rate": 1.5146743643955732e-05, "loss": 0.3243, "step": 2271 }, { "epoch": 1.0855384982975926, "grad_norm": 0.5221147872004993, "learning_rate": 1.5142227328132797e-05, "loss": 0.3364, "step": 2272 }, { "epoch": 1.086016367003166, "grad_norm": 0.5704820936647907, "learning_rate": 1.5137709585946705e-05, "loss": 0.3423, "step": 2273 }, { "epoch": 1.086494235708739, "grad_norm": 0.5013932264970669, "learning_rate": 1.51331904186506e-05, "loss": 0.3258, "step": 2274 }, { "epoch": 1.0869721044143121, "grad_norm": 0.5257762249096876, "learning_rate": 1.5128669827498024e-05, "loss": 0.3246, "step": 2275 }, { "epoch": 1.0874499731198852, "grad_norm": 0.5139561282311831, "learning_rate": 1.5124147813742904e-05, "loss": 0.3293, "step": 2276 }, { "epoch": 1.0879278418254585, "grad_norm": 0.543726884049005, "learning_rate": 1.5119624378639568e-05, "loss": 0.3555, "step": 2277 }, { "epoch": 1.0884057105310316, "grad_norm": 0.512178392507077, "learning_rate": 1.511509952344274e-05, "loss": 0.3254, "step": 2278 }, { "epoch": 1.0888835792366047, "grad_norm": 0.5176231019242892, "learning_rate": 1.511057324940753e-05, "loss": 0.3253, "step": 2279 }, { "epoch": 1.0893614479421778, "grad_norm": 0.5278862653717663, "learning_rate": 1.5106045557789453e-05, "loss": 0.3346, "step": 2280 }, { "epoch": 1.0898393166477511, "grad_norm": 0.47608912937489073, "learning_rate": 1.5101516449844407e-05, "loss": 0.3325, "step": 2281 }, { "epoch": 1.0903171853533242, "grad_norm": 0.5309476770511793, "learning_rate": 1.5096985926828684e-05, "loss": 0.351, "step": 2282 }, { "epoch": 1.0907950540588973, "grad_norm": 0.5133648196697228, "learning_rate": 1.5092453989998976e-05, "loss": 0.3369, "step": 2283 }, { "epoch": 1.0912729227644704, "grad_norm": 0.5748784259519858, "learning_rate": 1.5087920640612361e-05, "loss": 0.342, "step": 2284 }, { "epoch": 1.0917507914700435, "grad_norm": 0.51205359618062, "learning_rate": 1.5083385879926309e-05, "loss": 0.3304, "step": 2285 }, { "epoch": 1.0922286601756168, "grad_norm": 0.5640393858567604, "learning_rate": 1.5078849709198687e-05, "loss": 0.3529, "step": 2286 }, { "epoch": 1.09270652888119, "grad_norm": 0.5141124022477992, "learning_rate": 1.5074312129687741e-05, "loss": 0.3425, "step": 2287 }, { "epoch": 1.093184397586763, "grad_norm": 0.5187378970830254, "learning_rate": 1.5069773142652119e-05, "loss": 0.3378, "step": 2288 }, { "epoch": 1.093662266292336, "grad_norm": 0.520566150228239, "learning_rate": 1.506523274935086e-05, "loss": 0.3461, "step": 2289 }, { "epoch": 1.0941401349979094, "grad_norm": 0.5296392819928428, "learning_rate": 1.5060690951043385e-05, "loss": 0.3417, "step": 2290 }, { "epoch": 1.0946180037034825, "grad_norm": 0.5195069878439671, "learning_rate": 1.5056147748989505e-05, "loss": 0.3234, "step": 2291 }, { "epoch": 1.0950958724090556, "grad_norm": 0.5634053365773759, "learning_rate": 1.5051603144449431e-05, "loss": 0.3476, "step": 2292 }, { "epoch": 1.0955737411146287, "grad_norm": 0.5457090574500167, "learning_rate": 1.5047057138683753e-05, "loss": 0.3214, "step": 2293 }, { "epoch": 1.096051609820202, "grad_norm": 0.49911077929816133, "learning_rate": 1.5042509732953454e-05, "loss": 0.3414, "step": 2294 }, { "epoch": 1.096529478525775, "grad_norm": 2.702449341602149, "learning_rate": 1.5037960928519902e-05, "loss": 0.3377, "step": 2295 }, { "epoch": 1.0970073472313482, "grad_norm": 0.5565135853631065, "learning_rate": 1.5033410726644859e-05, "loss": 0.3343, "step": 2296 }, { "epoch": 1.0974852159369213, "grad_norm": 0.5073307379974941, "learning_rate": 1.5028859128590468e-05, "loss": 0.327, "step": 2297 }, { "epoch": 1.0979630846424944, "grad_norm": 0.5160051255941388, "learning_rate": 1.502430613561926e-05, "loss": 0.344, "step": 2298 }, { "epoch": 1.0984409533480677, "grad_norm": 0.5348498573334795, "learning_rate": 1.5019751748994158e-05, "loss": 0.3494, "step": 2299 }, { "epoch": 1.0989188220536408, "grad_norm": 0.4865115046264663, "learning_rate": 1.501519596997847e-05, "loss": 0.3435, "step": 2300 }, { "epoch": 1.0993966907592139, "grad_norm": 0.5216309305575126, "learning_rate": 1.5010638799835884e-05, "loss": 0.33, "step": 2301 }, { "epoch": 1.099874559464787, "grad_norm": 0.539223989848975, "learning_rate": 1.5006080239830483e-05, "loss": 0.3361, "step": 2302 }, { "epoch": 1.1003524281703603, "grad_norm": 0.5001901728249805, "learning_rate": 1.5001520291226727e-05, "loss": 0.3236, "step": 2303 }, { "epoch": 1.1008302968759334, "grad_norm": 0.515833509744522, "learning_rate": 1.499695895528947e-05, "loss": 0.3476, "step": 2304 }, { "epoch": 1.1013081655815065, "grad_norm": 0.5136798035818348, "learning_rate": 1.499239623328394e-05, "loss": 0.346, "step": 2305 }, { "epoch": 1.1017860342870796, "grad_norm": 0.5181371149654749, "learning_rate": 1.4987832126475763e-05, "loss": 0.3427, "step": 2306 }, { "epoch": 1.1022639029926529, "grad_norm": 0.5071439491474284, "learning_rate": 1.4983266636130935e-05, "loss": 0.3404, "step": 2307 }, { "epoch": 1.102741771698226, "grad_norm": 0.5024865933279972, "learning_rate": 1.4978699763515848e-05, "loss": 0.3408, "step": 2308 }, { "epoch": 1.103219640403799, "grad_norm": 0.6901876193242537, "learning_rate": 1.4974131509897269e-05, "loss": 0.3246, "step": 2309 }, { "epoch": 1.1036975091093721, "grad_norm": 0.5498690262019805, "learning_rate": 1.4969561876542348e-05, "loss": 0.339, "step": 2310 }, { "epoch": 1.1041753778149452, "grad_norm": 0.5305378391568346, "learning_rate": 1.4964990864718627e-05, "loss": 0.324, "step": 2311 }, { "epoch": 1.1046532465205186, "grad_norm": 0.6920996981731619, "learning_rate": 1.496041847569402e-05, "loss": 0.327, "step": 2312 }, { "epoch": 1.1051311152260916, "grad_norm": 0.5051871088425068, "learning_rate": 1.4955844710736829e-05, "loss": 0.324, "step": 2313 }, { "epoch": 1.1056089839316647, "grad_norm": 0.5231668443052977, "learning_rate": 1.4951269571115735e-05, "loss": 0.3192, "step": 2314 }, { "epoch": 1.1060868526372378, "grad_norm": 0.5107337355608277, "learning_rate": 1.4946693058099802e-05, "loss": 0.3187, "step": 2315 }, { "epoch": 1.1065647213428111, "grad_norm": 0.504994234754607, "learning_rate": 1.494211517295847e-05, "loss": 0.3274, "step": 2316 }, { "epoch": 1.1070425900483842, "grad_norm": 0.6160822307451901, "learning_rate": 1.4937535916961568e-05, "loss": 0.3296, "step": 2317 }, { "epoch": 1.1075204587539573, "grad_norm": 0.4975876476425877, "learning_rate": 1.49329552913793e-05, "loss": 0.3389, "step": 2318 }, { "epoch": 1.1079983274595304, "grad_norm": 0.5403670488971715, "learning_rate": 1.4928373297482249e-05, "loss": 0.3455, "step": 2319 }, { "epoch": 1.1084761961651037, "grad_norm": 0.529726563264269, "learning_rate": 1.4923789936541378e-05, "loss": 0.3406, "step": 2320 }, { "epoch": 1.1089540648706768, "grad_norm": 2.35075832261799, "learning_rate": 1.4919205209828037e-05, "loss": 0.3349, "step": 2321 }, { "epoch": 1.10943193357625, "grad_norm": 0.5425366907719269, "learning_rate": 1.4914619118613942e-05, "loss": 0.3317, "step": 2322 }, { "epoch": 1.109909802281823, "grad_norm": 0.5093459744924752, "learning_rate": 1.4910031664171195e-05, "loss": 0.3486, "step": 2323 }, { "epoch": 1.110387670987396, "grad_norm": 0.5160824382283055, "learning_rate": 1.4905442847772278e-05, "loss": 0.3339, "step": 2324 }, { "epoch": 1.1108655396929694, "grad_norm": 0.5457445322763627, "learning_rate": 1.4900852670690044e-05, "loss": 0.3391, "step": 2325 }, { "epoch": 1.1113434083985425, "grad_norm": 0.5181476585227104, "learning_rate": 1.4896261134197729e-05, "loss": 0.3454, "step": 2326 }, { "epoch": 1.1118212771041156, "grad_norm": 0.5151616532814238, "learning_rate": 1.4891668239568943e-05, "loss": 0.3323, "step": 2327 }, { "epoch": 1.1122991458096887, "grad_norm": 0.48685718879651513, "learning_rate": 1.4887073988077678e-05, "loss": 0.3325, "step": 2328 }, { "epoch": 1.112777014515262, "grad_norm": 0.6039961072105209, "learning_rate": 1.4882478380998291e-05, "loss": 0.3354, "step": 2329 }, { "epoch": 1.113254883220835, "grad_norm": 0.49114939066967817, "learning_rate": 1.487788141960553e-05, "loss": 0.3262, "step": 2330 }, { "epoch": 1.1137327519264082, "grad_norm": 0.46862190557531513, "learning_rate": 1.4873283105174504e-05, "loss": 0.3366, "step": 2331 }, { "epoch": 1.1142106206319813, "grad_norm": 0.47383978484154615, "learning_rate": 1.4868683438980714e-05, "loss": 0.3241, "step": 2332 }, { "epoch": 1.1146884893375546, "grad_norm": 0.48438365346141576, "learning_rate": 1.4864082422300015e-05, "loss": 0.3239, "step": 2333 }, { "epoch": 1.1151663580431277, "grad_norm": 0.49902567827730593, "learning_rate": 1.4859480056408653e-05, "loss": 0.3343, "step": 2334 }, { "epoch": 1.1156442267487008, "grad_norm": 0.5240825256150454, "learning_rate": 1.4854876342583246e-05, "loss": 0.3525, "step": 2335 }, { "epoch": 1.1161220954542739, "grad_norm": 0.6398611930145781, "learning_rate": 1.4850271282100779e-05, "loss": 0.3283, "step": 2336 }, { "epoch": 1.116599964159847, "grad_norm": 0.5213749367846112, "learning_rate": 1.4845664876238615e-05, "loss": 0.3461, "step": 2337 }, { "epoch": 1.1170778328654203, "grad_norm": 0.5017747641723227, "learning_rate": 1.4841057126274491e-05, "loss": 0.3476, "step": 2338 }, { "epoch": 1.1175557015709934, "grad_norm": 0.5121891657183757, "learning_rate": 1.4836448033486513e-05, "loss": 0.3116, "step": 2339 }, { "epoch": 1.1180335702765665, "grad_norm": 0.490973436785604, "learning_rate": 1.4831837599153165e-05, "loss": 0.3479, "step": 2340 }, { "epoch": 1.1185114389821396, "grad_norm": 0.4995891989120494, "learning_rate": 1.4827225824553302e-05, "loss": 0.3216, "step": 2341 }, { "epoch": 1.1189893076877129, "grad_norm": 0.48709022056044177, "learning_rate": 1.4822612710966143e-05, "loss": 0.3325, "step": 2342 }, { "epoch": 1.119467176393286, "grad_norm": 0.5110741917418703, "learning_rate": 1.4817998259671286e-05, "loss": 0.3411, "step": 2343 }, { "epoch": 1.119945045098859, "grad_norm": 0.526551969506569, "learning_rate": 1.4813382471948705e-05, "loss": 0.3283, "step": 2344 }, { "epoch": 1.1204229138044322, "grad_norm": 0.487810166808925, "learning_rate": 1.4808765349078729e-05, "loss": 0.3299, "step": 2345 }, { "epoch": 1.1209007825100055, "grad_norm": 0.5296951175479421, "learning_rate": 1.4804146892342071e-05, "loss": 0.3379, "step": 2346 }, { "epoch": 1.1213786512155786, "grad_norm": 0.5101831769474524, "learning_rate": 1.4799527103019808e-05, "loss": 0.3271, "step": 2347 }, { "epoch": 1.1218565199211517, "grad_norm": 0.4890714139873493, "learning_rate": 1.4794905982393388e-05, "loss": 0.3395, "step": 2348 }, { "epoch": 1.1223343886267247, "grad_norm": 0.5450992295068559, "learning_rate": 1.4790283531744634e-05, "loss": 0.3446, "step": 2349 }, { "epoch": 1.1228122573322978, "grad_norm": 0.5031828458579483, "learning_rate": 1.4785659752355724e-05, "loss": 0.3466, "step": 2350 }, { "epoch": 1.1232901260378712, "grad_norm": 0.4811659005185453, "learning_rate": 1.4781034645509216e-05, "loss": 0.3485, "step": 2351 }, { "epoch": 1.1237679947434442, "grad_norm": 0.5448333215238423, "learning_rate": 1.4776408212488035e-05, "loss": 0.3263, "step": 2352 }, { "epoch": 1.1242458634490173, "grad_norm": 0.4953819912341556, "learning_rate": 1.4771780454575468e-05, "loss": 0.3272, "step": 2353 }, { "epoch": 1.1247237321545904, "grad_norm": 0.5175483590354045, "learning_rate": 1.4767151373055178e-05, "loss": 0.3269, "step": 2354 }, { "epoch": 1.1252016008601637, "grad_norm": 0.5840296576158932, "learning_rate": 1.4762520969211186e-05, "loss": 0.3311, "step": 2355 }, { "epoch": 1.1256794695657368, "grad_norm": 0.5464923552621009, "learning_rate": 1.4757889244327888e-05, "loss": 0.3181, "step": 2356 }, { "epoch": 1.12615733827131, "grad_norm": 0.5188633658259454, "learning_rate": 1.475325619969004e-05, "loss": 0.3477, "step": 2357 }, { "epoch": 1.126635206976883, "grad_norm": 0.4888981161848857, "learning_rate": 1.4748621836582772e-05, "loss": 0.3253, "step": 2358 }, { "epoch": 1.1271130756824563, "grad_norm": 0.5231748369329128, "learning_rate": 1.4743986156291568e-05, "loss": 0.3304, "step": 2359 }, { "epoch": 1.1275909443880294, "grad_norm": 0.4988116880762236, "learning_rate": 1.4739349160102285e-05, "loss": 0.3313, "step": 2360 }, { "epoch": 1.1280688130936025, "grad_norm": 0.5036551455737499, "learning_rate": 1.4734710849301151e-05, "loss": 0.3258, "step": 2361 }, { "epoch": 1.1285466817991756, "grad_norm": 0.5138628273300756, "learning_rate": 1.473007122517474e-05, "loss": 0.3305, "step": 2362 }, { "epoch": 1.1290245505047487, "grad_norm": 0.5138521817190567, "learning_rate": 1.472543028901001e-05, "loss": 0.3438, "step": 2363 }, { "epoch": 1.129502419210322, "grad_norm": 0.5299139873711323, "learning_rate": 1.4720788042094273e-05, "loss": 0.3243, "step": 2364 }, { "epoch": 1.129980287915895, "grad_norm": 0.5461978922038623, "learning_rate": 1.4716144485715209e-05, "loss": 0.3236, "step": 2365 }, { "epoch": 1.1304581566214682, "grad_norm": 0.5311350713847056, "learning_rate": 1.4711499621160855e-05, "loss": 0.3174, "step": 2366 }, { "epoch": 1.1309360253270415, "grad_norm": 0.5009730559856843, "learning_rate": 1.4706853449719614e-05, "loss": 0.3376, "step": 2367 }, { "epoch": 1.1314138940326146, "grad_norm": 0.5018126239017102, "learning_rate": 1.4702205972680254e-05, "loss": 0.3328, "step": 2368 }, { "epoch": 1.1318917627381877, "grad_norm": 0.5242488655503064, "learning_rate": 1.4697557191331902e-05, "loss": 0.3193, "step": 2369 }, { "epoch": 1.1323696314437608, "grad_norm": 0.5945549331790201, "learning_rate": 1.4692907106964051e-05, "loss": 0.3345, "step": 2370 }, { "epoch": 1.1328475001493339, "grad_norm": 0.48153137632362025, "learning_rate": 1.468825572086655e-05, "loss": 0.3378, "step": 2371 }, { "epoch": 1.1333253688549072, "grad_norm": 0.5151533016446908, "learning_rate": 1.4683603034329608e-05, "loss": 0.3168, "step": 2372 }, { "epoch": 1.1338032375604803, "grad_norm": 0.49149298419228526, "learning_rate": 1.4678949048643806e-05, "loss": 0.3354, "step": 2373 }, { "epoch": 1.1342811062660534, "grad_norm": 0.48897908673118196, "learning_rate": 1.4674293765100069e-05, "loss": 0.3339, "step": 2374 }, { "epoch": 1.1347589749716265, "grad_norm": 0.4949702713316802, "learning_rate": 1.4669637184989696e-05, "loss": 0.3257, "step": 2375 }, { "epoch": 1.1352368436771996, "grad_norm": 0.5058801752873772, "learning_rate": 1.466497930960434e-05, "loss": 0.3519, "step": 2376 }, { "epoch": 1.1357147123827729, "grad_norm": 0.5237225993986185, "learning_rate": 1.4660320140236014e-05, "loss": 0.3322, "step": 2377 }, { "epoch": 1.136192581088346, "grad_norm": 0.5177111438994864, "learning_rate": 1.4655659678177084e-05, "loss": 0.336, "step": 2378 }, { "epoch": 1.136670449793919, "grad_norm": 0.5012469860916707, "learning_rate": 1.4650997924720288e-05, "loss": 0.32, "step": 2379 }, { "epoch": 1.1371483184994924, "grad_norm": 0.5648557105154629, "learning_rate": 1.4646334881158704e-05, "loss": 0.3446, "step": 2380 }, { "epoch": 1.1376261872050655, "grad_norm": 0.5492302394701447, "learning_rate": 1.4641670548785788e-05, "loss": 0.3237, "step": 2381 }, { "epoch": 1.1381040559106386, "grad_norm": 0.5032033816755968, "learning_rate": 1.4637004928895337e-05, "loss": 0.336, "step": 2382 }, { "epoch": 1.1385819246162117, "grad_norm": 0.5123273009616124, "learning_rate": 1.4632338022781516e-05, "loss": 0.3177, "step": 2383 }, { "epoch": 1.1390597933217848, "grad_norm": 0.5450557175277998, "learning_rate": 1.462766983173884e-05, "loss": 0.3262, "step": 2384 }, { "epoch": 1.139537662027358, "grad_norm": 0.5720917143183127, "learning_rate": 1.4623000357062184e-05, "loss": 0.3367, "step": 2385 }, { "epoch": 1.1400155307329312, "grad_norm": 0.5807405509549664, "learning_rate": 1.4618329600046774e-05, "loss": 0.347, "step": 2386 }, { "epoch": 1.1404933994385043, "grad_norm": 0.5038498790994034, "learning_rate": 1.4613657561988203e-05, "loss": 0.3368, "step": 2387 }, { "epoch": 1.1409712681440773, "grad_norm": 0.4737365464414105, "learning_rate": 1.4608984244182407e-05, "loss": 0.3223, "step": 2388 }, { "epoch": 1.1414491368496504, "grad_norm": 0.5476800314149111, "learning_rate": 1.4604309647925683e-05, "loss": 0.335, "step": 2389 }, { "epoch": 1.1419270055552238, "grad_norm": 0.522067736414712, "learning_rate": 1.459963377451468e-05, "loss": 0.3407, "step": 2390 }, { "epoch": 1.1424048742607968, "grad_norm": 0.5362139688122035, "learning_rate": 1.4594956625246406e-05, "loss": 0.3258, "step": 2391 }, { "epoch": 1.14288274296637, "grad_norm": 0.5547169113461832, "learning_rate": 1.4590278201418216e-05, "loss": 0.3184, "step": 2392 }, { "epoch": 1.1433606116719432, "grad_norm": 0.5163474508287502, "learning_rate": 1.4585598504327824e-05, "loss": 0.3507, "step": 2393 }, { "epoch": 1.1438384803775163, "grad_norm": 0.5023941568737598, "learning_rate": 1.4580917535273295e-05, "loss": 0.3541, "step": 2394 }, { "epoch": 1.1443163490830894, "grad_norm": 0.5253487591862723, "learning_rate": 1.457623529555305e-05, "loss": 0.3262, "step": 2395 }, { "epoch": 1.1447942177886625, "grad_norm": 0.534817117306971, "learning_rate": 1.4571551786465857e-05, "loss": 0.3324, "step": 2396 }, { "epoch": 1.1452720864942356, "grad_norm": 0.48543838148344, "learning_rate": 1.4566867009310834e-05, "loss": 0.3483, "step": 2397 }, { "epoch": 1.145749955199809, "grad_norm": 0.5361413462770422, "learning_rate": 1.4562180965387466e-05, "loss": 0.3236, "step": 2398 }, { "epoch": 1.146227823905382, "grad_norm": 0.5551669716622398, "learning_rate": 1.4557493655995574e-05, "loss": 0.3553, "step": 2399 }, { "epoch": 1.1467056926109551, "grad_norm": 0.5044244521639527, "learning_rate": 1.4552805082435333e-05, "loss": 0.3353, "step": 2400 }, { "epoch": 1.1471835613165282, "grad_norm": 0.5363754449521199, "learning_rate": 1.4548115246007274e-05, "loss": 0.3359, "step": 2401 }, { "epoch": 1.1476614300221013, "grad_norm": 0.513128864486388, "learning_rate": 1.4543424148012271e-05, "loss": 0.3385, "step": 2402 }, { "epoch": 1.1481392987276746, "grad_norm": 0.46287451620253967, "learning_rate": 1.4538731789751553e-05, "loss": 0.3286, "step": 2403 }, { "epoch": 1.1486171674332477, "grad_norm": 0.5195791493585902, "learning_rate": 1.45340381725267e-05, "loss": 0.34, "step": 2404 }, { "epoch": 1.1490950361388208, "grad_norm": 0.48538977574421593, "learning_rate": 1.4529343297639638e-05, "loss": 0.3307, "step": 2405 }, { "epoch": 1.1495729048443941, "grad_norm": 0.5095807086855236, "learning_rate": 1.452464716639264e-05, "loss": 0.3502, "step": 2406 }, { "epoch": 1.1500507735499672, "grad_norm": 0.5122639691694912, "learning_rate": 1.4519949780088334e-05, "loss": 0.3292, "step": 2407 }, { "epoch": 1.1505286422555403, "grad_norm": 0.5380110437402467, "learning_rate": 1.4515251140029687e-05, "loss": 0.3302, "step": 2408 }, { "epoch": 1.1510065109611134, "grad_norm": 0.5213594922561937, "learning_rate": 1.451055124752002e-05, "loss": 0.3154, "step": 2409 }, { "epoch": 1.1514843796666865, "grad_norm": 0.5161145005085588, "learning_rate": 1.4505850103863007e-05, "loss": 0.3373, "step": 2410 }, { "epoch": 1.1519622483722598, "grad_norm": 0.5206093606448915, "learning_rate": 1.4501147710362653e-05, "loss": 0.3371, "step": 2411 }, { "epoch": 1.152440117077833, "grad_norm": 0.5091973446696731, "learning_rate": 1.4496444068323322e-05, "loss": 0.3327, "step": 2412 }, { "epoch": 1.152917985783406, "grad_norm": 0.5085178563346524, "learning_rate": 1.4491739179049725e-05, "loss": 0.3239, "step": 2413 }, { "epoch": 1.153395854488979, "grad_norm": 0.5268216105789499, "learning_rate": 1.448703304384691e-05, "loss": 0.3334, "step": 2414 }, { "epoch": 1.1538737231945524, "grad_norm": 0.5068318386596699, "learning_rate": 1.448232566402028e-05, "loss": 0.3527, "step": 2415 }, { "epoch": 1.1543515919001255, "grad_norm": 0.5258368149602686, "learning_rate": 1.4477617040875577e-05, "loss": 0.3258, "step": 2416 }, { "epoch": 1.1548294606056986, "grad_norm": 0.47858102510009803, "learning_rate": 1.4472907175718893e-05, "loss": 0.3402, "step": 2417 }, { "epoch": 1.1553073293112717, "grad_norm": 0.5821829343093783, "learning_rate": 1.4468196069856658e-05, "loss": 0.3197, "step": 2418 }, { "epoch": 1.155785198016845, "grad_norm": 0.473803724054973, "learning_rate": 1.4463483724595651e-05, "loss": 0.3324, "step": 2419 }, { "epoch": 1.156263066722418, "grad_norm": 0.512993585884921, "learning_rate": 1.4458770141242992e-05, "loss": 0.3341, "step": 2420 }, { "epoch": 1.1567409354279912, "grad_norm": 0.49790696160613324, "learning_rate": 1.4454055321106148e-05, "loss": 0.3195, "step": 2421 }, { "epoch": 1.1572188041335643, "grad_norm": 0.4852347254650604, "learning_rate": 1.4449339265492927e-05, "loss": 0.3296, "step": 2422 }, { "epoch": 1.1576966728391374, "grad_norm": 0.5307482887330455, "learning_rate": 1.4444621975711477e-05, "loss": 0.3257, "step": 2423 }, { "epoch": 1.1581745415447107, "grad_norm": 0.5158352775082153, "learning_rate": 1.4439903453070294e-05, "loss": 0.3439, "step": 2424 }, { "epoch": 1.1586524102502838, "grad_norm": 0.5214779865598925, "learning_rate": 1.4435183698878212e-05, "loss": 0.3232, "step": 2425 }, { "epoch": 1.1591302789558569, "grad_norm": 0.5149532717689103, "learning_rate": 1.4430462714444406e-05, "loss": 0.3374, "step": 2426 }, { "epoch": 1.15960814766143, "grad_norm": 0.5313361793801107, "learning_rate": 1.4425740501078396e-05, "loss": 0.3196, "step": 2427 }, { "epoch": 1.1600860163670033, "grad_norm": 0.49763038284494304, "learning_rate": 1.4421017060090041e-05, "loss": 0.3389, "step": 2428 }, { "epoch": 1.1605638850725764, "grad_norm": 0.5899500716656553, "learning_rate": 1.4416292392789538e-05, "loss": 0.3331, "step": 2429 }, { "epoch": 1.1610417537781494, "grad_norm": 0.4872995509813868, "learning_rate": 1.4411566500487425e-05, "loss": 0.3466, "step": 2430 }, { "epoch": 1.1615196224837225, "grad_norm": 0.4954045549900169, "learning_rate": 1.4406839384494585e-05, "loss": 0.3374, "step": 2431 }, { "epoch": 1.1619974911892959, "grad_norm": 0.4851398103962068, "learning_rate": 1.4402111046122234e-05, "loss": 0.3233, "step": 2432 }, { "epoch": 1.162475359894869, "grad_norm": 0.49511298286491195, "learning_rate": 1.4397381486681931e-05, "loss": 0.3317, "step": 2433 }, { "epoch": 1.162953228600442, "grad_norm": 0.4953595652477772, "learning_rate": 1.4392650707485574e-05, "loss": 0.32, "step": 2434 }, { "epoch": 1.1634310973060151, "grad_norm": 0.5090056906997068, "learning_rate": 1.4387918709845395e-05, "loss": 0.3406, "step": 2435 }, { "epoch": 1.1639089660115882, "grad_norm": 0.47175952406629523, "learning_rate": 1.4383185495073968e-05, "loss": 0.351, "step": 2436 }, { "epoch": 1.1643868347171615, "grad_norm": 0.5195708641390865, "learning_rate": 1.43784510644842e-05, "loss": 0.327, "step": 2437 }, { "epoch": 1.1648647034227346, "grad_norm": 0.49425130257319605, "learning_rate": 1.4373715419389345e-05, "loss": 0.3203, "step": 2438 }, { "epoch": 1.1653425721283077, "grad_norm": 0.49907031699241045, "learning_rate": 1.4368978561102982e-05, "loss": 0.3254, "step": 2439 }, { "epoch": 1.1658204408338808, "grad_norm": 0.5690059412126197, "learning_rate": 1.4364240490939032e-05, "loss": 0.339, "step": 2440 }, { "epoch": 1.1662983095394541, "grad_norm": 0.48782220485318945, "learning_rate": 1.4359501210211754e-05, "loss": 0.3175, "step": 2441 }, { "epoch": 1.1667761782450272, "grad_norm": 0.48282271375944663, "learning_rate": 1.4354760720235743e-05, "loss": 0.3276, "step": 2442 }, { "epoch": 1.1672540469506003, "grad_norm": 0.4907377583967225, "learning_rate": 1.4350019022325925e-05, "loss": 0.332, "step": 2443 }, { "epoch": 1.1677319156561734, "grad_norm": 0.5201773113156611, "learning_rate": 1.434527611779756e-05, "loss": 0.3322, "step": 2444 }, { "epoch": 1.1682097843617467, "grad_norm": 0.5073601879220666, "learning_rate": 1.4340532007966252e-05, "loss": 0.344, "step": 2445 }, { "epoch": 1.1686876530673198, "grad_norm": 0.4966658318824344, "learning_rate": 1.4335786694147931e-05, "loss": 0.3286, "step": 2446 }, { "epoch": 1.169165521772893, "grad_norm": 0.47795836779748363, "learning_rate": 1.4331040177658859e-05, "loss": 0.3203, "step": 2447 }, { "epoch": 1.169643390478466, "grad_norm": 0.5167223624760666, "learning_rate": 1.4326292459815642e-05, "loss": 0.3289, "step": 2448 }, { "epoch": 1.170121259184039, "grad_norm": 0.4943415537304743, "learning_rate": 1.4321543541935213e-05, "loss": 0.3412, "step": 2449 }, { "epoch": 1.1705991278896124, "grad_norm": 0.5447813509352554, "learning_rate": 1.4316793425334836e-05, "loss": 0.3317, "step": 2450 }, { "epoch": 1.1710769965951855, "grad_norm": 0.6040250815392916, "learning_rate": 1.4312042111332108e-05, "loss": 0.319, "step": 2451 }, { "epoch": 1.1715548653007586, "grad_norm": 0.4897011714819888, "learning_rate": 1.4307289601244963e-05, "loss": 0.3253, "step": 2452 }, { "epoch": 1.1720327340063317, "grad_norm": 0.519171989422915, "learning_rate": 1.4302535896391661e-05, "loss": 0.3387, "step": 2453 }, { "epoch": 1.172510602711905, "grad_norm": 0.5310376803286037, "learning_rate": 1.4297780998090795e-05, "loss": 0.3306, "step": 2454 }, { "epoch": 1.172988471417478, "grad_norm": 0.5546851814204874, "learning_rate": 1.4293024907661295e-05, "loss": 0.3302, "step": 2455 }, { "epoch": 1.1734663401230512, "grad_norm": 0.5438187932879726, "learning_rate": 1.4288267626422411e-05, "loss": 0.3306, "step": 2456 }, { "epoch": 1.1739442088286243, "grad_norm": 0.5999235748445584, "learning_rate": 1.4283509155693734e-05, "loss": 0.3299, "step": 2457 }, { "epoch": 1.1744220775341976, "grad_norm": 0.5267230789014691, "learning_rate": 1.4278749496795174e-05, "loss": 0.3385, "step": 2458 }, { "epoch": 1.1748999462397707, "grad_norm": 0.49465625934833485, "learning_rate": 1.4273988651046982e-05, "loss": 0.3446, "step": 2459 }, { "epoch": 1.1753778149453438, "grad_norm": 0.520908355310508, "learning_rate": 1.4269226619769727e-05, "loss": 0.3317, "step": 2460 }, { "epoch": 1.1758556836509169, "grad_norm": 0.49990608969106165, "learning_rate": 1.4264463404284317e-05, "loss": 0.3267, "step": 2461 }, { "epoch": 1.17633355235649, "grad_norm": 0.49260831125565, "learning_rate": 1.4259699005911984e-05, "loss": 0.334, "step": 2462 }, { "epoch": 1.1768114210620633, "grad_norm": 0.5685639467668431, "learning_rate": 1.4254933425974284e-05, "loss": 0.3169, "step": 2463 }, { "epoch": 1.1772892897676364, "grad_norm": 0.5053854894890485, "learning_rate": 1.4250166665793106e-05, "loss": 0.3172, "step": 2464 }, { "epoch": 1.1777671584732095, "grad_norm": 0.5094103880275894, "learning_rate": 1.424539872669067e-05, "loss": 0.3371, "step": 2465 }, { "epoch": 1.1782450271787825, "grad_norm": 0.4946974997604784, "learning_rate": 1.4240629609989513e-05, "loss": 0.3334, "step": 2466 }, { "epoch": 1.1787228958843559, "grad_norm": 0.5018625213670579, "learning_rate": 1.4235859317012506e-05, "loss": 0.321, "step": 2467 }, { "epoch": 1.179200764589929, "grad_norm": 0.4709704159413237, "learning_rate": 1.4231087849082848e-05, "loss": 0.3285, "step": 2468 }, { "epoch": 1.179678633295502, "grad_norm": 0.49926378629859847, "learning_rate": 1.4226315207524049e-05, "loss": 0.3313, "step": 2469 }, { "epoch": 1.1801565020010751, "grad_norm": 0.512829518968713, "learning_rate": 1.4221541393659966e-05, "loss": 0.3374, "step": 2470 }, { "epoch": 1.1806343707066485, "grad_norm": 0.49421509200107594, "learning_rate": 1.4216766408814766e-05, "loss": 0.3326, "step": 2471 }, { "epoch": 1.1811122394122215, "grad_norm": 0.5693138964760737, "learning_rate": 1.4211990254312948e-05, "loss": 0.3413, "step": 2472 }, { "epoch": 1.1815901081177946, "grad_norm": 0.4758930284278045, "learning_rate": 1.4207212931479331e-05, "loss": 0.3227, "step": 2473 }, { "epoch": 1.1820679768233677, "grad_norm": 0.5414563707616173, "learning_rate": 1.4202434441639061e-05, "loss": 0.3565, "step": 2474 }, { "epoch": 1.1825458455289408, "grad_norm": 0.47934944618315223, "learning_rate": 1.4197654786117604e-05, "loss": 0.3228, "step": 2475 }, { "epoch": 1.1830237142345141, "grad_norm": 0.6167228683042635, "learning_rate": 1.419287396624076e-05, "loss": 0.3194, "step": 2476 }, { "epoch": 1.1835015829400872, "grad_norm": 0.6097773368996701, "learning_rate": 1.4188091983334636e-05, "loss": 0.3344, "step": 2477 }, { "epoch": 1.1839794516456603, "grad_norm": 0.5410843300407218, "learning_rate": 1.4183308838725669e-05, "loss": 0.3372, "step": 2478 }, { "epoch": 1.1844573203512334, "grad_norm": 0.48327603901696636, "learning_rate": 1.4178524533740628e-05, "loss": 0.3308, "step": 2479 }, { "epoch": 1.1849351890568067, "grad_norm": 0.5268010648750129, "learning_rate": 1.4173739069706586e-05, "loss": 0.3396, "step": 2480 }, { "epoch": 1.1854130577623798, "grad_norm": 0.5058799850737847, "learning_rate": 1.4168952447950948e-05, "loss": 0.3488, "step": 2481 }, { "epoch": 1.185890926467953, "grad_norm": 0.4793089155470465, "learning_rate": 1.4164164669801444e-05, "loss": 0.3202, "step": 2482 }, { "epoch": 1.186368795173526, "grad_norm": 0.537347786961874, "learning_rate": 1.4159375736586114e-05, "loss": 0.323, "step": 2483 }, { "epoch": 1.1868466638790993, "grad_norm": 0.5043366860217638, "learning_rate": 1.4154585649633324e-05, "loss": 0.3332, "step": 2484 }, { "epoch": 1.1873245325846724, "grad_norm": 0.6175384664642016, "learning_rate": 1.414979441027176e-05, "loss": 0.3298, "step": 2485 }, { "epoch": 1.1878024012902455, "grad_norm": 0.5172614792354202, "learning_rate": 1.414500201983043e-05, "loss": 0.3299, "step": 2486 }, { "epoch": 1.1882802699958186, "grad_norm": 0.5074099118761909, "learning_rate": 1.4140208479638653e-05, "loss": 0.3314, "step": 2487 }, { "epoch": 1.1887581387013917, "grad_norm": 0.49819077409033413, "learning_rate": 1.4135413791026081e-05, "loss": 0.3451, "step": 2488 }, { "epoch": 1.189236007406965, "grad_norm": 0.5176243373050009, "learning_rate": 1.4130617955322665e-05, "loss": 0.3221, "step": 2489 }, { "epoch": 1.189713876112538, "grad_norm": 0.473185476669963, "learning_rate": 1.4125820973858693e-05, "loss": 0.3231, "step": 2490 }, { "epoch": 1.1901917448181112, "grad_norm": 0.48907220673787455, "learning_rate": 1.4121022847964762e-05, "loss": 0.3408, "step": 2491 }, { "epoch": 1.1906696135236843, "grad_norm": 1.177648905452567, "learning_rate": 1.4116223578971787e-05, "loss": 0.332, "step": 2492 }, { "epoch": 1.1911474822292576, "grad_norm": 0.4828061304183026, "learning_rate": 1.4111423168210999e-05, "loss": 0.319, "step": 2493 }, { "epoch": 1.1916253509348307, "grad_norm": 0.5135930311328842, "learning_rate": 1.410662161701395e-05, "loss": 0.3381, "step": 2494 }, { "epoch": 1.1921032196404038, "grad_norm": 0.526643592475081, "learning_rate": 1.41018189267125e-05, "loss": 0.3262, "step": 2495 }, { "epoch": 1.1925810883459769, "grad_norm": 0.5090734041849178, "learning_rate": 1.4097015098638838e-05, "loss": 0.3276, "step": 2496 }, { "epoch": 1.1930589570515502, "grad_norm": 0.5174675269837884, "learning_rate": 1.4092210134125458e-05, "loss": 0.3263, "step": 2497 }, { "epoch": 1.1935368257571233, "grad_norm": 0.5214389237769186, "learning_rate": 1.4087404034505167e-05, "loss": 0.3375, "step": 2498 }, { "epoch": 1.1940146944626964, "grad_norm": 0.4846029928861073, "learning_rate": 1.4082596801111104e-05, "loss": 0.3388, "step": 2499 }, { "epoch": 1.1944925631682695, "grad_norm": 0.5209998258997901, "learning_rate": 1.4077788435276701e-05, "loss": 0.3373, "step": 2500 }, { "epoch": 1.1949704318738426, "grad_norm": 0.5407837704720175, "learning_rate": 1.4072978938335717e-05, "loss": 0.3242, "step": 2501 }, { "epoch": 1.1954483005794159, "grad_norm": 0.4792112154512138, "learning_rate": 1.4068168311622223e-05, "loss": 0.3262, "step": 2502 }, { "epoch": 1.195926169284989, "grad_norm": 0.5012097821642677, "learning_rate": 1.40633565564706e-05, "loss": 0.3299, "step": 2503 }, { "epoch": 1.196404037990562, "grad_norm": 0.5011100101721819, "learning_rate": 1.4058543674215543e-05, "loss": 0.3303, "step": 2504 }, { "epoch": 1.1968819066961351, "grad_norm": 0.5072036552663972, "learning_rate": 1.4053729666192067e-05, "loss": 0.3034, "step": 2505 }, { "epoch": 1.1973597754017085, "grad_norm": 0.49966051888354573, "learning_rate": 1.4048914533735482e-05, "loss": 0.3185, "step": 2506 }, { "epoch": 1.1978376441072816, "grad_norm": 0.6201559448981543, "learning_rate": 1.404409827818143e-05, "loss": 0.3081, "step": 2507 }, { "epoch": 1.1983155128128546, "grad_norm": 0.4892341169267986, "learning_rate": 1.4039280900865855e-05, "loss": 0.3347, "step": 2508 }, { "epoch": 1.1987933815184277, "grad_norm": 0.4894821104819882, "learning_rate": 1.4034462403125004e-05, "loss": 0.319, "step": 2509 }, { "epoch": 1.199271250224001, "grad_norm": 0.529046836458218, "learning_rate": 1.4029642786295452e-05, "loss": 0.3228, "step": 2510 }, { "epoch": 1.1997491189295741, "grad_norm": 0.5119754528953527, "learning_rate": 1.4024822051714075e-05, "loss": 0.3259, "step": 2511 }, { "epoch": 1.2002269876351472, "grad_norm": 0.5166507305196261, "learning_rate": 1.4020000200718053e-05, "loss": 0.347, "step": 2512 }, { "epoch": 1.2007048563407203, "grad_norm": 0.485424117273842, "learning_rate": 1.401517723464489e-05, "loss": 0.3433, "step": 2513 }, { "epoch": 1.2011827250462934, "grad_norm": 0.49766272512833354, "learning_rate": 1.4010353154832388e-05, "loss": 0.336, "step": 2514 }, { "epoch": 1.2016605937518667, "grad_norm": 0.6389939051707664, "learning_rate": 1.400552796261866e-05, "loss": 0.3151, "step": 2515 }, { "epoch": 1.2021384624574398, "grad_norm": 0.4842050986179893, "learning_rate": 1.4000701659342136e-05, "loss": 0.3315, "step": 2516 }, { "epoch": 1.202616331163013, "grad_norm": 0.5199065736138204, "learning_rate": 1.3995874246341542e-05, "loss": 0.329, "step": 2517 }, { "epoch": 1.203094199868586, "grad_norm": 0.46844026904175, "learning_rate": 1.3991045724955915e-05, "loss": 0.3358, "step": 2518 }, { "epoch": 1.2035720685741593, "grad_norm": 0.5097819606674651, "learning_rate": 1.3986216096524606e-05, "loss": 0.3282, "step": 2519 }, { "epoch": 1.2040499372797324, "grad_norm": 0.49611533463068996, "learning_rate": 1.3981385362387268e-05, "loss": 0.3237, "step": 2520 }, { "epoch": 1.2045278059853055, "grad_norm": 0.5427021170944505, "learning_rate": 1.397655352388386e-05, "loss": 0.3174, "step": 2521 }, { "epoch": 1.2050056746908786, "grad_norm": 0.5177994962109214, "learning_rate": 1.397172058235465e-05, "loss": 0.3266, "step": 2522 }, { "epoch": 1.205483543396452, "grad_norm": 0.5151693584485367, "learning_rate": 1.3966886539140212e-05, "loss": 0.343, "step": 2523 }, { "epoch": 1.205961412102025, "grad_norm": 0.4877067051638373, "learning_rate": 1.396205139558142e-05, "loss": 0.3289, "step": 2524 }, { "epoch": 1.206439280807598, "grad_norm": 0.4986624439516977, "learning_rate": 1.3957215153019463e-05, "loss": 0.3187, "step": 2525 }, { "epoch": 1.2069171495131712, "grad_norm": 0.5098275657839579, "learning_rate": 1.3952377812795826e-05, "loss": 0.3306, "step": 2526 }, { "epoch": 1.2073950182187443, "grad_norm": 0.4851080171025919, "learning_rate": 1.3947539376252301e-05, "loss": 0.3326, "step": 2527 }, { "epoch": 1.2078728869243176, "grad_norm": 0.5181654078258601, "learning_rate": 1.3942699844730986e-05, "loss": 0.3275, "step": 2528 }, { "epoch": 1.2083507556298907, "grad_norm": 0.517517079824198, "learning_rate": 1.3937859219574286e-05, "loss": 0.308, "step": 2529 }, { "epoch": 1.2088286243354638, "grad_norm": 0.5359336855688807, "learning_rate": 1.3933017502124897e-05, "loss": 0.3184, "step": 2530 }, { "epoch": 1.2093064930410369, "grad_norm": 0.5177636108490216, "learning_rate": 1.392817469372583e-05, "loss": 0.3318, "step": 2531 }, { "epoch": 1.2097843617466102, "grad_norm": 0.5049686036498979, "learning_rate": 1.3923330795720396e-05, "loss": 0.3379, "step": 2532 }, { "epoch": 1.2102622304521833, "grad_norm": 0.5034062129917729, "learning_rate": 1.3918485809452204e-05, "loss": 0.3297, "step": 2533 }, { "epoch": 1.2107400991577564, "grad_norm": 0.5046311056670907, "learning_rate": 1.3913639736265175e-05, "loss": 0.3385, "step": 2534 }, { "epoch": 1.2112179678633295, "grad_norm": 0.7815693194085443, "learning_rate": 1.3908792577503514e-05, "loss": 0.3385, "step": 2535 }, { "epoch": 1.2116958365689028, "grad_norm": 0.47743692428948703, "learning_rate": 1.3903944334511744e-05, "loss": 0.3177, "step": 2536 }, { "epoch": 1.2121737052744759, "grad_norm": 0.49090736922505357, "learning_rate": 1.3899095008634678e-05, "loss": 0.3218, "step": 2537 }, { "epoch": 1.212651573980049, "grad_norm": 0.49841642881825793, "learning_rate": 1.3894244601217435e-05, "loss": 0.3346, "step": 2538 }, { "epoch": 1.213129442685622, "grad_norm": 0.510969551833143, "learning_rate": 1.3889393113605433e-05, "loss": 0.3552, "step": 2539 }, { "epoch": 1.2136073113911952, "grad_norm": 0.5523292379752209, "learning_rate": 1.3884540547144393e-05, "loss": 0.3211, "step": 2540 }, { "epoch": 1.2140851800967685, "grad_norm": 0.5051371714386153, "learning_rate": 1.3879686903180326e-05, "loss": 0.3393, "step": 2541 }, { "epoch": 1.2145630488023416, "grad_norm": 0.5044231031807489, "learning_rate": 1.3874832183059545e-05, "loss": 0.3157, "step": 2542 }, { "epoch": 1.2150409175079147, "grad_norm": 0.4730004046440912, "learning_rate": 1.3869976388128672e-05, "loss": 0.3501, "step": 2543 }, { "epoch": 1.215518786213488, "grad_norm": 0.5105649664830538, "learning_rate": 1.3865119519734611e-05, "loss": 0.3328, "step": 2544 }, { "epoch": 1.215996654919061, "grad_norm": 0.5704906525223905, "learning_rate": 1.3860261579224574e-05, "loss": 0.3322, "step": 2545 }, { "epoch": 1.2164745236246342, "grad_norm": 0.525856706643172, "learning_rate": 1.3855402567946072e-05, "loss": 0.3202, "step": 2546 }, { "epoch": 1.2169523923302072, "grad_norm": 0.47546063015364237, "learning_rate": 1.3850542487246903e-05, "loss": 0.3377, "step": 2547 }, { "epoch": 1.2174302610357803, "grad_norm": 0.5451603336435644, "learning_rate": 1.384568133847517e-05, "loss": 0.3452, "step": 2548 }, { "epoch": 1.2179081297413537, "grad_norm": 0.5361634977822047, "learning_rate": 1.3840819122979272e-05, "loss": 0.3131, "step": 2549 }, { "epoch": 1.2183859984469267, "grad_norm": 0.4922706268092114, "learning_rate": 1.3835955842107897e-05, "loss": 0.3306, "step": 2550 }, { "epoch": 1.2188638671524998, "grad_norm": 0.4894912959757622, "learning_rate": 1.3831091497210043e-05, "loss": 0.322, "step": 2551 }, { "epoch": 1.219341735858073, "grad_norm": 0.5551732852060824, "learning_rate": 1.3826226089634982e-05, "loss": 0.3223, "step": 2552 }, { "epoch": 1.219819604563646, "grad_norm": 0.47984089277387154, "learning_rate": 1.3821359620732297e-05, "loss": 0.3399, "step": 2553 }, { "epoch": 1.2202974732692193, "grad_norm": 0.469484790257942, "learning_rate": 1.3816492091851865e-05, "loss": 0.3439, "step": 2554 }, { "epoch": 1.2207753419747924, "grad_norm": 0.49706698283821926, "learning_rate": 1.3811623504343845e-05, "loss": 0.3425, "step": 2555 }, { "epoch": 1.2212532106803655, "grad_norm": 0.4864721219524991, "learning_rate": 1.3806753859558702e-05, "loss": 0.3333, "step": 2556 }, { "epoch": 1.2217310793859388, "grad_norm": 0.531371918260641, "learning_rate": 1.380188315884719e-05, "loss": 0.3262, "step": 2557 }, { "epoch": 1.222208948091512, "grad_norm": 0.508703916265606, "learning_rate": 1.3797011403560349e-05, "loss": 0.3339, "step": 2558 }, { "epoch": 1.222686816797085, "grad_norm": 0.5519217880427334, "learning_rate": 1.3792138595049526e-05, "loss": 0.3313, "step": 2559 }, { "epoch": 1.2231646855026581, "grad_norm": 0.49067395021756355, "learning_rate": 1.378726473466635e-05, "loss": 0.3301, "step": 2560 }, { "epoch": 1.2236425542082312, "grad_norm": 0.527772112286325, "learning_rate": 1.378238982376274e-05, "loss": 0.3121, "step": 2561 }, { "epoch": 1.2241204229138045, "grad_norm": 0.5044285212896721, "learning_rate": 1.3777513863690914e-05, "loss": 0.3276, "step": 2562 }, { "epoch": 1.2245982916193776, "grad_norm": 0.5706596519616292, "learning_rate": 1.3772636855803378e-05, "loss": 0.3331, "step": 2563 }, { "epoch": 1.2250761603249507, "grad_norm": 0.4736938143718017, "learning_rate": 1.3767758801452926e-05, "loss": 0.3358, "step": 2564 }, { "epoch": 1.2255540290305238, "grad_norm": 0.5097229175872106, "learning_rate": 1.3762879701992642e-05, "loss": 0.3303, "step": 2565 }, { "epoch": 1.2260318977360969, "grad_norm": 0.5296217110279442, "learning_rate": 1.3757999558775907e-05, "loss": 0.3316, "step": 2566 }, { "epoch": 1.2265097664416702, "grad_norm": 0.4965806245245254, "learning_rate": 1.3753118373156382e-05, "loss": 0.311, "step": 2567 }, { "epoch": 1.2269876351472433, "grad_norm": 0.49384330392321485, "learning_rate": 1.3748236146488028e-05, "loss": 0.3394, "step": 2568 }, { "epoch": 1.2274655038528164, "grad_norm": 0.5159965075461034, "learning_rate": 1.3743352880125083e-05, "loss": 0.315, "step": 2569 }, { "epoch": 1.2279433725583897, "grad_norm": 0.5382950937633958, "learning_rate": 1.373846857542208e-05, "loss": 0.3495, "step": 2570 }, { "epoch": 1.2284212412639628, "grad_norm": 0.48821357626356915, "learning_rate": 1.3733583233733843e-05, "loss": 0.3366, "step": 2571 }, { "epoch": 1.2288991099695359, "grad_norm": 0.4912616599041392, "learning_rate": 1.372869685641547e-05, "loss": 0.3233, "step": 2572 }, { "epoch": 1.229376978675109, "grad_norm": 0.49906610048849775, "learning_rate": 1.3723809444822366e-05, "loss": 0.3223, "step": 2573 }, { "epoch": 1.229854847380682, "grad_norm": 0.4900856446449379, "learning_rate": 1.371892100031021e-05, "loss": 0.3358, "step": 2574 }, { "epoch": 1.2303327160862554, "grad_norm": 0.5218820319587112, "learning_rate": 1.3714031524234965e-05, "loss": 0.3134, "step": 2575 }, { "epoch": 1.2308105847918285, "grad_norm": 0.4782986698985612, "learning_rate": 1.3709141017952893e-05, "loss": 0.3125, "step": 2576 }, { "epoch": 1.2312884534974016, "grad_norm": 0.5293380049327159, "learning_rate": 1.370424948282053e-05, "loss": 0.3269, "step": 2577 }, { "epoch": 1.2317663222029747, "grad_norm": 0.48617546343080253, "learning_rate": 1.3699356920194702e-05, "loss": 0.3262, "step": 2578 }, { "epoch": 1.2322441909085478, "grad_norm": 0.489962199133281, "learning_rate": 1.3694463331432521e-05, "loss": 0.3145, "step": 2579 }, { "epoch": 1.232722059614121, "grad_norm": 0.5271714061634383, "learning_rate": 1.3689568717891381e-05, "loss": 0.305, "step": 2580 }, { "epoch": 1.2331999283196942, "grad_norm": 0.476057076836738, "learning_rate": 1.368467308092896e-05, "loss": 0.3246, "step": 2581 }, { "epoch": 1.2336777970252673, "grad_norm": 0.5037616861616993, "learning_rate": 1.3679776421903222e-05, "loss": 0.3157, "step": 2582 }, { "epoch": 1.2341556657308406, "grad_norm": 0.5029866004003386, "learning_rate": 1.3674878742172415e-05, "loss": 0.3424, "step": 2583 }, { "epoch": 1.2346335344364137, "grad_norm": 0.49776455157060684, "learning_rate": 1.366998004309507e-05, "loss": 0.3221, "step": 2584 }, { "epoch": 1.2351114031419868, "grad_norm": 0.5159003230211062, "learning_rate": 1.3665080326029997e-05, "loss": 0.3369, "step": 2585 }, { "epoch": 1.2355892718475598, "grad_norm": 0.4945390309105578, "learning_rate": 1.3660179592336296e-05, "loss": 0.324, "step": 2586 }, { "epoch": 1.236067140553133, "grad_norm": 0.5028457586094409, "learning_rate": 1.3655277843373338e-05, "loss": 0.3276, "step": 2587 }, { "epoch": 1.2365450092587063, "grad_norm": 0.4924865463215211, "learning_rate": 1.3650375080500784e-05, "loss": 0.3102, "step": 2588 }, { "epoch": 1.2370228779642793, "grad_norm": 0.4786677818333961, "learning_rate": 1.3645471305078575e-05, "loss": 0.3336, "step": 2589 }, { "epoch": 1.2375007466698524, "grad_norm": 0.5645830343314627, "learning_rate": 1.364056651846693e-05, "loss": 0.3441, "step": 2590 }, { "epoch": 1.2379786153754255, "grad_norm": 0.4772358981883002, "learning_rate": 1.3635660722026352e-05, "loss": 0.3442, "step": 2591 }, { "epoch": 1.2384564840809986, "grad_norm": 0.5656570549920833, "learning_rate": 1.3630753917117624e-05, "loss": 0.3081, "step": 2592 }, { "epoch": 1.238934352786572, "grad_norm": 0.4751884801558594, "learning_rate": 1.3625846105101801e-05, "loss": 0.3142, "step": 2593 }, { "epoch": 1.239412221492145, "grad_norm": 2.4050639525172075, "learning_rate": 1.3620937287340228e-05, "loss": 0.3273, "step": 2594 }, { "epoch": 1.2398900901977181, "grad_norm": 0.7444744255219318, "learning_rate": 1.3616027465194525e-05, "loss": 0.3273, "step": 2595 }, { "epoch": 1.2403679589032914, "grad_norm": 0.4944733158259898, "learning_rate": 1.3611116640026589e-05, "loss": 0.3399, "step": 2596 }, { "epoch": 1.2408458276088645, "grad_norm": 0.4818780698812894, "learning_rate": 1.3606204813198593e-05, "loss": 0.3213, "step": 2597 }, { "epoch": 1.2413236963144376, "grad_norm": 0.48772958226512186, "learning_rate": 1.3601291986072999e-05, "loss": 0.3328, "step": 2598 }, { "epoch": 1.2418015650200107, "grad_norm": 0.49554621709972824, "learning_rate": 1.3596378160012529e-05, "loss": 0.3458, "step": 2599 }, { "epoch": 1.2422794337255838, "grad_norm": 0.473500813505667, "learning_rate": 1.35914633363802e-05, "loss": 0.3203, "step": 2600 }, { "epoch": 1.2427573024311571, "grad_norm": 0.48856018275808843, "learning_rate": 1.358654751653929e-05, "loss": 0.3171, "step": 2601 }, { "epoch": 1.2432351711367302, "grad_norm": 0.5132783506070369, "learning_rate": 1.3581630701853368e-05, "loss": 0.3246, "step": 2602 }, { "epoch": 1.2437130398423033, "grad_norm": 0.4879512811328692, "learning_rate": 1.3576712893686268e-05, "loss": 0.2965, "step": 2603 }, { "epoch": 1.2441909085478764, "grad_norm": 0.5692497626729707, "learning_rate": 1.3571794093402103e-05, "loss": 0.3554, "step": 2604 }, { "epoch": 1.2446687772534497, "grad_norm": 0.5262501553177124, "learning_rate": 1.356687430236526e-05, "loss": 0.3426, "step": 2605 }, { "epoch": 1.2451466459590228, "grad_norm": 0.5101017072346472, "learning_rate": 1.3561953521940408e-05, "loss": 0.3384, "step": 2606 }, { "epoch": 1.245624514664596, "grad_norm": 0.4855194753569897, "learning_rate": 1.3557031753492477e-05, "loss": 0.3314, "step": 2607 }, { "epoch": 1.246102383370169, "grad_norm": 0.5165611849608196, "learning_rate": 1.3552108998386683e-05, "loss": 0.3408, "step": 2608 }, { "epoch": 1.2465802520757423, "grad_norm": 0.4974036775097272, "learning_rate": 1.3547185257988513e-05, "loss": 0.3221, "step": 2609 }, { "epoch": 1.2470581207813154, "grad_norm": 0.4947570822821488, "learning_rate": 1.3542260533663723e-05, "loss": 0.3427, "step": 2610 }, { "epoch": 1.2475359894868885, "grad_norm": 0.48247105943498364, "learning_rate": 1.3537334826778343e-05, "loss": 0.33, "step": 2611 }, { "epoch": 1.2480138581924616, "grad_norm": 0.5029602456053662, "learning_rate": 1.3532408138698685e-05, "loss": 0.337, "step": 2612 }, { "epoch": 1.2484917268980347, "grad_norm": 0.5072947162416663, "learning_rate": 1.3527480470791314e-05, "loss": 0.3383, "step": 2613 }, { "epoch": 1.248969595603608, "grad_norm": 0.49966515091084346, "learning_rate": 1.3522551824423088e-05, "loss": 0.3276, "step": 2614 }, { "epoch": 1.249447464309181, "grad_norm": 0.4845192026566661, "learning_rate": 1.351762220096112e-05, "loss": 0.3403, "step": 2615 }, { "epoch": 1.2499253330147542, "grad_norm": 0.48326585792046045, "learning_rate": 1.3512691601772803e-05, "loss": 0.3138, "step": 2616 }, { "epoch": 1.2504032017203273, "grad_norm": 0.500541614774048, "learning_rate": 1.3507760028225801e-05, "loss": 0.3183, "step": 2617 }, { "epoch": 1.2508810704259004, "grad_norm": 0.46031252587482197, "learning_rate": 1.3502827481688041e-05, "loss": 0.3235, "step": 2618 }, { "epoch": 1.2513589391314737, "grad_norm": 0.4621265173598907, "learning_rate": 1.3497893963527729e-05, "loss": 0.3192, "step": 2619 }, { "epoch": 1.2518368078370468, "grad_norm": 0.509128957862185, "learning_rate": 1.3492959475113332e-05, "loss": 0.343, "step": 2620 }, { "epoch": 1.2523146765426199, "grad_norm": 0.5017135977140528, "learning_rate": 1.348802401781359e-05, "loss": 0.3434, "step": 2621 }, { "epoch": 1.2527925452481932, "grad_norm": 0.48983060466434775, "learning_rate": 1.3483087592997513e-05, "loss": 0.3393, "step": 2622 }, { "epoch": 1.2532704139537663, "grad_norm": 0.4917900123369827, "learning_rate": 1.347815020203438e-05, "loss": 0.334, "step": 2623 }, { "epoch": 1.2537482826593394, "grad_norm": 0.48250587974454096, "learning_rate": 1.3473211846293735e-05, "loss": 0.331, "step": 2624 }, { "epoch": 1.2542261513649124, "grad_norm": 0.5248984409083738, "learning_rate": 1.3468272527145388e-05, "loss": 0.3289, "step": 2625 }, { "epoch": 1.2547040200704855, "grad_norm": 0.7447735031783237, "learning_rate": 1.3463332245959424e-05, "loss": 0.3237, "step": 2626 }, { "epoch": 1.2551818887760589, "grad_norm": 0.4813371324594707, "learning_rate": 1.3458391004106184e-05, "loss": 0.341, "step": 2627 }, { "epoch": 1.255659757481632, "grad_norm": 0.5512321025879464, "learning_rate": 1.3453448802956285e-05, "loss": 0.3284, "step": 2628 }, { "epoch": 1.256137626187205, "grad_norm": 0.5150834661725827, "learning_rate": 1.344850564388061e-05, "loss": 0.3277, "step": 2629 }, { "epoch": 1.2566154948927781, "grad_norm": 0.5120299966158544, "learning_rate": 1.3443561528250295e-05, "loss": 0.3403, "step": 2630 }, { "epoch": 1.2570933635983512, "grad_norm": 0.4814484918217983, "learning_rate": 1.3438616457436758e-05, "loss": 0.3525, "step": 2631 }, { "epoch": 1.2575712323039245, "grad_norm": 0.5120493174867615, "learning_rate": 1.343367043281167e-05, "loss": 0.3409, "step": 2632 }, { "epoch": 1.2580491010094976, "grad_norm": 0.47147647454359, "learning_rate": 1.3428723455746972e-05, "loss": 0.3457, "step": 2633 }, { "epoch": 1.2585269697150707, "grad_norm": 0.4898796593893291, "learning_rate": 1.3423775527614871e-05, "loss": 0.3284, "step": 2634 }, { "epoch": 1.259004838420644, "grad_norm": 0.48315639065037624, "learning_rate": 1.3418826649787834e-05, "loss": 0.319, "step": 2635 }, { "epoch": 1.2594827071262171, "grad_norm": 0.4518418312294045, "learning_rate": 1.341387682363859e-05, "loss": 0.3303, "step": 2636 }, { "epoch": 1.2599605758317902, "grad_norm": 0.4850719475140083, "learning_rate": 1.3408926050540134e-05, "loss": 0.3323, "step": 2637 }, { "epoch": 1.2604384445373633, "grad_norm": 0.4608901894307272, "learning_rate": 1.3403974331865728e-05, "loss": 0.3441, "step": 2638 }, { "epoch": 1.2609163132429364, "grad_norm": 0.5002804502797722, "learning_rate": 1.3399021668988882e-05, "loss": 0.3091, "step": 2639 }, { "epoch": 1.2613941819485097, "grad_norm": 0.49469049914409363, "learning_rate": 1.3394068063283387e-05, "loss": 0.3302, "step": 2640 }, { "epoch": 1.2618720506540828, "grad_norm": 0.6390540371820533, "learning_rate": 1.3389113516123283e-05, "loss": 0.3219, "step": 2641 }, { "epoch": 1.262349919359656, "grad_norm": 0.5039329426448652, "learning_rate": 1.3384158028882866e-05, "loss": 0.3264, "step": 2642 }, { "epoch": 1.262827788065229, "grad_norm": 0.4908786517804285, "learning_rate": 1.337920160293671e-05, "loss": 0.3383, "step": 2643 }, { "epoch": 1.263305656770802, "grad_norm": 0.475559094015669, "learning_rate": 1.3374244239659641e-05, "loss": 0.335, "step": 2644 }, { "epoch": 1.2637835254763754, "grad_norm": 0.5008300872967887, "learning_rate": 1.3369285940426737e-05, "loss": 0.3163, "step": 2645 }, { "epoch": 1.2642613941819485, "grad_norm": 0.5167558213932772, "learning_rate": 1.3364326706613346e-05, "loss": 0.3169, "step": 2646 }, { "epoch": 1.2647392628875216, "grad_norm": 0.5070324119135403, "learning_rate": 1.3359366539595075e-05, "loss": 0.3486, "step": 2647 }, { "epoch": 1.265217131593095, "grad_norm": 0.4926151078159255, "learning_rate": 1.3354405440747783e-05, "loss": 0.322, "step": 2648 }, { "epoch": 1.265695000298668, "grad_norm": 0.531687534308957, "learning_rate": 1.3349443411447591e-05, "loss": 0.3295, "step": 2649 }, { "epoch": 1.266172869004241, "grad_norm": 0.4865326711170691, "learning_rate": 1.334448045307088e-05, "loss": 0.321, "step": 2650 }, { "epoch": 1.2666507377098142, "grad_norm": 0.505824257323991, "learning_rate": 1.3339516566994285e-05, "loss": 0.3275, "step": 2651 }, { "epoch": 1.2671286064153873, "grad_norm": 0.4743225313426183, "learning_rate": 1.3334551754594709e-05, "loss": 0.3338, "step": 2652 }, { "epoch": 1.2676064751209606, "grad_norm": 0.51248355708598, "learning_rate": 1.3329586017249293e-05, "loss": 0.3227, "step": 2653 }, { "epoch": 1.2680843438265337, "grad_norm": 0.48662715453553024, "learning_rate": 1.3324619356335446e-05, "loss": 0.3229, "step": 2654 }, { "epoch": 1.2685622125321068, "grad_norm": 0.5130913956035601, "learning_rate": 1.331965177323084e-05, "loss": 0.3297, "step": 2655 }, { "epoch": 1.26904008123768, "grad_norm": 0.507745869575156, "learning_rate": 1.3314683269313387e-05, "loss": 0.3303, "step": 2656 }, { "epoch": 1.269517949943253, "grad_norm": 0.5295327741107102, "learning_rate": 1.3309713845961265e-05, "loss": 0.3353, "step": 2657 }, { "epoch": 1.2699958186488263, "grad_norm": 0.5362619437276863, "learning_rate": 1.3304743504552906e-05, "loss": 0.3198, "step": 2658 }, { "epoch": 1.2704736873543994, "grad_norm": 0.7456002921673187, "learning_rate": 1.3299772246466992e-05, "loss": 0.3247, "step": 2659 }, { "epoch": 1.2709515560599725, "grad_norm": 0.5057040297711566, "learning_rate": 1.3294800073082464e-05, "loss": 0.3366, "step": 2660 }, { "epoch": 1.2714294247655458, "grad_norm": 0.5114205500351166, "learning_rate": 1.3289826985778515e-05, "loss": 0.3081, "step": 2661 }, { "epoch": 1.2719072934711189, "grad_norm": 0.5107340207654342, "learning_rate": 1.3284852985934591e-05, "loss": 0.3003, "step": 2662 }, { "epoch": 1.272385162176692, "grad_norm": 0.5064044347392543, "learning_rate": 1.3279878074930394e-05, "loss": 0.3368, "step": 2663 }, { "epoch": 1.272863030882265, "grad_norm": 0.5498353953286683, "learning_rate": 1.3274902254145876e-05, "loss": 0.3265, "step": 2664 }, { "epoch": 1.2733408995878381, "grad_norm": 0.527749650382651, "learning_rate": 1.3269925524961237e-05, "loss": 0.3413, "step": 2665 }, { "epoch": 1.2738187682934115, "grad_norm": 0.4800823655478267, "learning_rate": 1.326494788875694e-05, "loss": 0.3383, "step": 2666 }, { "epoch": 1.2742966369989845, "grad_norm": 0.5476129798251085, "learning_rate": 1.3259969346913692e-05, "loss": 0.3114, "step": 2667 }, { "epoch": 1.2747745057045576, "grad_norm": 0.6895619251669788, "learning_rate": 1.3254989900812452e-05, "loss": 0.3184, "step": 2668 }, { "epoch": 1.275252374410131, "grad_norm": 0.5299012176977083, "learning_rate": 1.3250009551834431e-05, "loss": 0.3135, "step": 2669 }, { "epoch": 1.275730243115704, "grad_norm": 0.5236139281767733, "learning_rate": 1.3245028301361086e-05, "loss": 0.3399, "step": 2670 }, { "epoch": 1.2762081118212771, "grad_norm": 0.5049494339980716, "learning_rate": 1.3240046150774136e-05, "loss": 0.3293, "step": 2671 }, { "epoch": 1.2766859805268502, "grad_norm": 0.48846700131030796, "learning_rate": 1.3235063101455536e-05, "loss": 0.3272, "step": 2672 }, { "epoch": 1.2771638492324233, "grad_norm": 0.534758434518732, "learning_rate": 1.3230079154787497e-05, "loss": 0.2966, "step": 2673 }, { "epoch": 1.2776417179379966, "grad_norm": 0.5290754783911104, "learning_rate": 1.3225094312152478e-05, "loss": 0.336, "step": 2674 }, { "epoch": 1.2781195866435697, "grad_norm": 0.5164880573517012, "learning_rate": 1.3220108574933185e-05, "loss": 0.3283, "step": 2675 }, { "epoch": 1.2785974553491428, "grad_norm": 0.5334112157901388, "learning_rate": 1.3215121944512576e-05, "loss": 0.3334, "step": 2676 }, { "epoch": 1.279075324054716, "grad_norm": 1.0889045439453695, "learning_rate": 1.3210134422273855e-05, "loss": 0.3356, "step": 2677 }, { "epoch": 1.279553192760289, "grad_norm": 0.5519985462389302, "learning_rate": 1.3205146009600472e-05, "loss": 0.3169, "step": 2678 }, { "epoch": 1.2800310614658623, "grad_norm": 0.5034434905243013, "learning_rate": 1.320015670787612e-05, "loss": 0.3264, "step": 2679 }, { "epoch": 1.2805089301714354, "grad_norm": 0.4936078005967817, "learning_rate": 1.3195166518484748e-05, "loss": 0.3266, "step": 2680 }, { "epoch": 1.2809867988770085, "grad_norm": 0.48996148603955003, "learning_rate": 1.3190175442810547e-05, "loss": 0.3149, "step": 2681 }, { "epoch": 1.2814646675825818, "grad_norm": 0.5937307442317561, "learning_rate": 1.3185183482237948e-05, "loss": 0.3398, "step": 2682 }, { "epoch": 1.281942536288155, "grad_norm": 0.5041569401113013, "learning_rate": 1.318019063815164e-05, "loss": 0.3533, "step": 2683 }, { "epoch": 1.282420404993728, "grad_norm": 0.49226884478826943, "learning_rate": 1.3175196911936548e-05, "loss": 0.3384, "step": 2684 }, { "epoch": 1.282898273699301, "grad_norm": 0.5095296957203082, "learning_rate": 1.317020230497784e-05, "loss": 0.3261, "step": 2685 }, { "epoch": 1.2833761424048742, "grad_norm": 0.5756732067034811, "learning_rate": 1.3165206818660932e-05, "loss": 0.333, "step": 2686 }, { "epoch": 1.2838540111104475, "grad_norm": 0.5299900729104405, "learning_rate": 1.3160210454371489e-05, "loss": 0.3262, "step": 2687 }, { "epoch": 1.2843318798160206, "grad_norm": 0.5971638970062076, "learning_rate": 1.315521321349541e-05, "loss": 0.3198, "step": 2688 }, { "epoch": 1.2848097485215937, "grad_norm": 0.4998008800910582, "learning_rate": 1.3150215097418844e-05, "loss": 0.3317, "step": 2689 }, { "epoch": 1.2852876172271668, "grad_norm": 0.4930588171021608, "learning_rate": 1.3145216107528178e-05, "loss": 0.3055, "step": 2690 }, { "epoch": 1.2857654859327399, "grad_norm": 0.5053649686384691, "learning_rate": 1.3140216245210042e-05, "loss": 0.3419, "step": 2691 }, { "epoch": 1.2862433546383132, "grad_norm": 0.4874790099765457, "learning_rate": 1.3135215511851316e-05, "loss": 0.336, "step": 2692 }, { "epoch": 1.2867212233438863, "grad_norm": 0.5212769061425828, "learning_rate": 1.313021390883911e-05, "loss": 0.3201, "step": 2693 }, { "epoch": 1.2871990920494594, "grad_norm": 0.5057814940111431, "learning_rate": 1.312521143756078e-05, "loss": 0.3087, "step": 2694 }, { "epoch": 1.2876769607550327, "grad_norm": 0.5430285465859317, "learning_rate": 1.3120208099403926e-05, "loss": 0.3223, "step": 2695 }, { "epoch": 1.2881548294606058, "grad_norm": 0.4973193693517145, "learning_rate": 1.3115203895756387e-05, "loss": 0.3249, "step": 2696 }, { "epoch": 1.2886326981661789, "grad_norm": 0.48876830415257927, "learning_rate": 1.3110198828006236e-05, "loss": 0.3256, "step": 2697 }, { "epoch": 1.289110566871752, "grad_norm": 0.5073517633637624, "learning_rate": 1.3105192897541792e-05, "loss": 0.3369, "step": 2698 }, { "epoch": 1.289588435577325, "grad_norm": 0.5115023299359397, "learning_rate": 1.3100186105751615e-05, "loss": 0.3112, "step": 2699 }, { "epoch": 1.2900663042828984, "grad_norm": 0.5050875752293276, "learning_rate": 1.3095178454024496e-05, "loss": 0.3355, "step": 2700 }, { "epoch": 1.2905441729884715, "grad_norm": 0.48487507417501574, "learning_rate": 1.3090169943749475e-05, "loss": 0.3494, "step": 2701 }, { "epoch": 1.2910220416940446, "grad_norm": 0.6454288527132577, "learning_rate": 1.308516057631582e-05, "loss": 0.3378, "step": 2702 }, { "epoch": 1.2914999103996176, "grad_norm": 0.5860456115191699, "learning_rate": 1.3080150353113044e-05, "loss": 0.3425, "step": 2703 }, { "epoch": 1.2919777791051907, "grad_norm": 0.45560558853026417, "learning_rate": 1.3075139275530893e-05, "loss": 0.3333, "step": 2704 }, { "epoch": 1.292455647810764, "grad_norm": 0.5735696204556753, "learning_rate": 1.3070127344959348e-05, "loss": 0.3034, "step": 2705 }, { "epoch": 1.2929335165163371, "grad_norm": 0.5440263491737388, "learning_rate": 1.3065114562788634e-05, "loss": 0.3148, "step": 2706 }, { "epoch": 1.2934113852219102, "grad_norm": 0.4894882914596722, "learning_rate": 1.3060100930409211e-05, "loss": 0.3359, "step": 2707 }, { "epoch": 1.2938892539274836, "grad_norm": 0.5571970234505838, "learning_rate": 1.3055086449211768e-05, "loss": 0.3288, "step": 2708 }, { "epoch": 1.2943671226330566, "grad_norm": 0.4614963731916626, "learning_rate": 1.3050071120587235e-05, "loss": 0.3191, "step": 2709 }, { "epoch": 1.2948449913386297, "grad_norm": 0.5415712777984418, "learning_rate": 1.3045054945926775e-05, "loss": 0.3393, "step": 2710 }, { "epoch": 1.2953228600442028, "grad_norm": 0.5125831058498063, "learning_rate": 1.3040037926621788e-05, "loss": 0.3201, "step": 2711 }, { "epoch": 1.295800728749776, "grad_norm": 0.5121610378114612, "learning_rate": 1.3035020064063903e-05, "loss": 0.3415, "step": 2712 }, { "epoch": 1.2962785974553492, "grad_norm": 0.5247492831665274, "learning_rate": 1.3030001359644992e-05, "loss": 0.3252, "step": 2713 }, { "epoch": 1.2967564661609223, "grad_norm": 0.9578688622282728, "learning_rate": 1.302498181475715e-05, "loss": 0.3308, "step": 2714 }, { "epoch": 1.2972343348664954, "grad_norm": 0.5081119642289837, "learning_rate": 1.3019961430792711e-05, "loss": 0.3129, "step": 2715 }, { "epoch": 1.2977122035720685, "grad_norm": 0.486093493880196, "learning_rate": 1.3014940209144246e-05, "loss": 0.3437, "step": 2716 }, { "epoch": 1.2981900722776416, "grad_norm": 0.5533735834037948, "learning_rate": 1.3009918151204546e-05, "loss": 0.3192, "step": 2717 }, { "epoch": 1.298667940983215, "grad_norm": 0.5027180369228621, "learning_rate": 1.3004895258366648e-05, "loss": 0.3539, "step": 2718 }, { "epoch": 1.299145809688788, "grad_norm": 0.48700562710108203, "learning_rate": 1.299987153202381e-05, "loss": 0.3283, "step": 2719 }, { "epoch": 1.299623678394361, "grad_norm": 0.4947556284065124, "learning_rate": 1.2994846973569524e-05, "loss": 0.3273, "step": 2720 }, { "epoch": 1.3001015470999344, "grad_norm": 0.5278204218639244, "learning_rate": 1.298982158439752e-05, "loss": 0.3283, "step": 2721 }, { "epoch": 1.3005794158055075, "grad_norm": 0.5311707177146678, "learning_rate": 1.2984795365901743e-05, "loss": 0.3358, "step": 2722 }, { "epoch": 1.3010572845110806, "grad_norm": 0.5090844743344694, "learning_rate": 1.2979768319476384e-05, "loss": 0.3265, "step": 2723 }, { "epoch": 1.3015351532166537, "grad_norm": 0.5069541011918202, "learning_rate": 1.2974740446515858e-05, "loss": 0.3422, "step": 2724 }, { "epoch": 1.3020130219222268, "grad_norm": 0.48331918054796685, "learning_rate": 1.2969711748414804e-05, "loss": 0.3206, "step": 2725 }, { "epoch": 1.3024908906278, "grad_norm": 0.5096220010199282, "learning_rate": 1.2964682226568095e-05, "loss": 0.3428, "step": 2726 }, { "epoch": 1.3029687593333732, "grad_norm": 0.5373918271686221, "learning_rate": 1.2959651882370835e-05, "loss": 0.3182, "step": 2727 }, { "epoch": 1.3034466280389463, "grad_norm": 0.48152585122740144, "learning_rate": 1.2954620717218344e-05, "loss": 0.3366, "step": 2728 }, { "epoch": 1.3039244967445194, "grad_norm": 0.5135718903681161, "learning_rate": 1.2949588732506191e-05, "loss": 0.3185, "step": 2729 }, { "epoch": 1.3044023654500925, "grad_norm": 0.5112482878597828, "learning_rate": 1.2944555929630152e-05, "loss": 0.3175, "step": 2730 }, { "epoch": 1.3048802341556658, "grad_norm": 0.5064609836837177, "learning_rate": 1.293952230998624e-05, "loss": 0.3276, "step": 2731 }, { "epoch": 1.3053581028612389, "grad_norm": 0.5400667483073638, "learning_rate": 1.2934487874970686e-05, "loss": 0.3254, "step": 2732 }, { "epoch": 1.305835971566812, "grad_norm": 0.8384923478396311, "learning_rate": 1.2929452625979966e-05, "loss": 0.3323, "step": 2733 }, { "epoch": 1.3063138402723853, "grad_norm": 0.5025088436289159, "learning_rate": 1.2924416564410755e-05, "loss": 0.3337, "step": 2734 }, { "epoch": 1.3067917089779584, "grad_norm": 0.5521316915977825, "learning_rate": 1.2919379691659979e-05, "loss": 0.3338, "step": 2735 }, { "epoch": 1.3072695776835315, "grad_norm": 0.5297751581493867, "learning_rate": 1.2914342009124777e-05, "loss": 0.3064, "step": 2736 }, { "epoch": 1.3077474463891046, "grad_norm": 0.5163194639773859, "learning_rate": 1.2909303518202502e-05, "loss": 0.3227, "step": 2737 }, { "epoch": 1.3082253150946777, "grad_norm": 0.48736656477750007, "learning_rate": 1.2904264220290755e-05, "loss": 0.3385, "step": 2738 }, { "epoch": 1.308703183800251, "grad_norm": 0.5131870252317862, "learning_rate": 1.2899224116787345e-05, "loss": 0.3316, "step": 2739 }, { "epoch": 1.309181052505824, "grad_norm": 0.5101460923469608, "learning_rate": 1.2894183209090304e-05, "loss": 0.3302, "step": 2740 }, { "epoch": 1.3096589212113972, "grad_norm": 0.6483849369195238, "learning_rate": 1.2889141498597893e-05, "loss": 0.3144, "step": 2741 }, { "epoch": 1.3101367899169702, "grad_norm": 0.4884644846621732, "learning_rate": 1.2884098986708598e-05, "loss": 0.3326, "step": 2742 }, { "epoch": 1.3106146586225433, "grad_norm": 0.49766916739509637, "learning_rate": 1.2879055674821112e-05, "loss": 0.346, "step": 2743 }, { "epoch": 1.3110925273281167, "grad_norm": 0.505704966484551, "learning_rate": 1.2874011564334372e-05, "loss": 0.3242, "step": 2744 }, { "epoch": 1.3115703960336897, "grad_norm": 0.49985444305963106, "learning_rate": 1.2868966656647522e-05, "loss": 0.3169, "step": 2745 }, { "epoch": 1.3120482647392628, "grad_norm": 0.4722113638853128, "learning_rate": 1.2863920953159925e-05, "loss": 0.3359, "step": 2746 }, { "epoch": 1.3125261334448362, "grad_norm": 0.5155161827129077, "learning_rate": 1.2858874455271175e-05, "loss": 0.3321, "step": 2747 }, { "epoch": 1.3130040021504092, "grad_norm": 0.5850743401779861, "learning_rate": 1.2853827164381083e-05, "loss": 0.3404, "step": 2748 }, { "epoch": 1.3134818708559823, "grad_norm": 0.47900054565538824, "learning_rate": 1.2848779081889675e-05, "loss": 0.3299, "step": 2749 }, { "epoch": 1.3139597395615554, "grad_norm": 0.4846014694563064, "learning_rate": 1.2843730209197203e-05, "loss": 0.317, "step": 2750 }, { "epoch": 1.3144376082671285, "grad_norm": 0.46319015268026265, "learning_rate": 1.283868054770413e-05, "loss": 0.3649, "step": 2751 }, { "epoch": 1.3149154769727018, "grad_norm": 0.492209610537854, "learning_rate": 1.2833630098811148e-05, "loss": 0.3354, "step": 2752 }, { "epoch": 1.315393345678275, "grad_norm": 0.5469966096817243, "learning_rate": 1.2828578863919163e-05, "loss": 0.3318, "step": 2753 }, { "epoch": 1.315871214383848, "grad_norm": 0.7017701345315566, "learning_rate": 1.2823526844429295e-05, "loss": 0.343, "step": 2754 }, { "epoch": 1.3163490830894211, "grad_norm": 0.4870050694221161, "learning_rate": 1.2818474041742885e-05, "loss": 0.3287, "step": 2755 }, { "epoch": 1.3168269517949942, "grad_norm": 0.48028268862938356, "learning_rate": 1.2813420457261497e-05, "loss": 0.325, "step": 2756 }, { "epoch": 1.3173048205005675, "grad_norm": 0.47555681810939554, "learning_rate": 1.2808366092386896e-05, "loss": 0.3383, "step": 2757 }, { "epoch": 1.3177826892061406, "grad_norm": 0.47978915398997984, "learning_rate": 1.2803310948521083e-05, "loss": 0.3344, "step": 2758 }, { "epoch": 1.3182605579117137, "grad_norm": 0.5676291606710551, "learning_rate": 1.2798255027066265e-05, "loss": 0.3367, "step": 2759 }, { "epoch": 1.318738426617287, "grad_norm": 0.47898495928629903, "learning_rate": 1.2793198329424858e-05, "loss": 0.344, "step": 2760 }, { "epoch": 1.31921629532286, "grad_norm": 0.6291553763138783, "learning_rate": 1.278814085699951e-05, "loss": 0.3288, "step": 2761 }, { "epoch": 1.3196941640284332, "grad_norm": 0.5128423783465226, "learning_rate": 1.2783082611193068e-05, "loss": 0.3326, "step": 2762 }, { "epoch": 1.3201720327340063, "grad_norm": 0.4966607745937759, "learning_rate": 1.2778023593408601e-05, "loss": 0.335, "step": 2763 }, { "epoch": 1.3206499014395794, "grad_norm": 0.4621870832599053, "learning_rate": 1.2772963805049395e-05, "loss": 0.3169, "step": 2764 }, { "epoch": 1.3211277701451527, "grad_norm": 0.5336568579759654, "learning_rate": 1.2767903247518945e-05, "loss": 0.3197, "step": 2765 }, { "epoch": 1.3216056388507258, "grad_norm": 0.4998758991079805, "learning_rate": 1.2762841922220956e-05, "loss": 0.3101, "step": 2766 }, { "epoch": 1.3220835075562989, "grad_norm": 0.5061171116591409, "learning_rate": 1.2757779830559353e-05, "loss": 0.3191, "step": 2767 }, { "epoch": 1.322561376261872, "grad_norm": 0.5779265310597284, "learning_rate": 1.2752716973938272e-05, "loss": 0.3102, "step": 2768 }, { "epoch": 1.323039244967445, "grad_norm": 0.4806329769254405, "learning_rate": 1.274765335376206e-05, "loss": 0.3441, "step": 2769 }, { "epoch": 1.3235171136730184, "grad_norm": 0.5262517952171601, "learning_rate": 1.2742588971435276e-05, "loss": 0.3289, "step": 2770 }, { "epoch": 1.3239949823785915, "grad_norm": 0.5469131458946188, "learning_rate": 1.273752382836269e-05, "loss": 0.3281, "step": 2771 }, { "epoch": 1.3244728510841646, "grad_norm": 0.4981852013260321, "learning_rate": 1.2732457925949282e-05, "loss": 0.3339, "step": 2772 }, { "epoch": 1.3249507197897379, "grad_norm": 0.4995218099055462, "learning_rate": 1.2727391265600248e-05, "loss": 0.3217, "step": 2773 }, { "epoch": 1.325428588495311, "grad_norm": 0.48442166594011293, "learning_rate": 1.2722323848720985e-05, "loss": 0.3198, "step": 2774 }, { "epoch": 1.325906457200884, "grad_norm": 0.5288357240858039, "learning_rate": 1.2717255676717106e-05, "loss": 0.3265, "step": 2775 }, { "epoch": 1.3263843259064572, "grad_norm": 0.47363656909947727, "learning_rate": 1.2712186750994437e-05, "loss": 0.3144, "step": 2776 }, { "epoch": 1.3268621946120303, "grad_norm": 0.5621473769140592, "learning_rate": 1.2707117072959004e-05, "loss": 0.3201, "step": 2777 }, { "epoch": 1.3273400633176036, "grad_norm": 0.516657131750651, "learning_rate": 1.2702046644017045e-05, "loss": 0.3407, "step": 2778 }, { "epoch": 1.3278179320231767, "grad_norm": 0.529619985203502, "learning_rate": 1.2696975465575016e-05, "loss": 0.3256, "step": 2779 }, { "epoch": 1.3282958007287498, "grad_norm": 0.5434444774186435, "learning_rate": 1.2691903539039563e-05, "loss": 0.3228, "step": 2780 }, { "epoch": 1.3287736694343228, "grad_norm": 0.565815798825105, "learning_rate": 1.2686830865817552e-05, "loss": 0.3327, "step": 2781 }, { "epoch": 1.329251538139896, "grad_norm": 0.5113159211723257, "learning_rate": 1.2681757447316057e-05, "loss": 0.336, "step": 2782 }, { "epoch": 1.3297294068454693, "grad_norm": 0.48646729323156335, "learning_rate": 1.2676683284942348e-05, "loss": 0.3377, "step": 2783 }, { "epoch": 1.3302072755510423, "grad_norm": 0.4730669634348989, "learning_rate": 1.267160838010391e-05, "loss": 0.3511, "step": 2784 }, { "epoch": 1.3306851442566154, "grad_norm": 0.6446175620584933, "learning_rate": 1.2666532734208437e-05, "loss": 0.3382, "step": 2785 }, { "epoch": 1.3311630129621888, "grad_norm": 0.5040172790962374, "learning_rate": 1.2661456348663822e-05, "loss": 0.3291, "step": 2786 }, { "epoch": 1.3316408816677618, "grad_norm": 0.6517602921578377, "learning_rate": 1.265637922487816e-05, "loss": 0.3216, "step": 2787 }, { "epoch": 1.332118750373335, "grad_norm": 0.4648517246826999, "learning_rate": 1.2651301364259761e-05, "loss": 0.3355, "step": 2788 }, { "epoch": 1.332596619078908, "grad_norm": 0.48877805021759047, "learning_rate": 1.2646222768217129e-05, "loss": 0.3331, "step": 2789 }, { "epoch": 1.3330744877844811, "grad_norm": 0.4820080738115575, "learning_rate": 1.264114343815898e-05, "loss": 0.3148, "step": 2790 }, { "epoch": 1.3335523564900544, "grad_norm": 0.4842200928813744, "learning_rate": 1.2636063375494233e-05, "loss": 0.3271, "step": 2791 }, { "epoch": 1.3340302251956275, "grad_norm": 0.48858773566706376, "learning_rate": 1.2630982581632003e-05, "loss": 0.3306, "step": 2792 }, { "epoch": 1.3345080939012006, "grad_norm": 0.49277764185033907, "learning_rate": 1.2625901057981613e-05, "loss": 0.3288, "step": 2793 }, { "epoch": 1.3349859626067737, "grad_norm": 0.5015471955508418, "learning_rate": 1.2620818805952595e-05, "loss": 0.315, "step": 2794 }, { "epoch": 1.3354638313123468, "grad_norm": 0.4703720797913821, "learning_rate": 1.2615735826954664e-05, "loss": 0.3377, "step": 2795 }, { "epoch": 1.3359417000179201, "grad_norm": 0.6095651211109612, "learning_rate": 1.2610652122397762e-05, "loss": 0.3285, "step": 2796 }, { "epoch": 1.3364195687234932, "grad_norm": 0.4528886888495506, "learning_rate": 1.260556769369201e-05, "loss": 0.3222, "step": 2797 }, { "epoch": 1.3368974374290663, "grad_norm": 0.46151024325444917, "learning_rate": 1.2600482542247738e-05, "loss": 0.3311, "step": 2798 }, { "epoch": 1.3373753061346396, "grad_norm": 0.5144167233653287, "learning_rate": 1.2595396669475486e-05, "loss": 0.3292, "step": 2799 }, { "epoch": 1.3378531748402127, "grad_norm": 0.5408339927194712, "learning_rate": 1.2590310076785974e-05, "loss": 0.3179, "step": 2800 }, { "epoch": 1.3383310435457858, "grad_norm": 0.47650637067616053, "learning_rate": 1.258522276559014e-05, "loss": 0.3355, "step": 2801 }, { "epoch": 1.338808912251359, "grad_norm": 0.525246628210698, "learning_rate": 1.2580134737299117e-05, "loss": 0.3481, "step": 2802 }, { "epoch": 1.339286780956932, "grad_norm": 0.47691152981429813, "learning_rate": 1.2575045993324227e-05, "loss": 0.3122, "step": 2803 }, { "epoch": 1.3397646496625053, "grad_norm": 0.5092137433283072, "learning_rate": 1.2569956535077004e-05, "loss": 0.3324, "step": 2804 }, { "epoch": 1.3402425183680784, "grad_norm": 0.4952198893971536, "learning_rate": 1.256486636396917e-05, "loss": 0.338, "step": 2805 }, { "epoch": 1.3407203870736515, "grad_norm": 0.49148265729272733, "learning_rate": 1.255977548141265e-05, "loss": 0.3133, "step": 2806 }, { "epoch": 1.3411982557792246, "grad_norm": 0.5396008187377243, "learning_rate": 1.2554683888819565e-05, "loss": 0.3187, "step": 2807 }, { "epoch": 1.3416761244847977, "grad_norm": 0.4824817484621732, "learning_rate": 1.2549591587602237e-05, "loss": 0.3304, "step": 2808 }, { "epoch": 1.342153993190371, "grad_norm": 0.5081977637354367, "learning_rate": 1.2544498579173172e-05, "loss": 0.3142, "step": 2809 }, { "epoch": 1.342631861895944, "grad_norm": 0.5058206945057618, "learning_rate": 1.2539404864945087e-05, "loss": 0.317, "step": 2810 }, { "epoch": 1.3431097306015172, "grad_norm": 0.5153353255064529, "learning_rate": 1.2534310446330888e-05, "loss": 0.3176, "step": 2811 }, { "epoch": 1.3435875993070905, "grad_norm": 0.5318910708535983, "learning_rate": 1.2529215324743673e-05, "loss": 0.308, "step": 2812 }, { "epoch": 1.3440654680126636, "grad_norm": 0.496994923710053, "learning_rate": 1.2524119501596743e-05, "loss": 0.3226, "step": 2813 }, { "epoch": 1.3445433367182367, "grad_norm": 0.5165301727279924, "learning_rate": 1.251902297830359e-05, "loss": 0.336, "step": 2814 }, { "epoch": 1.3450212054238098, "grad_norm": 0.5340709701111669, "learning_rate": 1.2513925756277894e-05, "loss": 0.3234, "step": 2815 }, { "epoch": 1.3454990741293829, "grad_norm": 0.4828618104957075, "learning_rate": 1.250882783693354e-05, "loss": 0.3006, "step": 2816 }, { "epoch": 1.3459769428349562, "grad_norm": 0.4785810307178594, "learning_rate": 1.25037292216846e-05, "loss": 0.3301, "step": 2817 }, { "epoch": 1.3464548115405293, "grad_norm": 0.5032661037822173, "learning_rate": 1.2498629911945333e-05, "loss": 0.3262, "step": 2818 }, { "epoch": 1.3469326802461024, "grad_norm": 0.48799683567616237, "learning_rate": 1.249352990913021e-05, "loss": 0.3166, "step": 2819 }, { "epoch": 1.3474105489516754, "grad_norm": 0.5225514388182845, "learning_rate": 1.2488429214653871e-05, "loss": 0.3338, "step": 2820 }, { "epoch": 1.3478884176572485, "grad_norm": 0.4688356836854529, "learning_rate": 1.2483327829931167e-05, "loss": 0.3117, "step": 2821 }, { "epoch": 1.3483662863628219, "grad_norm": 0.5582134330183454, "learning_rate": 1.2478225756377127e-05, "loss": 0.3239, "step": 2822 }, { "epoch": 1.348844155068395, "grad_norm": 0.4909856374763454, "learning_rate": 1.2473122995406976e-05, "loss": 0.3374, "step": 2823 }, { "epoch": 1.349322023773968, "grad_norm": 0.5121950978077848, "learning_rate": 1.2468019548436132e-05, "loss": 0.3232, "step": 2824 }, { "epoch": 1.3497998924795414, "grad_norm": 0.5347242883406231, "learning_rate": 1.24629154168802e-05, "loss": 0.3201, "step": 2825 }, { "epoch": 1.3502777611851144, "grad_norm": 0.7886377488771786, "learning_rate": 1.2457810602154975e-05, "loss": 0.3208, "step": 2826 }, { "epoch": 1.3507556298906875, "grad_norm": 0.48634722610155756, "learning_rate": 1.2452705105676448e-05, "loss": 0.3164, "step": 2827 }, { "epoch": 1.3512334985962606, "grad_norm": 0.49071957483260065, "learning_rate": 1.244759892886079e-05, "loss": 0.3365, "step": 2828 }, { "epoch": 1.3517113673018337, "grad_norm": 0.4953873498833253, "learning_rate": 1.2442492073124359e-05, "loss": 0.321, "step": 2829 }, { "epoch": 1.352189236007407, "grad_norm": 0.5251226282353867, "learning_rate": 1.2437384539883715e-05, "loss": 0.3327, "step": 2830 }, { "epoch": 1.3526671047129801, "grad_norm": 0.5013267329927095, "learning_rate": 1.2432276330555592e-05, "loss": 0.3445, "step": 2831 }, { "epoch": 1.3531449734185532, "grad_norm": 0.8028493838704872, "learning_rate": 1.2427167446556922e-05, "loss": 0.3166, "step": 2832 }, { "epoch": 1.3536228421241263, "grad_norm": 0.6378917954070072, "learning_rate": 1.2422057889304814e-05, "loss": 0.3396, "step": 2833 }, { "epoch": 1.3541007108296994, "grad_norm": 0.4859271357599526, "learning_rate": 1.2416947660216576e-05, "loss": 0.3371, "step": 2834 }, { "epoch": 1.3545785795352727, "grad_norm": 0.5227282184131874, "learning_rate": 1.2411836760709686e-05, "loss": 0.327, "step": 2835 }, { "epoch": 1.3550564482408458, "grad_norm": 0.5208799708860445, "learning_rate": 1.2406725192201828e-05, "loss": 0.3254, "step": 2836 }, { "epoch": 1.355534316946419, "grad_norm": 0.4795223980334988, "learning_rate": 1.2401612956110853e-05, "loss": 0.3111, "step": 2837 }, { "epoch": 1.3560121856519922, "grad_norm": 0.4963240280920323, "learning_rate": 1.2396500053854808e-05, "loss": 0.3259, "step": 2838 }, { "epoch": 1.3564900543575653, "grad_norm": 0.5039449229996626, "learning_rate": 1.2391386486851922e-05, "loss": 0.3197, "step": 2839 }, { "epoch": 1.3569679230631384, "grad_norm": 0.5317251552116927, "learning_rate": 1.2386272256520606e-05, "loss": 0.3197, "step": 2840 }, { "epoch": 1.3574457917687115, "grad_norm": 0.48744894671494793, "learning_rate": 1.2381157364279462e-05, "loss": 0.3035, "step": 2841 }, { "epoch": 1.3579236604742846, "grad_norm": 0.5025271430359258, "learning_rate": 1.2376041811547268e-05, "loss": 0.3314, "step": 2842 }, { "epoch": 1.358401529179858, "grad_norm": 0.5167243594092309, "learning_rate": 1.2370925599742987e-05, "loss": 0.3226, "step": 2843 }, { "epoch": 1.358879397885431, "grad_norm": 0.525633231146885, "learning_rate": 1.2365808730285764e-05, "loss": 0.3359, "step": 2844 }, { "epoch": 1.359357266591004, "grad_norm": 0.4991777969329675, "learning_rate": 1.2360691204594937e-05, "loss": 0.3209, "step": 2845 }, { "epoch": 1.3598351352965774, "grad_norm": 0.5286964966712626, "learning_rate": 1.2355573024090009e-05, "loss": 0.3243, "step": 2846 }, { "epoch": 1.3603130040021503, "grad_norm": 0.48540137645110326, "learning_rate": 1.2350454190190675e-05, "loss": 0.3146, "step": 2847 }, { "epoch": 1.3607908727077236, "grad_norm": 0.526573346210916, "learning_rate": 1.2345334704316811e-05, "loss": 0.3272, "step": 2848 }, { "epoch": 1.3612687414132967, "grad_norm": 0.5248630946951779, "learning_rate": 1.2340214567888472e-05, "loss": 0.3318, "step": 2849 }, { "epoch": 1.3617466101188698, "grad_norm": 0.49262488475200295, "learning_rate": 1.2335093782325889e-05, "loss": 0.3335, "step": 2850 }, { "epoch": 1.362224478824443, "grad_norm": 0.47664312818269816, "learning_rate": 1.2329972349049481e-05, "loss": 0.3357, "step": 2851 }, { "epoch": 1.3627023475300162, "grad_norm": 0.49333473606642414, "learning_rate": 1.2324850269479847e-05, "loss": 0.3193, "step": 2852 }, { "epoch": 1.3631802162355893, "grad_norm": 0.5352064541201343, "learning_rate": 1.2319727545037753e-05, "loss": 0.3372, "step": 2853 }, { "epoch": 1.3636580849411624, "grad_norm": 0.5155080301298953, "learning_rate": 1.2314604177144164e-05, "loss": 0.3165, "step": 2854 }, { "epoch": 1.3641359536467355, "grad_norm": 0.5556617290050194, "learning_rate": 1.2309480167220203e-05, "loss": 0.3062, "step": 2855 }, { "epoch": 1.3646138223523088, "grad_norm": 0.5075291890940449, "learning_rate": 1.230435551668718e-05, "loss": 0.3345, "step": 2856 }, { "epoch": 1.3650916910578819, "grad_norm": 0.5139412287417211, "learning_rate": 1.2299230226966587e-05, "loss": 0.3351, "step": 2857 }, { "epoch": 1.365569559763455, "grad_norm": 0.5335170451249204, "learning_rate": 1.2294104299480085e-05, "loss": 0.3316, "step": 2858 }, { "epoch": 1.3660474284690283, "grad_norm": 0.5623627174173891, "learning_rate": 1.2288977735649518e-05, "loss": 0.3182, "step": 2859 }, { "epoch": 1.3665252971746014, "grad_norm": 0.48937860284820955, "learning_rate": 1.2283850536896907e-05, "loss": 0.3301, "step": 2860 }, { "epoch": 1.3670031658801745, "grad_norm": 0.4993453513384397, "learning_rate": 1.2278722704644439e-05, "loss": 0.3349, "step": 2861 }, { "epoch": 1.3674810345857475, "grad_norm": 0.5215921358568462, "learning_rate": 1.227359424031449e-05, "loss": 0.3166, "step": 2862 }, { "epoch": 1.3679589032913206, "grad_norm": 0.473824073344398, "learning_rate": 1.2268465145329607e-05, "loss": 0.329, "step": 2863 }, { "epoch": 1.368436771996894, "grad_norm": 0.5100010802249643, "learning_rate": 1.2263335421112505e-05, "loss": 0.3178, "step": 2864 }, { "epoch": 1.368914640702467, "grad_norm": 0.49849508667435044, "learning_rate": 1.2258205069086082e-05, "loss": 0.3331, "step": 2865 }, { "epoch": 1.3693925094080401, "grad_norm": 0.5169260084417553, "learning_rate": 1.2253074090673408e-05, "loss": 0.3419, "step": 2866 }, { "epoch": 1.3698703781136132, "grad_norm": 0.4756763753549606, "learning_rate": 1.2247942487297724e-05, "loss": 0.3263, "step": 2867 }, { "epoch": 1.3703482468191863, "grad_norm": 0.492096668882402, "learning_rate": 1.2242810260382446e-05, "loss": 0.3301, "step": 2868 }, { "epoch": 1.3708261155247596, "grad_norm": 0.49548035638201127, "learning_rate": 1.2237677411351165e-05, "loss": 0.3235, "step": 2869 }, { "epoch": 1.3713039842303327, "grad_norm": 0.5095279037269734, "learning_rate": 1.2232543941627641e-05, "loss": 0.3342, "step": 2870 }, { "epoch": 1.3717818529359058, "grad_norm": 0.45819233635275786, "learning_rate": 1.2227409852635811e-05, "loss": 0.3265, "step": 2871 }, { "epoch": 1.3722597216414791, "grad_norm": 0.4923000662072581, "learning_rate": 1.2222275145799778e-05, "loss": 0.3217, "step": 2872 }, { "epoch": 1.3727375903470522, "grad_norm": 0.5112507834075313, "learning_rate": 1.2217139822543819e-05, "loss": 0.3066, "step": 2873 }, { "epoch": 1.3732154590526253, "grad_norm": 0.4628910419474185, "learning_rate": 1.2212003884292388e-05, "loss": 0.316, "step": 2874 }, { "epoch": 1.3736933277581984, "grad_norm": 0.45732279246199825, "learning_rate": 1.2206867332470091e-05, "loss": 0.3165, "step": 2875 }, { "epoch": 1.3741711964637715, "grad_norm": 0.4936353320658925, "learning_rate": 1.2201730168501729e-05, "loss": 0.3251, "step": 2876 }, { "epoch": 1.3746490651693448, "grad_norm": 0.48905812990241215, "learning_rate": 1.2196592393812257e-05, "loss": 0.3374, "step": 2877 }, { "epoch": 1.375126933874918, "grad_norm": 0.45753365931472645, "learning_rate": 1.2191454009826798e-05, "loss": 0.3242, "step": 2878 }, { "epoch": 1.375604802580491, "grad_norm": 0.5299641397547704, "learning_rate": 1.2186315017970656e-05, "loss": 0.3065, "step": 2879 }, { "epoch": 1.376082671286064, "grad_norm": 0.4839322939919366, "learning_rate": 1.2181175419669293e-05, "loss": 0.317, "step": 2880 }, { "epoch": 1.3765605399916372, "grad_norm": 0.48836787664932607, "learning_rate": 1.2176035216348345e-05, "loss": 0.3293, "step": 2881 }, { "epoch": 1.3770384086972105, "grad_norm": 0.49187662653450037, "learning_rate": 1.2170894409433612e-05, "loss": 0.3438, "step": 2882 }, { "epoch": 1.3775162774027836, "grad_norm": 0.4818132985229131, "learning_rate": 1.2165753000351064e-05, "loss": 0.3338, "step": 2883 }, { "epoch": 1.3779941461083567, "grad_norm": 0.4742707254068882, "learning_rate": 1.2160610990526836e-05, "loss": 0.3123, "step": 2884 }, { "epoch": 1.37847201481393, "grad_norm": 0.475050796561632, "learning_rate": 1.215546838138723e-05, "loss": 0.3223, "step": 2885 }, { "epoch": 1.378949883519503, "grad_norm": 0.5011042293446867, "learning_rate": 1.215032517435872e-05, "loss": 0.3261, "step": 2886 }, { "epoch": 1.3794277522250762, "grad_norm": 0.483985527438474, "learning_rate": 1.2145181370867936e-05, "loss": 0.3186, "step": 2887 }, { "epoch": 1.3799056209306493, "grad_norm": 0.4541289680253298, "learning_rate": 1.2140036972341683e-05, "loss": 0.3343, "step": 2888 }, { "epoch": 1.3803834896362224, "grad_norm": 0.5073229100491303, "learning_rate": 1.213489198020692e-05, "loss": 0.331, "step": 2889 }, { "epoch": 1.3808613583417957, "grad_norm": 0.5173324784175866, "learning_rate": 1.212974639589078e-05, "loss": 0.3313, "step": 2890 }, { "epoch": 1.3813392270473688, "grad_norm": 0.4874640856604241, "learning_rate": 1.2124600220820562e-05, "loss": 0.3372, "step": 2891 }, { "epoch": 1.3818170957529419, "grad_norm": 0.4805367156153807, "learning_rate": 1.2119453456423718e-05, "loss": 0.3448, "step": 2892 }, { "epoch": 1.382294964458515, "grad_norm": 0.48556515258960525, "learning_rate": 1.211430610412787e-05, "loss": 0.3233, "step": 2893 }, { "epoch": 1.382772833164088, "grad_norm": 0.5315828725085151, "learning_rate": 1.2109158165360805e-05, "loss": 0.3142, "step": 2894 }, { "epoch": 1.3832507018696614, "grad_norm": 0.48141420283737696, "learning_rate": 1.2104009641550472e-05, "loss": 0.3218, "step": 2895 }, { "epoch": 1.3837285705752345, "grad_norm": 0.4990446188800693, "learning_rate": 1.2098860534124976e-05, "loss": 0.3286, "step": 2896 }, { "epoch": 1.3842064392808076, "grad_norm": 0.49905032499820146, "learning_rate": 1.2093710844512594e-05, "loss": 0.3273, "step": 2897 }, { "epoch": 1.3846843079863809, "grad_norm": 0.5206266774252051, "learning_rate": 1.2088560574141754e-05, "loss": 0.3233, "step": 2898 }, { "epoch": 1.385162176691954, "grad_norm": 0.47976992992949313, "learning_rate": 1.2083409724441054e-05, "loss": 0.3318, "step": 2899 }, { "epoch": 1.385640045397527, "grad_norm": 0.4858512164454944, "learning_rate": 1.2078258296839245e-05, "loss": 0.3497, "step": 2900 }, { "epoch": 1.3861179141031001, "grad_norm": 0.5034404806016024, "learning_rate": 1.2073106292765247e-05, "loss": 0.3279, "step": 2901 }, { "epoch": 1.3865957828086732, "grad_norm": 0.5079287528998049, "learning_rate": 1.2067953713648126e-05, "loss": 0.3344, "step": 2902 }, { "epoch": 1.3870736515142466, "grad_norm": 0.47886319969478675, "learning_rate": 1.206280056091713e-05, "loss": 0.3384, "step": 2903 }, { "epoch": 1.3875515202198196, "grad_norm": 0.5187096003859187, "learning_rate": 1.2057646836001641e-05, "loss": 0.3281, "step": 2904 }, { "epoch": 1.3880293889253927, "grad_norm": 0.4615577488871791, "learning_rate": 1.2052492540331218e-05, "loss": 0.3235, "step": 2905 }, { "epoch": 1.3885072576309658, "grad_norm": 0.48313478967101564, "learning_rate": 1.2047337675335571e-05, "loss": 0.3167, "step": 2906 }, { "epoch": 1.388985126336539, "grad_norm": 0.48859241370609713, "learning_rate": 1.2042182242444567e-05, "loss": 0.3285, "step": 2907 }, { "epoch": 1.3894629950421122, "grad_norm": 0.494594083600188, "learning_rate": 1.203702624308823e-05, "loss": 0.3207, "step": 2908 }, { "epoch": 1.3899408637476853, "grad_norm": 0.46974833583616593, "learning_rate": 1.2031869678696748e-05, "loss": 0.3247, "step": 2909 }, { "epoch": 1.3904187324532584, "grad_norm": 0.5040204897201126, "learning_rate": 1.2026712550700457e-05, "loss": 0.3303, "step": 2910 }, { "epoch": 1.3908966011588317, "grad_norm": 0.47910736507397716, "learning_rate": 1.2021554860529856e-05, "loss": 0.314, "step": 2911 }, { "epoch": 1.3913744698644048, "grad_norm": 0.4908900810546827, "learning_rate": 1.2016396609615597e-05, "loss": 0.3191, "step": 2912 }, { "epoch": 1.391852338569978, "grad_norm": 0.4902328178847632, "learning_rate": 1.2011237799388486e-05, "loss": 0.3063, "step": 2913 }, { "epoch": 1.392330207275551, "grad_norm": 0.48448162975602765, "learning_rate": 1.2006078431279486e-05, "loss": 0.334, "step": 2914 }, { "epoch": 1.392808075981124, "grad_norm": 0.4975553113261312, "learning_rate": 1.200091850671972e-05, "loss": 0.3086, "step": 2915 }, { "epoch": 1.3932859446866974, "grad_norm": 0.47682406789705983, "learning_rate": 1.1995758027140451e-05, "loss": 0.3267, "step": 2916 }, { "epoch": 1.3937638133922705, "grad_norm": 0.4874223745730435, "learning_rate": 1.1990596993973112e-05, "loss": 0.3119, "step": 2917 }, { "epoch": 1.3942416820978436, "grad_norm": 0.4853795113192791, "learning_rate": 1.1985435408649281e-05, "loss": 0.3346, "step": 2918 }, { "epoch": 1.3947195508034167, "grad_norm": 0.46269214125213426, "learning_rate": 1.1980273272600687e-05, "loss": 0.3341, "step": 2919 }, { "epoch": 1.3951974195089898, "grad_norm": 0.5053092201197434, "learning_rate": 1.1975110587259222e-05, "loss": 0.3229, "step": 2920 }, { "epoch": 1.395675288214563, "grad_norm": 0.4944091898145227, "learning_rate": 1.1969947354056918e-05, "loss": 0.3285, "step": 2921 }, { "epoch": 1.3961531569201362, "grad_norm": 0.49386779198974684, "learning_rate": 1.1964783574425969e-05, "loss": 0.3212, "step": 2922 }, { "epoch": 1.3966310256257093, "grad_norm": 0.5269873184728784, "learning_rate": 1.1959619249798717e-05, "loss": 0.3294, "step": 2923 }, { "epoch": 1.3971088943312826, "grad_norm": 0.46298505676389556, "learning_rate": 1.1954454381607648e-05, "loss": 0.3138, "step": 2924 }, { "epoch": 1.3975867630368557, "grad_norm": 0.5451763604619123, "learning_rate": 1.1949288971285411e-05, "loss": 0.3109, "step": 2925 }, { "epoch": 1.3980646317424288, "grad_norm": 0.47048163898437384, "learning_rate": 1.19441230202648e-05, "loss": 0.3332, "step": 2926 }, { "epoch": 1.3985425004480019, "grad_norm": 0.5134522375456477, "learning_rate": 1.1938956529978754e-05, "loss": 0.3209, "step": 2927 }, { "epoch": 1.399020369153575, "grad_norm": 0.5437669027850316, "learning_rate": 1.1933789501860371e-05, "loss": 0.3169, "step": 2928 }, { "epoch": 1.3994982378591483, "grad_norm": 0.4756199727019128, "learning_rate": 1.192862193734289e-05, "loss": 0.3175, "step": 2929 }, { "epoch": 1.3999761065647214, "grad_norm": 0.5356791744713957, "learning_rate": 1.1923453837859706e-05, "loss": 0.3342, "step": 2930 }, { "epoch": 1.4004539752702945, "grad_norm": 0.5132793736491843, "learning_rate": 1.1918285204844355e-05, "loss": 0.3226, "step": 2931 }, { "epoch": 1.4009318439758676, "grad_norm": 0.4654901065559778, "learning_rate": 1.1913116039730528e-05, "loss": 0.3326, "step": 2932 }, { "epoch": 1.4014097126814407, "grad_norm": 0.5200095543826778, "learning_rate": 1.1907946343952057e-05, "loss": 0.321, "step": 2933 }, { "epoch": 1.401887581387014, "grad_norm": 0.4772540680937219, "learning_rate": 1.1902776118942924e-05, "loss": 0.3287, "step": 2934 }, { "epoch": 1.402365450092587, "grad_norm": 0.7737307275643527, "learning_rate": 1.1897605366137264e-05, "loss": 0.3153, "step": 2935 }, { "epoch": 1.4028433187981602, "grad_norm": 0.5148598861162687, "learning_rate": 1.1892434086969343e-05, "loss": 0.3237, "step": 2936 }, { "epoch": 1.4033211875037335, "grad_norm": 0.5176901401278625, "learning_rate": 1.1887262282873593e-05, "loss": 0.3143, "step": 2937 }, { "epoch": 1.4037990562093066, "grad_norm": 0.5251202606650733, "learning_rate": 1.1882089955284575e-05, "loss": 0.3325, "step": 2938 }, { "epoch": 1.4042769249148797, "grad_norm": 0.5736546704799379, "learning_rate": 1.1876917105637e-05, "loss": 0.317, "step": 2939 }, { "epoch": 1.4047547936204527, "grad_norm": 0.5121472986243492, "learning_rate": 1.1871743735365735e-05, "loss": 0.33, "step": 2940 }, { "epoch": 1.4052326623260258, "grad_norm": 0.51293687234706, "learning_rate": 1.186656984590577e-05, "loss": 0.3366, "step": 2941 }, { "epoch": 1.4057105310315992, "grad_norm": 0.5289581413398837, "learning_rate": 1.1861395438692256e-05, "loss": 0.338, "step": 2942 }, { "epoch": 1.4061883997371722, "grad_norm": 0.7060472699769851, "learning_rate": 1.1856220515160483e-05, "loss": 0.328, "step": 2943 }, { "epoch": 1.4066662684427453, "grad_norm": 0.49082212031808403, "learning_rate": 1.185104507674588e-05, "loss": 0.3218, "step": 2944 }, { "epoch": 1.4071441371483184, "grad_norm": 0.5266564439767433, "learning_rate": 1.1845869124884027e-05, "loss": 0.3254, "step": 2945 }, { "epoch": 1.4076220058538915, "grad_norm": 0.4967432730788497, "learning_rate": 1.1840692661010639e-05, "loss": 0.3141, "step": 2946 }, { "epoch": 1.4080998745594648, "grad_norm": 0.49284978510750543, "learning_rate": 1.1835515686561574e-05, "loss": 0.3151, "step": 2947 }, { "epoch": 1.408577743265038, "grad_norm": 0.4584745987957341, "learning_rate": 1.1830338202972838e-05, "loss": 0.3367, "step": 2948 }, { "epoch": 1.409055611970611, "grad_norm": 0.5227828009244597, "learning_rate": 1.1825160211680571e-05, "loss": 0.3316, "step": 2949 }, { "epoch": 1.4095334806761843, "grad_norm": 1.0116286136942643, "learning_rate": 1.1819981714121054e-05, "loss": 0.3457, "step": 2950 }, { "epoch": 1.4100113493817574, "grad_norm": 0.508515991330586, "learning_rate": 1.1814802711730714e-05, "loss": 0.3234, "step": 2951 }, { "epoch": 1.4104892180873305, "grad_norm": 0.4806606845844688, "learning_rate": 1.1809623205946116e-05, "loss": 0.3171, "step": 2952 }, { "epoch": 1.4109670867929036, "grad_norm": 0.5469751195822524, "learning_rate": 1.180444319820396e-05, "loss": 0.3236, "step": 2953 }, { "epoch": 1.4114449554984767, "grad_norm": 0.5181364422493109, "learning_rate": 1.179926268994109e-05, "loss": 0.33, "step": 2954 }, { "epoch": 1.41192282420405, "grad_norm": 0.4780046219803632, "learning_rate": 1.1794081682594491e-05, "loss": 0.3224, "step": 2955 }, { "epoch": 1.4124006929096231, "grad_norm": 0.4912369040242893, "learning_rate": 1.178890017760128e-05, "loss": 0.3153, "step": 2956 }, { "epoch": 1.4128785616151962, "grad_norm": 0.4775890106833153, "learning_rate": 1.1783718176398716e-05, "loss": 0.3332, "step": 2957 }, { "epoch": 1.4133564303207693, "grad_norm": 0.525737532682014, "learning_rate": 1.1778535680424192e-05, "loss": 0.314, "step": 2958 }, { "epoch": 1.4138342990263424, "grad_norm": 0.47808599069608393, "learning_rate": 1.1773352691115246e-05, "loss": 0.3091, "step": 2959 }, { "epoch": 1.4143121677319157, "grad_norm": 0.5514083626189912, "learning_rate": 1.176816920990954e-05, "loss": 0.3335, "step": 2960 }, { "epoch": 1.4147900364374888, "grad_norm": 0.5288775984235308, "learning_rate": 1.176298523824489e-05, "loss": 0.3496, "step": 2961 }, { "epoch": 1.4152679051430619, "grad_norm": 0.5325892186828176, "learning_rate": 1.1757800777559232e-05, "loss": 0.303, "step": 2962 }, { "epoch": 1.4157457738486352, "grad_norm": 0.48239236208741576, "learning_rate": 1.1752615829290644e-05, "loss": 0.3328, "step": 2963 }, { "epoch": 1.4162236425542083, "grad_norm": 0.5050757806977875, "learning_rate": 1.1747430394877342e-05, "loss": 0.3391, "step": 2964 }, { "epoch": 1.4167015112597814, "grad_norm": 0.5076547010445559, "learning_rate": 1.174224447575767e-05, "loss": 0.319, "step": 2965 }, { "epoch": 1.4171793799653545, "grad_norm": 0.5138942551079375, "learning_rate": 1.1737058073370116e-05, "loss": 0.329, "step": 2966 }, { "epoch": 1.4176572486709276, "grad_norm": 0.7215604590019389, "learning_rate": 1.1731871189153295e-05, "loss": 0.3423, "step": 2967 }, { "epoch": 1.4181351173765009, "grad_norm": 0.4880111433749174, "learning_rate": 1.1726683824545953e-05, "loss": 0.3194, "step": 2968 }, { "epoch": 1.418612986082074, "grad_norm": 0.49438163754649106, "learning_rate": 1.1721495980986975e-05, "loss": 0.3308, "step": 2969 }, { "epoch": 1.419090854787647, "grad_norm": 0.5137865619807299, "learning_rate": 1.171630765991538e-05, "loss": 0.3181, "step": 2970 }, { "epoch": 1.4195687234932202, "grad_norm": 0.5058423220393458, "learning_rate": 1.1711118862770314e-05, "loss": 0.3299, "step": 2971 }, { "epoch": 1.4200465921987933, "grad_norm": 0.5134808993321874, "learning_rate": 1.1705929590991062e-05, "loss": 0.3328, "step": 2972 }, { "epoch": 1.4205244609043666, "grad_norm": 0.5311157334978709, "learning_rate": 1.1700739846017033e-05, "loss": 0.3254, "step": 2973 }, { "epoch": 1.4210023296099397, "grad_norm": 0.47075019738009893, "learning_rate": 1.169554962928777e-05, "loss": 0.3283, "step": 2974 }, { "epoch": 1.4214801983155128, "grad_norm": 0.5260956963173294, "learning_rate": 1.169035894224295e-05, "loss": 0.3304, "step": 2975 }, { "epoch": 1.421958067021086, "grad_norm": 0.5540899904553104, "learning_rate": 1.1685167786322375e-05, "loss": 0.3338, "step": 2976 }, { "epoch": 1.4224359357266592, "grad_norm": 0.5415508885164177, "learning_rate": 1.1679976162965984e-05, "loss": 0.3244, "step": 2977 }, { "epoch": 1.4229138044322323, "grad_norm": 0.47480665610205475, "learning_rate": 1.1674784073613841e-05, "loss": 0.3329, "step": 2978 }, { "epoch": 1.4233916731378053, "grad_norm": 0.5732469730028841, "learning_rate": 1.1669591519706134e-05, "loss": 0.3413, "step": 2979 }, { "epoch": 1.4238695418433784, "grad_norm": 0.49708436697419595, "learning_rate": 1.1664398502683194e-05, "loss": 0.3129, "step": 2980 }, { "epoch": 1.4243474105489518, "grad_norm": 0.46717071543365596, "learning_rate": 1.165920502398547e-05, "loss": 0.3372, "step": 2981 }, { "epoch": 1.4248252792545248, "grad_norm": 0.4770094789534314, "learning_rate": 1.1654011085053537e-05, "loss": 0.3341, "step": 2982 }, { "epoch": 1.425303147960098, "grad_norm": 0.4957722154844253, "learning_rate": 1.1648816687328104e-05, "loss": 0.3112, "step": 2983 }, { "epoch": 1.425781016665671, "grad_norm": 0.4954214821075703, "learning_rate": 1.1643621832250012e-05, "loss": 0.3202, "step": 2984 }, { "epoch": 1.4262588853712441, "grad_norm": 0.560090241257771, "learning_rate": 1.1638426521260211e-05, "loss": 0.3387, "step": 2985 }, { "epoch": 1.4267367540768174, "grad_norm": 0.5088929436420992, "learning_rate": 1.1633230755799799e-05, "loss": 0.3337, "step": 2986 }, { "epoch": 1.4272146227823905, "grad_norm": 0.5423381244933303, "learning_rate": 1.162803453730998e-05, "loss": 0.3151, "step": 2987 }, { "epoch": 1.4276924914879636, "grad_norm": 0.49395978930167594, "learning_rate": 1.1622837867232102e-05, "loss": 0.3265, "step": 2988 }, { "epoch": 1.428170360193537, "grad_norm": 0.4718130842189427, "learning_rate": 1.1617640747007626e-05, "loss": 0.3246, "step": 2989 }, { "epoch": 1.42864822889911, "grad_norm": 0.4980105057115814, "learning_rate": 1.1612443178078138e-05, "loss": 0.3472, "step": 2990 }, { "epoch": 1.4291260976046831, "grad_norm": 0.5938682319676443, "learning_rate": 1.1607245161885358e-05, "loss": 0.3105, "step": 2991 }, { "epoch": 1.4296039663102562, "grad_norm": 0.48289231319482784, "learning_rate": 1.1602046699871126e-05, "loss": 0.3224, "step": 2992 }, { "epoch": 1.4300818350158293, "grad_norm": 0.5006882224163078, "learning_rate": 1.1596847793477393e-05, "loss": 0.333, "step": 2993 }, { "epoch": 1.4305597037214026, "grad_norm": 0.5004173467454556, "learning_rate": 1.1591648444146251e-05, "loss": 0.3128, "step": 2994 }, { "epoch": 1.4310375724269757, "grad_norm": 0.49393923359090247, "learning_rate": 1.1586448653319908e-05, "loss": 0.3196, "step": 2995 }, { "epoch": 1.4315154411325488, "grad_norm": 0.4696580955679868, "learning_rate": 1.1581248422440692e-05, "loss": 0.3249, "step": 2996 }, { "epoch": 1.431993309838122, "grad_norm": 0.5122849607619827, "learning_rate": 1.1576047752951056e-05, "loss": 0.3295, "step": 2997 }, { "epoch": 1.432471178543695, "grad_norm": 0.5080265014360067, "learning_rate": 1.157084664629358e-05, "loss": 0.318, "step": 2998 }, { "epoch": 1.4329490472492683, "grad_norm": 0.5283144934239468, "learning_rate": 1.1565645103910945e-05, "loss": 0.3259, "step": 2999 }, { "epoch": 1.4334269159548414, "grad_norm": 0.5254577539682926, "learning_rate": 1.156044312724598e-05, "loss": 0.3244, "step": 3000 }, { "epoch": 1.4339047846604145, "grad_norm": 0.4926424316230558, "learning_rate": 1.1555240717741618e-05, "loss": 0.3417, "step": 3001 }, { "epoch": 1.4343826533659878, "grad_norm": 0.4796896772668644, "learning_rate": 1.1550037876840913e-05, "loss": 0.3166, "step": 3002 }, { "epoch": 1.434860522071561, "grad_norm": 0.4773964347570398, "learning_rate": 1.1544834605987042e-05, "loss": 0.3325, "step": 3003 }, { "epoch": 1.435338390777134, "grad_norm": 0.5159550135561818, "learning_rate": 1.1539630906623305e-05, "loss": 0.3169, "step": 3004 }, { "epoch": 1.435816259482707, "grad_norm": 0.4972490804066091, "learning_rate": 1.153442678019311e-05, "loss": 0.3306, "step": 3005 }, { "epoch": 1.4362941281882802, "grad_norm": 0.48572512446743665, "learning_rate": 1.1529222228139993e-05, "loss": 0.3196, "step": 3006 }, { "epoch": 1.4367719968938535, "grad_norm": 0.4956598068058492, "learning_rate": 1.1524017251907609e-05, "loss": 0.3233, "step": 3007 }, { "epoch": 1.4372498655994266, "grad_norm": 0.47225254602078437, "learning_rate": 1.151881185293972e-05, "loss": 0.322, "step": 3008 }, { "epoch": 1.4377277343049997, "grad_norm": 0.4994314156150159, "learning_rate": 1.1513606032680214e-05, "loss": 0.33, "step": 3009 }, { "epoch": 1.4382056030105728, "grad_norm": 0.4932853873998178, "learning_rate": 1.1508399792573095e-05, "loss": 0.3316, "step": 3010 }, { "epoch": 1.4386834717161459, "grad_norm": 0.4692505918945855, "learning_rate": 1.150319313406248e-05, "loss": 0.3068, "step": 3011 }, { "epoch": 1.4391613404217192, "grad_norm": 0.5056071919354226, "learning_rate": 1.1497986058592607e-05, "loss": 0.3485, "step": 3012 }, { "epoch": 1.4396392091272923, "grad_norm": 0.5537829323649248, "learning_rate": 1.1492778567607826e-05, "loss": 0.2966, "step": 3013 }, { "epoch": 1.4401170778328654, "grad_norm": 0.49754562492925625, "learning_rate": 1.1487570662552601e-05, "loss": 0.3409, "step": 3014 }, { "epoch": 1.4405949465384387, "grad_norm": 0.5261937593332422, "learning_rate": 1.1482362344871514e-05, "loss": 0.3353, "step": 3015 }, { "epoch": 1.4410728152440118, "grad_norm": 0.44720642661167864, "learning_rate": 1.1477153616009262e-05, "loss": 0.3287, "step": 3016 }, { "epoch": 1.4415506839495849, "grad_norm": 0.46950785439169124, "learning_rate": 1.1471944477410652e-05, "loss": 0.3198, "step": 3017 }, { "epoch": 1.442028552655158, "grad_norm": 0.5086685855274399, "learning_rate": 1.1466734930520609e-05, "loss": 0.309, "step": 3018 }, { "epoch": 1.442506421360731, "grad_norm": 0.47131182541150224, "learning_rate": 1.1461524976784172e-05, "loss": 0.3426, "step": 3019 }, { "epoch": 1.4429842900663044, "grad_norm": 0.4858076737245921, "learning_rate": 1.1456314617646482e-05, "loss": 0.3335, "step": 3020 }, { "epoch": 1.4434621587718774, "grad_norm": 0.47177866647330885, "learning_rate": 1.145110385455281e-05, "loss": 0.3117, "step": 3021 }, { "epoch": 1.4439400274774505, "grad_norm": 0.49146463401072915, "learning_rate": 1.1445892688948525e-05, "loss": 0.3288, "step": 3022 }, { "epoch": 1.4444178961830239, "grad_norm": 0.5021205966554858, "learning_rate": 1.1440681122279113e-05, "loss": 0.3244, "step": 3023 }, { "epoch": 1.4448957648885967, "grad_norm": 0.4819803103747588, "learning_rate": 1.1435469155990171e-05, "loss": 0.3103, "step": 3024 }, { "epoch": 1.44537363359417, "grad_norm": 1.383385618058039, "learning_rate": 1.1430256791527406e-05, "loss": 0.3312, "step": 3025 }, { "epoch": 1.4458515022997431, "grad_norm": 0.5170297557146272, "learning_rate": 1.1425044030336636e-05, "loss": 0.3127, "step": 3026 }, { "epoch": 1.4463293710053162, "grad_norm": 0.4832092172218538, "learning_rate": 1.1419830873863792e-05, "loss": 0.3267, "step": 3027 }, { "epoch": 1.4468072397108895, "grad_norm": 0.5132105382295371, "learning_rate": 1.1414617323554906e-05, "loss": 0.3224, "step": 3028 }, { "epoch": 1.4472851084164626, "grad_norm": 0.5023592255793703, "learning_rate": 1.1409403380856128e-05, "loss": 0.3197, "step": 3029 }, { "epoch": 1.4477629771220357, "grad_norm": 0.49076288043984206, "learning_rate": 1.1404189047213716e-05, "loss": 0.3323, "step": 3030 }, { "epoch": 1.4482408458276088, "grad_norm": 0.49562922226214057, "learning_rate": 1.139897432407403e-05, "loss": 0.333, "step": 3031 }, { "epoch": 1.448718714533182, "grad_norm": 0.4849984150072255, "learning_rate": 1.1393759212883544e-05, "loss": 0.3169, "step": 3032 }, { "epoch": 1.4491965832387552, "grad_norm": 0.4706486064594749, "learning_rate": 1.1388543715088838e-05, "loss": 0.2996, "step": 3033 }, { "epoch": 1.4496744519443283, "grad_norm": 0.5374385570329433, "learning_rate": 1.13833278321366e-05, "loss": 0.3099, "step": 3034 }, { "epoch": 1.4501523206499014, "grad_norm": 0.5207013029763824, "learning_rate": 1.137811156547362e-05, "loss": 0.3135, "step": 3035 }, { "epoch": 1.4506301893554747, "grad_norm": 0.5204531377813438, "learning_rate": 1.1372894916546804e-05, "loss": 0.3254, "step": 3036 }, { "epoch": 1.4511080580610476, "grad_norm": 0.46838194787481924, "learning_rate": 1.1367677886803152e-05, "loss": 0.3311, "step": 3037 }, { "epoch": 1.451585926766621, "grad_norm": 0.5507044912148897, "learning_rate": 1.1362460477689784e-05, "loss": 0.3367, "step": 3038 }, { "epoch": 1.452063795472194, "grad_norm": 0.5069990443386853, "learning_rate": 1.1357242690653911e-05, "loss": 0.342, "step": 3039 }, { "epoch": 1.452541664177767, "grad_norm": 0.48662576527328144, "learning_rate": 1.1352024527142855e-05, "loss": 0.3279, "step": 3040 }, { "epoch": 1.4530195328833404, "grad_norm": 0.5009454804753579, "learning_rate": 1.1346805988604048e-05, "loss": 0.3205, "step": 3041 }, { "epoch": 1.4534974015889135, "grad_norm": 0.5033573059665943, "learning_rate": 1.1341587076485015e-05, "loss": 0.3258, "step": 3042 }, { "epoch": 1.4539752702944866, "grad_norm": 0.48696663644276184, "learning_rate": 1.1336367792233394e-05, "loss": 0.306, "step": 3043 }, { "epoch": 1.4544531390000597, "grad_norm": 0.4703928054737579, "learning_rate": 1.133114813729692e-05, "loss": 0.3342, "step": 3044 }, { "epoch": 1.4549310077056328, "grad_norm": 0.5033596475668163, "learning_rate": 1.1325928113123431e-05, "loss": 0.3275, "step": 3045 }, { "epoch": 1.455408876411206, "grad_norm": 0.4994474514417599, "learning_rate": 1.1320707721160876e-05, "loss": 0.3334, "step": 3046 }, { "epoch": 1.4558867451167792, "grad_norm": 0.5430329386590091, "learning_rate": 1.1315486962857293e-05, "loss": 0.3116, "step": 3047 }, { "epoch": 1.4563646138223523, "grad_norm": 0.4838126377080352, "learning_rate": 1.1310265839660835e-05, "loss": 0.3131, "step": 3048 }, { "epoch": 1.4568424825279256, "grad_norm": 0.6439581398779899, "learning_rate": 1.130504435301974e-05, "loss": 0.3308, "step": 3049 }, { "epoch": 1.4573203512334987, "grad_norm": 0.4840420029195805, "learning_rate": 1.129982250438237e-05, "loss": 0.3257, "step": 3050 }, { "epoch": 1.4577982199390718, "grad_norm": 0.47837719127146977, "learning_rate": 1.129460029519716e-05, "loss": 0.3358, "step": 3051 }, { "epoch": 1.4582760886446449, "grad_norm": 0.4893765918667975, "learning_rate": 1.1289377726912665e-05, "loss": 0.3352, "step": 3052 }, { "epoch": 1.458753957350218, "grad_norm": 0.4923586108450949, "learning_rate": 1.1284154800977533e-05, "loss": 0.3281, "step": 3053 }, { "epoch": 1.4592318260557913, "grad_norm": 0.4621191750554461, "learning_rate": 1.127893151884051e-05, "loss": 0.3401, "step": 3054 }, { "epoch": 1.4597096947613644, "grad_norm": 0.492839774425873, "learning_rate": 1.1273707881950445e-05, "loss": 0.3327, "step": 3055 }, { "epoch": 1.4601875634669375, "grad_norm": 0.4643479712090378, "learning_rate": 1.1268483891756283e-05, "loss": 0.3292, "step": 3056 }, { "epoch": 1.4606654321725105, "grad_norm": 0.4671866712639955, "learning_rate": 1.1263259549707063e-05, "loss": 0.3255, "step": 3057 }, { "epoch": 1.4611433008780836, "grad_norm": 0.493972595273388, "learning_rate": 1.125803485725193e-05, "loss": 0.3326, "step": 3058 }, { "epoch": 1.461621169583657, "grad_norm": 0.4660787343724186, "learning_rate": 1.1252809815840118e-05, "loss": 0.3128, "step": 3059 }, { "epoch": 1.46209903828923, "grad_norm": 0.4975144467520985, "learning_rate": 1.1247584426920962e-05, "loss": 0.3134, "step": 3060 }, { "epoch": 1.4625769069948031, "grad_norm": 0.45843098521936504, "learning_rate": 1.124235869194389e-05, "loss": 0.3154, "step": 3061 }, { "epoch": 1.4630547757003765, "grad_norm": 0.4818123666317099, "learning_rate": 1.1237132612358436e-05, "loss": 0.3275, "step": 3062 }, { "epoch": 1.4635326444059495, "grad_norm": 0.5292710710183121, "learning_rate": 1.1231906189614217e-05, "loss": 0.3257, "step": 3063 }, { "epoch": 1.4640105131115226, "grad_norm": 0.48700887730356873, "learning_rate": 1.1226679425160949e-05, "loss": 0.3329, "step": 3064 }, { "epoch": 1.4644883818170957, "grad_norm": 0.48240701019079835, "learning_rate": 1.1221452320448449e-05, "loss": 0.3425, "step": 3065 }, { "epoch": 1.4649662505226688, "grad_norm": 0.48147101653808244, "learning_rate": 1.1216224876926622e-05, "loss": 0.318, "step": 3066 }, { "epoch": 1.4654441192282421, "grad_norm": 0.467596203651121, "learning_rate": 1.1210997096045466e-05, "loss": 0.3293, "step": 3067 }, { "epoch": 1.4659219879338152, "grad_norm": 0.4550529019067923, "learning_rate": 1.1205768979255078e-05, "loss": 0.3158, "step": 3068 }, { "epoch": 1.4663998566393883, "grad_norm": 0.48042440869684455, "learning_rate": 1.1200540528005645e-05, "loss": 0.3298, "step": 3069 }, { "epoch": 1.4668777253449614, "grad_norm": 0.5277816140348442, "learning_rate": 1.1195311743747445e-05, "loss": 0.3363, "step": 3070 }, { "epoch": 1.4673555940505345, "grad_norm": 0.4888123296530078, "learning_rate": 1.1190082627930854e-05, "loss": 0.3226, "step": 3071 }, { "epoch": 1.4678334627561078, "grad_norm": 0.5015585568421738, "learning_rate": 1.1184853182006332e-05, "loss": 0.3172, "step": 3072 }, { "epoch": 1.468311331461681, "grad_norm": 0.47745495211196265, "learning_rate": 1.1179623407424442e-05, "loss": 0.3181, "step": 3073 }, { "epoch": 1.468789200167254, "grad_norm": 0.5013475163036912, "learning_rate": 1.1174393305635825e-05, "loss": 0.3083, "step": 3074 }, { "epoch": 1.4692670688728273, "grad_norm": 0.521653074954796, "learning_rate": 1.116916287809122e-05, "loss": 0.3178, "step": 3075 }, { "epoch": 1.4697449375784004, "grad_norm": 0.4953449613595978, "learning_rate": 1.116393212624146e-05, "loss": 0.3375, "step": 3076 }, { "epoch": 1.4702228062839735, "grad_norm": 0.4704912345390417, "learning_rate": 1.1158701051537455e-05, "loss": 0.3199, "step": 3077 }, { "epoch": 1.4707006749895466, "grad_norm": 0.5066215917086822, "learning_rate": 1.1153469655430218e-05, "loss": 0.3028, "step": 3078 }, { "epoch": 1.4711785436951197, "grad_norm": 0.49299038538236867, "learning_rate": 1.1148237939370847e-05, "loss": 0.3013, "step": 3079 }, { "epoch": 1.471656412400693, "grad_norm": 0.503131186431184, "learning_rate": 1.1143005904810527e-05, "loss": 0.3228, "step": 3080 }, { "epoch": 1.472134281106266, "grad_norm": 0.48541708299393943, "learning_rate": 1.1137773553200528e-05, "loss": 0.3247, "step": 3081 }, { "epoch": 1.4726121498118392, "grad_norm": 0.4667618110206343, "learning_rate": 1.1132540885992221e-05, "loss": 0.3302, "step": 3082 }, { "epoch": 1.4730900185174123, "grad_norm": 0.4966223048730619, "learning_rate": 1.1127307904637044e-05, "loss": 0.3098, "step": 3083 }, { "epoch": 1.4735678872229854, "grad_norm": 0.49913335938844805, "learning_rate": 1.1122074610586541e-05, "loss": 0.3338, "step": 3084 }, { "epoch": 1.4740457559285587, "grad_norm": 0.5787027610611666, "learning_rate": 1.1116841005292339e-05, "loss": 0.3233, "step": 3085 }, { "epoch": 1.4745236246341318, "grad_norm": 0.4988518285658534, "learning_rate": 1.1111607090206135e-05, "loss": 0.3451, "step": 3086 }, { "epoch": 1.4750014933397049, "grad_norm": 0.48890728691122937, "learning_rate": 1.1106372866779738e-05, "loss": 0.3226, "step": 3087 }, { "epoch": 1.4754793620452782, "grad_norm": 0.4743490756030402, "learning_rate": 1.110113833646502e-05, "loss": 0.3174, "step": 3088 }, { "epoch": 1.4759572307508513, "grad_norm": 0.496094343763048, "learning_rate": 1.1095903500713953e-05, "loss": 0.3114, "step": 3089 }, { "epoch": 1.4764350994564244, "grad_norm": 0.5359507037011488, "learning_rate": 1.1090668360978589e-05, "loss": 0.3144, "step": 3090 }, { "epoch": 1.4769129681619975, "grad_norm": 0.5615293353766108, "learning_rate": 1.1085432918711059e-05, "loss": 0.3209, "step": 3091 }, { "epoch": 1.4773908368675706, "grad_norm": 0.512584855813972, "learning_rate": 1.1080197175363584e-05, "loss": 0.3242, "step": 3092 }, { "epoch": 1.4778687055731439, "grad_norm": 0.5265949611457437, "learning_rate": 1.1074961132388466e-05, "loss": 0.3275, "step": 3093 }, { "epoch": 1.478346574278717, "grad_norm": 0.49105786313448796, "learning_rate": 1.1069724791238092e-05, "loss": 0.3118, "step": 3094 }, { "epoch": 1.47882444298429, "grad_norm": 0.47704097499158915, "learning_rate": 1.106448815336493e-05, "loss": 0.3289, "step": 3095 }, { "epoch": 1.4793023116898631, "grad_norm": 0.5232471767406344, "learning_rate": 1.1059251220221534e-05, "loss": 0.3302, "step": 3096 }, { "epoch": 1.4797801803954362, "grad_norm": 0.5228817184494445, "learning_rate": 1.1054013993260533e-05, "loss": 0.3201, "step": 3097 }, { "epoch": 1.4802580491010096, "grad_norm": 0.4936116345958341, "learning_rate": 1.1048776473934642e-05, "loss": 0.32, "step": 3098 }, { "epoch": 1.4807359178065826, "grad_norm": 0.4930123354861475, "learning_rate": 1.1043538663696658e-05, "loss": 0.319, "step": 3099 }, { "epoch": 1.4812137865121557, "grad_norm": 0.5490307592266955, "learning_rate": 1.1038300563999455e-05, "loss": 0.3392, "step": 3100 }, { "epoch": 1.481691655217729, "grad_norm": 0.5030549664101823, "learning_rate": 1.1033062176295992e-05, "loss": 0.3256, "step": 3101 }, { "epoch": 1.4821695239233021, "grad_norm": 0.4693206453740588, "learning_rate": 1.1027823502039307e-05, "loss": 0.3357, "step": 3102 }, { "epoch": 1.4826473926288752, "grad_norm": 0.49980642230471584, "learning_rate": 1.1022584542682508e-05, "loss": 0.3224, "step": 3103 }, { "epoch": 1.4831252613344483, "grad_norm": 0.4872985953951889, "learning_rate": 1.1017345299678797e-05, "loss": 0.32, "step": 3104 }, { "epoch": 1.4836031300400214, "grad_norm": 0.5155764862652535, "learning_rate": 1.1012105774481446e-05, "loss": 0.3255, "step": 3105 }, { "epoch": 1.4840809987455947, "grad_norm": 0.49092199999070874, "learning_rate": 1.1006865968543805e-05, "loss": 0.3022, "step": 3106 }, { "epoch": 1.4845588674511678, "grad_norm": 0.4638968403292164, "learning_rate": 1.1001625883319307e-05, "loss": 0.3284, "step": 3107 }, { "epoch": 1.485036736156741, "grad_norm": 0.47749399048412333, "learning_rate": 1.0996385520261457e-05, "loss": 0.3171, "step": 3108 }, { "epoch": 1.485514604862314, "grad_norm": 0.571045799710566, "learning_rate": 1.0991144880823838e-05, "loss": 0.3236, "step": 3109 }, { "epoch": 1.485992473567887, "grad_norm": 0.49745529315284953, "learning_rate": 1.0985903966460115e-05, "loss": 0.3439, "step": 3110 }, { "epoch": 1.4864703422734604, "grad_norm": 0.5308655028324103, "learning_rate": 1.0980662778624023e-05, "loss": 0.329, "step": 3111 }, { "epoch": 1.4869482109790335, "grad_norm": 0.5029668660735324, "learning_rate": 1.0975421318769373e-05, "loss": 0.332, "step": 3112 }, { "epoch": 1.4874260796846066, "grad_norm": 0.5166152056859417, "learning_rate": 1.0970179588350054e-05, "loss": 0.3311, "step": 3113 }, { "epoch": 1.48790394839018, "grad_norm": 0.5380965879327918, "learning_rate": 1.0964937588820036e-05, "loss": 0.3368, "step": 3114 }, { "epoch": 1.488381817095753, "grad_norm": 0.4866414171234857, "learning_rate": 1.0959695321633346e-05, "loss": 0.3182, "step": 3115 }, { "epoch": 1.488859685801326, "grad_norm": 0.5061118269564608, "learning_rate": 1.0954452788244106e-05, "loss": 0.3256, "step": 3116 }, { "epoch": 1.4893375545068992, "grad_norm": 0.4940241531258727, "learning_rate": 1.0949209990106497e-05, "loss": 0.3083, "step": 3117 }, { "epoch": 1.4898154232124723, "grad_norm": 0.4543349967578536, "learning_rate": 1.0943966928674783e-05, "loss": 0.3339, "step": 3118 }, { "epoch": 1.4902932919180456, "grad_norm": 0.5120304669937483, "learning_rate": 1.093872360540329e-05, "loss": 0.3219, "step": 3119 }, { "epoch": 1.4907711606236187, "grad_norm": 0.4662796909068133, "learning_rate": 1.0933480021746432e-05, "loss": 0.327, "step": 3120 }, { "epoch": 1.4912490293291918, "grad_norm": 0.4846749119742991, "learning_rate": 1.0928236179158678e-05, "loss": 0.3087, "step": 3121 }, { "epoch": 1.4917268980347649, "grad_norm": 0.48452861211325327, "learning_rate": 1.0922992079094588e-05, "loss": 0.3141, "step": 3122 }, { "epoch": 1.492204766740338, "grad_norm": 0.499123257716151, "learning_rate": 1.0917747723008771e-05, "loss": 0.3193, "step": 3123 }, { "epoch": 1.4926826354459113, "grad_norm": 0.46393153800632286, "learning_rate": 1.0912503112355926e-05, "loss": 0.3166, "step": 3124 }, { "epoch": 1.4931605041514844, "grad_norm": 0.5298869777156393, "learning_rate": 1.0907258248590816e-05, "loss": 0.3258, "step": 3125 }, { "epoch": 1.4936383728570575, "grad_norm": 0.47615348856877815, "learning_rate": 1.0902013133168267e-05, "loss": 0.296, "step": 3126 }, { "epoch": 1.4941162415626308, "grad_norm": 0.4861605658720827, "learning_rate": 1.089676776754319e-05, "loss": 0.3092, "step": 3127 }, { "epoch": 1.4945941102682039, "grad_norm": 0.5061227916241133, "learning_rate": 1.0891522153170553e-05, "loss": 0.3258, "step": 3128 }, { "epoch": 1.495071978973777, "grad_norm": 0.44470015995888657, "learning_rate": 1.0886276291505395e-05, "loss": 0.3264, "step": 3129 }, { "epoch": 1.49554984767935, "grad_norm": 0.4956355952980061, "learning_rate": 1.0881030184002827e-05, "loss": 0.3124, "step": 3130 }, { "epoch": 1.4960277163849232, "grad_norm": 0.47672673270267124, "learning_rate": 1.0875783832118032e-05, "loss": 0.3075, "step": 3131 }, { "epoch": 1.4965055850904965, "grad_norm": 0.48537882960310214, "learning_rate": 1.0870537237306245e-05, "loss": 0.3264, "step": 3132 }, { "epoch": 1.4969834537960696, "grad_norm": 0.5110345138782598, "learning_rate": 1.0865290401022785e-05, "loss": 0.325, "step": 3133 }, { "epoch": 1.4974613225016427, "grad_norm": 0.4626387908464134, "learning_rate": 1.0860043324723035e-05, "loss": 0.315, "step": 3134 }, { "epoch": 1.4979391912072157, "grad_norm": 0.4425903321792646, "learning_rate": 1.0854796009862434e-05, "loss": 0.3349, "step": 3135 }, { "epoch": 1.4984170599127888, "grad_norm": 0.5005573549699335, "learning_rate": 1.0849548457896499e-05, "loss": 0.3229, "step": 3136 }, { "epoch": 1.4988949286183622, "grad_norm": 0.4629721762379501, "learning_rate": 1.0844300670280809e-05, "loss": 0.3098, "step": 3137 }, { "epoch": 1.4993727973239352, "grad_norm": 0.45157919259053075, "learning_rate": 1.0839052648471002e-05, "loss": 0.3338, "step": 3138 }, { "epoch": 1.4998506660295083, "grad_norm": 0.49218452155784925, "learning_rate": 1.0833804393922796e-05, "loss": 0.3279, "step": 3139 }, { "epoch": 1.5003285347350817, "grad_norm": 0.46697736877733986, "learning_rate": 1.0828555908091958e-05, "loss": 0.3209, "step": 3140 }, { "epoch": 1.5008064034406545, "grad_norm": 0.502846029719294, "learning_rate": 1.0823307192434325e-05, "loss": 0.3211, "step": 3141 }, { "epoch": 1.5012842721462278, "grad_norm": 0.5308861043661305, "learning_rate": 1.08180582484058e-05, "loss": 0.317, "step": 3142 }, { "epoch": 1.501762140851801, "grad_norm": 0.9950929077061546, "learning_rate": 1.0812809077462348e-05, "loss": 0.3308, "step": 3143 }, { "epoch": 1.502240009557374, "grad_norm": 0.4748834877132264, "learning_rate": 1.0807559681059993e-05, "loss": 0.3225, "step": 3144 }, { "epoch": 1.5027178782629473, "grad_norm": 0.48714653402335817, "learning_rate": 1.0802310060654832e-05, "loss": 0.3294, "step": 3145 }, { "epoch": 1.5031957469685204, "grad_norm": 0.48089986557496106, "learning_rate": 1.079706021770301e-05, "loss": 0.3175, "step": 3146 }, { "epoch": 1.5036736156740935, "grad_norm": 0.5517783451396178, "learning_rate": 1.0791810153660745e-05, "loss": 0.3294, "step": 3147 }, { "epoch": 1.5041514843796668, "grad_norm": 0.5650570568015437, "learning_rate": 1.078655986998431e-05, "loss": 0.3312, "step": 3148 }, { "epoch": 1.5046293530852397, "grad_norm": 0.45637472471019414, "learning_rate": 1.0781309368130042e-05, "loss": 0.3262, "step": 3149 }, { "epoch": 1.505107221790813, "grad_norm": 0.4653008253186181, "learning_rate": 1.0776058649554336e-05, "loss": 0.3049, "step": 3150 }, { "epoch": 1.5055850904963861, "grad_norm": 0.545056353096684, "learning_rate": 1.0770807715713651e-05, "loss": 0.3362, "step": 3151 }, { "epoch": 1.5060629592019592, "grad_norm": 0.4902414971923998, "learning_rate": 1.0765556568064503e-05, "loss": 0.3204, "step": 3152 }, { "epoch": 1.5065408279075325, "grad_norm": 0.4608071782816651, "learning_rate": 1.0760305208063467e-05, "loss": 0.3208, "step": 3153 }, { "epoch": 1.5070186966131054, "grad_norm": 0.4808143098839568, "learning_rate": 1.0755053637167178e-05, "loss": 0.3328, "step": 3154 }, { "epoch": 1.5074965653186787, "grad_norm": 0.48834994164881984, "learning_rate": 1.0749801856832325e-05, "loss": 0.3175, "step": 3155 }, { "epoch": 1.5079744340242518, "grad_norm": 0.49638350606323217, "learning_rate": 1.0744549868515667e-05, "loss": 0.3324, "step": 3156 }, { "epoch": 1.508452302729825, "grad_norm": 0.49222615340506876, "learning_rate": 1.073929767367401e-05, "loss": 0.322, "step": 3157 }, { "epoch": 1.5089301714353982, "grad_norm": 0.4564826854769966, "learning_rate": 1.0734045273764217e-05, "loss": 0.3301, "step": 3158 }, { "epoch": 1.5094080401409713, "grad_norm": 0.4941738943168986, "learning_rate": 1.0728792670243215e-05, "loss": 0.3282, "step": 3159 }, { "epoch": 1.5098859088465444, "grad_norm": 0.4565157299448364, "learning_rate": 1.0723539864567983e-05, "loss": 0.3139, "step": 3160 }, { "epoch": 1.5103637775521177, "grad_norm": 0.46731577882122116, "learning_rate": 1.0718286858195553e-05, "loss": 0.3094, "step": 3161 }, { "epoch": 1.5108416462576906, "grad_norm": 0.5380454191031079, "learning_rate": 1.071303365258302e-05, "loss": 0.3274, "step": 3162 }, { "epoch": 1.5113195149632639, "grad_norm": 0.4504131152594778, "learning_rate": 1.070778024918753e-05, "loss": 0.3404, "step": 3163 }, { "epoch": 1.511797383668837, "grad_norm": 0.47414368841241944, "learning_rate": 1.0702526649466282e-05, "loss": 0.3326, "step": 3164 }, { "epoch": 1.51227525237441, "grad_norm": 0.48484243615521744, "learning_rate": 1.0697272854876537e-05, "loss": 0.3149, "step": 3165 }, { "epoch": 1.5127531210799834, "grad_norm": 0.4926823715355633, "learning_rate": 1.0692018866875598e-05, "loss": 0.3274, "step": 3166 }, { "epoch": 1.5132309897855563, "grad_norm": 0.47882967616119354, "learning_rate": 1.0686764686920834e-05, "loss": 0.3292, "step": 3167 }, { "epoch": 1.5137088584911296, "grad_norm": 0.4626903520160237, "learning_rate": 1.0681510316469661e-05, "loss": 0.3185, "step": 3168 }, { "epoch": 1.5141867271967027, "grad_norm": 0.560531953606567, "learning_rate": 1.0676255756979548e-05, "loss": 0.3301, "step": 3169 }, { "epoch": 1.5146645959022758, "grad_norm": 0.48792270052341974, "learning_rate": 1.0671001009908015e-05, "loss": 0.3126, "step": 3170 }, { "epoch": 1.515142464607849, "grad_norm": 0.5129007440119009, "learning_rate": 1.066574607671264e-05, "loss": 0.3181, "step": 3171 }, { "epoch": 1.5156203333134222, "grad_norm": 0.4711038320320487, "learning_rate": 1.0660490958851044e-05, "loss": 0.3178, "step": 3172 }, { "epoch": 1.5160982020189953, "grad_norm": 0.5092552024671185, "learning_rate": 1.0655235657780906e-05, "loss": 0.3289, "step": 3173 }, { "epoch": 1.5165760707245686, "grad_norm": 0.5953972215800041, "learning_rate": 1.0649980174959961e-05, "loss": 0.3102, "step": 3174 }, { "epoch": 1.5170539394301414, "grad_norm": 0.4703123203138388, "learning_rate": 1.0644724511845976e-05, "loss": 0.3143, "step": 3175 }, { "epoch": 1.5175318081357148, "grad_norm": 0.5165416055915609, "learning_rate": 1.0639468669896787e-05, "loss": 0.3115, "step": 3176 }, { "epoch": 1.5180096768412878, "grad_norm": 0.582450596746319, "learning_rate": 1.0634212650570269e-05, "loss": 0.3316, "step": 3177 }, { "epoch": 1.518487545546861, "grad_norm": 0.4535700489101072, "learning_rate": 1.0628956455324347e-05, "loss": 0.3307, "step": 3178 }, { "epoch": 1.5189654142524343, "grad_norm": 0.48156284886715084, "learning_rate": 1.0623700085616999e-05, "loss": 0.327, "step": 3179 }, { "epoch": 1.5194432829580071, "grad_norm": 0.4838090723343779, "learning_rate": 1.0618443542906251e-05, "loss": 0.3302, "step": 3180 }, { "epoch": 1.5199211516635804, "grad_norm": 0.5079130746721264, "learning_rate": 1.0613186828650171e-05, "loss": 0.3202, "step": 3181 }, { "epoch": 1.5203990203691535, "grad_norm": 0.4986317378339555, "learning_rate": 1.0607929944306883e-05, "loss": 0.308, "step": 3182 }, { "epoch": 1.5208768890747266, "grad_norm": 0.5275576649032065, "learning_rate": 1.0602672891334552e-05, "loss": 0.3332, "step": 3183 }, { "epoch": 1.5213547577803, "grad_norm": 0.4731647767150495, "learning_rate": 1.0597415671191391e-05, "loss": 0.3509, "step": 3184 }, { "epoch": 1.521832626485873, "grad_norm": 0.4763797455280031, "learning_rate": 1.059215828533566e-05, "loss": 0.3286, "step": 3185 }, { "epoch": 1.5223104951914461, "grad_norm": 0.47355536256007635, "learning_rate": 1.0586900735225669e-05, "loss": 0.3239, "step": 3186 }, { "epoch": 1.5227883638970194, "grad_norm": 0.4674306668056484, "learning_rate": 1.0581643022319765e-05, "loss": 0.3065, "step": 3187 }, { "epoch": 1.5232662326025923, "grad_norm": 0.5012159446084873, "learning_rate": 1.0576385148076346e-05, "loss": 0.3288, "step": 3188 }, { "epoch": 1.5237441013081656, "grad_norm": 0.4926801554454291, "learning_rate": 1.0571127113953855e-05, "loss": 0.321, "step": 3189 }, { "epoch": 1.5242219700137387, "grad_norm": 0.5352440355066403, "learning_rate": 1.0565868921410776e-05, "loss": 0.3326, "step": 3190 }, { "epoch": 1.5246998387193118, "grad_norm": 0.5707201595977832, "learning_rate": 1.0560610571905642e-05, "loss": 0.3247, "step": 3191 }, { "epoch": 1.5251777074248851, "grad_norm": 0.46143438621243094, "learning_rate": 1.0555352066897025e-05, "loss": 0.335, "step": 3192 }, { "epoch": 1.525655576130458, "grad_norm": 0.46914561506550345, "learning_rate": 1.0550093407843538e-05, "loss": 0.3035, "step": 3193 }, { "epoch": 1.5261334448360313, "grad_norm": 0.4745615418585816, "learning_rate": 1.0544834596203846e-05, "loss": 0.3192, "step": 3194 }, { "epoch": 1.5266113135416044, "grad_norm": 0.48717953885939197, "learning_rate": 1.0539575633436645e-05, "loss": 0.3333, "step": 3195 }, { "epoch": 1.5270891822471775, "grad_norm": 0.4735843171248546, "learning_rate": 1.0534316521000683e-05, "loss": 0.3256, "step": 3196 }, { "epoch": 1.5275670509527508, "grad_norm": 0.4796146078480026, "learning_rate": 1.0529057260354744e-05, "loss": 0.311, "step": 3197 }, { "epoch": 1.528044919658324, "grad_norm": 0.4964693345022835, "learning_rate": 1.052379785295765e-05, "loss": 0.3328, "step": 3198 }, { "epoch": 1.528522788363897, "grad_norm": 0.4551957925753305, "learning_rate": 1.0518538300268275e-05, "loss": 0.3267, "step": 3199 }, { "epoch": 1.5290006570694703, "grad_norm": 0.4833164973926133, "learning_rate": 1.0513278603745523e-05, "loss": 0.3346, "step": 3200 }, { "epoch": 1.5294785257750432, "grad_norm": 0.4976783327391536, "learning_rate": 1.0508018764848336e-05, "loss": 0.3214, "step": 3201 }, { "epoch": 1.5299563944806165, "grad_norm": 0.5163458481272856, "learning_rate": 1.0502758785035708e-05, "loss": 0.3276, "step": 3202 }, { "epoch": 1.5304342631861896, "grad_norm": 0.5188646785189226, "learning_rate": 1.0497498665766662e-05, "loss": 0.3228, "step": 3203 }, { "epoch": 1.5309121318917627, "grad_norm": 0.4865309249115541, "learning_rate": 1.049223840850026e-05, "loss": 0.3232, "step": 3204 }, { "epoch": 1.531390000597336, "grad_norm": 0.508377716195505, "learning_rate": 1.0486978014695606e-05, "loss": 0.3224, "step": 3205 }, { "epoch": 1.531867869302909, "grad_norm": 0.4890171119416868, "learning_rate": 1.048171748581184e-05, "loss": 0.321, "step": 3206 }, { "epoch": 1.5323457380084822, "grad_norm": 0.49225304466391234, "learning_rate": 1.0476456823308144e-05, "loss": 0.295, "step": 3207 }, { "epoch": 1.5328236067140553, "grad_norm": 0.533313040699636, "learning_rate": 1.0471196028643728e-05, "loss": 0.3171, "step": 3208 }, { "epoch": 1.5333014754196284, "grad_norm": 0.5656534777964286, "learning_rate": 1.0465935103277845e-05, "loss": 0.324, "step": 3209 }, { "epoch": 1.5337793441252017, "grad_norm": 0.49695878757528583, "learning_rate": 1.0460674048669783e-05, "loss": 0.3218, "step": 3210 }, { "epoch": 1.5342572128307748, "grad_norm": 0.5362838974016912, "learning_rate": 1.0455412866278868e-05, "loss": 0.3242, "step": 3211 }, { "epoch": 1.5347350815363479, "grad_norm": 0.5144518785758332, "learning_rate": 1.0450151557564457e-05, "loss": 0.3334, "step": 3212 }, { "epoch": 1.5352129502419212, "grad_norm": 0.5068918594348978, "learning_rate": 1.0444890123985942e-05, "loss": 0.3299, "step": 3213 }, { "epoch": 1.535690818947494, "grad_norm": 0.4780919059955396, "learning_rate": 1.043962856700276e-05, "loss": 0.3199, "step": 3214 }, { "epoch": 1.5361686876530674, "grad_norm": 0.5127686326304569, "learning_rate": 1.0434366888074363e-05, "loss": 0.3193, "step": 3215 }, { "epoch": 1.5366465563586404, "grad_norm": 0.49589298134321574, "learning_rate": 1.0429105088660253e-05, "loss": 0.3226, "step": 3216 }, { "epoch": 1.5371244250642135, "grad_norm": 0.4860233798195414, "learning_rate": 1.0423843170219966e-05, "loss": 0.3365, "step": 3217 }, { "epoch": 1.5376022937697869, "grad_norm": 0.4714518886662182, "learning_rate": 1.0418581134213055e-05, "loss": 0.3244, "step": 3218 }, { "epoch": 1.53808016247536, "grad_norm": 0.4955762103566397, "learning_rate": 1.0413318982099124e-05, "loss": 0.3226, "step": 3219 }, { "epoch": 1.538558031180933, "grad_norm": 0.4973793360007407, "learning_rate": 1.0408056715337797e-05, "loss": 0.3154, "step": 3220 }, { "epoch": 1.5390358998865061, "grad_norm": 0.487987592402328, "learning_rate": 1.0402794335388733e-05, "loss": 0.3219, "step": 3221 }, { "epoch": 1.5395137685920792, "grad_norm": 0.5013231822647967, "learning_rate": 1.0397531843711626e-05, "loss": 0.3144, "step": 3222 }, { "epoch": 1.5399916372976525, "grad_norm": 0.5122546611335848, "learning_rate": 1.0392269241766199e-05, "loss": 0.3161, "step": 3223 }, { "epoch": 1.5404695060032256, "grad_norm": 0.5023537156876484, "learning_rate": 1.0387006531012204e-05, "loss": 0.3307, "step": 3224 }, { "epoch": 1.5409473747087987, "grad_norm": 0.46724605937313596, "learning_rate": 1.0381743712909424e-05, "loss": 0.3229, "step": 3225 }, { "epoch": 1.541425243414372, "grad_norm": 0.47093375402216126, "learning_rate": 1.0376480788917676e-05, "loss": 0.3279, "step": 3226 }, { "epoch": 1.541903112119945, "grad_norm": 0.46994586921571396, "learning_rate": 1.0371217760496792e-05, "loss": 0.3236, "step": 3227 }, { "epoch": 1.5423809808255182, "grad_norm": 0.4843800423622691, "learning_rate": 1.0365954629106652e-05, "loss": 0.3085, "step": 3228 }, { "epoch": 1.5428588495310913, "grad_norm": 0.4772582302201957, "learning_rate": 1.0360691396207155e-05, "loss": 0.3076, "step": 3229 }, { "epoch": 1.5433367182366644, "grad_norm": 0.481322355383063, "learning_rate": 1.0355428063258224e-05, "loss": 0.3068, "step": 3230 }, { "epoch": 1.5438145869422377, "grad_norm": 0.511101779977841, "learning_rate": 1.0350164631719816e-05, "loss": 0.3217, "step": 3231 }, { "epoch": 1.5442924556478108, "grad_norm": 0.45963843948522115, "learning_rate": 1.0344901103051923e-05, "loss": 0.343, "step": 3232 }, { "epoch": 1.544770324353384, "grad_norm": 0.48757882063659436, "learning_rate": 1.033963747871454e-05, "loss": 0.3232, "step": 3233 }, { "epoch": 1.545248193058957, "grad_norm": 0.5055497495839371, "learning_rate": 1.0334373760167718e-05, "loss": 0.3234, "step": 3234 }, { "epoch": 1.54572606176453, "grad_norm": 0.49927220968524727, "learning_rate": 1.0329109948871512e-05, "loss": 0.3197, "step": 3235 }, { "epoch": 1.5462039304701034, "grad_norm": 0.48763244928090066, "learning_rate": 1.032384604628601e-05, "loss": 0.3222, "step": 3236 }, { "epoch": 1.5466817991756765, "grad_norm": 0.464310330481023, "learning_rate": 1.0318582053871326e-05, "loss": 0.319, "step": 3237 }, { "epoch": 1.5471596678812496, "grad_norm": 0.4970739536764959, "learning_rate": 1.0313317973087603e-05, "loss": 0.3121, "step": 3238 }, { "epoch": 1.547637536586823, "grad_norm": 0.5697514900420939, "learning_rate": 1.0308053805394998e-05, "loss": 0.3153, "step": 3239 }, { "epoch": 1.5481154052923958, "grad_norm": 0.48345598443017196, "learning_rate": 1.0302789552253702e-05, "loss": 0.3194, "step": 3240 }, { "epoch": 1.548593273997969, "grad_norm": 0.4761641613916091, "learning_rate": 1.0297525215123927e-05, "loss": 0.3175, "step": 3241 }, { "epoch": 1.5490711427035422, "grad_norm": 0.4899651453697106, "learning_rate": 1.0292260795465905e-05, "loss": 0.3289, "step": 3242 }, { "epoch": 1.5495490114091153, "grad_norm": 0.5535534354531657, "learning_rate": 1.0286996294739895e-05, "loss": 0.3321, "step": 3243 }, { "epoch": 1.5500268801146886, "grad_norm": 0.49382296783652835, "learning_rate": 1.0281731714406172e-05, "loss": 0.3274, "step": 3244 }, { "epoch": 1.5505047488202617, "grad_norm": 0.48868429365584, "learning_rate": 1.0276467055925044e-05, "loss": 0.315, "step": 3245 }, { "epoch": 1.5509826175258348, "grad_norm": 0.4900357280511654, "learning_rate": 1.027120232075683e-05, "loss": 0.3273, "step": 3246 }, { "epoch": 1.551460486231408, "grad_norm": 0.44278332935205783, "learning_rate": 1.0265937510361876e-05, "loss": 0.3363, "step": 3247 }, { "epoch": 1.551938354936981, "grad_norm": 0.5025588525393508, "learning_rate": 1.0260672626200548e-05, "loss": 0.3467, "step": 3248 }, { "epoch": 1.5524162236425543, "grad_norm": 0.48485895773001003, "learning_rate": 1.0255407669733235e-05, "loss": 0.3179, "step": 3249 }, { "epoch": 1.5528940923481274, "grad_norm": 0.48681474256092666, "learning_rate": 1.0250142642420335e-05, "loss": 0.3131, "step": 3250 }, { "epoch": 1.5533719610537005, "grad_norm": 0.5072938805616622, "learning_rate": 1.024487754572228e-05, "loss": 0.3048, "step": 3251 }, { "epoch": 1.5538498297592738, "grad_norm": 0.464960127721567, "learning_rate": 1.0239612381099515e-05, "loss": 0.3081, "step": 3252 }, { "epoch": 1.5543276984648466, "grad_norm": 0.5029051775399344, "learning_rate": 1.02343471500125e-05, "loss": 0.326, "step": 3253 }, { "epoch": 1.55480556717042, "grad_norm": 0.4760931121940165, "learning_rate": 1.0229081853921719e-05, "loss": 0.3373, "step": 3254 }, { "epoch": 1.555283435875993, "grad_norm": 0.5211628472885691, "learning_rate": 1.0223816494287675e-05, "loss": 0.3194, "step": 3255 }, { "epoch": 1.5557613045815661, "grad_norm": 4.00869479916537, "learning_rate": 1.021855107257088e-05, "loss": 0.3147, "step": 3256 }, { "epoch": 1.5562391732871395, "grad_norm": 0.5278043616111543, "learning_rate": 1.0213285590231877e-05, "loss": 0.2986, "step": 3257 }, { "epoch": 1.5567170419927125, "grad_norm": 0.48164426331213295, "learning_rate": 1.020802004873121e-05, "loss": 0.3253, "step": 3258 }, { "epoch": 1.5571949106982856, "grad_norm": 0.5500585812101135, "learning_rate": 1.0202754449529453e-05, "loss": 0.3209, "step": 3259 }, { "epoch": 1.557672779403859, "grad_norm": 0.49446254990048405, "learning_rate": 1.0197488794087188e-05, "loss": 0.3198, "step": 3260 }, { "epoch": 1.5581506481094318, "grad_norm": 0.4657507878684054, "learning_rate": 1.0192223083865013e-05, "loss": 0.3256, "step": 3261 }, { "epoch": 1.5586285168150051, "grad_norm": 0.4477764129200038, "learning_rate": 1.0186957320323547e-05, "loss": 0.3271, "step": 3262 }, { "epoch": 1.5591063855205782, "grad_norm": 0.48278945630965814, "learning_rate": 1.0181691504923421e-05, "loss": 0.307, "step": 3263 }, { "epoch": 1.5595842542261513, "grad_norm": 1.1355856675651534, "learning_rate": 1.0176425639125273e-05, "loss": 0.3271, "step": 3264 }, { "epoch": 1.5600621229317246, "grad_norm": 0.4819769035656966, "learning_rate": 1.0171159724389766e-05, "loss": 0.3178, "step": 3265 }, { "epoch": 1.5605399916372975, "grad_norm": 0.4786394343649937, "learning_rate": 1.016589376217757e-05, "loss": 0.3382, "step": 3266 }, { "epoch": 1.5610178603428708, "grad_norm": 0.48736910260272115, "learning_rate": 1.016062775394937e-05, "loss": 0.3196, "step": 3267 }, { "epoch": 1.561495729048444, "grad_norm": 0.46448638746562965, "learning_rate": 1.0155361701165867e-05, "loss": 0.3148, "step": 3268 }, { "epoch": 1.561973597754017, "grad_norm": 0.49431236575933374, "learning_rate": 1.0150095605287768e-05, "loss": 0.3093, "step": 3269 }, { "epoch": 1.5624514664595903, "grad_norm": 0.4686420840220977, "learning_rate": 1.0144829467775794e-05, "loss": 0.3168, "step": 3270 }, { "epoch": 1.5629293351651634, "grad_norm": 0.44817236471431465, "learning_rate": 1.0139563290090679e-05, "loss": 0.3154, "step": 3271 }, { "epoch": 1.5634072038707365, "grad_norm": 0.46921012393643297, "learning_rate": 1.0134297073693173e-05, "loss": 0.3257, "step": 3272 }, { "epoch": 1.5638850725763098, "grad_norm": 0.46434620917562563, "learning_rate": 1.0129030820044024e-05, "loss": 0.3303, "step": 3273 }, { "epoch": 1.5643629412818827, "grad_norm": 0.6236012194460561, "learning_rate": 1.0123764530604003e-05, "loss": 0.3127, "step": 3274 }, { "epoch": 1.564840809987456, "grad_norm": 0.4944268701162515, "learning_rate": 1.0118498206833886e-05, "loss": 0.3269, "step": 3275 }, { "epoch": 1.565318678693029, "grad_norm": 0.5192897092562184, "learning_rate": 1.0113231850194455e-05, "loss": 0.3332, "step": 3276 }, { "epoch": 1.5657965473986022, "grad_norm": 0.4841696394389553, "learning_rate": 1.0107965462146507e-05, "loss": 0.3369, "step": 3277 }, { "epoch": 1.5662744161041755, "grad_norm": 0.5059546085514659, "learning_rate": 1.0102699044150845e-05, "loss": 0.3233, "step": 3278 }, { "epoch": 1.5667522848097484, "grad_norm": 0.4818467825525027, "learning_rate": 1.0097432597668279e-05, "loss": 0.3099, "step": 3279 }, { "epoch": 1.5672301535153217, "grad_norm": 0.4889420234246756, "learning_rate": 1.0092166124159628e-05, "loss": 0.3182, "step": 3280 }, { "epoch": 1.5677080222208948, "grad_norm": 0.4826052928580796, "learning_rate": 1.0086899625085725e-05, "loss": 0.3247, "step": 3281 }, { "epoch": 1.5681858909264679, "grad_norm": 0.49036739531543294, "learning_rate": 1.0081633101907393e-05, "loss": 0.329, "step": 3282 }, { "epoch": 1.5686637596320412, "grad_norm": 0.4574906703009941, "learning_rate": 1.007636655608548e-05, "loss": 0.3157, "step": 3283 }, { "epoch": 1.5691416283376143, "grad_norm": 0.4738908087609072, "learning_rate": 1.0071099989080833e-05, "loss": 0.3433, "step": 3284 }, { "epoch": 1.5696194970431874, "grad_norm": 0.46547191964568313, "learning_rate": 1.0065833402354302e-05, "loss": 0.3208, "step": 3285 }, { "epoch": 1.5700973657487607, "grad_norm": 0.5839094375309312, "learning_rate": 1.0060566797366744e-05, "loss": 0.3255, "step": 3286 }, { "epoch": 1.5705752344543336, "grad_norm": 0.4947521545103725, "learning_rate": 1.005530017557903e-05, "loss": 0.3208, "step": 3287 }, { "epoch": 1.5710531031599069, "grad_norm": 0.5025064223155531, "learning_rate": 1.005003353845202e-05, "loss": 0.3076, "step": 3288 }, { "epoch": 1.57153097186548, "grad_norm": 0.4669404424154083, "learning_rate": 1.0044766887446586e-05, "loss": 0.3276, "step": 3289 }, { "epoch": 1.572008840571053, "grad_norm": 0.5009595930368698, "learning_rate": 1.003950022402361e-05, "loss": 0.3106, "step": 3290 }, { "epoch": 1.5724867092766264, "grad_norm": 0.4948274057807226, "learning_rate": 1.0034233549643969e-05, "loss": 0.3424, "step": 3291 }, { "epoch": 1.5729645779821992, "grad_norm": 0.45361579014587694, "learning_rate": 1.0028966865768546e-05, "loss": 0.3258, "step": 3292 }, { "epoch": 1.5734424466877726, "grad_norm": 0.7899603895576185, "learning_rate": 1.0023700173858224e-05, "loss": 0.3189, "step": 3293 }, { "epoch": 1.5739203153933456, "grad_norm": 0.45700787119484876, "learning_rate": 1.0018433475373891e-05, "loss": 0.3257, "step": 3294 }, { "epoch": 1.5743981840989187, "grad_norm": 0.4721568980282006, "learning_rate": 1.0013166771776441e-05, "loss": 0.3356, "step": 3295 }, { "epoch": 1.574876052804492, "grad_norm": 0.4854731194781878, "learning_rate": 1.0007900064526756e-05, "loss": 0.3142, "step": 3296 }, { "epoch": 1.5753539215100651, "grad_norm": 0.4880060979457883, "learning_rate": 1.0002633355085734e-05, "loss": 0.3223, "step": 3297 }, { "epoch": 1.5758317902156382, "grad_norm": 0.4561283926612506, "learning_rate": 9.997366644914266e-06, "loss": 0.3206, "step": 3298 }, { "epoch": 1.5763096589212116, "grad_norm": 0.47632004228079955, "learning_rate": 9.992099935473244e-06, "loss": 0.3128, "step": 3299 }, { "epoch": 1.5767875276267844, "grad_norm": 0.4847083170328396, "learning_rate": 9.986833228223562e-06, "loss": 0.2984, "step": 3300 }, { "epoch": 1.5772653963323577, "grad_norm": 0.473252193657243, "learning_rate": 9.98156652462611e-06, "loss": 0.3223, "step": 3301 }, { "epoch": 1.5777432650379308, "grad_norm": 0.5012398605644248, "learning_rate": 9.976299826141776e-06, "loss": 0.3186, "step": 3302 }, { "epoch": 1.578221133743504, "grad_norm": 0.5394851324327755, "learning_rate": 9.971033134231458e-06, "loss": 0.3228, "step": 3303 }, { "epoch": 1.5786990024490772, "grad_norm": 0.46951070560264985, "learning_rate": 9.965766450356031e-06, "loss": 0.3249, "step": 3304 }, { "epoch": 1.57917687115465, "grad_norm": 0.47934334873980533, "learning_rate": 9.96049977597639e-06, "loss": 0.3197, "step": 3305 }, { "epoch": 1.5796547398602234, "grad_norm": 0.514993715660872, "learning_rate": 9.955233112553416e-06, "loss": 0.3037, "step": 3306 }, { "epoch": 1.5801326085657965, "grad_norm": 0.4812618916635242, "learning_rate": 9.949966461547984e-06, "loss": 0.3224, "step": 3307 }, { "epoch": 1.5806104772713696, "grad_norm": 0.49269064197863277, "learning_rate": 9.944699824420973e-06, "loss": 0.3335, "step": 3308 }, { "epoch": 1.581088345976943, "grad_norm": 0.4888765058606419, "learning_rate": 9.939433202633258e-06, "loss": 0.3188, "step": 3309 }, { "epoch": 1.581566214682516, "grad_norm": 0.47130242909704306, "learning_rate": 9.934166597645703e-06, "loss": 0.327, "step": 3310 }, { "epoch": 1.582044083388089, "grad_norm": 0.4980212169856602, "learning_rate": 9.92890001091917e-06, "loss": 0.3144, "step": 3311 }, { "epoch": 1.5825219520936624, "grad_norm": 0.5047705095751753, "learning_rate": 9.923633443914522e-06, "loss": 0.3098, "step": 3312 }, { "epoch": 1.5829998207992353, "grad_norm": 0.5055307123773655, "learning_rate": 9.91836689809261e-06, "loss": 0.3249, "step": 3313 }, { "epoch": 1.5834776895048086, "grad_norm": 0.45564093127252564, "learning_rate": 9.913100374914279e-06, "loss": 0.3027, "step": 3314 }, { "epoch": 1.5839555582103817, "grad_norm": 0.48312085794596293, "learning_rate": 9.907833875840374e-06, "loss": 0.3143, "step": 3315 }, { "epoch": 1.5844334269159548, "grad_norm": 0.4737622360232326, "learning_rate": 9.902567402331723e-06, "loss": 0.3171, "step": 3316 }, { "epoch": 1.584911295621528, "grad_norm": 0.46646546545393947, "learning_rate": 9.897300955849157e-06, "loss": 0.306, "step": 3317 }, { "epoch": 1.585389164327101, "grad_norm": 0.5382467045040225, "learning_rate": 9.892034537853495e-06, "loss": 0.3118, "step": 3318 }, { "epoch": 1.5858670330326743, "grad_norm": 0.46491097933901154, "learning_rate": 9.886768149805546e-06, "loss": 0.3236, "step": 3319 }, { "epoch": 1.5863449017382474, "grad_norm": 0.48990510634701584, "learning_rate": 9.881501793166117e-06, "loss": 0.32, "step": 3320 }, { "epoch": 1.5868227704438205, "grad_norm": 0.4872711839238837, "learning_rate": 9.876235469395999e-06, "loss": 0.3242, "step": 3321 }, { "epoch": 1.5873006391493938, "grad_norm": 0.46588753435922464, "learning_rate": 9.870969179955978e-06, "loss": 0.3045, "step": 3322 }, { "epoch": 1.5877785078549669, "grad_norm": 0.4365913398780867, "learning_rate": 9.86570292630683e-06, "loss": 0.31, "step": 3323 }, { "epoch": 1.58825637656054, "grad_norm": 0.5319535297106422, "learning_rate": 9.860436709909324e-06, "loss": 0.3159, "step": 3324 }, { "epoch": 1.5887342452661133, "grad_norm": 0.49154790591120767, "learning_rate": 9.85517053222421e-06, "loss": 0.3555, "step": 3325 }, { "epoch": 1.5892121139716862, "grad_norm": 0.4843832531975595, "learning_rate": 9.849904394712237e-06, "loss": 0.312, "step": 3326 }, { "epoch": 1.5896899826772595, "grad_norm": 0.466629508182958, "learning_rate": 9.84463829883414e-06, "loss": 0.3207, "step": 3327 }, { "epoch": 1.5901678513828326, "grad_norm": 0.5132633803194183, "learning_rate": 9.839372246050633e-06, "loss": 0.3106, "step": 3328 }, { "epoch": 1.5906457200884057, "grad_norm": 0.47341023027177437, "learning_rate": 9.834106237822434e-06, "loss": 0.3185, "step": 3329 }, { "epoch": 1.591123588793979, "grad_norm": 0.49348692682013234, "learning_rate": 9.82884027561024e-06, "loss": 0.3286, "step": 3330 }, { "epoch": 1.5916014574995518, "grad_norm": 0.4663364393993305, "learning_rate": 9.823574360874732e-06, "loss": 0.3146, "step": 3331 }, { "epoch": 1.5920793262051252, "grad_norm": 0.5075761152310269, "learning_rate": 9.818308495076582e-06, "loss": 0.3231, "step": 3332 }, { "epoch": 1.5925571949106982, "grad_norm": 0.4901694360126208, "learning_rate": 9.813042679676453e-06, "loss": 0.3078, "step": 3333 }, { "epoch": 1.5930350636162713, "grad_norm": 0.47515746573089945, "learning_rate": 9.807776916134985e-06, "loss": 0.3177, "step": 3334 }, { "epoch": 1.5935129323218447, "grad_norm": 0.45885158737432963, "learning_rate": 9.802511205912815e-06, "loss": 0.333, "step": 3335 }, { "epoch": 1.5939908010274177, "grad_norm": 0.4932981943331556, "learning_rate": 9.797245550470549e-06, "loss": 0.2977, "step": 3336 }, { "epoch": 1.5944686697329908, "grad_norm": 0.4566237024471887, "learning_rate": 9.791979951268791e-06, "loss": 0.3147, "step": 3337 }, { "epoch": 1.5949465384385642, "grad_norm": 0.4767957332887537, "learning_rate": 9.786714409768127e-06, "loss": 0.3114, "step": 3338 }, { "epoch": 1.595424407144137, "grad_norm": 0.583636525688097, "learning_rate": 9.78144892742912e-06, "loss": 0.3029, "step": 3339 }, { "epoch": 1.5959022758497103, "grad_norm": 0.5062035780498737, "learning_rate": 9.776183505712327e-06, "loss": 0.3161, "step": 3340 }, { "epoch": 1.5963801445552834, "grad_norm": 0.4838854040827197, "learning_rate": 9.770918146078283e-06, "loss": 0.3555, "step": 3341 }, { "epoch": 1.5968580132608565, "grad_norm": 0.49830687795336376, "learning_rate": 9.765652849987504e-06, "loss": 0.3151, "step": 3342 }, { "epoch": 1.5973358819664298, "grad_norm": 0.49241625719381726, "learning_rate": 9.760387618900488e-06, "loss": 0.3171, "step": 3343 }, { "epoch": 1.5978137506720027, "grad_norm": 0.5523584158121128, "learning_rate": 9.755122454277723e-06, "loss": 0.3274, "step": 3344 }, { "epoch": 1.598291619377576, "grad_norm": 0.4845078185424916, "learning_rate": 9.749857357579667e-06, "loss": 0.3149, "step": 3345 }, { "epoch": 1.5987694880831491, "grad_norm": 0.5024697204031072, "learning_rate": 9.744592330266769e-06, "loss": 0.314, "step": 3346 }, { "epoch": 1.5992473567887222, "grad_norm": 0.45934959722680274, "learning_rate": 9.739327373799454e-06, "loss": 0.3142, "step": 3347 }, { "epoch": 1.5997252254942955, "grad_norm": 0.464086055829073, "learning_rate": 9.734062489638127e-06, "loss": 0.323, "step": 3348 }, { "epoch": 1.6002030941998686, "grad_norm": 0.4831840810555116, "learning_rate": 9.728797679243172e-06, "loss": 0.3306, "step": 3349 }, { "epoch": 1.6006809629054417, "grad_norm": 0.4764342274703041, "learning_rate": 9.723532944074961e-06, "loss": 0.3201, "step": 3350 }, { "epoch": 1.601158831611015, "grad_norm": 0.48070401161567705, "learning_rate": 9.71826828559383e-06, "loss": 0.3281, "step": 3351 }, { "epoch": 1.601636700316588, "grad_norm": 0.4708155298765893, "learning_rate": 9.71300370526011e-06, "loss": 0.322, "step": 3352 }, { "epoch": 1.6021145690221612, "grad_norm": 0.4767194553320532, "learning_rate": 9.7077392045341e-06, "loss": 0.3258, "step": 3353 }, { "epoch": 1.6025924377277343, "grad_norm": 0.479523768892733, "learning_rate": 9.702474784876075e-06, "loss": 0.3199, "step": 3354 }, { "epoch": 1.6030703064333074, "grad_norm": 0.5236359911668674, "learning_rate": 9.6972104477463e-06, "loss": 0.3237, "step": 3355 }, { "epoch": 1.6035481751388807, "grad_norm": 0.46485496717447494, "learning_rate": 9.691946194605007e-06, "loss": 0.3084, "step": 3356 }, { "epoch": 1.6040260438444536, "grad_norm": 0.48134675159435425, "learning_rate": 9.686682026912402e-06, "loss": 0.3084, "step": 3357 }, { "epoch": 1.604503912550027, "grad_norm": 0.49397380566853843, "learning_rate": 9.681417946128677e-06, "loss": 0.2973, "step": 3358 }, { "epoch": 1.6049817812556, "grad_norm": 0.48607625348138517, "learning_rate": 9.676153953713996e-06, "loss": 0.3212, "step": 3359 }, { "epoch": 1.605459649961173, "grad_norm": 0.487640811201148, "learning_rate": 9.670890051128493e-06, "loss": 0.3181, "step": 3360 }, { "epoch": 1.6059375186667464, "grad_norm": 0.47749175931112464, "learning_rate": 9.665626239832286e-06, "loss": 0.3211, "step": 3361 }, { "epoch": 1.6064153873723195, "grad_norm": 0.7933182151405382, "learning_rate": 9.660362521285463e-06, "loss": 0.3277, "step": 3362 }, { "epoch": 1.6068932560778926, "grad_norm": 0.47250312135112554, "learning_rate": 9.655098896948083e-06, "loss": 0.3167, "step": 3363 }, { "epoch": 1.6073711247834659, "grad_norm": 0.4693883987261192, "learning_rate": 9.649835368280186e-06, "loss": 0.3241, "step": 3364 }, { "epoch": 1.6078489934890388, "grad_norm": 0.47939105139177257, "learning_rate": 9.644571936741778e-06, "loss": 0.3253, "step": 3365 }, { "epoch": 1.608326862194612, "grad_norm": 0.471799876196094, "learning_rate": 9.639308603792847e-06, "loss": 0.2978, "step": 3366 }, { "epoch": 1.6088047309001852, "grad_norm": 0.5037783606345849, "learning_rate": 9.634045370893348e-06, "loss": 0.3178, "step": 3367 }, { "epoch": 1.6092825996057583, "grad_norm": 0.5003960430326154, "learning_rate": 9.628782239503208e-06, "loss": 0.3416, "step": 3368 }, { "epoch": 1.6097604683113316, "grad_norm": 0.48869672104414125, "learning_rate": 9.623519211082325e-06, "loss": 0.3125, "step": 3369 }, { "epoch": 1.6102383370169044, "grad_norm": 0.49864918116539536, "learning_rate": 9.618256287090576e-06, "loss": 0.3244, "step": 3370 }, { "epoch": 1.6107162057224778, "grad_norm": 0.5019807545385679, "learning_rate": 9.612993468987796e-06, "loss": 0.3138, "step": 3371 }, { "epoch": 1.6111940744280508, "grad_norm": 0.4683993877429079, "learning_rate": 9.6077307582338e-06, "loss": 0.332, "step": 3372 }, { "epoch": 1.611671943133624, "grad_norm": 0.4605238593535419, "learning_rate": 9.602468156288374e-06, "loss": 0.3209, "step": 3373 }, { "epoch": 1.6121498118391973, "grad_norm": 0.47258089848935597, "learning_rate": 9.597205664611269e-06, "loss": 0.3218, "step": 3374 }, { "epoch": 1.6126276805447703, "grad_norm": 0.48987083989083485, "learning_rate": 9.591943284662206e-06, "loss": 0.3216, "step": 3375 }, { "epoch": 1.6131055492503434, "grad_norm": 0.45509362905511785, "learning_rate": 9.586681017900881e-06, "loss": 0.3151, "step": 3376 }, { "epoch": 1.6135834179559168, "grad_norm": 0.5226274148001115, "learning_rate": 9.581418865786948e-06, "loss": 0.296, "step": 3377 }, { "epoch": 1.6140612866614896, "grad_norm": 0.4972753917887101, "learning_rate": 9.576156829780038e-06, "loss": 0.3083, "step": 3378 }, { "epoch": 1.614539155367063, "grad_norm": 0.5066525482936579, "learning_rate": 9.570894911339748e-06, "loss": 0.3135, "step": 3379 }, { "epoch": 1.615017024072636, "grad_norm": 0.4594810626516822, "learning_rate": 9.56563311192564e-06, "loss": 0.3315, "step": 3380 }, { "epoch": 1.6154948927782091, "grad_norm": 0.5042615947206254, "learning_rate": 9.560371432997244e-06, "loss": 0.3119, "step": 3381 }, { "epoch": 1.6159727614837824, "grad_norm": 0.4914519937789319, "learning_rate": 9.55510987601406e-06, "loss": 0.3276, "step": 3382 }, { "epoch": 1.6164506301893553, "grad_norm": 0.45163236705607995, "learning_rate": 9.549848442435547e-06, "loss": 0.3236, "step": 3383 }, { "epoch": 1.6169284988949286, "grad_norm": 0.4853699956930337, "learning_rate": 9.544587133721133e-06, "loss": 0.3262, "step": 3384 }, { "epoch": 1.6174063676005017, "grad_norm": 0.5027837659953032, "learning_rate": 9.53932595133022e-06, "loss": 0.3159, "step": 3385 }, { "epoch": 1.6178842363060748, "grad_norm": 0.5154740574278834, "learning_rate": 9.534064896722157e-06, "loss": 0.3022, "step": 3386 }, { "epoch": 1.6183621050116481, "grad_norm": 0.4571584542414456, "learning_rate": 9.528803971356275e-06, "loss": 0.3106, "step": 3387 }, { "epoch": 1.6188399737172212, "grad_norm": 0.5686619283974957, "learning_rate": 9.523543176691861e-06, "loss": 0.3067, "step": 3388 }, { "epoch": 1.6193178424227943, "grad_norm": 0.5191198661164662, "learning_rate": 9.518282514188163e-06, "loss": 0.3071, "step": 3389 }, { "epoch": 1.6197957111283676, "grad_norm": 0.46818165462038835, "learning_rate": 9.513021985304399e-06, "loss": 0.3116, "step": 3390 }, { "epoch": 1.6202735798339405, "grad_norm": 0.4948050354348871, "learning_rate": 9.507761591499747e-06, "loss": 0.3252, "step": 3391 }, { "epoch": 1.6207514485395138, "grad_norm": 0.5123067692293698, "learning_rate": 9.502501334233343e-06, "loss": 0.312, "step": 3392 }, { "epoch": 1.621229317245087, "grad_norm": 0.4591469898065541, "learning_rate": 9.497241214964297e-06, "loss": 0.3208, "step": 3393 }, { "epoch": 1.62170718595066, "grad_norm": 0.4756275676531446, "learning_rate": 9.491981235151669e-06, "loss": 0.3121, "step": 3394 }, { "epoch": 1.6221850546562333, "grad_norm": 0.4827825826648765, "learning_rate": 9.486721396254484e-06, "loss": 0.3049, "step": 3395 }, { "epoch": 1.6226629233618064, "grad_norm": 0.4915126644495333, "learning_rate": 9.48146169973173e-06, "loss": 0.3192, "step": 3396 }, { "epoch": 1.6231407920673795, "grad_norm": 0.4610798399908291, "learning_rate": 9.476202147042354e-06, "loss": 0.3269, "step": 3397 }, { "epoch": 1.6236186607729526, "grad_norm": 0.5599323558551276, "learning_rate": 9.47094273964526e-06, "loss": 0.335, "step": 3398 }, { "epoch": 1.6240965294785257, "grad_norm": 0.5255208607648829, "learning_rate": 9.465683478999319e-06, "loss": 0.3291, "step": 3399 }, { "epoch": 1.624574398184099, "grad_norm": 0.4924211708443086, "learning_rate": 9.460424366563355e-06, "loss": 0.3217, "step": 3400 }, { "epoch": 1.625052266889672, "grad_norm": 0.476959287524869, "learning_rate": 9.455165403796157e-06, "loss": 0.3091, "step": 3401 }, { "epoch": 1.6255301355952452, "grad_norm": 0.4672843679938778, "learning_rate": 9.449906592156463e-06, "loss": 0.3176, "step": 3402 }, { "epoch": 1.6260080043008185, "grad_norm": 0.5009267665364574, "learning_rate": 9.444647933102977e-06, "loss": 0.3306, "step": 3403 }, { "epoch": 1.6264858730063914, "grad_norm": 0.49020359549356995, "learning_rate": 9.43938942809436e-06, "loss": 0.3016, "step": 3404 }, { "epoch": 1.6269637417119647, "grad_norm": 0.4871954472988403, "learning_rate": 9.434131078589224e-06, "loss": 0.3118, "step": 3405 }, { "epoch": 1.6274416104175378, "grad_norm": 0.47764178365700216, "learning_rate": 9.428872886046145e-06, "loss": 0.3277, "step": 3406 }, { "epoch": 1.6279194791231109, "grad_norm": 0.49682895676211, "learning_rate": 9.423614851923657e-06, "loss": 0.3052, "step": 3407 }, { "epoch": 1.6283973478286842, "grad_norm": 0.4664914179892642, "learning_rate": 9.418356977680238e-06, "loss": 0.3182, "step": 3408 }, { "epoch": 1.6288752165342573, "grad_norm": 0.46865718754503594, "learning_rate": 9.413099264774334e-06, "loss": 0.303, "step": 3409 }, { "epoch": 1.6293530852398304, "grad_norm": 0.487385856422879, "learning_rate": 9.407841714664343e-06, "loss": 0.3303, "step": 3410 }, { "epoch": 1.6298309539454034, "grad_norm": 0.5465692345398058, "learning_rate": 9.402584328808614e-06, "loss": 0.3158, "step": 3411 }, { "epoch": 1.6303088226509765, "grad_norm": 0.47198199422777404, "learning_rate": 9.39732710866545e-06, "loss": 0.3182, "step": 3412 }, { "epoch": 1.6307866913565499, "grad_norm": 0.46116344865689524, "learning_rate": 9.392070055693122e-06, "loss": 0.3393, "step": 3413 }, { "epoch": 1.631264560062123, "grad_norm": 0.4763679490362691, "learning_rate": 9.38681317134983e-06, "loss": 0.3161, "step": 3414 }, { "epoch": 1.631742428767696, "grad_norm": 0.46624869036989713, "learning_rate": 9.381556457093752e-06, "loss": 0.3285, "step": 3415 }, { "epoch": 1.6322202974732694, "grad_norm": 0.46060193625574064, "learning_rate": 9.376299914383004e-06, "loss": 0.3169, "step": 3416 }, { "epoch": 1.6326981661788422, "grad_norm": 0.48033327662418424, "learning_rate": 9.371043544675656e-06, "loss": 0.3169, "step": 3417 }, { "epoch": 1.6331760348844155, "grad_norm": 0.4802037904197471, "learning_rate": 9.365787349429734e-06, "loss": 0.325, "step": 3418 }, { "epoch": 1.6336539035899886, "grad_norm": 0.5496693935684847, "learning_rate": 9.360531330103218e-06, "loss": 0.3025, "step": 3419 }, { "epoch": 1.6341317722955617, "grad_norm": 0.4824651576505537, "learning_rate": 9.355275488154025e-06, "loss": 0.3251, "step": 3420 }, { "epoch": 1.634609641001135, "grad_norm": 0.49309460242445563, "learning_rate": 9.350019825040042e-06, "loss": 0.3023, "step": 3421 }, { "epoch": 1.6350875097067081, "grad_norm": 0.471529984373171, "learning_rate": 9.344764342219096e-06, "loss": 0.3212, "step": 3422 }, { "epoch": 1.6355653784122812, "grad_norm": 0.47065857771956116, "learning_rate": 9.33950904114896e-06, "loss": 0.3107, "step": 3423 }, { "epoch": 1.6360432471178543, "grad_norm": 0.47686302451072865, "learning_rate": 9.334253923287364e-06, "loss": 0.3156, "step": 3424 }, { "epoch": 1.6365211158234274, "grad_norm": 0.5127539654610185, "learning_rate": 9.32899899009199e-06, "loss": 0.3162, "step": 3425 }, { "epoch": 1.6369989845290007, "grad_norm": 0.4747555573267503, "learning_rate": 9.323744243020458e-06, "loss": 0.3094, "step": 3426 }, { "epoch": 1.6374768532345738, "grad_norm": 2.954360238976603, "learning_rate": 9.318489683530342e-06, "loss": 0.2972, "step": 3427 }, { "epoch": 1.637954721940147, "grad_norm": 0.5656096550756227, "learning_rate": 9.313235313079171e-06, "loss": 0.321, "step": 3428 }, { "epoch": 1.6384325906457202, "grad_norm": 0.5225575379322978, "learning_rate": 9.307981133124407e-06, "loss": 0.3252, "step": 3429 }, { "epoch": 1.638910459351293, "grad_norm": 0.5091222365768304, "learning_rate": 9.30272714512347e-06, "loss": 0.3249, "step": 3430 }, { "epoch": 1.6393883280568664, "grad_norm": 0.4992499483033707, "learning_rate": 9.297473350533723e-06, "loss": 0.3172, "step": 3431 }, { "epoch": 1.6398661967624395, "grad_norm": 0.4831370776686128, "learning_rate": 9.292219750812475e-06, "loss": 0.3244, "step": 3432 }, { "epoch": 1.6403440654680126, "grad_norm": 0.4476338694579795, "learning_rate": 9.286966347416982e-06, "loss": 0.3148, "step": 3433 }, { "epoch": 1.640821934173586, "grad_norm": 0.43830875865078245, "learning_rate": 9.281713141804449e-06, "loss": 0.3182, "step": 3434 }, { "epoch": 1.641299802879159, "grad_norm": 0.4809168285139566, "learning_rate": 9.276460135432019e-06, "loss": 0.3241, "step": 3435 }, { "epoch": 1.641777671584732, "grad_norm": 0.4754134644489622, "learning_rate": 9.271207329756787e-06, "loss": 0.2952, "step": 3436 }, { "epoch": 1.6422555402903054, "grad_norm": 0.48655404908499644, "learning_rate": 9.265954726235783e-06, "loss": 0.312, "step": 3437 }, { "epoch": 1.6427334089958783, "grad_norm": 0.4683670052071872, "learning_rate": 9.26070232632599e-06, "loss": 0.324, "step": 3438 }, { "epoch": 1.6432112777014516, "grad_norm": 0.4890631847154369, "learning_rate": 9.255450131484334e-06, "loss": 0.3092, "step": 3439 }, { "epoch": 1.6436891464070247, "grad_norm": 0.5288231867382709, "learning_rate": 9.250198143167675e-06, "loss": 0.3033, "step": 3440 }, { "epoch": 1.6441670151125978, "grad_norm": 0.48338257041337923, "learning_rate": 9.244946362832825e-06, "loss": 0.3129, "step": 3441 }, { "epoch": 1.644644883818171, "grad_norm": 0.4614188770486306, "learning_rate": 9.239694791936536e-06, "loss": 0.3121, "step": 3442 }, { "epoch": 1.645122752523744, "grad_norm": 0.48639433398453336, "learning_rate": 9.234443431935498e-06, "loss": 0.3359, "step": 3443 }, { "epoch": 1.6456006212293173, "grad_norm": 0.454810660207841, "learning_rate": 9.22919228428635e-06, "loss": 0.3307, "step": 3444 }, { "epoch": 1.6460784899348904, "grad_norm": 0.4813257235091246, "learning_rate": 9.223941350445666e-06, "loss": 0.3322, "step": 3445 }, { "epoch": 1.6465563586404635, "grad_norm": 0.466364711126299, "learning_rate": 9.218690631869961e-06, "loss": 0.3192, "step": 3446 }, { "epoch": 1.6470342273460368, "grad_norm": 0.5066591698413345, "learning_rate": 9.213440130015692e-06, "loss": 0.3273, "step": 3447 }, { "epoch": 1.6475120960516099, "grad_norm": 0.49776839266177914, "learning_rate": 9.208189846339259e-06, "loss": 0.3338, "step": 3448 }, { "epoch": 1.647989964757183, "grad_norm": 0.480237463551773, "learning_rate": 9.202939782296992e-06, "loss": 0.3249, "step": 3449 }, { "epoch": 1.6484678334627563, "grad_norm": 0.46449483591155677, "learning_rate": 9.19768993934517e-06, "loss": 0.2952, "step": 3450 }, { "epoch": 1.6489457021683291, "grad_norm": 0.46253453287395435, "learning_rate": 9.192440318940009e-06, "loss": 0.3152, "step": 3451 }, { "epoch": 1.6494235708739025, "grad_norm": 0.8458069073772643, "learning_rate": 9.187190922537654e-06, "loss": 0.3168, "step": 3452 }, { "epoch": 1.6499014395794755, "grad_norm": 0.6969051435404784, "learning_rate": 9.181941751594203e-06, "loss": 0.3276, "step": 3453 }, { "epoch": 1.6503793082850486, "grad_norm": 0.461496229769572, "learning_rate": 9.176692807565679e-06, "loss": 0.326, "step": 3454 }, { "epoch": 1.650857176990622, "grad_norm": 0.483293334225472, "learning_rate": 9.171444091908046e-06, "loss": 0.3164, "step": 3455 }, { "epoch": 1.6513350456961948, "grad_norm": 0.4776240975371244, "learning_rate": 9.166195606077205e-06, "loss": 0.3153, "step": 3456 }, { "epoch": 1.6518129144017681, "grad_norm": 0.47229526671977295, "learning_rate": 9.160947351529001e-06, "loss": 0.3199, "step": 3457 }, { "epoch": 1.6522907831073412, "grad_norm": 0.5146895918341302, "learning_rate": 9.155699329719196e-06, "loss": 0.3286, "step": 3458 }, { "epoch": 1.6527686518129143, "grad_norm": 0.4384463475806529, "learning_rate": 9.150451542103505e-06, "loss": 0.3301, "step": 3459 }, { "epoch": 1.6532465205184876, "grad_norm": 0.4703968423103728, "learning_rate": 9.145203990137571e-06, "loss": 0.3096, "step": 3460 }, { "epoch": 1.6537243892240607, "grad_norm": 0.4793369058069929, "learning_rate": 9.13995667527697e-06, "loss": 0.3284, "step": 3461 }, { "epoch": 1.6542022579296338, "grad_norm": 0.45236867666956726, "learning_rate": 9.134709598977218e-06, "loss": 0.3234, "step": 3462 }, { "epoch": 1.6546801266352071, "grad_norm": 0.47207424023467076, "learning_rate": 9.129462762693759e-06, "loss": 0.3096, "step": 3463 }, { "epoch": 1.65515799534078, "grad_norm": 0.4750207016386637, "learning_rate": 9.124216167881974e-06, "loss": 0.3025, "step": 3464 }, { "epoch": 1.6556358640463533, "grad_norm": 0.45011316729860085, "learning_rate": 9.118969815997174e-06, "loss": 0.3117, "step": 3465 }, { "epoch": 1.6561137327519264, "grad_norm": 0.45982543099231327, "learning_rate": 9.11372370849461e-06, "loss": 0.338, "step": 3466 }, { "epoch": 1.6565916014574995, "grad_norm": 0.47099246855087673, "learning_rate": 9.108477846829447e-06, "loss": 0.313, "step": 3467 }, { "epoch": 1.6570694701630728, "grad_norm": 0.4872573696860828, "learning_rate": 9.103232232456812e-06, "loss": 0.3257, "step": 3468 }, { "epoch": 1.6575473388686457, "grad_norm": 0.44856712782419816, "learning_rate": 9.097986866831733e-06, "loss": 0.321, "step": 3469 }, { "epoch": 1.658025207574219, "grad_norm": 0.4420962792302268, "learning_rate": 9.092741751409186e-06, "loss": 0.3273, "step": 3470 }, { "epoch": 1.658503076279792, "grad_norm": 0.4687035556845388, "learning_rate": 9.087496887644075e-06, "loss": 0.3249, "step": 3471 }, { "epoch": 1.6589809449853652, "grad_norm": 0.49495908912270503, "learning_rate": 9.08225227699123e-06, "loss": 0.3054, "step": 3472 }, { "epoch": 1.6594588136909385, "grad_norm": 0.48042565301421647, "learning_rate": 9.077007920905413e-06, "loss": 0.3207, "step": 3473 }, { "epoch": 1.6599366823965116, "grad_norm": 0.46644981695972604, "learning_rate": 9.071763820841322e-06, "loss": 0.3226, "step": 3474 }, { "epoch": 1.6604145511020847, "grad_norm": 0.4810549358732364, "learning_rate": 9.06651997825357e-06, "loss": 0.3275, "step": 3475 }, { "epoch": 1.660892419807658, "grad_norm": 0.4889810661635855, "learning_rate": 9.061276394596712e-06, "loss": 0.3238, "step": 3476 }, { "epoch": 1.6613702885132309, "grad_norm": 0.4429594397309242, "learning_rate": 9.05603307132522e-06, "loss": 0.3114, "step": 3477 }, { "epoch": 1.6618481572188042, "grad_norm": 0.46379425469030755, "learning_rate": 9.050790009893505e-06, "loss": 0.313, "step": 3478 }, { "epoch": 1.6623260259243773, "grad_norm": 0.5433136463867893, "learning_rate": 9.0455472117559e-06, "loss": 0.3061, "step": 3479 }, { "epoch": 1.6628038946299504, "grad_norm": 0.4514939829237272, "learning_rate": 9.040304678366658e-06, "loss": 0.3147, "step": 3480 }, { "epoch": 1.6632817633355237, "grad_norm": 0.471795348976342, "learning_rate": 9.03506241117997e-06, "loss": 0.3277, "step": 3481 }, { "epoch": 1.6637596320410966, "grad_norm": 0.480378485496424, "learning_rate": 9.02982041164995e-06, "loss": 0.3062, "step": 3482 }, { "epoch": 1.6642375007466699, "grad_norm": 0.44413418724919074, "learning_rate": 9.024578681230632e-06, "loss": 0.3231, "step": 3483 }, { "epoch": 1.664715369452243, "grad_norm": 0.495040097592052, "learning_rate": 9.01933722137598e-06, "loss": 0.3113, "step": 3484 }, { "epoch": 1.665193238157816, "grad_norm": 0.5093870178221196, "learning_rate": 9.014096033539889e-06, "loss": 0.3206, "step": 3485 }, { "epoch": 1.6656711068633894, "grad_norm": 0.5133513477654141, "learning_rate": 9.008855119176165e-06, "loss": 0.3057, "step": 3486 }, { "epoch": 1.6661489755689625, "grad_norm": 0.4699567002694254, "learning_rate": 9.003614479738544e-06, "loss": 0.3287, "step": 3487 }, { "epoch": 1.6666268442745356, "grad_norm": 0.4892249229390645, "learning_rate": 8.998374116680697e-06, "loss": 0.3274, "step": 3488 }, { "epoch": 1.6671047129801089, "grad_norm": 0.4946263872577261, "learning_rate": 8.993134031456198e-06, "loss": 0.3179, "step": 3489 }, { "epoch": 1.6675825816856817, "grad_norm": 0.48264814521862553, "learning_rate": 8.987894225518556e-06, "loss": 0.3097, "step": 3490 }, { "epoch": 1.668060450391255, "grad_norm": 0.4605926310411199, "learning_rate": 8.982654700321207e-06, "loss": 0.3218, "step": 3491 }, { "epoch": 1.6685383190968281, "grad_norm": 0.4807584069153299, "learning_rate": 8.977415457317495e-06, "loss": 0.3295, "step": 3492 }, { "epoch": 1.6690161878024012, "grad_norm": 0.5126134343679848, "learning_rate": 8.972176497960698e-06, "loss": 0.3023, "step": 3493 }, { "epoch": 1.6694940565079746, "grad_norm": 0.4866870303998035, "learning_rate": 8.966937823704013e-06, "loss": 0.3246, "step": 3494 }, { "epoch": 1.6699719252135474, "grad_norm": 0.4701808785486881, "learning_rate": 8.961699436000548e-06, "loss": 0.3138, "step": 3495 }, { "epoch": 1.6704497939191207, "grad_norm": 0.48437050470284443, "learning_rate": 8.956461336303345e-06, "loss": 0.3309, "step": 3496 }, { "epoch": 1.6709276626246938, "grad_norm": 0.4915042033611595, "learning_rate": 8.951223526065363e-06, "loss": 0.3056, "step": 3497 }, { "epoch": 1.671405531330267, "grad_norm": 0.48853602825271614, "learning_rate": 8.945986006739472e-06, "loss": 0.3398, "step": 3498 }, { "epoch": 1.6718834000358402, "grad_norm": 0.47166819435850216, "learning_rate": 8.94074877977847e-06, "loss": 0.32, "step": 3499 }, { "epoch": 1.6723612687414133, "grad_norm": 0.4848661547844431, "learning_rate": 8.93551184663507e-06, "loss": 0.3163, "step": 3500 }, { "epoch": 1.6728391374469864, "grad_norm": 0.5013003475484796, "learning_rate": 8.930275208761908e-06, "loss": 0.3187, "step": 3501 }, { "epoch": 1.6733170061525597, "grad_norm": 0.5469587538744264, "learning_rate": 8.925038867611536e-06, "loss": 0.3181, "step": 3502 }, { "epoch": 1.6737948748581326, "grad_norm": 0.4669973541608491, "learning_rate": 8.919802824636418e-06, "loss": 0.3089, "step": 3503 }, { "epoch": 1.674272743563706, "grad_norm": 0.6631093786446882, "learning_rate": 8.914567081288943e-06, "loss": 0.3196, "step": 3504 }, { "epoch": 1.674750612269279, "grad_norm": 0.4870905723140869, "learning_rate": 8.909331639021414e-06, "loss": 0.3002, "step": 3505 }, { "epoch": 1.675228480974852, "grad_norm": 0.5011009424411035, "learning_rate": 8.904096499286047e-06, "loss": 0.3197, "step": 3506 }, { "epoch": 1.6757063496804254, "grad_norm": 0.49155633759059203, "learning_rate": 8.89886166353498e-06, "loss": 0.3081, "step": 3507 }, { "epoch": 1.6761842183859983, "grad_norm": 0.49765551877135294, "learning_rate": 8.893627133220266e-06, "loss": 0.3117, "step": 3508 }, { "epoch": 1.6766620870915716, "grad_norm": 0.4911618547326789, "learning_rate": 8.888392909793866e-06, "loss": 0.3098, "step": 3509 }, { "epoch": 1.6771399557971447, "grad_norm": 0.5063806765110067, "learning_rate": 8.883158994707666e-06, "loss": 0.3335, "step": 3510 }, { "epoch": 1.6776178245027178, "grad_norm": 0.5734707096493096, "learning_rate": 8.87792538941346e-06, "loss": 0.3082, "step": 3511 }, { "epoch": 1.678095693208291, "grad_norm": 0.5067693296298389, "learning_rate": 8.872692095362957e-06, "loss": 0.3061, "step": 3512 }, { "epoch": 1.6785735619138642, "grad_norm": 0.5479709311181276, "learning_rate": 8.867459114007784e-06, "loss": 0.3164, "step": 3513 }, { "epoch": 1.6790514306194373, "grad_norm": 0.5287134929809657, "learning_rate": 8.862226446799474e-06, "loss": 0.3307, "step": 3514 }, { "epoch": 1.6795292993250106, "grad_norm": 0.4649713213931607, "learning_rate": 8.856994095189477e-06, "loss": 0.3155, "step": 3515 }, { "epoch": 1.6800071680305835, "grad_norm": 0.49222649083051145, "learning_rate": 8.851762060629155e-06, "loss": 0.3309, "step": 3516 }, { "epoch": 1.6804850367361568, "grad_norm": 0.5143659759951493, "learning_rate": 8.846530344569785e-06, "loss": 0.3245, "step": 3517 }, { "epoch": 1.6809629054417299, "grad_norm": 0.49180603565206343, "learning_rate": 8.841298948462548e-06, "loss": 0.319, "step": 3518 }, { "epoch": 1.681440774147303, "grad_norm": 0.4904305397252043, "learning_rate": 8.836067873758544e-06, "loss": 0.3076, "step": 3519 }, { "epoch": 1.6819186428528763, "grad_norm": 0.5069444940408103, "learning_rate": 8.830837121908783e-06, "loss": 0.3142, "step": 3520 }, { "epoch": 1.6823965115584492, "grad_norm": 0.47329267540124315, "learning_rate": 8.825606694364178e-06, "loss": 0.3116, "step": 3521 }, { "epoch": 1.6828743802640225, "grad_norm": 0.5502456613469378, "learning_rate": 8.820376592575562e-06, "loss": 0.3266, "step": 3522 }, { "epoch": 1.6833522489695956, "grad_norm": 0.5242294700558009, "learning_rate": 8.81514681799367e-06, "loss": 0.3171, "step": 3523 }, { "epoch": 1.6838301176751687, "grad_norm": 0.4587826429514902, "learning_rate": 8.80991737206915e-06, "loss": 0.3128, "step": 3524 }, { "epoch": 1.684307986380742, "grad_norm": 0.5210196583488915, "learning_rate": 8.804688256252557e-06, "loss": 0.3201, "step": 3525 }, { "epoch": 1.684785855086315, "grad_norm": 0.511732360451873, "learning_rate": 8.79945947199436e-06, "loss": 0.3213, "step": 3526 }, { "epoch": 1.6852637237918882, "grad_norm": 0.6266517581717058, "learning_rate": 8.794231020744926e-06, "loss": 0.308, "step": 3527 }, { "epoch": 1.6857415924974615, "grad_norm": 0.4306673990518024, "learning_rate": 8.789002903954538e-06, "loss": 0.3062, "step": 3528 }, { "epoch": 1.6862194612030343, "grad_norm": 0.5201991430967379, "learning_rate": 8.783775123073383e-06, "loss": 0.2972, "step": 3529 }, { "epoch": 1.6866973299086077, "grad_norm": 0.4789566466601809, "learning_rate": 8.778547679551555e-06, "loss": 0.3287, "step": 3530 }, { "epoch": 1.6871751986141807, "grad_norm": 0.4503867654809924, "learning_rate": 8.773320574839055e-06, "loss": 0.3158, "step": 3531 }, { "epoch": 1.6876530673197538, "grad_norm": 0.5592132809330164, "learning_rate": 8.76809381038579e-06, "loss": 0.2928, "step": 3532 }, { "epoch": 1.6881309360253272, "grad_norm": 0.5248394969953822, "learning_rate": 8.762867387641569e-06, "loss": 0.3192, "step": 3533 }, { "epoch": 1.6886088047309, "grad_norm": 0.473584910793579, "learning_rate": 8.757641308056111e-06, "loss": 0.315, "step": 3534 }, { "epoch": 1.6890866734364733, "grad_norm": 0.5181876158952626, "learning_rate": 8.75241557307904e-06, "loss": 0.3034, "step": 3535 }, { "epoch": 1.6895645421420464, "grad_norm": 0.4871622666683337, "learning_rate": 8.747190184159883e-06, "loss": 0.3042, "step": 3536 }, { "epoch": 1.6900424108476195, "grad_norm": 0.4857165458002748, "learning_rate": 8.741965142748072e-06, "loss": 0.3125, "step": 3537 }, { "epoch": 1.6905202795531928, "grad_norm": 0.47357991306741637, "learning_rate": 8.736740450292937e-06, "loss": 0.3216, "step": 3538 }, { "epoch": 1.690998148258766, "grad_norm": 0.45787328331002924, "learning_rate": 8.731516108243717e-06, "loss": 0.3252, "step": 3539 }, { "epoch": 1.691476016964339, "grad_norm": 0.479337722950027, "learning_rate": 8.726292118049555e-06, "loss": 0.3217, "step": 3540 }, { "epoch": 1.6919538856699123, "grad_norm": 0.4770400698452951, "learning_rate": 8.721068481159491e-06, "loss": 0.3332, "step": 3541 }, { "epoch": 1.6924317543754852, "grad_norm": 0.5242710945344221, "learning_rate": 8.715845199022468e-06, "loss": 0.3065, "step": 3542 }, { "epoch": 1.6929096230810585, "grad_norm": 0.47188838291316537, "learning_rate": 8.710622273087338e-06, "loss": 0.3181, "step": 3543 }, { "epoch": 1.6933874917866316, "grad_norm": 0.4916479471672269, "learning_rate": 8.705399704802844e-06, "loss": 0.3124, "step": 3544 }, { "epoch": 1.6938653604922047, "grad_norm": 0.4864876674585418, "learning_rate": 8.700177495617635e-06, "loss": 0.303, "step": 3545 }, { "epoch": 1.694343229197778, "grad_norm": 0.48143706434962735, "learning_rate": 8.694955646980261e-06, "loss": 0.3244, "step": 3546 }, { "epoch": 1.694821097903351, "grad_norm": 0.5024128714258727, "learning_rate": 8.68973416033917e-06, "loss": 0.3214, "step": 3547 }, { "epoch": 1.6952989666089242, "grad_norm": 0.4701290161733491, "learning_rate": 8.684513037142708e-06, "loss": 0.319, "step": 3548 }, { "epoch": 1.6957768353144973, "grad_norm": 0.5196310295903178, "learning_rate": 8.67929227883913e-06, "loss": 0.3304, "step": 3549 }, { "epoch": 1.6962547040200704, "grad_norm": 0.4470857047416331, "learning_rate": 8.674071886876572e-06, "loss": 0.3116, "step": 3550 }, { "epoch": 1.6967325727256437, "grad_norm": 0.49139633093819357, "learning_rate": 8.668851862703084e-06, "loss": 0.3147, "step": 3551 }, { "epoch": 1.6972104414312168, "grad_norm": 0.4533593202968756, "learning_rate": 8.66363220776661e-06, "loss": 0.3117, "step": 3552 }, { "epoch": 1.69768831013679, "grad_norm": 0.5016957872077247, "learning_rate": 8.658412923514987e-06, "loss": 0.3058, "step": 3553 }, { "epoch": 1.6981661788423632, "grad_norm": 0.49422695943685363, "learning_rate": 8.653194011395955e-06, "loss": 0.3166, "step": 3554 }, { "epoch": 1.698644047547936, "grad_norm": 0.5041371009037742, "learning_rate": 8.647975472857148e-06, "loss": 0.3058, "step": 3555 }, { "epoch": 1.6991219162535094, "grad_norm": 0.46401191058629204, "learning_rate": 8.642757309346092e-06, "loss": 0.2981, "step": 3556 }, { "epoch": 1.6995997849590825, "grad_norm": 0.47462647906742783, "learning_rate": 8.637539522310219e-06, "loss": 0.3158, "step": 3557 }, { "epoch": 1.7000776536646556, "grad_norm": 0.48560418398414035, "learning_rate": 8.63232211319685e-06, "loss": 0.3101, "step": 3558 }, { "epoch": 1.700555522370229, "grad_norm": 0.4924892083480203, "learning_rate": 8.6271050834532e-06, "loss": 0.3175, "step": 3559 }, { "epoch": 1.7010333910758018, "grad_norm": 0.4739306925770209, "learning_rate": 8.621888434526382e-06, "loss": 0.3055, "step": 3560 }, { "epoch": 1.701511259781375, "grad_norm": 0.5668938530252869, "learning_rate": 8.616672167863406e-06, "loss": 0.3001, "step": 3561 }, { "epoch": 1.7019891284869482, "grad_norm": 0.5258055525735766, "learning_rate": 8.611456284911167e-06, "loss": 0.3317, "step": 3562 }, { "epoch": 1.7024669971925213, "grad_norm": 0.4731593091810082, "learning_rate": 8.606240787116459e-06, "loss": 0.305, "step": 3563 }, { "epoch": 1.7029448658980946, "grad_norm": 0.47298302795830727, "learning_rate": 8.601025675925976e-06, "loss": 0.3518, "step": 3564 }, { "epoch": 1.7034227346036677, "grad_norm": 0.5044823125447081, "learning_rate": 8.595810952786289e-06, "loss": 0.316, "step": 3565 }, { "epoch": 1.7039006033092408, "grad_norm": 0.48834528388610515, "learning_rate": 8.590596619143874e-06, "loss": 0.3185, "step": 3566 }, { "epoch": 1.704378472014814, "grad_norm": 0.47503311635279094, "learning_rate": 8.585382676445099e-06, "loss": 0.3122, "step": 3567 }, { "epoch": 1.704856340720387, "grad_norm": 0.4779679121742256, "learning_rate": 8.580169126136211e-06, "loss": 0.3024, "step": 3568 }, { "epoch": 1.7053342094259603, "grad_norm": 0.4881996436437521, "learning_rate": 8.574955969663364e-06, "loss": 0.308, "step": 3569 }, { "epoch": 1.7058120781315333, "grad_norm": 0.49243529307480594, "learning_rate": 8.569743208472594e-06, "loss": 0.3211, "step": 3570 }, { "epoch": 1.7062899468371064, "grad_norm": 0.47769625174462077, "learning_rate": 8.56453084400983e-06, "loss": 0.3022, "step": 3571 }, { "epoch": 1.7067678155426798, "grad_norm": 0.4687274182843692, "learning_rate": 8.559318877720889e-06, "loss": 0.3026, "step": 3572 }, { "epoch": 1.7072456842482526, "grad_norm": 0.48608349498390174, "learning_rate": 8.554107311051477e-06, "loss": 0.3178, "step": 3573 }, { "epoch": 1.707723552953826, "grad_norm": 0.48723365794182133, "learning_rate": 8.548896145447191e-06, "loss": 0.3041, "step": 3574 }, { "epoch": 1.708201421659399, "grad_norm": 0.5215881012424272, "learning_rate": 8.543685382353518e-06, "loss": 0.3147, "step": 3575 }, { "epoch": 1.7086792903649721, "grad_norm": 0.46461361060388295, "learning_rate": 8.538475023215831e-06, "loss": 0.3296, "step": 3576 }, { "epoch": 1.7091571590705454, "grad_norm": 0.48227283989606085, "learning_rate": 8.533265069479393e-06, "loss": 0.3143, "step": 3577 }, { "epoch": 1.7096350277761185, "grad_norm": 0.4427755116826597, "learning_rate": 8.52805552258935e-06, "loss": 0.2946, "step": 3578 }, { "epoch": 1.7101128964816916, "grad_norm": 0.4557912224968937, "learning_rate": 8.52284638399074e-06, "loss": 0.3117, "step": 3579 }, { "epoch": 1.710590765187265, "grad_norm": 0.48947216925097453, "learning_rate": 8.51763765512849e-06, "loss": 0.3135, "step": 3580 }, { "epoch": 1.7110686338928378, "grad_norm": 0.48750224853053237, "learning_rate": 8.512429337447404e-06, "loss": 0.3152, "step": 3581 }, { "epoch": 1.7115465025984111, "grad_norm": 0.5424080934153999, "learning_rate": 8.507221432392177e-06, "loss": 0.3185, "step": 3582 }, { "epoch": 1.7120243713039842, "grad_norm": 0.5243992703363829, "learning_rate": 8.502013941407396e-06, "loss": 0.2851, "step": 3583 }, { "epoch": 1.7125022400095573, "grad_norm": 0.47093855520770667, "learning_rate": 8.496806865937523e-06, "loss": 0.3141, "step": 3584 }, { "epoch": 1.7129801087151306, "grad_norm": 0.5163527516479693, "learning_rate": 8.491600207426907e-06, "loss": 0.3048, "step": 3585 }, { "epoch": 1.7134579774207037, "grad_norm": 0.46552143826305653, "learning_rate": 8.48639396731979e-06, "loss": 0.3002, "step": 3586 }, { "epoch": 1.7139358461262768, "grad_norm": 0.47841277412531025, "learning_rate": 8.481188147060283e-06, "loss": 0.3095, "step": 3587 }, { "epoch": 1.71441371483185, "grad_norm": 0.5911896029327872, "learning_rate": 8.475982748092395e-06, "loss": 0.3257, "step": 3588 }, { "epoch": 1.714891583537423, "grad_norm": 0.4716246559506084, "learning_rate": 8.470777771860009e-06, "loss": 0.3108, "step": 3589 }, { "epoch": 1.7153694522429963, "grad_norm": 0.4544478697048045, "learning_rate": 8.465573219806893e-06, "loss": 0.3144, "step": 3590 }, { "epoch": 1.7158473209485694, "grad_norm": 0.48108714707741906, "learning_rate": 8.460369093376698e-06, "loss": 0.2957, "step": 3591 }, { "epoch": 1.7163251896541425, "grad_norm": 0.8635235335475422, "learning_rate": 8.455165394012962e-06, "loss": 0.3218, "step": 3592 }, { "epoch": 1.7168030583597158, "grad_norm": 0.5054277355892234, "learning_rate": 8.449962123159092e-06, "loss": 0.3072, "step": 3593 }, { "epoch": 1.7172809270652887, "grad_norm": 0.4829413421685776, "learning_rate": 8.444759282258387e-06, "loss": 0.3052, "step": 3594 }, { "epoch": 1.717758795770862, "grad_norm": 0.4494055048503989, "learning_rate": 8.439556872754025e-06, "loss": 0.3111, "step": 3595 }, { "epoch": 1.718236664476435, "grad_norm": 0.4736205047237309, "learning_rate": 8.434354896089058e-06, "loss": 0.3211, "step": 3596 }, { "epoch": 1.7187145331820082, "grad_norm": 0.47081905982842837, "learning_rate": 8.429153353706427e-06, "loss": 0.3341, "step": 3597 }, { "epoch": 1.7191924018875815, "grad_norm": 0.4748503377753521, "learning_rate": 8.423952247048948e-06, "loss": 0.3215, "step": 3598 }, { "epoch": 1.7196702705931546, "grad_norm": 0.4944887369523441, "learning_rate": 8.418751577559313e-06, "loss": 0.3099, "step": 3599 }, { "epoch": 1.7201481392987277, "grad_norm": 0.47254363092097545, "learning_rate": 8.413551346680095e-06, "loss": 0.3067, "step": 3600 }, { "epoch": 1.7206260080043008, "grad_norm": 0.47647885914472093, "learning_rate": 8.40835155585375e-06, "loss": 0.3129, "step": 3601 }, { "epoch": 1.7211038767098739, "grad_norm": 0.48835317263293404, "learning_rate": 8.403152206522607e-06, "loss": 0.3107, "step": 3602 }, { "epoch": 1.7215817454154472, "grad_norm": 0.4596088980636602, "learning_rate": 8.397953300128879e-06, "loss": 0.3157, "step": 3603 }, { "epoch": 1.7220596141210203, "grad_norm": 0.49182267500267884, "learning_rate": 8.39275483811464e-06, "loss": 0.3254, "step": 3604 }, { "epoch": 1.7225374828265934, "grad_norm": 0.48328627094269644, "learning_rate": 8.38755682192186e-06, "loss": 0.3112, "step": 3605 }, { "epoch": 1.7230153515321667, "grad_norm": 0.47304365223535483, "learning_rate": 8.382359252992377e-06, "loss": 0.3069, "step": 3606 }, { "epoch": 1.7234932202377395, "grad_norm": 0.4813058108065857, "learning_rate": 8.3771621327679e-06, "loss": 0.3198, "step": 3607 }, { "epoch": 1.7239710889433129, "grad_norm": 0.47699476225663495, "learning_rate": 8.371965462690021e-06, "loss": 0.32, "step": 3608 }, { "epoch": 1.724448957648886, "grad_norm": 0.48478741000916475, "learning_rate": 8.366769244200206e-06, "loss": 0.3123, "step": 3609 }, { "epoch": 1.724926826354459, "grad_norm": 0.4912926079455751, "learning_rate": 8.36157347873979e-06, "loss": 0.3116, "step": 3610 }, { "epoch": 1.7254046950600324, "grad_norm": 0.5174639765845415, "learning_rate": 8.356378167749993e-06, "loss": 0.298, "step": 3611 }, { "epoch": 1.7258825637656054, "grad_norm": 0.4793465538100503, "learning_rate": 8.351183312671898e-06, "loss": 0.31, "step": 3612 }, { "epoch": 1.7263604324711785, "grad_norm": 0.4814025970072797, "learning_rate": 8.345988914946467e-06, "loss": 0.3103, "step": 3613 }, { "epoch": 1.7268383011767516, "grad_norm": 0.46966946318183, "learning_rate": 8.340794976014535e-06, "loss": 0.3055, "step": 3614 }, { "epoch": 1.7273161698823247, "grad_norm": 0.48421924623191237, "learning_rate": 8.335601497316809e-06, "loss": 0.3147, "step": 3615 }, { "epoch": 1.727794038587898, "grad_norm": 0.47415453106644484, "learning_rate": 8.330408480293867e-06, "loss": 0.3189, "step": 3616 }, { "epoch": 1.7282719072934711, "grad_norm": 0.5311247241943203, "learning_rate": 8.325215926386164e-06, "loss": 0.3081, "step": 3617 }, { "epoch": 1.7287497759990442, "grad_norm": 0.45880552685400533, "learning_rate": 8.32002383703402e-06, "loss": 0.3079, "step": 3618 }, { "epoch": 1.7292276447046175, "grad_norm": 0.7527896953526857, "learning_rate": 8.314832213677627e-06, "loss": 0.3276, "step": 3619 }, { "epoch": 1.7297055134101904, "grad_norm": 0.47577023953198666, "learning_rate": 8.309641057757052e-06, "loss": 0.3193, "step": 3620 }, { "epoch": 1.7301833821157637, "grad_norm": 0.4600219319465241, "learning_rate": 8.304450370712234e-06, "loss": 0.3247, "step": 3621 }, { "epoch": 1.7306612508213368, "grad_norm": 0.49059880424437236, "learning_rate": 8.299260153982969e-06, "loss": 0.3199, "step": 3622 }, { "epoch": 1.73113911952691, "grad_norm": 0.4882276209446351, "learning_rate": 8.29407040900894e-06, "loss": 0.3041, "step": 3623 }, { "epoch": 1.7316169882324832, "grad_norm": 0.44643071797150663, "learning_rate": 8.288881137229687e-06, "loss": 0.3095, "step": 3624 }, { "epoch": 1.7320948569380563, "grad_norm": 0.46790133259886285, "learning_rate": 8.283692340084623e-06, "loss": 0.3146, "step": 3625 }, { "epoch": 1.7325727256436294, "grad_norm": 0.48231263905937954, "learning_rate": 8.278504019013026e-06, "loss": 0.3126, "step": 3626 }, { "epoch": 1.7330505943492027, "grad_norm": 0.4774119989895905, "learning_rate": 8.273316175454052e-06, "loss": 0.3261, "step": 3627 }, { "epoch": 1.7335284630547756, "grad_norm": 0.45700207528874304, "learning_rate": 8.26812881084671e-06, "loss": 0.3016, "step": 3628 }, { "epoch": 1.734006331760349, "grad_norm": 0.8002514747178684, "learning_rate": 8.262941926629888e-06, "loss": 0.311, "step": 3629 }, { "epoch": 1.734484200465922, "grad_norm": 0.4762774044807064, "learning_rate": 8.257755524242333e-06, "loss": 0.3037, "step": 3630 }, { "epoch": 1.734962069171495, "grad_norm": 0.4694744914517388, "learning_rate": 8.252569605122662e-06, "loss": 0.3129, "step": 3631 }, { "epoch": 1.7354399378770684, "grad_norm": 0.4991347812581403, "learning_rate": 8.24738417070936e-06, "loss": 0.3395, "step": 3632 }, { "epoch": 1.7359178065826413, "grad_norm": 0.504295162908251, "learning_rate": 8.242199222440773e-06, "loss": 0.3125, "step": 3633 }, { "epoch": 1.7363956752882146, "grad_norm": 0.6877943879571605, "learning_rate": 8.237014761755116e-06, "loss": 0.3096, "step": 3634 }, { "epoch": 1.7368735439937877, "grad_norm": 0.4672144021586577, "learning_rate": 8.231830790090461e-06, "loss": 0.3082, "step": 3635 }, { "epoch": 1.7373514126993608, "grad_norm": 0.44504887155749057, "learning_rate": 8.226647308884756e-06, "loss": 0.3228, "step": 3636 }, { "epoch": 1.737829281404934, "grad_norm": 0.4640809118349916, "learning_rate": 8.221464319575808e-06, "loss": 0.3251, "step": 3637 }, { "epoch": 1.7383071501105072, "grad_norm": 0.46673343701161357, "learning_rate": 8.216281823601286e-06, "loss": 0.3146, "step": 3638 }, { "epoch": 1.7387850188160803, "grad_norm": 0.4820552462095371, "learning_rate": 8.211099822398721e-06, "loss": 0.3118, "step": 3639 }, { "epoch": 1.7392628875216536, "grad_norm": 0.49498233493294336, "learning_rate": 8.205918317405508e-06, "loss": 0.3249, "step": 3640 }, { "epoch": 1.7397407562272265, "grad_norm": 0.48843945985854864, "learning_rate": 8.20073731005891e-06, "loss": 0.3164, "step": 3641 }, { "epoch": 1.7402186249327998, "grad_norm": 0.4814877105665985, "learning_rate": 8.195556801796041e-06, "loss": 0.3267, "step": 3642 }, { "epoch": 1.7406964936383729, "grad_norm": 0.4683984421194578, "learning_rate": 8.190376794053886e-06, "loss": 0.3048, "step": 3643 }, { "epoch": 1.741174362343946, "grad_norm": 0.4713542128746563, "learning_rate": 8.185197288269289e-06, "loss": 0.3114, "step": 3644 }, { "epoch": 1.7416522310495193, "grad_norm": 0.5033578491109322, "learning_rate": 8.18001828587895e-06, "loss": 0.3356, "step": 3645 }, { "epoch": 1.7421300997550921, "grad_norm": 0.48757140141481903, "learning_rate": 8.174839788319432e-06, "loss": 0.2978, "step": 3646 }, { "epoch": 1.7426079684606655, "grad_norm": 0.5198704975496817, "learning_rate": 8.169661797027167e-06, "loss": 0.3215, "step": 3647 }, { "epoch": 1.7430858371662385, "grad_norm": 0.5077562055422693, "learning_rate": 8.164484313438427e-06, "loss": 0.3096, "step": 3648 }, { "epoch": 1.7435637058718116, "grad_norm": 0.476581737348631, "learning_rate": 8.159307338989364e-06, "loss": 0.3266, "step": 3649 }, { "epoch": 1.744041574577385, "grad_norm": 0.474540506795891, "learning_rate": 8.154130875115978e-06, "loss": 0.3041, "step": 3650 }, { "epoch": 1.744519443282958, "grad_norm": 0.4801751996496636, "learning_rate": 8.148954923254122e-06, "loss": 0.3127, "step": 3651 }, { "epoch": 1.7449973119885311, "grad_norm": 0.4852373646526146, "learning_rate": 8.14377948483952e-06, "loss": 0.3295, "step": 3652 }, { "epoch": 1.7454751806941045, "grad_norm": 0.47761777592380195, "learning_rate": 8.138604561307748e-06, "loss": 0.3239, "step": 3653 }, { "epoch": 1.7459530493996773, "grad_norm": 0.5234441445204633, "learning_rate": 8.133430154094232e-06, "loss": 0.3189, "step": 3654 }, { "epoch": 1.7464309181052506, "grad_norm": 0.49427462068157, "learning_rate": 8.12825626463427e-06, "loss": 0.3126, "step": 3655 }, { "epoch": 1.7469087868108237, "grad_norm": 0.4470937189715777, "learning_rate": 8.123082894363001e-06, "loss": 0.3129, "step": 3656 }, { "epoch": 1.7473866555163968, "grad_norm": 0.5522576075572359, "learning_rate": 8.11791004471543e-06, "loss": 0.3012, "step": 3657 }, { "epoch": 1.7478645242219701, "grad_norm": 0.49371091975472053, "learning_rate": 8.11273771712641e-06, "loss": 0.3192, "step": 3658 }, { "epoch": 1.748342392927543, "grad_norm": 0.45307522023914715, "learning_rate": 8.10756591303066e-06, "loss": 0.3204, "step": 3659 }, { "epoch": 1.7488202616331163, "grad_norm": 0.46977495936479996, "learning_rate": 8.102394633862743e-06, "loss": 0.3327, "step": 3660 }, { "epoch": 1.7492981303386894, "grad_norm": 0.49827574100114275, "learning_rate": 8.097223881057079e-06, "loss": 0.3308, "step": 3661 }, { "epoch": 1.7497759990442625, "grad_norm": 0.46928949793764024, "learning_rate": 8.09205365604795e-06, "loss": 0.3085, "step": 3662 }, { "epoch": 1.7502538677498358, "grad_norm": 0.46261108160255604, "learning_rate": 8.086883960269477e-06, "loss": 0.3281, "step": 3663 }, { "epoch": 1.750731736455409, "grad_norm": 0.45709628622401643, "learning_rate": 8.081714795155648e-06, "loss": 0.3195, "step": 3664 }, { "epoch": 1.751209605160982, "grad_norm": 0.5065744246500246, "learning_rate": 8.0765461621403e-06, "loss": 0.3302, "step": 3665 }, { "epoch": 1.7516874738665553, "grad_norm": 0.46058472348695123, "learning_rate": 8.071378062657114e-06, "loss": 0.3293, "step": 3666 }, { "epoch": 1.7521653425721282, "grad_norm": 0.5009271488564606, "learning_rate": 8.066210498139632e-06, "loss": 0.3049, "step": 3667 }, { "epoch": 1.7526432112777015, "grad_norm": 0.5227592677426378, "learning_rate": 8.061043470021251e-06, "loss": 0.3135, "step": 3668 }, { "epoch": 1.7531210799832746, "grad_norm": 0.5018329300564572, "learning_rate": 8.055876979735203e-06, "loss": 0.3071, "step": 3669 }, { "epoch": 1.7535989486888477, "grad_norm": 0.4680917060285861, "learning_rate": 8.050711028714589e-06, "loss": 0.2993, "step": 3670 }, { "epoch": 1.754076817394421, "grad_norm": 0.5167520768333849, "learning_rate": 8.045545618392352e-06, "loss": 0.3084, "step": 3671 }, { "epoch": 1.7545546860999939, "grad_norm": 0.4654107230776164, "learning_rate": 8.040380750201286e-06, "loss": 0.3149, "step": 3672 }, { "epoch": 1.7550325548055672, "grad_norm": 0.5273472331111888, "learning_rate": 8.035216425574031e-06, "loss": 0.3195, "step": 3673 }, { "epoch": 1.7555104235111403, "grad_norm": 0.4671145108043513, "learning_rate": 8.03005264594308e-06, "loss": 0.3249, "step": 3674 }, { "epoch": 1.7559882922167134, "grad_norm": 0.6794461125069341, "learning_rate": 8.02488941274078e-06, "loss": 0.3101, "step": 3675 }, { "epoch": 1.7564661609222867, "grad_norm": 0.46888345908111245, "learning_rate": 8.019726727399313e-06, "loss": 0.3028, "step": 3676 }, { "epoch": 1.7569440296278598, "grad_norm": 0.4913169627797939, "learning_rate": 8.01456459135072e-06, "loss": 0.3357, "step": 3677 }, { "epoch": 1.7574218983334329, "grad_norm": 0.475145978227504, "learning_rate": 8.00940300602689e-06, "loss": 0.3149, "step": 3678 }, { "epoch": 1.7578997670390062, "grad_norm": 0.44803279971539967, "learning_rate": 8.004241972859552e-06, "loss": 0.3347, "step": 3679 }, { "epoch": 1.758377635744579, "grad_norm": 0.4787438847451073, "learning_rate": 7.999081493280283e-06, "loss": 0.3168, "step": 3680 }, { "epoch": 1.7588555044501524, "grad_norm": 0.47005813353883635, "learning_rate": 7.993921568720515e-06, "loss": 0.3322, "step": 3681 }, { "epoch": 1.7593333731557255, "grad_norm": 0.4661035524219349, "learning_rate": 7.988762200611517e-06, "loss": 0.322, "step": 3682 }, { "epoch": 1.7598112418612986, "grad_norm": 0.45866488888688306, "learning_rate": 7.983603390384405e-06, "loss": 0.3227, "step": 3683 }, { "epoch": 1.7602891105668719, "grad_norm": 0.5137297024673525, "learning_rate": 7.978445139470147e-06, "loss": 0.323, "step": 3684 }, { "epoch": 1.7607669792724447, "grad_norm": 0.9472143703913082, "learning_rate": 7.973287449299545e-06, "loss": 0.3266, "step": 3685 }, { "epoch": 1.761244847978018, "grad_norm": 0.46102852447781484, "learning_rate": 7.968130321303254e-06, "loss": 0.3103, "step": 3686 }, { "epoch": 1.7617227166835912, "grad_norm": 0.5664763306205914, "learning_rate": 7.962973756911773e-06, "loss": 0.3173, "step": 3687 }, { "epoch": 1.7622005853891642, "grad_norm": 0.48417061898824687, "learning_rate": 7.957817757555438e-06, "loss": 0.3044, "step": 3688 }, { "epoch": 1.7626784540947376, "grad_norm": 0.4891312388302578, "learning_rate": 7.95266232466443e-06, "loss": 0.3075, "step": 3689 }, { "epoch": 1.7631563228003106, "grad_norm": 0.4839119830817586, "learning_rate": 7.947507459668784e-06, "loss": 0.3227, "step": 3690 }, { "epoch": 1.7636341915058837, "grad_norm": 0.5122896682596106, "learning_rate": 7.94235316399836e-06, "loss": 0.3229, "step": 3691 }, { "epoch": 1.764112060211457, "grad_norm": 0.4895385220427247, "learning_rate": 7.937199439082874e-06, "loss": 0.2932, "step": 3692 }, { "epoch": 1.76458992891703, "grad_norm": 0.47779205504554345, "learning_rate": 7.932046286351877e-06, "loss": 0.3098, "step": 3693 }, { "epoch": 1.7650677976226032, "grad_norm": 0.46886659878011405, "learning_rate": 7.92689370723476e-06, "loss": 0.3195, "step": 3694 }, { "epoch": 1.7655456663281763, "grad_norm": 0.47588844827515836, "learning_rate": 7.921741703160758e-06, "loss": 0.3178, "step": 3695 }, { "epoch": 1.7660235350337494, "grad_norm": 0.48249496323108887, "learning_rate": 7.916590275558953e-06, "loss": 0.3135, "step": 3696 }, { "epoch": 1.7665014037393227, "grad_norm": 0.5528175204761951, "learning_rate": 7.91143942585825e-06, "loss": 0.3119, "step": 3697 }, { "epoch": 1.7669792724448956, "grad_norm": 0.49703375415055845, "learning_rate": 7.90628915548741e-06, "loss": 0.319, "step": 3698 }, { "epoch": 1.767457141150469, "grad_norm": 0.4797540216428928, "learning_rate": 7.901139465875029e-06, "loss": 0.3216, "step": 3699 }, { "epoch": 1.767935009856042, "grad_norm": 0.4685398215123699, "learning_rate": 7.895990358449533e-06, "loss": 0.2973, "step": 3700 }, { "epoch": 1.768412878561615, "grad_norm": 0.47072717607559234, "learning_rate": 7.890841834639198e-06, "loss": 0.3204, "step": 3701 }, { "epoch": 1.7688907472671884, "grad_norm": 0.47672448748601165, "learning_rate": 7.88569389587213e-06, "loss": 0.3201, "step": 3702 }, { "epoch": 1.7693686159727615, "grad_norm": 0.4417548032639278, "learning_rate": 7.880546543576283e-06, "loss": 0.3278, "step": 3703 }, { "epoch": 1.7698464846783346, "grad_norm": 0.5159968343841332, "learning_rate": 7.875399779179442e-06, "loss": 0.3204, "step": 3704 }, { "epoch": 1.770324353383908, "grad_norm": 0.5117769103897961, "learning_rate": 7.87025360410922e-06, "loss": 0.3129, "step": 3705 }, { "epoch": 1.7708022220894808, "grad_norm": 0.514433283638377, "learning_rate": 7.865108019793082e-06, "loss": 0.3174, "step": 3706 }, { "epoch": 1.771280090795054, "grad_norm": 0.58396258202816, "learning_rate": 7.859963027658322e-06, "loss": 0.3238, "step": 3707 }, { "epoch": 1.7717579595006272, "grad_norm": 0.4542920457799816, "learning_rate": 7.854818629132065e-06, "loss": 0.3169, "step": 3708 }, { "epoch": 1.7722358282062003, "grad_norm": 0.49838845584475117, "learning_rate": 7.849674825641282e-06, "loss": 0.323, "step": 3709 }, { "epoch": 1.7727136969117736, "grad_norm": 0.4707270211274528, "learning_rate": 7.844531618612772e-06, "loss": 0.3196, "step": 3710 }, { "epoch": 1.7731915656173465, "grad_norm": 0.45205689040938446, "learning_rate": 7.839389009473167e-06, "loss": 0.3276, "step": 3711 }, { "epoch": 1.7736694343229198, "grad_norm": 0.4717638415399161, "learning_rate": 7.83424699964894e-06, "loss": 0.3403, "step": 3712 }, { "epoch": 1.7741473030284929, "grad_norm": 0.5086223507316545, "learning_rate": 7.829105590566393e-06, "loss": 0.3124, "step": 3713 }, { "epoch": 1.774625171734066, "grad_norm": 0.49314322583865455, "learning_rate": 7.823964783651659e-06, "loss": 0.3089, "step": 3714 }, { "epoch": 1.7751030404396393, "grad_norm": 0.4623851784080253, "learning_rate": 7.81882458033071e-06, "loss": 0.3048, "step": 3715 }, { "epoch": 1.7755809091452124, "grad_norm": 0.5172646679609034, "learning_rate": 7.813684982029347e-06, "loss": 0.3164, "step": 3716 }, { "epoch": 1.7760587778507855, "grad_norm": 0.4947812594178152, "learning_rate": 7.808545990173204e-06, "loss": 0.3134, "step": 3717 }, { "epoch": 1.7765366465563588, "grad_norm": 0.4779432224978824, "learning_rate": 7.803407606187748e-06, "loss": 0.3098, "step": 3718 }, { "epoch": 1.7770145152619317, "grad_norm": 0.5012122088224422, "learning_rate": 7.798269831498275e-06, "loss": 0.3096, "step": 3719 }, { "epoch": 1.777492383967505, "grad_norm": 0.47084003336822555, "learning_rate": 7.79313266752991e-06, "loss": 0.312, "step": 3720 }, { "epoch": 1.777970252673078, "grad_norm": 0.47286383043551417, "learning_rate": 7.787996115707617e-06, "loss": 0.3197, "step": 3721 }, { "epoch": 1.7784481213786512, "grad_norm": 0.4780990261307311, "learning_rate": 7.782860177456183e-06, "loss": 0.3041, "step": 3722 }, { "epoch": 1.7789259900842245, "grad_norm": 0.5239288813334391, "learning_rate": 7.777724854200224e-06, "loss": 0.3107, "step": 3723 }, { "epoch": 1.7794038587897973, "grad_norm": 0.47039194395335954, "learning_rate": 7.77259014736419e-06, "loss": 0.3097, "step": 3724 }, { "epoch": 1.7798817274953707, "grad_norm": 0.45970590084025953, "learning_rate": 7.767456058372362e-06, "loss": 0.3211, "step": 3725 }, { "epoch": 1.7803595962009438, "grad_norm": 0.4734707205616713, "learning_rate": 7.762322588648839e-06, "loss": 0.3074, "step": 3726 }, { "epoch": 1.7808374649065168, "grad_norm": 0.5154833143648376, "learning_rate": 7.757189739617556e-06, "loss": 0.3048, "step": 3727 }, { "epoch": 1.7813153336120902, "grad_norm": 0.47648033493201797, "learning_rate": 7.75205751270228e-06, "loss": 0.325, "step": 3728 }, { "epoch": 1.7817932023176632, "grad_norm": 0.46731827189098973, "learning_rate": 7.746925909326597e-06, "loss": 0.3175, "step": 3729 }, { "epoch": 1.7822710710232363, "grad_norm": 0.4600389521064629, "learning_rate": 7.741794930913922e-06, "loss": 0.3058, "step": 3730 }, { "epoch": 1.7827489397288097, "grad_norm": 0.4471951076152281, "learning_rate": 7.7366645788875e-06, "loss": 0.3226, "step": 3731 }, { "epoch": 1.7832268084343825, "grad_norm": 0.45824019393467236, "learning_rate": 7.731534854670398e-06, "loss": 0.3184, "step": 3732 }, { "epoch": 1.7837046771399558, "grad_norm": 0.5039947063633161, "learning_rate": 7.726405759685512e-06, "loss": 0.3294, "step": 3733 }, { "epoch": 1.784182545845529, "grad_norm": 0.48151111030694355, "learning_rate": 7.721277295355566e-06, "loss": 0.3135, "step": 3734 }, { "epoch": 1.784660414551102, "grad_norm": 0.47276411179310734, "learning_rate": 7.7161494631031e-06, "loss": 0.3087, "step": 3735 }, { "epoch": 1.7851382832566753, "grad_norm": 0.5222088309394991, "learning_rate": 7.711022264350483e-06, "loss": 0.3032, "step": 3736 }, { "epoch": 1.7856161519622482, "grad_norm": 0.45961959884253906, "learning_rate": 7.705895700519915e-06, "loss": 0.324, "step": 3737 }, { "epoch": 1.7860940206678215, "grad_norm": 0.45367078593289256, "learning_rate": 7.700769773033414e-06, "loss": 0.312, "step": 3738 }, { "epoch": 1.7865718893733946, "grad_norm": 0.48592430357796496, "learning_rate": 7.69564448331282e-06, "loss": 0.3064, "step": 3739 }, { "epoch": 1.7870497580789677, "grad_norm": 0.49401805489694345, "learning_rate": 7.690519832779799e-06, "loss": 0.3158, "step": 3740 }, { "epoch": 1.787527626784541, "grad_norm": 0.4920841853592163, "learning_rate": 7.685395822855837e-06, "loss": 0.3193, "step": 3741 }, { "epoch": 1.7880054954901141, "grad_norm": 0.48732165094819957, "learning_rate": 7.680272454962245e-06, "loss": 0.3163, "step": 3742 }, { "epoch": 1.7884833641956872, "grad_norm": 0.49308527781981676, "learning_rate": 7.675149730520155e-06, "loss": 0.3173, "step": 3743 }, { "epoch": 1.7889612329012605, "grad_norm": 0.48955038846140736, "learning_rate": 7.670027650950519e-06, "loss": 0.3266, "step": 3744 }, { "epoch": 1.7894391016068334, "grad_norm": 0.4710450744507011, "learning_rate": 7.664906217674115e-06, "loss": 0.309, "step": 3745 }, { "epoch": 1.7899169703124067, "grad_norm": 0.4738846393248144, "learning_rate": 7.659785432111533e-06, "loss": 0.3105, "step": 3746 }, { "epoch": 1.7903948390179798, "grad_norm": 0.48221267868714024, "learning_rate": 7.654665295683192e-06, "loss": 0.3188, "step": 3747 }, { "epoch": 1.790872707723553, "grad_norm": 0.46375474845886444, "learning_rate": 7.649545809809329e-06, "loss": 0.2975, "step": 3748 }, { "epoch": 1.7913505764291262, "grad_norm": 0.47721809669352233, "learning_rate": 7.644426975909995e-06, "loss": 0.3113, "step": 3749 }, { "epoch": 1.791828445134699, "grad_norm": 4.805146189592445, "learning_rate": 7.639308795405066e-06, "loss": 0.3143, "step": 3750 }, { "epoch": 1.7923063138402724, "grad_norm": 0.5455951254365826, "learning_rate": 7.634191269714238e-06, "loss": 0.2981, "step": 3751 }, { "epoch": 1.7927841825458455, "grad_norm": 0.4744407650385994, "learning_rate": 7.6290744002570176e-06, "loss": 0.3249, "step": 3752 }, { "epoch": 1.7932620512514186, "grad_norm": 0.45267368874034103, "learning_rate": 7.6239581884527354e-06, "loss": 0.3169, "step": 3753 }, { "epoch": 1.793739919956992, "grad_norm": 0.46081369771470404, "learning_rate": 7.618842635720542e-06, "loss": 0.3122, "step": 3754 }, { "epoch": 1.794217788662565, "grad_norm": 0.46409053301581027, "learning_rate": 7.613727743479395e-06, "loss": 0.317, "step": 3755 }, { "epoch": 1.794695657368138, "grad_norm": 0.45315984581264734, "learning_rate": 7.608613513148081e-06, "loss": 0.3253, "step": 3756 }, { "epoch": 1.7951735260737114, "grad_norm": 0.44114117490408994, "learning_rate": 7.6034999461451956e-06, "loss": 0.3272, "step": 3757 }, { "epoch": 1.7956513947792843, "grad_norm": 1.5565443010930862, "learning_rate": 7.5983870438891505e-06, "loss": 0.3197, "step": 3758 }, { "epoch": 1.7961292634848576, "grad_norm": 0.4911118507351176, "learning_rate": 7.593274807798175e-06, "loss": 0.3245, "step": 3759 }, { "epoch": 1.7966071321904307, "grad_norm": 0.44995132281699324, "learning_rate": 7.588163239290316e-06, "loss": 0.3186, "step": 3760 }, { "epoch": 1.7970850008960038, "grad_norm": 0.46456788831578155, "learning_rate": 7.583052339783428e-06, "loss": 0.3172, "step": 3761 }, { "epoch": 1.797562869601577, "grad_norm": 0.4734482643803076, "learning_rate": 7.5779421106951874e-06, "loss": 0.3086, "step": 3762 }, { "epoch": 1.7980407383071502, "grad_norm": 0.46205250286552624, "learning_rate": 7.572832553443083e-06, "loss": 0.309, "step": 3763 }, { "epoch": 1.7985186070127233, "grad_norm": 0.5474167526568322, "learning_rate": 7.567723669444411e-06, "loss": 0.3263, "step": 3764 }, { "epoch": 1.7989964757182964, "grad_norm": 0.4807279141914673, "learning_rate": 7.562615460116289e-06, "loss": 0.3114, "step": 3765 }, { "epoch": 1.7994743444238694, "grad_norm": 0.47642414056800825, "learning_rate": 7.557507926875646e-06, "loss": 0.3141, "step": 3766 }, { "epoch": 1.7999522131294428, "grad_norm": 0.44512012977708126, "learning_rate": 7.552401071139217e-06, "loss": 0.3202, "step": 3767 }, { "epoch": 1.8004300818350158, "grad_norm": 0.48382096788679013, "learning_rate": 7.547294894323556e-06, "loss": 0.3183, "step": 3768 }, { "epoch": 1.800907950540589, "grad_norm": 0.4631932373502014, "learning_rate": 7.542189397845028e-06, "loss": 0.3163, "step": 3769 }, { "epoch": 1.8013858192461623, "grad_norm": 0.4661580950852657, "learning_rate": 7.537084583119802e-06, "loss": 0.3009, "step": 3770 }, { "epoch": 1.8018636879517351, "grad_norm": 0.44890347173646833, "learning_rate": 7.531980451563869e-06, "loss": 0.324, "step": 3771 }, { "epoch": 1.8023415566573084, "grad_norm": 0.46211335360678135, "learning_rate": 7.5268770045930255e-06, "loss": 0.3063, "step": 3772 }, { "epoch": 1.8028194253628815, "grad_norm": 0.47324725173384913, "learning_rate": 7.521774243622875e-06, "loss": 0.3232, "step": 3773 }, { "epoch": 1.8032972940684546, "grad_norm": 0.4769086546795592, "learning_rate": 7.516672170068835e-06, "loss": 0.3081, "step": 3774 }, { "epoch": 1.803775162774028, "grad_norm": 0.4987810815938432, "learning_rate": 7.511570785346129e-06, "loss": 0.3331, "step": 3775 }, { "epoch": 1.804253031479601, "grad_norm": 0.515528221050649, "learning_rate": 7.506470090869793e-06, "loss": 0.3081, "step": 3776 }, { "epoch": 1.8047309001851741, "grad_norm": 0.48258513179828666, "learning_rate": 7.501370088054667e-06, "loss": 0.3253, "step": 3777 }, { "epoch": 1.8052087688907472, "grad_norm": 0.4687979828089521, "learning_rate": 7.496270778315404e-06, "loss": 0.3038, "step": 3778 }, { "epoch": 1.8056866375963203, "grad_norm": 0.4675700871564693, "learning_rate": 7.4911721630664644e-06, "loss": 0.3131, "step": 3779 }, { "epoch": 1.8061645063018936, "grad_norm": 0.4488321146024742, "learning_rate": 7.486074243722109e-06, "loss": 0.3097, "step": 3780 }, { "epoch": 1.8066423750074667, "grad_norm": 0.4731548586712247, "learning_rate": 7.480977021696414e-06, "loss": 0.3036, "step": 3781 }, { "epoch": 1.8071202437130398, "grad_norm": 0.5469397778562964, "learning_rate": 7.475880498403261e-06, "loss": 0.32, "step": 3782 }, { "epoch": 1.8075981124186131, "grad_norm": 0.45764608523635797, "learning_rate": 7.470784675256329e-06, "loss": 0.3237, "step": 3783 }, { "epoch": 1.808075981124186, "grad_norm": 0.4850459958523287, "learning_rate": 7.4656895536691154e-06, "loss": 0.3132, "step": 3784 }, { "epoch": 1.8085538498297593, "grad_norm": 0.45406496016389747, "learning_rate": 7.460595135054916e-06, "loss": 0.3124, "step": 3785 }, { "epoch": 1.8090317185353324, "grad_norm": 0.45950621992569934, "learning_rate": 7.455501420826831e-06, "loss": 0.3218, "step": 3786 }, { "epoch": 1.8095095872409055, "grad_norm": 0.4591525516581785, "learning_rate": 7.450408412397767e-06, "loss": 0.3206, "step": 3787 }, { "epoch": 1.8099874559464788, "grad_norm": 0.508129994813578, "learning_rate": 7.445316111180436e-06, "loss": 0.3265, "step": 3788 }, { "epoch": 1.810465324652052, "grad_norm": 0.4487238152879312, "learning_rate": 7.440224518587353e-06, "loss": 0.317, "step": 3789 }, { "epoch": 1.810943193357625, "grad_norm": 0.44328561901531993, "learning_rate": 7.435133636030831e-06, "loss": 0.3091, "step": 3790 }, { "epoch": 1.811421062063198, "grad_norm": 0.4386069882959427, "learning_rate": 7.430043464923e-06, "loss": 0.33, "step": 3791 }, { "epoch": 1.8118989307687712, "grad_norm": 0.4737344270055403, "learning_rate": 7.424954006675775e-06, "loss": 0.3075, "step": 3792 }, { "epoch": 1.8123767994743445, "grad_norm": 0.46437704276405334, "learning_rate": 7.419865262700887e-06, "loss": 0.3163, "step": 3793 }, { "epoch": 1.8128546681799176, "grad_norm": 0.4582800308955872, "learning_rate": 7.414777234409863e-06, "loss": 0.336, "step": 3794 }, { "epoch": 1.8133325368854907, "grad_norm": 0.4639778390365935, "learning_rate": 7.4096899232140295e-06, "loss": 0.3253, "step": 3795 }, { "epoch": 1.813810405591064, "grad_norm": 0.47191555510498095, "learning_rate": 7.40460333052452e-06, "loss": 0.3164, "step": 3796 }, { "epoch": 1.8142882742966369, "grad_norm": 0.8754820249606956, "learning_rate": 7.399517457752266e-06, "loss": 0.3068, "step": 3797 }, { "epoch": 1.8147661430022102, "grad_norm": 0.4647598211303201, "learning_rate": 7.394432306307997e-06, "loss": 0.3236, "step": 3798 }, { "epoch": 1.8152440117077833, "grad_norm": 0.8061057528954818, "learning_rate": 7.389347877602242e-06, "loss": 0.3204, "step": 3799 }, { "epoch": 1.8157218804133564, "grad_norm": 0.4832339467960314, "learning_rate": 7.384264173045339e-06, "loss": 0.3068, "step": 3800 }, { "epoch": 1.8161997491189297, "grad_norm": 0.48537677126279627, "learning_rate": 7.379181194047412e-06, "loss": 0.3044, "step": 3801 }, { "epoch": 1.8166776178245028, "grad_norm": 0.4370504478039206, "learning_rate": 7.374098942018388e-06, "loss": 0.3103, "step": 3802 }, { "epoch": 1.8171554865300759, "grad_norm": 0.4540403617689952, "learning_rate": 7.3690174183680015e-06, "loss": 0.293, "step": 3803 }, { "epoch": 1.817633355235649, "grad_norm": 0.44834138741434454, "learning_rate": 7.363936624505767e-06, "loss": 0.3038, "step": 3804 }, { "epoch": 1.818111223941222, "grad_norm": 0.5071526227296054, "learning_rate": 7.358856561841021e-06, "loss": 0.33, "step": 3805 }, { "epoch": 1.8185890926467954, "grad_norm": 0.45524356000949795, "learning_rate": 7.353777231782873e-06, "loss": 0.3037, "step": 3806 }, { "epoch": 1.8190669613523685, "grad_norm": 0.4323954513417616, "learning_rate": 7.3486986357402414e-06, "loss": 0.2984, "step": 3807 }, { "epoch": 1.8195448300579415, "grad_norm": 0.4748222174160758, "learning_rate": 7.343620775121842e-06, "loss": 0.3197, "step": 3808 }, { "epoch": 1.8200226987635149, "grad_norm": 0.5434867730758338, "learning_rate": 7.338543651336181e-06, "loss": 0.3048, "step": 3809 }, { "epoch": 1.8205005674690877, "grad_norm": 0.5261663632567056, "learning_rate": 7.333467265791563e-06, "loss": 0.3269, "step": 3810 }, { "epoch": 1.820978436174661, "grad_norm": 0.4668224192467844, "learning_rate": 7.328391619896092e-06, "loss": 0.3256, "step": 3811 }, { "epoch": 1.8214563048802341, "grad_norm": 0.47668206195798435, "learning_rate": 7.3233167150576554e-06, "loss": 0.3268, "step": 3812 }, { "epoch": 1.8219341735858072, "grad_norm": 0.5070836849852632, "learning_rate": 7.318242552683948e-06, "loss": 0.321, "step": 3813 }, { "epoch": 1.8224120422913805, "grad_norm": 0.494039037408748, "learning_rate": 7.3131691341824515e-06, "loss": 0.3083, "step": 3814 }, { "epoch": 1.8228899109969536, "grad_norm": 0.4701617746326885, "learning_rate": 7.308096460960441e-06, "loss": 0.3036, "step": 3815 }, { "epoch": 1.8233677797025267, "grad_norm": 0.48182454253938445, "learning_rate": 7.303024534424987e-06, "loss": 0.3191, "step": 3816 }, { "epoch": 1.8238456484081, "grad_norm": 0.4968254147423399, "learning_rate": 7.297953355982956e-06, "loss": 0.3228, "step": 3817 }, { "epoch": 1.824323517113673, "grad_norm": 0.48879138443250414, "learning_rate": 7.292882927040999e-06, "loss": 0.3083, "step": 3818 }, { "epoch": 1.8248013858192462, "grad_norm": 0.47334912056885264, "learning_rate": 7.287813249005565e-06, "loss": 0.3111, "step": 3819 }, { "epoch": 1.8252792545248193, "grad_norm": 0.4884790884097843, "learning_rate": 7.282744323282895e-06, "loss": 0.3256, "step": 3820 }, { "epoch": 1.8257571232303924, "grad_norm": 0.5022837404885262, "learning_rate": 7.277676151279019e-06, "loss": 0.3076, "step": 3821 }, { "epoch": 1.8262349919359657, "grad_norm": 0.47479766132527035, "learning_rate": 7.272608734399754e-06, "loss": 0.3033, "step": 3822 }, { "epoch": 1.8267128606415386, "grad_norm": 1.3020109975876173, "learning_rate": 7.26754207405072e-06, "loss": 0.3079, "step": 3823 }, { "epoch": 1.827190729347112, "grad_norm": 0.4919024454500918, "learning_rate": 7.262476171637311e-06, "loss": 0.3095, "step": 3824 }, { "epoch": 1.827668598052685, "grad_norm": 0.44665992779272773, "learning_rate": 7.2574110285647244e-06, "loss": 0.3103, "step": 3825 }, { "epoch": 1.828146466758258, "grad_norm": 0.44838920186117176, "learning_rate": 7.252346646237942e-06, "loss": 0.3092, "step": 3826 }, { "epoch": 1.8286243354638314, "grad_norm": 0.7199341984258678, "learning_rate": 7.24728302606173e-06, "loss": 0.3233, "step": 3827 }, { "epoch": 1.8291022041694045, "grad_norm": 0.4594979265995351, "learning_rate": 7.242220169440649e-06, "loss": 0.3203, "step": 3828 }, { "epoch": 1.8295800728749776, "grad_norm": 0.46434411184296537, "learning_rate": 7.2371580777790494e-06, "loss": 0.3297, "step": 3829 }, { "epoch": 1.830057941580551, "grad_norm": 0.4661421339585899, "learning_rate": 7.232096752481061e-06, "loss": 0.31, "step": 3830 }, { "epoch": 1.8305358102861238, "grad_norm": 0.4796291545667761, "learning_rate": 7.2270361949506075e-06, "loss": 0.3196, "step": 3831 }, { "epoch": 1.831013678991697, "grad_norm": 0.46983415973458353, "learning_rate": 7.2219764065914024e-06, "loss": 0.317, "step": 3832 }, { "epoch": 1.8314915476972702, "grad_norm": 0.5194541122421759, "learning_rate": 7.216917388806936e-06, "loss": 0.323, "step": 3833 }, { "epoch": 1.8319694164028433, "grad_norm": 0.483395653309714, "learning_rate": 7.211859143000495e-06, "loss": 0.3084, "step": 3834 }, { "epoch": 1.8324472851084166, "grad_norm": 0.4802816137671634, "learning_rate": 7.206801670575145e-06, "loss": 0.3213, "step": 3835 }, { "epoch": 1.8329251538139895, "grad_norm": 0.46676119411435246, "learning_rate": 7.2017449729337396e-06, "loss": 0.2915, "step": 3836 }, { "epoch": 1.8334030225195628, "grad_norm": 0.4751615080956812, "learning_rate": 7.196689051478917e-06, "loss": 0.3172, "step": 3837 }, { "epoch": 1.8338808912251359, "grad_norm": 0.4518355207103295, "learning_rate": 7.191633907613103e-06, "loss": 0.3037, "step": 3838 }, { "epoch": 1.834358759930709, "grad_norm": 0.4964989199148571, "learning_rate": 7.186579542738507e-06, "loss": 0.3241, "step": 3839 }, { "epoch": 1.8348366286362823, "grad_norm": 0.45654528244066295, "learning_rate": 7.181525958257116e-06, "loss": 0.3127, "step": 3840 }, { "epoch": 1.8353144973418554, "grad_norm": 0.46918542594424667, "learning_rate": 7.176473155570707e-06, "loss": 0.3119, "step": 3841 }, { "epoch": 1.8357923660474285, "grad_norm": 0.4507255821310943, "learning_rate": 7.171421136080841e-06, "loss": 0.3096, "step": 3842 }, { "epoch": 1.8362702347530018, "grad_norm": 0.4345435682976803, "learning_rate": 7.1663699011888524e-06, "loss": 0.3327, "step": 3843 }, { "epoch": 1.8367481034585746, "grad_norm": 0.46364207609499564, "learning_rate": 7.1613194522958705e-06, "loss": 0.3321, "step": 3844 }, { "epoch": 1.837225972164148, "grad_norm": 1.3350277825716896, "learning_rate": 7.156269790802801e-06, "loss": 0.3215, "step": 3845 }, { "epoch": 1.837703840869721, "grad_norm": 0.45482610525005635, "learning_rate": 7.151220918110326e-06, "loss": 0.3122, "step": 3846 }, { "epoch": 1.8381817095752941, "grad_norm": 0.4799378572791614, "learning_rate": 7.146172835618919e-06, "loss": 0.3202, "step": 3847 }, { "epoch": 1.8386595782808675, "grad_norm": 0.4560594548210913, "learning_rate": 7.1411255447288266e-06, "loss": 0.3178, "step": 3848 }, { "epoch": 1.8391374469864403, "grad_norm": 0.42536499972455255, "learning_rate": 7.136079046840078e-06, "loss": 0.3098, "step": 3849 }, { "epoch": 1.8396153156920136, "grad_norm": 0.4728281050415246, "learning_rate": 7.131033343352483e-06, "loss": 0.3162, "step": 3850 }, { "epoch": 1.8400931843975867, "grad_norm": 0.45137859761216986, "learning_rate": 7.125988435665632e-06, "loss": 0.3258, "step": 3851 }, { "epoch": 1.8405710531031598, "grad_norm": 0.4305757718398639, "learning_rate": 7.120944325178889e-06, "loss": 0.3034, "step": 3852 }, { "epoch": 1.8410489218087331, "grad_norm": 0.46513614861758734, "learning_rate": 7.1159010132914065e-06, "loss": 0.3233, "step": 3853 }, { "epoch": 1.8415267905143062, "grad_norm": 0.4516843311861331, "learning_rate": 7.1108585014021095e-06, "loss": 0.3123, "step": 3854 }, { "epoch": 1.8420046592198793, "grad_norm": 0.44599301993670454, "learning_rate": 7.105816790909699e-06, "loss": 0.303, "step": 3855 }, { "epoch": 1.8424825279254526, "grad_norm": 0.45407721798005035, "learning_rate": 7.100775883212658e-06, "loss": 0.2951, "step": 3856 }, { "epoch": 1.8429603966310255, "grad_norm": 0.6330334384592656, "learning_rate": 7.095735779709248e-06, "loss": 0.3187, "step": 3857 }, { "epoch": 1.8434382653365988, "grad_norm": 0.45481681430926496, "learning_rate": 7.0906964817974984e-06, "loss": 0.3068, "step": 3858 }, { "epoch": 1.843916134042172, "grad_norm": 0.4552031236149239, "learning_rate": 7.085657990875227e-06, "loss": 0.3038, "step": 3859 }, { "epoch": 1.844394002747745, "grad_norm": 0.4834807925984325, "learning_rate": 7.080620308340024e-06, "loss": 0.3047, "step": 3860 }, { "epoch": 1.8448718714533183, "grad_norm": 0.46556812455338153, "learning_rate": 7.075583435589248e-06, "loss": 0.3085, "step": 3861 }, { "epoch": 1.8453497401588912, "grad_norm": 0.43412355998395485, "learning_rate": 7.07054737402004e-06, "loss": 0.297, "step": 3862 }, { "epoch": 1.8458276088644645, "grad_norm": 0.4346076047173381, "learning_rate": 7.065512125029318e-06, "loss": 0.3116, "step": 3863 }, { "epoch": 1.8463054775700376, "grad_norm": 0.4604136030349345, "learning_rate": 7.060477690013767e-06, "loss": 0.3257, "step": 3864 }, { "epoch": 1.8467833462756107, "grad_norm": 0.4900954658380756, "learning_rate": 7.055444070369852e-06, "loss": 0.3204, "step": 3865 }, { "epoch": 1.847261214981184, "grad_norm": 0.4813520085061644, "learning_rate": 7.050411267493815e-06, "loss": 0.3139, "step": 3866 }, { "epoch": 1.847739083686757, "grad_norm": 0.4567730976405721, "learning_rate": 7.045379282781659e-06, "loss": 0.3125, "step": 3867 }, { "epoch": 1.8482169523923302, "grad_norm": 0.474550906247094, "learning_rate": 7.040348117629172e-06, "loss": 0.3149, "step": 3868 }, { "epoch": 1.8486948210979035, "grad_norm": 0.4432230508973788, "learning_rate": 7.035317773431911e-06, "loss": 0.3055, "step": 3869 }, { "epoch": 1.8491726898034764, "grad_norm": 0.5522942357232046, "learning_rate": 7.0302882515852025e-06, "loss": 0.3366, "step": 3870 }, { "epoch": 1.8496505585090497, "grad_norm": 0.4752790460554777, "learning_rate": 7.025259553484145e-06, "loss": 0.3324, "step": 3871 }, { "epoch": 1.8501284272146228, "grad_norm": 0.4643972132496278, "learning_rate": 7.020231680523616e-06, "loss": 0.3093, "step": 3872 }, { "epoch": 1.8506062959201959, "grad_norm": 0.4554789111097097, "learning_rate": 7.015204634098256e-06, "loss": 0.3319, "step": 3873 }, { "epoch": 1.8510841646257692, "grad_norm": 0.45867714400719684, "learning_rate": 7.010178415602485e-06, "loss": 0.3133, "step": 3874 }, { "epoch": 1.851562033331342, "grad_norm": 0.45519094876743393, "learning_rate": 7.005153026430476e-06, "loss": 0.311, "step": 3875 }, { "epoch": 1.8520399020369154, "grad_norm": 0.4456318199658115, "learning_rate": 7.00012846797619e-06, "loss": 0.3298, "step": 3876 }, { "epoch": 1.8525177707424885, "grad_norm": 0.4877288837711452, "learning_rate": 6.995104741633354e-06, "loss": 0.302, "step": 3877 }, { "epoch": 1.8529956394480616, "grad_norm": 0.45490188442789403, "learning_rate": 6.990081848795453e-06, "loss": 0.3225, "step": 3878 }, { "epoch": 1.8534735081536349, "grad_norm": 0.46268349832494854, "learning_rate": 6.985059790855755e-06, "loss": 0.31, "step": 3879 }, { "epoch": 1.853951376859208, "grad_norm": 0.47114477836168356, "learning_rate": 6.980038569207291e-06, "loss": 0.3215, "step": 3880 }, { "epoch": 1.854429245564781, "grad_norm": 0.4627024705877978, "learning_rate": 6.975018185242852e-06, "loss": 0.322, "step": 3881 }, { "epoch": 1.8549071142703544, "grad_norm": 0.4403644932273325, "learning_rate": 6.969998640355011e-06, "loss": 0.3075, "step": 3882 }, { "epoch": 1.8553849829759272, "grad_norm": 0.4734619620341228, "learning_rate": 6.9649799359361e-06, "loss": 0.3153, "step": 3883 }, { "epoch": 1.8558628516815006, "grad_norm": 0.45177620940516144, "learning_rate": 6.959962073378216e-06, "loss": 0.3014, "step": 3884 }, { "epoch": 1.8563407203870737, "grad_norm": 0.4898115665474695, "learning_rate": 6.954945054073228e-06, "loss": 0.3159, "step": 3885 }, { "epoch": 1.8568185890926467, "grad_norm": 0.49590283970632837, "learning_rate": 6.949928879412768e-06, "loss": 0.3122, "step": 3886 }, { "epoch": 1.85729645779822, "grad_norm": 0.483933886138119, "learning_rate": 6.944913550788235e-06, "loss": 0.3134, "step": 3887 }, { "epoch": 1.857774326503793, "grad_norm": 0.503142035037706, "learning_rate": 6.939899069590791e-06, "loss": 0.327, "step": 3888 }, { "epoch": 1.8582521952093662, "grad_norm": 0.47557368760354807, "learning_rate": 6.934885437211367e-06, "loss": 0.3218, "step": 3889 }, { "epoch": 1.8587300639149393, "grad_norm": 0.4620961694190902, "learning_rate": 6.929872655040655e-06, "loss": 0.2842, "step": 3890 }, { "epoch": 1.8592079326205124, "grad_norm": 0.4673591975296846, "learning_rate": 6.924860724469111e-06, "loss": 0.315, "step": 3891 }, { "epoch": 1.8596858013260857, "grad_norm": 0.46945354634024067, "learning_rate": 6.9198496468869605e-06, "loss": 0.3005, "step": 3892 }, { "epoch": 1.8601636700316588, "grad_norm": 0.4851381015060144, "learning_rate": 6.914839423684183e-06, "loss": 0.3195, "step": 3893 }, { "epoch": 1.860641538737232, "grad_norm": 0.5201132587365642, "learning_rate": 6.909830056250527e-06, "loss": 0.3026, "step": 3894 }, { "epoch": 1.8611194074428052, "grad_norm": 0.5134723697086168, "learning_rate": 6.904821545975507e-06, "loss": 0.3245, "step": 3895 }, { "epoch": 1.861597276148378, "grad_norm": 0.4518202226167905, "learning_rate": 6.899813894248388e-06, "loss": 0.3088, "step": 3896 }, { "epoch": 1.8620751448539514, "grad_norm": 0.4683222226158601, "learning_rate": 6.894807102458211e-06, "loss": 0.3239, "step": 3897 }, { "epoch": 1.8625530135595245, "grad_norm": 0.48007891114431994, "learning_rate": 6.889801171993769e-06, "loss": 0.2836, "step": 3898 }, { "epoch": 1.8630308822650976, "grad_norm": 0.4772065271771579, "learning_rate": 6.8847961042436185e-06, "loss": 0.3176, "step": 3899 }, { "epoch": 1.863508750970671, "grad_norm": 0.45177407887015353, "learning_rate": 6.879791900596077e-06, "loss": 0.3081, "step": 3900 }, { "epoch": 1.8639866196762438, "grad_norm": 0.4729590065660319, "learning_rate": 6.874788562439225e-06, "loss": 0.3102, "step": 3901 }, { "epoch": 1.864464488381817, "grad_norm": 0.49303248211065703, "learning_rate": 6.869786091160895e-06, "loss": 0.3371, "step": 3902 }, { "epoch": 1.8649423570873902, "grad_norm": 0.4798724268373024, "learning_rate": 6.864784488148688e-06, "loss": 0.3172, "step": 3903 }, { "epoch": 1.8654202257929633, "grad_norm": 0.4566461806579496, "learning_rate": 6.859783754789962e-06, "loss": 0.3159, "step": 3904 }, { "epoch": 1.8658980944985366, "grad_norm": 0.45266075225313984, "learning_rate": 6.854783892471823e-06, "loss": 0.3336, "step": 3905 }, { "epoch": 1.8663759632041097, "grad_norm": 0.47703082895473686, "learning_rate": 6.849784902581158e-06, "loss": 0.2831, "step": 3906 }, { "epoch": 1.8668538319096828, "grad_norm": 0.5127241356341654, "learning_rate": 6.8447867865045905e-06, "loss": 0.3055, "step": 3907 }, { "epoch": 1.867331700615256, "grad_norm": 0.46983255513156474, "learning_rate": 6.83978954562851e-06, "loss": 0.3053, "step": 3908 }, { "epoch": 1.867809569320829, "grad_norm": 0.44944233352974683, "learning_rate": 6.834793181339068e-06, "loss": 0.312, "step": 3909 }, { "epoch": 1.8682874380264023, "grad_norm": 0.48300744589768124, "learning_rate": 6.829797695022163e-06, "loss": 0.2879, "step": 3910 }, { "epoch": 1.8687653067319754, "grad_norm": 0.49195148134310035, "learning_rate": 6.824803088063454e-06, "loss": 0.3086, "step": 3911 }, { "epoch": 1.8692431754375485, "grad_norm": 0.48226235431323466, "learning_rate": 6.819809361848362e-06, "loss": 0.3161, "step": 3912 }, { "epoch": 1.8697210441431218, "grad_norm": 0.4673032858019702, "learning_rate": 6.814816517762053e-06, "loss": 0.3009, "step": 3913 }, { "epoch": 1.8701989128486947, "grad_norm": 0.5120888668650547, "learning_rate": 6.809824557189456e-06, "loss": 0.3047, "step": 3914 }, { "epoch": 1.870676781554268, "grad_norm": 0.5188704505048501, "learning_rate": 6.804833481515256e-06, "loss": 0.2996, "step": 3915 }, { "epoch": 1.871154650259841, "grad_norm": 0.4586738347773163, "learning_rate": 6.799843292123883e-06, "loss": 0.3216, "step": 3916 }, { "epoch": 1.8716325189654142, "grad_norm": 0.4514115914431411, "learning_rate": 6.794853990399533e-06, "loss": 0.3157, "step": 3917 }, { "epoch": 1.8721103876709875, "grad_norm": 0.4567742118666483, "learning_rate": 6.7898655777261494e-06, "loss": 0.3228, "step": 3918 }, { "epoch": 1.8725882563765606, "grad_norm": 0.5130560266812821, "learning_rate": 6.784878055487425e-06, "loss": 0.3059, "step": 3919 }, { "epoch": 1.8730661250821337, "grad_norm": 0.471031405151132, "learning_rate": 6.7798914250668154e-06, "loss": 0.3208, "step": 3920 }, { "epoch": 1.873543993787707, "grad_norm": 0.4488894073471434, "learning_rate": 6.774905687847526e-06, "loss": 0.3082, "step": 3921 }, { "epoch": 1.8740218624932798, "grad_norm": 0.4837073839553914, "learning_rate": 6.769920845212506e-06, "loss": 0.3114, "step": 3922 }, { "epoch": 1.8744997311988532, "grad_norm": 0.45927942054259496, "learning_rate": 6.764936898544466e-06, "loss": 0.3162, "step": 3923 }, { "epoch": 1.8749775999044263, "grad_norm": 0.517741398437042, "learning_rate": 6.759953849225867e-06, "loss": 0.3065, "step": 3924 }, { "epoch": 1.8754554686099993, "grad_norm": 0.4830767260559844, "learning_rate": 6.7549716986389146e-06, "loss": 0.3142, "step": 3925 }, { "epoch": 1.8759333373155727, "grad_norm": 0.4686254460663425, "learning_rate": 6.749990448165572e-06, "loss": 0.3139, "step": 3926 }, { "epoch": 1.8764112060211455, "grad_norm": 0.4460888626954049, "learning_rate": 6.745010099187552e-06, "loss": 0.3315, "step": 3927 }, { "epoch": 1.8768890747267188, "grad_norm": 0.4643706696696984, "learning_rate": 6.740030653086311e-06, "loss": 0.2954, "step": 3928 }, { "epoch": 1.877366943432292, "grad_norm": 0.5033306648528414, "learning_rate": 6.735052111243061e-06, "loss": 0.298, "step": 3929 }, { "epoch": 1.877844812137865, "grad_norm": 0.44669150190220186, "learning_rate": 6.730074475038766e-06, "loss": 0.3019, "step": 3930 }, { "epoch": 1.8783226808434383, "grad_norm": 0.4653710125052024, "learning_rate": 6.72509774585413e-06, "loss": 0.307, "step": 3931 }, { "epoch": 1.8788005495490114, "grad_norm": 0.48207706455379085, "learning_rate": 6.720121925069609e-06, "loss": 0.3321, "step": 3932 }, { "epoch": 1.8792784182545845, "grad_norm": 0.446573686847004, "learning_rate": 6.715147014065413e-06, "loss": 0.3118, "step": 3933 }, { "epoch": 1.8797562869601578, "grad_norm": 0.4360305190566346, "learning_rate": 6.710173014221489e-06, "loss": 0.3305, "step": 3934 }, { "epoch": 1.8802341556657307, "grad_norm": 0.4928534869463769, "learning_rate": 6.7051999269175405e-06, "loss": 0.3335, "step": 3935 }, { "epoch": 1.880712024371304, "grad_norm": 0.5334772505119653, "learning_rate": 6.700227753533013e-06, "loss": 0.3203, "step": 3936 }, { "epoch": 1.8811898930768771, "grad_norm": 0.49357554615411325, "learning_rate": 6.695256495447099e-06, "loss": 0.3086, "step": 3937 }, { "epoch": 1.8816677617824502, "grad_norm": 0.46109123255173334, "learning_rate": 6.690286154038736e-06, "loss": 0.3013, "step": 3938 }, { "epoch": 1.8821456304880235, "grad_norm": 0.449128708951584, "learning_rate": 6.685316730686614e-06, "loss": 0.3243, "step": 3939 }, { "epoch": 1.8826234991935964, "grad_norm": 0.46653691376464573, "learning_rate": 6.680348226769162e-06, "loss": 0.3299, "step": 3940 }, { "epoch": 1.8831013678991697, "grad_norm": 0.4647819337168596, "learning_rate": 6.675380643664553e-06, "loss": 0.2945, "step": 3941 }, { "epoch": 1.8835792366047428, "grad_norm": 0.4644806300385637, "learning_rate": 6.670413982750709e-06, "loss": 0.2997, "step": 3942 }, { "epoch": 1.884057105310316, "grad_norm": 0.46165409315562544, "learning_rate": 6.6654482454052936e-06, "loss": 0.2979, "step": 3943 }, { "epoch": 1.8845349740158892, "grad_norm": 0.4604824497984356, "learning_rate": 6.660483433005714e-06, "loss": 0.3179, "step": 3944 }, { "epoch": 1.8850128427214623, "grad_norm": 0.46168391343184206, "learning_rate": 6.655519546929121e-06, "loss": 0.303, "step": 3945 }, { "epoch": 1.8854907114270354, "grad_norm": 0.4678840236583196, "learning_rate": 6.650556588552413e-06, "loss": 0.3107, "step": 3946 }, { "epoch": 1.8859685801326087, "grad_norm": 0.4594155584535815, "learning_rate": 6.64559455925222e-06, "loss": 0.3178, "step": 3947 }, { "epoch": 1.8864464488381816, "grad_norm": 0.45341402313704277, "learning_rate": 6.640633460404927e-06, "loss": 0.3231, "step": 3948 }, { "epoch": 1.886924317543755, "grad_norm": 0.5076384285562465, "learning_rate": 6.635673293386656e-06, "loss": 0.3197, "step": 3949 }, { "epoch": 1.887402186249328, "grad_norm": 0.4643275428624748, "learning_rate": 6.630714059573267e-06, "loss": 0.3217, "step": 3950 }, { "epoch": 1.887880054954901, "grad_norm": 0.49068283754529113, "learning_rate": 6.625755760340362e-06, "loss": 0.2889, "step": 3951 }, { "epoch": 1.8883579236604744, "grad_norm": 0.47460107283399794, "learning_rate": 6.620798397063291e-06, "loss": 0.3271, "step": 3952 }, { "epoch": 1.8888357923660475, "grad_norm": 0.48812107695821716, "learning_rate": 6.615841971117136e-06, "loss": 0.3138, "step": 3953 }, { "epoch": 1.8893136610716206, "grad_norm": 0.5734196274573817, "learning_rate": 6.610886483876721e-06, "loss": 0.3044, "step": 3954 }, { "epoch": 1.8897915297771937, "grad_norm": 0.4617227039696627, "learning_rate": 6.6059319367166165e-06, "loss": 0.3175, "step": 3955 }, { "epoch": 1.8902693984827668, "grad_norm": 0.6100117271945517, "learning_rate": 6.600978331011118e-06, "loss": 0.3084, "step": 3956 }, { "epoch": 1.89074726718834, "grad_norm": 0.46971941183014404, "learning_rate": 6.596025668134276e-06, "loss": 0.2983, "step": 3957 }, { "epoch": 1.8912251358939132, "grad_norm": 0.461607197188674, "learning_rate": 6.5910739494598675e-06, "loss": 0.314, "step": 3958 }, { "epoch": 1.8917030045994863, "grad_norm": 0.4654393182079575, "learning_rate": 6.586123176361412e-06, "loss": 0.3093, "step": 3959 }, { "epoch": 1.8921808733050596, "grad_norm": 0.49908066713941035, "learning_rate": 6.581173350212169e-06, "loss": 0.3159, "step": 3960 }, { "epoch": 1.8926587420106324, "grad_norm": 0.4512238380490708, "learning_rate": 6.576224472385132e-06, "loss": 0.3226, "step": 3961 }, { "epoch": 1.8931366107162058, "grad_norm": 0.4496277486152005, "learning_rate": 6.5712765442530305e-06, "loss": 0.3107, "step": 3962 }, { "epoch": 1.8936144794217789, "grad_norm": 0.4489167401241315, "learning_rate": 6.566329567188334e-06, "loss": 0.3026, "step": 3963 }, { "epoch": 1.894092348127352, "grad_norm": 0.45699132086998806, "learning_rate": 6.5613835425632475e-06, "loss": 0.3145, "step": 3964 }, { "epoch": 1.8945702168329253, "grad_norm": 0.45425664515009867, "learning_rate": 6.55643847174971e-06, "loss": 0.3174, "step": 3965 }, { "epoch": 1.8950480855384984, "grad_norm": 0.500523026242252, "learning_rate": 6.551494356119395e-06, "loss": 0.3115, "step": 3966 }, { "epoch": 1.8955259542440714, "grad_norm": 0.44931814929891095, "learning_rate": 6.546551197043719e-06, "loss": 0.3027, "step": 3967 }, { "epoch": 1.8960038229496445, "grad_norm": 0.4626590116957062, "learning_rate": 6.54160899589382e-06, "loss": 0.2975, "step": 3968 }, { "epoch": 1.8964816916552176, "grad_norm": 0.6053375872302748, "learning_rate": 6.536667754040581e-06, "loss": 0.3172, "step": 3969 }, { "epoch": 1.896959560360791, "grad_norm": 0.46082649951002486, "learning_rate": 6.531727472854617e-06, "loss": 0.3041, "step": 3970 }, { "epoch": 1.897437429066364, "grad_norm": 0.4647416997290529, "learning_rate": 6.52678815370627e-06, "loss": 0.312, "step": 3971 }, { "epoch": 1.8979152977719371, "grad_norm": 0.8690617380399219, "learning_rate": 6.521849797965623e-06, "loss": 0.3076, "step": 3972 }, { "epoch": 1.8983931664775104, "grad_norm": 0.44236729928797086, "learning_rate": 6.516912407002487e-06, "loss": 0.3154, "step": 3973 }, { "epoch": 1.8988710351830833, "grad_norm": 0.48835977208241177, "learning_rate": 6.511975982186412e-06, "loss": 0.3152, "step": 3974 }, { "epoch": 1.8993489038886566, "grad_norm": 0.44854340512081925, "learning_rate": 6.507040524886672e-06, "loss": 0.305, "step": 3975 }, { "epoch": 1.8998267725942297, "grad_norm": 0.46881260069389524, "learning_rate": 6.502106036472274e-06, "loss": 0.3059, "step": 3976 }, { "epoch": 1.9003046412998028, "grad_norm": 0.4534629224226534, "learning_rate": 6.4971725183119596e-06, "loss": 0.3133, "step": 3977 }, { "epoch": 1.9007825100053761, "grad_norm": 0.4727954464086862, "learning_rate": 6.492239971774201e-06, "loss": 0.3112, "step": 3978 }, { "epoch": 1.9012603787109492, "grad_norm": 0.4905488487960794, "learning_rate": 6.487308398227198e-06, "loss": 0.3084, "step": 3979 }, { "epoch": 1.9017382474165223, "grad_norm": 0.45969932453025913, "learning_rate": 6.482377799038882e-06, "loss": 0.3074, "step": 3980 }, { "epoch": 1.9022161161220954, "grad_norm": 0.4503140751242902, "learning_rate": 6.477448175576917e-06, "loss": 0.3112, "step": 3981 }, { "epoch": 1.9026939848276685, "grad_norm": 0.5162485390503506, "learning_rate": 6.472519529208688e-06, "loss": 0.3116, "step": 3982 }, { "epoch": 1.9031718535332418, "grad_norm": 0.4393300520762453, "learning_rate": 6.467591861301319e-06, "loss": 0.3186, "step": 3983 }, { "epoch": 1.903649722238815, "grad_norm": 0.4404302931504677, "learning_rate": 6.462665173221658e-06, "loss": 0.3027, "step": 3984 }, { "epoch": 1.904127590944388, "grad_norm": 0.4513739623364187, "learning_rate": 6.45773946633628e-06, "loss": 0.3191, "step": 3985 }, { "epoch": 1.9046054596499613, "grad_norm": 0.43862099527361414, "learning_rate": 6.45281474201149e-06, "loss": 0.3186, "step": 3986 }, { "epoch": 1.9050833283555342, "grad_norm": 0.48035117229015456, "learning_rate": 6.44789100161332e-06, "loss": 0.2902, "step": 3987 }, { "epoch": 1.9055611970611075, "grad_norm": 0.46306152743712947, "learning_rate": 6.442968246507526e-06, "loss": 0.3228, "step": 3988 }, { "epoch": 1.9060390657666806, "grad_norm": 0.4547005976928245, "learning_rate": 6.438046478059597e-06, "loss": 0.3081, "step": 3989 }, { "epoch": 1.9065169344722537, "grad_norm": 0.455512028698371, "learning_rate": 6.4331256976347434e-06, "loss": 0.3136, "step": 3990 }, { "epoch": 1.906994803177827, "grad_norm": 0.45184250650864355, "learning_rate": 6.4282059065979e-06, "loss": 0.3234, "step": 3991 }, { "epoch": 1.9074726718834, "grad_norm": 0.4589810256657876, "learning_rate": 6.423287106313734e-06, "loss": 0.3139, "step": 3992 }, { "epoch": 1.9079505405889732, "grad_norm": 0.4469761574276576, "learning_rate": 6.4183692981466354e-06, "loss": 0.3178, "step": 3993 }, { "epoch": 1.9084284092945463, "grad_norm": 0.4819073592214982, "learning_rate": 6.413452483460712e-06, "loss": 0.3078, "step": 3994 }, { "epoch": 1.9089062780001194, "grad_norm": 0.473783423003341, "learning_rate": 6.408536663619803e-06, "loss": 0.3232, "step": 3995 }, { "epoch": 1.9093841467056927, "grad_norm": 0.46190041854892633, "learning_rate": 6.403621839987475e-06, "loss": 0.2947, "step": 3996 }, { "epoch": 1.9098620154112658, "grad_norm": 0.4853262011211858, "learning_rate": 6.398708013927006e-06, "loss": 0.3191, "step": 3997 }, { "epoch": 1.9103398841168389, "grad_norm": 0.46613152697547133, "learning_rate": 6.393795186801408e-06, "loss": 0.3148, "step": 3998 }, { "epoch": 1.9108177528224122, "grad_norm": 0.478468525695969, "learning_rate": 6.3888833599734164e-06, "loss": 0.3088, "step": 3999 }, { "epoch": 1.911295621527985, "grad_norm": 0.4807025101803059, "learning_rate": 6.383972534805478e-06, "loss": 0.3137, "step": 4000 }, { "epoch": 1.9117734902335584, "grad_norm": 0.4699617352574663, "learning_rate": 6.379062712659775e-06, "loss": 0.3151, "step": 4001 }, { "epoch": 1.9122513589391315, "grad_norm": 0.4432981759624267, "learning_rate": 6.374153894898204e-06, "loss": 0.301, "step": 4002 }, { "epoch": 1.9127292276447045, "grad_norm": 0.4591365732275299, "learning_rate": 6.369246082882381e-06, "loss": 0.3183, "step": 4003 }, { "epoch": 1.9132070963502779, "grad_norm": 0.4866953393750346, "learning_rate": 6.36433927797365e-06, "loss": 0.3077, "step": 4004 }, { "epoch": 1.913684965055851, "grad_norm": 0.4474609520754289, "learning_rate": 6.359433481533074e-06, "loss": 0.314, "step": 4005 }, { "epoch": 1.914162833761424, "grad_norm": 0.48773485797623917, "learning_rate": 6.3545286949214245e-06, "loss": 0.3329, "step": 4006 }, { "epoch": 1.9146407024669974, "grad_norm": 0.47016604797082867, "learning_rate": 6.349624919499218e-06, "loss": 0.302, "step": 4007 }, { "epoch": 1.9151185711725702, "grad_norm": 0.4760285819352319, "learning_rate": 6.344722156626663e-06, "loss": 0.2967, "step": 4008 }, { "epoch": 1.9155964398781435, "grad_norm": 0.4759007373191583, "learning_rate": 6.3398204076637035e-06, "loss": 0.3126, "step": 4009 }, { "epoch": 1.9160743085837166, "grad_norm": 0.45794278531506144, "learning_rate": 6.3349196739700024e-06, "loss": 0.3035, "step": 4010 }, { "epoch": 1.9165521772892897, "grad_norm": 0.4692457739911771, "learning_rate": 6.3300199569049305e-06, "loss": 0.3215, "step": 4011 }, { "epoch": 1.917030045994863, "grad_norm": 0.4630100738986396, "learning_rate": 6.325121257827584e-06, "loss": 0.2977, "step": 4012 }, { "epoch": 1.917507914700436, "grad_norm": 0.44301606576563307, "learning_rate": 6.32022357809678e-06, "loss": 0.3033, "step": 4013 }, { "epoch": 1.9179857834060092, "grad_norm": 0.4878278270456394, "learning_rate": 6.3153269190710435e-06, "loss": 0.3061, "step": 4014 }, { "epoch": 1.9184636521115823, "grad_norm": 0.46093038914319545, "learning_rate": 6.310431282108622e-06, "loss": 0.3094, "step": 4015 }, { "epoch": 1.9189415208171554, "grad_norm": 0.4342745654994972, "learning_rate": 6.305536668567482e-06, "loss": 0.2909, "step": 4016 }, { "epoch": 1.9194193895227287, "grad_norm": 0.46848755594806835, "learning_rate": 6.3006430798053e-06, "loss": 0.317, "step": 4017 }, { "epoch": 1.9198972582283018, "grad_norm": 0.45778293002207393, "learning_rate": 6.295750517179471e-06, "loss": 0.3253, "step": 4018 }, { "epoch": 1.920375126933875, "grad_norm": 0.48174882670851016, "learning_rate": 6.29085898204711e-06, "loss": 0.2912, "step": 4019 }, { "epoch": 1.9208529956394482, "grad_norm": 0.44572761526104454, "learning_rate": 6.2859684757650365e-06, "loss": 0.3104, "step": 4020 }, { "epoch": 1.921330864345021, "grad_norm": 0.4477457574600347, "learning_rate": 6.281078999689794e-06, "loss": 0.3135, "step": 4021 }, { "epoch": 1.9218087330505944, "grad_norm": 0.4612358982666258, "learning_rate": 6.276190555177637e-06, "loss": 0.3121, "step": 4022 }, { "epoch": 1.9222866017561675, "grad_norm": 0.4575159149820703, "learning_rate": 6.271303143584532e-06, "loss": 0.3016, "step": 4023 }, { "epoch": 1.9227644704617406, "grad_norm": 0.4482684891108602, "learning_rate": 6.266416766266161e-06, "loss": 0.3073, "step": 4024 }, { "epoch": 1.923242339167314, "grad_norm": 0.4520583335071011, "learning_rate": 6.261531424577923e-06, "loss": 0.2965, "step": 4025 }, { "epoch": 1.9237202078728868, "grad_norm": 0.47269951843491825, "learning_rate": 6.256647119874919e-06, "loss": 0.3121, "step": 4026 }, { "epoch": 1.92419807657846, "grad_norm": 0.4758823780817171, "learning_rate": 6.251763853511974e-06, "loss": 0.315, "step": 4027 }, { "epoch": 1.9246759452840332, "grad_norm": 0.4471003165029847, "learning_rate": 6.24688162684362e-06, "loss": 0.308, "step": 4028 }, { "epoch": 1.9251538139896063, "grad_norm": 0.4842554500937279, "learning_rate": 6.242000441224096e-06, "loss": 0.3197, "step": 4029 }, { "epoch": 1.9256316826951796, "grad_norm": 0.4766633160283862, "learning_rate": 6.2371202980073596e-06, "loss": 0.3098, "step": 4030 }, { "epoch": 1.9261095514007527, "grad_norm": 0.4753968836250539, "learning_rate": 6.23224119854708e-06, "loss": 0.3108, "step": 4031 }, { "epoch": 1.9265874201063258, "grad_norm": 0.5527062509603616, "learning_rate": 6.227363144196625e-06, "loss": 0.2904, "step": 4032 }, { "epoch": 1.927065288811899, "grad_norm": 0.4689169847484964, "learning_rate": 6.222486136309087e-06, "loss": 0.2986, "step": 4033 }, { "epoch": 1.927543157517472, "grad_norm": 0.45259176341385676, "learning_rate": 6.217610176237263e-06, "loss": 0.3121, "step": 4034 }, { "epoch": 1.9280210262230453, "grad_norm": 0.45564864703780517, "learning_rate": 6.212735265333655e-06, "loss": 0.3016, "step": 4035 }, { "epoch": 1.9284988949286184, "grad_norm": 0.4559492608859359, "learning_rate": 6.207861404950477e-06, "loss": 0.3143, "step": 4036 }, { "epoch": 1.9289767636341915, "grad_norm": 0.4451863912439992, "learning_rate": 6.2029885964396544e-06, "loss": 0.3317, "step": 4037 }, { "epoch": 1.9294546323397648, "grad_norm": 0.4535151095832485, "learning_rate": 6.198116841152816e-06, "loss": 0.3132, "step": 4038 }, { "epoch": 1.9299325010453376, "grad_norm": 0.4459931426857373, "learning_rate": 6.1932461404412994e-06, "loss": 0.3109, "step": 4039 }, { "epoch": 1.930410369750911, "grad_norm": 0.5319828701956664, "learning_rate": 6.188376495656156e-06, "loss": 0.3035, "step": 4040 }, { "epoch": 1.930888238456484, "grad_norm": 0.46969694056011513, "learning_rate": 6.183507908148137e-06, "loss": 0.3163, "step": 4041 }, { "epoch": 1.9313661071620571, "grad_norm": 0.45244458398728504, "learning_rate": 6.178640379267702e-06, "loss": 0.3025, "step": 4042 }, { "epoch": 1.9318439758676305, "grad_norm": 0.45648090587965856, "learning_rate": 6.173773910365018e-06, "loss": 0.2901, "step": 4043 }, { "epoch": 1.9323218445732036, "grad_norm": 0.4610212804634707, "learning_rate": 6.168908502789961e-06, "loss": 0.3086, "step": 4044 }, { "epoch": 1.9327997132787766, "grad_norm": 0.46276231755271585, "learning_rate": 6.164044157892102e-06, "loss": 0.3272, "step": 4045 }, { "epoch": 1.93327758198435, "grad_norm": 0.4494641795334654, "learning_rate": 6.15918087702073e-06, "loss": 0.3214, "step": 4046 }, { "epoch": 1.9337554506899228, "grad_norm": 0.4495205402291461, "learning_rate": 6.154318661524832e-06, "loss": 0.3209, "step": 4047 }, { "epoch": 1.9342333193954961, "grad_norm": 0.4707004184620095, "learning_rate": 6.149457512753101e-06, "loss": 0.3053, "step": 4048 }, { "epoch": 1.9347111881010692, "grad_norm": 0.5357951441151997, "learning_rate": 6.144597432053932e-06, "loss": 0.3119, "step": 4049 }, { "epoch": 1.9351890568066423, "grad_norm": 0.4535876834445935, "learning_rate": 6.13973842077543e-06, "loss": 0.3023, "step": 4050 }, { "epoch": 1.9356669255122156, "grad_norm": 0.48376924175039243, "learning_rate": 6.134880480265393e-06, "loss": 0.3059, "step": 4051 }, { "epoch": 1.9361447942177885, "grad_norm": 0.49754708572204154, "learning_rate": 6.130023611871332e-06, "loss": 0.3139, "step": 4052 }, { "epoch": 1.9366226629233618, "grad_norm": 0.6147957148377768, "learning_rate": 6.1251678169404585e-06, "loss": 0.3017, "step": 4053 }, { "epoch": 1.937100531628935, "grad_norm": 0.4597512390463644, "learning_rate": 6.120313096819679e-06, "loss": 0.3184, "step": 4054 }, { "epoch": 1.937578400334508, "grad_norm": 0.4559692746363391, "learning_rate": 6.11545945285561e-06, "loss": 0.3085, "step": 4055 }, { "epoch": 1.9380562690400813, "grad_norm": 0.47922658966713283, "learning_rate": 6.110606886394568e-06, "loss": 0.3254, "step": 4056 }, { "epoch": 1.9385341377456544, "grad_norm": 0.4534671409403072, "learning_rate": 6.105755398782567e-06, "loss": 0.324, "step": 4057 }, { "epoch": 1.9390120064512275, "grad_norm": 0.4545090220212905, "learning_rate": 6.100904991365324e-06, "loss": 0.3079, "step": 4058 }, { "epoch": 1.9394898751568008, "grad_norm": 0.4762232060665426, "learning_rate": 6.096055665488261e-06, "loss": 0.3051, "step": 4059 }, { "epoch": 1.9399677438623737, "grad_norm": 0.5066682145376573, "learning_rate": 6.091207422496489e-06, "loss": 0.3159, "step": 4060 }, { "epoch": 1.940445612567947, "grad_norm": 0.44386055788270473, "learning_rate": 6.0863602637348284e-06, "loss": 0.3119, "step": 4061 }, { "epoch": 1.94092348127352, "grad_norm": 0.43903421860827463, "learning_rate": 6.081514190547797e-06, "loss": 0.3078, "step": 4062 }, { "epoch": 1.9414013499790932, "grad_norm": 0.47742457334875266, "learning_rate": 6.076669204279606e-06, "loss": 0.3176, "step": 4063 }, { "epoch": 1.9418792186846665, "grad_norm": 0.4665930593611481, "learning_rate": 6.071825306274173e-06, "loss": 0.3036, "step": 4064 }, { "epoch": 1.9423570873902394, "grad_norm": 0.5253515300391597, "learning_rate": 6.066982497875109e-06, "loss": 0.3198, "step": 4065 }, { "epoch": 1.9428349560958127, "grad_norm": 0.5049405090302596, "learning_rate": 6.0621407804257205e-06, "loss": 0.3048, "step": 4066 }, { "epoch": 1.9433128248013858, "grad_norm": 0.4700459384897942, "learning_rate": 6.057300155269017e-06, "loss": 0.3106, "step": 4067 }, { "epoch": 1.9437906935069589, "grad_norm": 0.4434734524599051, "learning_rate": 6.052460623747705e-06, "loss": 0.3237, "step": 4068 }, { "epoch": 1.9442685622125322, "grad_norm": 0.46014030921295707, "learning_rate": 6.0476221872041794e-06, "loss": 0.3223, "step": 4069 }, { "epoch": 1.9447464309181053, "grad_norm": 0.468750131948572, "learning_rate": 6.042784846980542e-06, "loss": 0.2991, "step": 4070 }, { "epoch": 1.9452242996236784, "grad_norm": 0.47243399384340334, "learning_rate": 6.037948604418584e-06, "loss": 0.3229, "step": 4071 }, { "epoch": 1.9457021683292517, "grad_norm": 0.48404534391010834, "learning_rate": 6.033113460859794e-06, "loss": 0.3042, "step": 4072 }, { "epoch": 1.9461800370348246, "grad_norm": 0.459036713881152, "learning_rate": 6.028279417645351e-06, "loss": 0.3125, "step": 4073 }, { "epoch": 1.9466579057403979, "grad_norm": 0.7112099246861842, "learning_rate": 6.023446476116141e-06, "loss": 0.3112, "step": 4074 }, { "epoch": 1.947135774445971, "grad_norm": 0.4643938545098871, "learning_rate": 6.018614637612733e-06, "loss": 0.2963, "step": 4075 }, { "epoch": 1.947613643151544, "grad_norm": 0.44678142004677796, "learning_rate": 6.013783903475396e-06, "loss": 0.3037, "step": 4076 }, { "epoch": 1.9480915118571174, "grad_norm": 0.5656950978702013, "learning_rate": 6.008954275044088e-06, "loss": 0.2962, "step": 4077 }, { "epoch": 1.9485693805626902, "grad_norm": 0.47423542716615724, "learning_rate": 6.004125753658461e-06, "loss": 0.3106, "step": 4078 }, { "epoch": 1.9490472492682636, "grad_norm": 0.45972148442045957, "learning_rate": 5.9992983406578666e-06, "loss": 0.3016, "step": 4079 }, { "epoch": 1.9495251179738367, "grad_norm": 0.48436905917421313, "learning_rate": 5.99447203738134e-06, "loss": 0.3036, "step": 4080 }, { "epoch": 1.9500029866794097, "grad_norm": 0.44553011226551475, "learning_rate": 5.989646845167614e-06, "loss": 0.2932, "step": 4081 }, { "epoch": 1.950480855384983, "grad_norm": 0.4607346503392114, "learning_rate": 5.984822765355113e-06, "loss": 0.3358, "step": 4082 }, { "epoch": 1.9509587240905562, "grad_norm": 0.4952981445916362, "learning_rate": 5.979999799281948e-06, "loss": 0.3151, "step": 4083 }, { "epoch": 1.9514365927961292, "grad_norm": 0.4675121455424015, "learning_rate": 5.975177948285929e-06, "loss": 0.3148, "step": 4084 }, { "epoch": 1.9519144615017026, "grad_norm": 0.45369885707604624, "learning_rate": 5.9703572137045495e-06, "loss": 0.3034, "step": 4085 }, { "epoch": 1.9523923302072754, "grad_norm": 0.4579342776702539, "learning_rate": 5.965537596874997e-06, "loss": 0.313, "step": 4086 }, { "epoch": 1.9528701989128487, "grad_norm": 0.6942835585119591, "learning_rate": 5.960719099134149e-06, "loss": 0.317, "step": 4087 }, { "epoch": 1.9533480676184218, "grad_norm": 0.46918542083573794, "learning_rate": 5.9559017218185724e-06, "loss": 0.309, "step": 4088 }, { "epoch": 1.953825936323995, "grad_norm": 0.45277876411651685, "learning_rate": 5.951085466264519e-06, "loss": 0.3032, "step": 4089 }, { "epoch": 1.9543038050295682, "grad_norm": 0.4690874120536212, "learning_rate": 5.946270333807937e-06, "loss": 0.3039, "step": 4090 }, { "epoch": 1.9547816737351411, "grad_norm": 0.4507823330535709, "learning_rate": 5.94145632578446e-06, "loss": 0.3077, "step": 4091 }, { "epoch": 1.9552595424407144, "grad_norm": 0.44925892593784034, "learning_rate": 5.9366434435294026e-06, "loss": 0.3155, "step": 4092 }, { "epoch": 1.9557374111462875, "grad_norm": 0.48240656700569656, "learning_rate": 5.9318316883777795e-06, "loss": 0.3208, "step": 4093 }, { "epoch": 1.9562152798518606, "grad_norm": 0.4604534040175604, "learning_rate": 5.927021061664287e-06, "loss": 0.2998, "step": 4094 }, { "epoch": 1.956693148557434, "grad_norm": 0.4526034134761243, "learning_rate": 5.922211564723302e-06, "loss": 0.2946, "step": 4095 }, { "epoch": 1.957171017263007, "grad_norm": 0.46416284670050006, "learning_rate": 5.9174031988888995e-06, "loss": 0.3118, "step": 4096 }, { "epoch": 1.95764888596858, "grad_norm": 0.45215047247388573, "learning_rate": 5.912595965494835e-06, "loss": 0.3182, "step": 4097 }, { "epoch": 1.9581267546741534, "grad_norm": 0.5546875852085521, "learning_rate": 5.907789865874547e-06, "loss": 0.3064, "step": 4098 }, { "epoch": 1.9586046233797263, "grad_norm": 0.4752323586160744, "learning_rate": 5.902984901361166e-06, "loss": 0.3278, "step": 4099 }, { "epoch": 1.9590824920852996, "grad_norm": 0.452399975076052, "learning_rate": 5.898181073287504e-06, "loss": 0.3028, "step": 4100 }, { "epoch": 1.9595603607908727, "grad_norm": 0.4578845518507173, "learning_rate": 5.893378382986057e-06, "loss": 0.2949, "step": 4101 }, { "epoch": 1.9600382294964458, "grad_norm": 0.4390145249730691, "learning_rate": 5.8885768317890054e-06, "loss": 0.3016, "step": 4102 }, { "epoch": 1.960516098202019, "grad_norm": 0.9622036180396034, "learning_rate": 5.883776421028219e-06, "loss": 0.3221, "step": 4103 }, { "epoch": 1.960993966907592, "grad_norm": 0.45023967591580155, "learning_rate": 5.878977152035243e-06, "loss": 0.3117, "step": 4104 }, { "epoch": 1.9614718356131653, "grad_norm": 0.44945352983330045, "learning_rate": 5.87417902614131e-06, "loss": 0.2991, "step": 4105 }, { "epoch": 1.9619497043187384, "grad_norm": 0.4507418181158918, "learning_rate": 5.869382044677341e-06, "loss": 0.3181, "step": 4106 }, { "epoch": 1.9624275730243115, "grad_norm": 0.44350954473838944, "learning_rate": 5.8645862089739215e-06, "loss": 0.3151, "step": 4107 }, { "epoch": 1.9629054417298848, "grad_norm": 0.5341361831039657, "learning_rate": 5.859791520361348e-06, "loss": 0.3072, "step": 4108 }, { "epoch": 1.9633833104354579, "grad_norm": 0.46709165414809023, "learning_rate": 5.854997980169572e-06, "loss": 0.3031, "step": 4109 }, { "epoch": 1.963861179141031, "grad_norm": 0.4280650147245011, "learning_rate": 5.850205589728239e-06, "loss": 0.3127, "step": 4110 }, { "epoch": 1.9643390478466043, "grad_norm": 0.44278861997127844, "learning_rate": 5.845414350366679e-06, "loss": 0.3177, "step": 4111 }, { "epoch": 1.9648169165521772, "grad_norm": 0.43260620835576497, "learning_rate": 5.8406242634138875e-06, "loss": 0.3133, "step": 4112 }, { "epoch": 1.9652947852577505, "grad_norm": 0.46599928724656164, "learning_rate": 5.835835330198558e-06, "loss": 0.317, "step": 4113 }, { "epoch": 1.9657726539633236, "grad_norm": 0.4868101146764809, "learning_rate": 5.83104755204905e-06, "loss": 0.3072, "step": 4114 }, { "epoch": 1.9662505226688967, "grad_norm": 0.43340141877422556, "learning_rate": 5.826260930293417e-06, "loss": 0.3041, "step": 4115 }, { "epoch": 1.96672839137447, "grad_norm": 0.4737663310328414, "learning_rate": 5.8214754662593765e-06, "loss": 0.3196, "step": 4116 }, { "epoch": 1.9672062600800428, "grad_norm": 0.4510243883089674, "learning_rate": 5.81669116127433e-06, "loss": 0.2846, "step": 4117 }, { "epoch": 1.9676841287856162, "grad_norm": 0.4721590552134255, "learning_rate": 5.811908016665369e-06, "loss": 0.2991, "step": 4118 }, { "epoch": 1.9681619974911893, "grad_norm": 0.4485701829219417, "learning_rate": 5.807126033759245e-06, "loss": 0.3201, "step": 4119 }, { "epoch": 1.9686398661967623, "grad_norm": 0.42971050420170925, "learning_rate": 5.802345213882396e-06, "loss": 0.2848, "step": 4120 }, { "epoch": 1.9691177349023357, "grad_norm": 0.5060676790854636, "learning_rate": 5.797565558360943e-06, "loss": 0.3189, "step": 4121 }, { "epoch": 1.9695956036079088, "grad_norm": 0.5368619185936723, "learning_rate": 5.792787068520674e-06, "loss": 0.3272, "step": 4122 }, { "epoch": 1.9700734723134818, "grad_norm": 0.4969544005475237, "learning_rate": 5.788009745687053e-06, "loss": 0.3044, "step": 4123 }, { "epoch": 1.9705513410190552, "grad_norm": 0.5192317905793113, "learning_rate": 5.7832335911852355e-06, "loss": 0.3073, "step": 4124 }, { "epoch": 1.971029209724628, "grad_norm": 0.4584957853344697, "learning_rate": 5.778458606340037e-06, "loss": 0.3182, "step": 4125 }, { "epoch": 1.9715070784302013, "grad_norm": 0.4826482249055619, "learning_rate": 5.7736847924759505e-06, "loss": 0.2936, "step": 4126 }, { "epoch": 1.9719849471357744, "grad_norm": 0.4565375353417649, "learning_rate": 5.7689121509171564e-06, "loss": 0.3002, "step": 4127 }, { "epoch": 1.9724628158413475, "grad_norm": 0.4524472919502672, "learning_rate": 5.764140682987496e-06, "loss": 0.3091, "step": 4128 }, { "epoch": 1.9729406845469208, "grad_norm": 0.47905618351860524, "learning_rate": 5.759370390010487e-06, "loss": 0.3123, "step": 4129 }, { "epoch": 1.9734185532524937, "grad_norm": 0.4588694980027561, "learning_rate": 5.754601273309333e-06, "loss": 0.3194, "step": 4130 }, { "epoch": 1.973896421958067, "grad_norm": 0.45707604771801996, "learning_rate": 5.749833334206897e-06, "loss": 0.3094, "step": 4131 }, { "epoch": 1.9743742906636401, "grad_norm": 0.4587355771749119, "learning_rate": 5.745066574025718e-06, "loss": 0.302, "step": 4132 }, { "epoch": 1.9748521593692132, "grad_norm": 0.4740061736075553, "learning_rate": 5.740300994088022e-06, "loss": 0.3304, "step": 4133 }, { "epoch": 1.9753300280747865, "grad_norm": 0.4566089421929211, "learning_rate": 5.735536595715687e-06, "loss": 0.3113, "step": 4134 }, { "epoch": 1.9758078967803596, "grad_norm": 0.4663581459744624, "learning_rate": 5.730773380230276e-06, "loss": 0.2976, "step": 4135 }, { "epoch": 1.9762857654859327, "grad_norm": 0.44354111986279265, "learning_rate": 5.726011348953023e-06, "loss": 0.303, "step": 4136 }, { "epoch": 1.976763634191506, "grad_norm": 0.43956310394463377, "learning_rate": 5.7212505032048315e-06, "loss": 0.3212, "step": 4137 }, { "epoch": 1.977241502897079, "grad_norm": 0.43772616497346545, "learning_rate": 5.716490844306271e-06, "loss": 0.3071, "step": 4138 }, { "epoch": 1.9777193716026522, "grad_norm": 0.5091670239004066, "learning_rate": 5.711732373577592e-06, "loss": 0.3224, "step": 4139 }, { "epoch": 1.9781972403082253, "grad_norm": 0.47000699212819913, "learning_rate": 5.70697509233871e-06, "loss": 0.29, "step": 4140 }, { "epoch": 1.9786751090137984, "grad_norm": 0.4704139528256369, "learning_rate": 5.702219001909206e-06, "loss": 0.2997, "step": 4141 }, { "epoch": 1.9791529777193717, "grad_norm": 0.4614015610752338, "learning_rate": 5.697464103608339e-06, "loss": 0.3103, "step": 4142 }, { "epoch": 1.9796308464249448, "grad_norm": 0.45164231657830634, "learning_rate": 5.692710398755039e-06, "loss": 0.3092, "step": 4143 }, { "epoch": 1.980108715130518, "grad_norm": 0.47238767638550566, "learning_rate": 5.687957888667894e-06, "loss": 0.3087, "step": 4144 }, { "epoch": 1.980586583836091, "grad_norm": 0.4513243311661097, "learning_rate": 5.683206574665165e-06, "loss": 0.3104, "step": 4145 }, { "epoch": 1.981064452541664, "grad_norm": 0.4721048544965712, "learning_rate": 5.678456458064788e-06, "loss": 0.306, "step": 4146 }, { "epoch": 1.9815423212472374, "grad_norm": 0.5125862122325965, "learning_rate": 5.673707540184359e-06, "loss": 0.2986, "step": 4147 }, { "epoch": 1.9820201899528105, "grad_norm": 0.4389466347186533, "learning_rate": 5.66895982234114e-06, "loss": 0.3081, "step": 4148 }, { "epoch": 1.9824980586583836, "grad_norm": 0.4407487284192881, "learning_rate": 5.664213305852073e-06, "loss": 0.3238, "step": 4149 }, { "epoch": 1.982975927363957, "grad_norm": 0.47364655963546964, "learning_rate": 5.6594679920337514e-06, "loss": 0.3118, "step": 4150 }, { "epoch": 1.9834537960695298, "grad_norm": 0.4304623563483301, "learning_rate": 5.6547238822024395e-06, "loss": 0.3229, "step": 4151 }, { "epoch": 1.983931664775103, "grad_norm": 0.45174215596394307, "learning_rate": 5.649980977674079e-06, "loss": 0.3041, "step": 4152 }, { "epoch": 1.9844095334806762, "grad_norm": 0.45178499648893194, "learning_rate": 5.6452392797642605e-06, "loss": 0.3035, "step": 4153 }, { "epoch": 1.9848874021862493, "grad_norm": 0.4521203750930736, "learning_rate": 5.640498789788246e-06, "loss": 0.3094, "step": 4154 }, { "epoch": 1.9853652708918226, "grad_norm": 0.47602827989218915, "learning_rate": 5.635759509060969e-06, "loss": 0.3116, "step": 4155 }, { "epoch": 1.9858431395973957, "grad_norm": 0.4801852786233523, "learning_rate": 5.631021438897023e-06, "loss": 0.3241, "step": 4156 }, { "epoch": 1.9863210083029688, "grad_norm": 0.44827145290236947, "learning_rate": 5.626284580610657e-06, "loss": 0.3137, "step": 4157 }, { "epoch": 1.9867988770085419, "grad_norm": 0.520461667707988, "learning_rate": 5.621548935515801e-06, "loss": 0.3203, "step": 4158 }, { "epoch": 1.987276745714115, "grad_norm": 0.4662181204209844, "learning_rate": 5.616814504926037e-06, "loss": 0.3011, "step": 4159 }, { "epoch": 1.9877546144196883, "grad_norm": 0.4531917716602397, "learning_rate": 5.612081290154607e-06, "loss": 0.3324, "step": 4160 }, { "epoch": 1.9882324831252614, "grad_norm": 0.47179765398325363, "learning_rate": 5.607349292514429e-06, "loss": 0.3167, "step": 4161 }, { "epoch": 1.9887103518308344, "grad_norm": 0.7180803991243624, "learning_rate": 5.602618513318072e-06, "loss": 0.3052, "step": 4162 }, { "epoch": 1.9891882205364078, "grad_norm": 0.4606351292564438, "learning_rate": 5.597888953877768e-06, "loss": 0.3148, "step": 4163 }, { "epoch": 1.9896660892419806, "grad_norm": 0.44908296173819723, "learning_rate": 5.5931606155054195e-06, "loss": 0.3084, "step": 4164 }, { "epoch": 1.990143957947554, "grad_norm": 0.5095275810168982, "learning_rate": 5.58843349951258e-06, "loss": 0.2959, "step": 4165 }, { "epoch": 1.990621826653127, "grad_norm": 0.46103156732406936, "learning_rate": 5.583707607210467e-06, "loss": 0.3, "step": 4166 }, { "epoch": 1.9910996953587001, "grad_norm": 0.45327848630075884, "learning_rate": 5.578982939909965e-06, "loss": 0.3107, "step": 4167 }, { "epoch": 1.9915775640642734, "grad_norm": 0.47076365416712834, "learning_rate": 5.574259498921608e-06, "loss": 0.292, "step": 4168 }, { "epoch": 1.9920554327698465, "grad_norm": 0.4667403906050238, "learning_rate": 5.569537285555596e-06, "loss": 0.3196, "step": 4169 }, { "epoch": 1.9925333014754196, "grad_norm": 0.4926455741619648, "learning_rate": 5.564816301121792e-06, "loss": 0.3176, "step": 4170 }, { "epoch": 1.9930111701809927, "grad_norm": 0.4940948150038382, "learning_rate": 5.5600965469297105e-06, "loss": 0.3132, "step": 4171 }, { "epoch": 1.9934890388865658, "grad_norm": 0.4516008067632424, "learning_rate": 5.555378024288525e-06, "loss": 0.3259, "step": 4172 }, { "epoch": 1.9939669075921391, "grad_norm": 0.45739246251961, "learning_rate": 5.550660734507077e-06, "loss": 0.3084, "step": 4173 }, { "epoch": 1.9944447762977122, "grad_norm": 0.4526553226283521, "learning_rate": 5.545944678893853e-06, "loss": 0.3158, "step": 4174 }, { "epoch": 1.9949226450032853, "grad_norm": 0.4659888379257696, "learning_rate": 5.541229858757011e-06, "loss": 0.2973, "step": 4175 }, { "epoch": 1.9954005137088586, "grad_norm": 0.45374769534588466, "learning_rate": 5.53651627540435e-06, "loss": 0.3078, "step": 4176 }, { "epoch": 1.9958783824144315, "grad_norm": 0.47368493531667, "learning_rate": 5.531803930143345e-06, "loss": 0.3152, "step": 4177 }, { "epoch": 1.9963562511200048, "grad_norm": 0.470660248933249, "learning_rate": 5.527092824281111e-06, "loss": 0.2873, "step": 4178 }, { "epoch": 1.996834119825578, "grad_norm": 0.4557800815216673, "learning_rate": 5.522382959124422e-06, "loss": 0.2972, "step": 4179 }, { "epoch": 1.997311988531151, "grad_norm": 0.4748327062670451, "learning_rate": 5.517674335979721e-06, "loss": 0.2962, "step": 4180 }, { "epoch": 1.9977898572367243, "grad_norm": 0.4718260147834405, "learning_rate": 5.512966956153093e-06, "loss": 0.3072, "step": 4181 }, { "epoch": 1.9982677259422974, "grad_norm": 0.4766665255326838, "learning_rate": 5.508260820950278e-06, "loss": 0.3006, "step": 4182 }, { "epoch": 1.9987455946478705, "grad_norm": 0.43482332106111293, "learning_rate": 5.503555931676681e-06, "loss": 0.3002, "step": 4183 }, { "epoch": 1.9992234633534436, "grad_norm": 0.5879689150189751, "learning_rate": 5.498852289637353e-06, "loss": 0.3064, "step": 4184 }, { "epoch": 1.9997013320590167, "grad_norm": 0.6039537333215967, "learning_rate": 5.494149896136998e-06, "loss": 0.2986, "step": 4185 }, { "epoch": 2.0, "grad_norm": 0.507643331231026, "learning_rate": 5.489448752479982e-06, "loss": 0.3016, "step": 4186 }, { "epoch": 2.0004778687055733, "grad_norm": 0.5557131392016892, "learning_rate": 5.484748859970319e-06, "loss": 0.2706, "step": 4187 }, { "epoch": 2.000955737411146, "grad_norm": 0.5129666479233886, "learning_rate": 5.48005021991167e-06, "loss": 0.2874, "step": 4188 }, { "epoch": 2.0014336061167195, "grad_norm": 0.48363044424144763, "learning_rate": 5.475352833607363e-06, "loss": 0.2666, "step": 4189 }, { "epoch": 2.0019114748222924, "grad_norm": 0.4571679864809817, "learning_rate": 5.470656702360367e-06, "loss": 0.2944, "step": 4190 }, { "epoch": 2.0023893435278657, "grad_norm": 0.49923351769769775, "learning_rate": 5.4659618274733e-06, "loss": 0.2708, "step": 4191 }, { "epoch": 2.002867212233439, "grad_norm": 0.5726085889940755, "learning_rate": 5.461268210248449e-06, "loss": 0.2778, "step": 4192 }, { "epoch": 2.003345080939012, "grad_norm": 0.5151146352983149, "learning_rate": 5.4565758519877354e-06, "loss": 0.2768, "step": 4193 }, { "epoch": 2.003822949644585, "grad_norm": 0.5408118208195342, "learning_rate": 5.45188475399273e-06, "loss": 0.2789, "step": 4194 }, { "epoch": 2.0043008183501585, "grad_norm": 0.5218303636526377, "learning_rate": 5.447194917564671e-06, "loss": 0.283, "step": 4195 }, { "epoch": 2.0047786870557314, "grad_norm": 0.48975172353959573, "learning_rate": 5.442506344004433e-06, "loss": 0.2583, "step": 4196 }, { "epoch": 2.0052565557613047, "grad_norm": 0.48789372594335206, "learning_rate": 5.437819034612536e-06, "loss": 0.2891, "step": 4197 }, { "epoch": 2.0057344244668776, "grad_norm": 0.5173557034209996, "learning_rate": 5.433132990689168e-06, "loss": 0.2718, "step": 4198 }, { "epoch": 2.006212293172451, "grad_norm": 0.4979488229723115, "learning_rate": 5.42844821353415e-06, "loss": 0.2763, "step": 4199 }, { "epoch": 2.006690161878024, "grad_norm": 0.46766644811447605, "learning_rate": 5.423764704446954e-06, "loss": 0.2718, "step": 4200 }, { "epoch": 2.007168030583597, "grad_norm": 0.4858229963672474, "learning_rate": 5.41908246472671e-06, "loss": 0.2717, "step": 4201 }, { "epoch": 2.0076458992891704, "grad_norm": 0.4871157460335423, "learning_rate": 5.414401495672183e-06, "loss": 0.2479, "step": 4202 }, { "epoch": 2.0081237679947432, "grad_norm": 0.4719329157303939, "learning_rate": 5.4097217985817885e-06, "loss": 0.2707, "step": 4203 }, { "epoch": 2.0086016367003166, "grad_norm": 0.4784620215856789, "learning_rate": 5.4050433747536e-06, "loss": 0.2793, "step": 4204 }, { "epoch": 2.00907950540589, "grad_norm": 0.4480022553241313, "learning_rate": 5.400366225485326e-06, "loss": 0.2551, "step": 4205 }, { "epoch": 2.0095573741114627, "grad_norm": 0.4862355352173036, "learning_rate": 5.395690352074321e-06, "loss": 0.277, "step": 4206 }, { "epoch": 2.010035242817036, "grad_norm": 0.4832048787344954, "learning_rate": 5.391015755817597e-06, "loss": 0.2741, "step": 4207 }, { "epoch": 2.0105131115226094, "grad_norm": 0.5260287493937732, "learning_rate": 5.386342438011798e-06, "loss": 0.2737, "step": 4208 }, { "epoch": 2.0109909802281822, "grad_norm": 0.46713974738731096, "learning_rate": 5.3816703999532225e-06, "loss": 0.2707, "step": 4209 }, { "epoch": 2.0114688489337555, "grad_norm": 0.48820679807463374, "learning_rate": 5.376999642937817e-06, "loss": 0.271, "step": 4210 }, { "epoch": 2.0119467176393284, "grad_norm": 0.4546420853003914, "learning_rate": 5.372330168261162e-06, "loss": 0.266, "step": 4211 }, { "epoch": 2.0124245863449017, "grad_norm": 0.5092550048537878, "learning_rate": 5.367661977218484e-06, "loss": 0.2718, "step": 4212 }, { "epoch": 2.012902455050475, "grad_norm": 0.6128690560615392, "learning_rate": 5.362995071104664e-06, "loss": 0.2576, "step": 4213 }, { "epoch": 2.013380323756048, "grad_norm": 0.45952890722819323, "learning_rate": 5.358329451214215e-06, "loss": 0.2868, "step": 4214 }, { "epoch": 2.0138581924616212, "grad_norm": 0.48006938509230707, "learning_rate": 5.353665118841296e-06, "loss": 0.2816, "step": 4215 }, { "epoch": 2.014336061167194, "grad_norm": 0.45754420825780606, "learning_rate": 5.3490020752797165e-06, "loss": 0.2846, "step": 4216 }, { "epoch": 2.0148139298727674, "grad_norm": 0.48929584524947306, "learning_rate": 5.344340321822919e-06, "loss": 0.2877, "step": 4217 }, { "epoch": 2.0152917985783407, "grad_norm": 0.4735902467266419, "learning_rate": 5.339679859763988e-06, "loss": 0.2622, "step": 4218 }, { "epoch": 2.0157696672839136, "grad_norm": 0.4723270046896756, "learning_rate": 5.33502069039566e-06, "loss": 0.283, "step": 4219 }, { "epoch": 2.016247535989487, "grad_norm": 0.47606202201064185, "learning_rate": 5.330362815010306e-06, "loss": 0.2811, "step": 4220 }, { "epoch": 2.0167254046950602, "grad_norm": 0.47024447912354794, "learning_rate": 5.325706234899931e-06, "loss": 0.2724, "step": 4221 }, { "epoch": 2.017203273400633, "grad_norm": 0.46214107281210676, "learning_rate": 5.321050951356197e-06, "loss": 0.2671, "step": 4222 }, { "epoch": 2.0176811421062064, "grad_norm": 0.4685381460638152, "learning_rate": 5.316396965670394e-06, "loss": 0.2763, "step": 4223 }, { "epoch": 2.0181590108117793, "grad_norm": 0.45819421363781176, "learning_rate": 5.3117442791334514e-06, "loss": 0.264, "step": 4224 }, { "epoch": 2.0186368795173526, "grad_norm": 0.45413644290919253, "learning_rate": 5.307092893035951e-06, "loss": 0.285, "step": 4225 }, { "epoch": 2.019114748222926, "grad_norm": 0.45027926657460016, "learning_rate": 5.3024428086681e-06, "loss": 0.2687, "step": 4226 }, { "epoch": 2.019592616928499, "grad_norm": 0.4656522036888355, "learning_rate": 5.297794027319747e-06, "loss": 0.2688, "step": 4227 }, { "epoch": 2.020070485634072, "grad_norm": 0.4763467893183739, "learning_rate": 5.293146550280388e-06, "loss": 0.2758, "step": 4228 }, { "epoch": 2.020548354339645, "grad_norm": 0.46374013514301876, "learning_rate": 5.28850037883915e-06, "loss": 0.2621, "step": 4229 }, { "epoch": 2.0210262230452183, "grad_norm": 0.48224561494719265, "learning_rate": 5.2838555142847925e-06, "loss": 0.2731, "step": 4230 }, { "epoch": 2.0215040917507916, "grad_norm": 0.6679479212555328, "learning_rate": 5.2792119579057275e-06, "loss": 0.2578, "step": 4231 }, { "epoch": 2.0219819604563645, "grad_norm": 0.49348773245256794, "learning_rate": 5.274569710989994e-06, "loss": 0.2573, "step": 4232 }, { "epoch": 2.022459829161938, "grad_norm": 0.47356875200053883, "learning_rate": 5.269928774825261e-06, "loss": 0.2815, "step": 4233 }, { "epoch": 2.022937697867511, "grad_norm": 0.531779784720976, "learning_rate": 5.265289150698855e-06, "loss": 0.286, "step": 4234 }, { "epoch": 2.023415566573084, "grad_norm": 0.46128402209008207, "learning_rate": 5.260650839897719e-06, "loss": 0.2742, "step": 4235 }, { "epoch": 2.0238934352786573, "grad_norm": 0.5084306437023877, "learning_rate": 5.256013843708435e-06, "loss": 0.3016, "step": 4236 }, { "epoch": 2.02437130398423, "grad_norm": 0.5186816576776002, "learning_rate": 5.251378163417232e-06, "loss": 0.2646, "step": 4237 }, { "epoch": 2.0248491726898035, "grad_norm": 0.7432820985970882, "learning_rate": 5.246743800309964e-06, "loss": 0.2893, "step": 4238 }, { "epoch": 2.025327041395377, "grad_norm": 0.5031043114302443, "learning_rate": 5.242110755672114e-06, "loss": 0.2574, "step": 4239 }, { "epoch": 2.0258049101009497, "grad_norm": 0.47460643773168915, "learning_rate": 5.237479030788817e-06, "loss": 0.2619, "step": 4240 }, { "epoch": 2.026282778806523, "grad_norm": 0.483227882308964, "learning_rate": 5.232848626944827e-06, "loss": 0.2796, "step": 4241 }, { "epoch": 2.026760647512096, "grad_norm": 0.5588184535169874, "learning_rate": 5.228219545424533e-06, "loss": 0.2732, "step": 4242 }, { "epoch": 2.027238516217669, "grad_norm": 0.4734462488933155, "learning_rate": 5.2235917875119656e-06, "loss": 0.2708, "step": 4243 }, { "epoch": 2.0277163849232425, "grad_norm": 0.4781528576455179, "learning_rate": 5.218965354490786e-06, "loss": 0.2753, "step": 4244 }, { "epoch": 2.0281942536288153, "grad_norm": 0.5011498208217005, "learning_rate": 5.214340247644278e-06, "loss": 0.2675, "step": 4245 }, { "epoch": 2.0286721223343886, "grad_norm": 0.48552354280981097, "learning_rate": 5.209716468255367e-06, "loss": 0.274, "step": 4246 }, { "epoch": 2.029149991039962, "grad_norm": 0.5513420020431447, "learning_rate": 5.205094017606611e-06, "loss": 0.2806, "step": 4247 }, { "epoch": 2.029627859745535, "grad_norm": 0.7169421131443054, "learning_rate": 5.2004728969801945e-06, "loss": 0.281, "step": 4248 }, { "epoch": 2.030105728451108, "grad_norm": 0.48421888141015607, "learning_rate": 5.19585310765793e-06, "loss": 0.2649, "step": 4249 }, { "epoch": 2.030583597156681, "grad_norm": 0.4750848932286133, "learning_rate": 5.191234650921273e-06, "loss": 0.2617, "step": 4250 }, { "epoch": 2.0310614658622543, "grad_norm": 0.45867486142995845, "learning_rate": 5.1866175280513e-06, "loss": 0.2652, "step": 4251 }, { "epoch": 2.0315393345678276, "grad_norm": 0.4993525355930482, "learning_rate": 5.182001740328713e-06, "loss": 0.2535, "step": 4252 }, { "epoch": 2.0320172032734005, "grad_norm": 0.478090275533228, "learning_rate": 5.17738728903386e-06, "loss": 0.2761, "step": 4253 }, { "epoch": 2.032495071978974, "grad_norm": 0.47697353288833244, "learning_rate": 5.172774175446703e-06, "loss": 0.2733, "step": 4254 }, { "epoch": 2.0329729406845467, "grad_norm": 0.46412526746517374, "learning_rate": 5.168162400846835e-06, "loss": 0.2818, "step": 4255 }, { "epoch": 2.03345080939012, "grad_norm": 0.4740087446649736, "learning_rate": 5.16355196651349e-06, "loss": 0.2696, "step": 4256 }, { "epoch": 2.0339286780956933, "grad_norm": 0.4895373321168128, "learning_rate": 5.158942873725514e-06, "loss": 0.2909, "step": 4257 }, { "epoch": 2.034406546801266, "grad_norm": 0.48823594256633335, "learning_rate": 5.154335123761387e-06, "loss": 0.2603, "step": 4258 }, { "epoch": 2.0348844155068395, "grad_norm": 0.4403350386861437, "learning_rate": 5.149728717899225e-06, "loss": 0.2739, "step": 4259 }, { "epoch": 2.035362284212413, "grad_norm": 0.4744290808806091, "learning_rate": 5.145123657416759e-06, "loss": 0.2665, "step": 4260 }, { "epoch": 2.0358401529179857, "grad_norm": 0.49559161901762544, "learning_rate": 5.140519943591348e-06, "loss": 0.2802, "step": 4261 }, { "epoch": 2.036318021623559, "grad_norm": 0.4945068985838198, "learning_rate": 5.135917577699988e-06, "loss": 0.2627, "step": 4262 }, { "epoch": 2.036795890329132, "grad_norm": 0.47314478267085935, "learning_rate": 5.131316561019293e-06, "loss": 0.2787, "step": 4263 }, { "epoch": 2.037273759034705, "grad_norm": 0.48612430158367564, "learning_rate": 5.126716894825496e-06, "loss": 0.2784, "step": 4264 }, { "epoch": 2.0377516277402785, "grad_norm": 0.4685580006629703, "learning_rate": 5.122118580394473e-06, "loss": 0.2604, "step": 4265 }, { "epoch": 2.0382294964458514, "grad_norm": 0.4876824775843487, "learning_rate": 5.117521619001713e-06, "loss": 0.2737, "step": 4266 }, { "epoch": 2.0387073651514247, "grad_norm": 0.4533392736697577, "learning_rate": 5.112926011922326e-06, "loss": 0.2571, "step": 4267 }, { "epoch": 2.0391852338569976, "grad_norm": 0.48519263547086117, "learning_rate": 5.10833176043106e-06, "loss": 0.2742, "step": 4268 }, { "epoch": 2.039663102562571, "grad_norm": 0.4562442726047049, "learning_rate": 5.103738865802277e-06, "loss": 0.2658, "step": 4269 }, { "epoch": 2.040140971268144, "grad_norm": 0.4672534849486868, "learning_rate": 5.099147329309959e-06, "loss": 0.256, "step": 4270 }, { "epoch": 2.040618839973717, "grad_norm": 0.4772694807476173, "learning_rate": 5.0945571522277255e-06, "loss": 0.2663, "step": 4271 }, { "epoch": 2.0410967086792904, "grad_norm": 0.5706905661498267, "learning_rate": 5.08996833582881e-06, "loss": 0.2859, "step": 4272 }, { "epoch": 2.0415745773848637, "grad_norm": 0.4947667148137507, "learning_rate": 5.0853808813860616e-06, "loss": 0.3067, "step": 4273 }, { "epoch": 2.0420524460904366, "grad_norm": 0.4851005503979229, "learning_rate": 5.080794790171968e-06, "loss": 0.2716, "step": 4274 }, { "epoch": 2.04253031479601, "grad_norm": 0.48079461477386043, "learning_rate": 5.076210063458622e-06, "loss": 0.2688, "step": 4275 }, { "epoch": 2.0430081835015828, "grad_norm": 0.4673487498716285, "learning_rate": 5.071626702517756e-06, "loss": 0.2632, "step": 4276 }, { "epoch": 2.043486052207156, "grad_norm": 0.4653754050455356, "learning_rate": 5.067044708620702e-06, "loss": 0.2827, "step": 4277 }, { "epoch": 2.0439639209127294, "grad_norm": 0.4697586401370185, "learning_rate": 5.062464083038434e-06, "loss": 0.2667, "step": 4278 }, { "epoch": 2.0444417896183023, "grad_norm": 0.5260281606453924, "learning_rate": 5.057884827041533e-06, "loss": 0.2515, "step": 4279 }, { "epoch": 2.0449196583238756, "grad_norm": 0.4994642856059735, "learning_rate": 5.0533069419002e-06, "loss": 0.2671, "step": 4280 }, { "epoch": 2.045397527029449, "grad_norm": 0.48493301210780937, "learning_rate": 5.048730428884268e-06, "loss": 0.2727, "step": 4281 }, { "epoch": 2.0458753957350218, "grad_norm": 0.46353129132177423, "learning_rate": 5.044155289263174e-06, "loss": 0.2765, "step": 4282 }, { "epoch": 2.046353264440595, "grad_norm": 0.5018313449986281, "learning_rate": 5.03958152430598e-06, "loss": 0.2727, "step": 4283 }, { "epoch": 2.046831133146168, "grad_norm": 0.4851187115521681, "learning_rate": 5.035009135281375e-06, "loss": 0.2652, "step": 4284 }, { "epoch": 2.0473090018517413, "grad_norm": 0.4723372968025927, "learning_rate": 5.030438123457655e-06, "loss": 0.2729, "step": 4285 }, { "epoch": 2.0477868705573146, "grad_norm": 0.47152065145636834, "learning_rate": 5.025868490102734e-06, "loss": 0.2724, "step": 4286 }, { "epoch": 2.0482647392628874, "grad_norm": 0.5260680519772822, "learning_rate": 5.021300236484156e-06, "loss": 0.2648, "step": 4287 }, { "epoch": 2.0487426079684607, "grad_norm": 0.47329119567837263, "learning_rate": 5.016733363869068e-06, "loss": 0.2713, "step": 4288 }, { "epoch": 2.0492204766740336, "grad_norm": 0.50642114847147, "learning_rate": 5.01216787352424e-06, "loss": 0.2638, "step": 4289 }, { "epoch": 2.049698345379607, "grad_norm": 0.4705941501131214, "learning_rate": 5.007603766716063e-06, "loss": 0.2643, "step": 4290 }, { "epoch": 2.0501762140851802, "grad_norm": 0.4803945099496459, "learning_rate": 5.003041044710536e-06, "loss": 0.2717, "step": 4291 }, { "epoch": 2.050654082790753, "grad_norm": 0.4612851976830831, "learning_rate": 4.998479708773275e-06, "loss": 0.2805, "step": 4292 }, { "epoch": 2.0511319514963264, "grad_norm": 0.4759211681133958, "learning_rate": 4.993919760169521e-06, "loss": 0.2659, "step": 4293 }, { "epoch": 2.0516098202018993, "grad_norm": 0.4979267001248353, "learning_rate": 4.98936120016412e-06, "loss": 0.2764, "step": 4294 }, { "epoch": 2.0520876889074726, "grad_norm": 0.5214662750542031, "learning_rate": 4.984804030021533e-06, "loss": 0.2707, "step": 4295 }, { "epoch": 2.052565557613046, "grad_norm": 0.4913707086962388, "learning_rate": 4.9802482510058445e-06, "loss": 0.2687, "step": 4296 }, { "epoch": 2.053043426318619, "grad_norm": 0.5027711346349333, "learning_rate": 4.975693864380744e-06, "loss": 0.2613, "step": 4297 }, { "epoch": 2.053521295024192, "grad_norm": 0.5024104113465125, "learning_rate": 4.971140871409536e-06, "loss": 0.2741, "step": 4298 }, { "epoch": 2.0539991637297654, "grad_norm": 0.4811287363006302, "learning_rate": 4.966589273355144e-06, "loss": 0.2689, "step": 4299 }, { "epoch": 2.0544770324353383, "grad_norm": 0.4644489936036945, "learning_rate": 4.962039071480102e-06, "loss": 0.2791, "step": 4300 }, { "epoch": 2.0549549011409116, "grad_norm": 0.5336660306893748, "learning_rate": 4.957490267046549e-06, "loss": 0.2803, "step": 4301 }, { "epoch": 2.0554327698464845, "grad_norm": 0.47071851786985547, "learning_rate": 4.95294286131625e-06, "loss": 0.2673, "step": 4302 }, { "epoch": 2.055910638552058, "grad_norm": 0.7828065837615993, "learning_rate": 4.948396855550575e-06, "loss": 0.2633, "step": 4303 }, { "epoch": 2.056388507257631, "grad_norm": 0.45464028072528234, "learning_rate": 4.943852251010498e-06, "loss": 0.2793, "step": 4304 }, { "epoch": 2.056866375963204, "grad_norm": 0.6373445627548838, "learning_rate": 4.939309048956622e-06, "loss": 0.2825, "step": 4305 }, { "epoch": 2.0573442446687773, "grad_norm": 0.4671574195509635, "learning_rate": 4.934767250649146e-06, "loss": 0.2825, "step": 4306 }, { "epoch": 2.0578221133743506, "grad_norm": 0.5249600277106131, "learning_rate": 4.9302268573478825e-06, "loss": 0.2798, "step": 4307 }, { "epoch": 2.0582999820799235, "grad_norm": 0.532751022686781, "learning_rate": 4.925687870312263e-06, "loss": 0.2704, "step": 4308 }, { "epoch": 2.058777850785497, "grad_norm": 0.4936173702678726, "learning_rate": 4.921150290801316e-06, "loss": 0.2791, "step": 4309 }, { "epoch": 2.0592557194910697, "grad_norm": 0.46696117374431717, "learning_rate": 4.9166141200736885e-06, "loss": 0.2717, "step": 4310 }, { "epoch": 2.059733588196643, "grad_norm": 0.4612537280249499, "learning_rate": 4.912079359387638e-06, "loss": 0.2712, "step": 4311 }, { "epoch": 2.0602114569022163, "grad_norm": 0.44406796450445546, "learning_rate": 4.907546010001026e-06, "loss": 0.2772, "step": 4312 }, { "epoch": 2.060689325607789, "grad_norm": 0.4685017422306754, "learning_rate": 4.903014073171315e-06, "loss": 0.2689, "step": 4313 }, { "epoch": 2.0611671943133625, "grad_norm": 0.49168518163616726, "learning_rate": 4.898483550155595e-06, "loss": 0.2685, "step": 4314 }, { "epoch": 2.0616450630189354, "grad_norm": 0.47506846437092554, "learning_rate": 4.89395444221055e-06, "loss": 0.2636, "step": 4315 }, { "epoch": 2.0621229317245087, "grad_norm": 0.44749028284198716, "learning_rate": 4.889426750592469e-06, "loss": 0.273, "step": 4316 }, { "epoch": 2.062600800430082, "grad_norm": 0.4643039012469354, "learning_rate": 4.884900476557263e-06, "loss": 0.2843, "step": 4317 }, { "epoch": 2.063078669135655, "grad_norm": 0.524850579695799, "learning_rate": 4.880375621360435e-06, "loss": 0.2732, "step": 4318 }, { "epoch": 2.063556537841228, "grad_norm": 0.4937697172955831, "learning_rate": 4.8758521862570975e-06, "loss": 0.2877, "step": 4319 }, { "epoch": 2.0640344065468015, "grad_norm": 0.4568437096178462, "learning_rate": 4.871330172501979e-06, "loss": 0.2591, "step": 4320 }, { "epoch": 2.0645122752523744, "grad_norm": 0.4818485682691764, "learning_rate": 4.866809581349403e-06, "loss": 0.2842, "step": 4321 }, { "epoch": 2.0649901439579477, "grad_norm": 0.500279053318082, "learning_rate": 4.862290414053296e-06, "loss": 0.2689, "step": 4322 }, { "epoch": 2.0654680126635205, "grad_norm": 0.4796867445554122, "learning_rate": 4.857772671867206e-06, "loss": 0.261, "step": 4323 }, { "epoch": 2.065945881369094, "grad_norm": 0.45209725252949634, "learning_rate": 4.853256356044269e-06, "loss": 0.263, "step": 4324 }, { "epoch": 2.066423750074667, "grad_norm": 0.5262595142130551, "learning_rate": 4.848741467837228e-06, "loss": 0.3065, "step": 4325 }, { "epoch": 2.06690161878024, "grad_norm": 0.5027222558313732, "learning_rate": 4.844228008498441e-06, "loss": 0.2771, "step": 4326 }, { "epoch": 2.0673794874858133, "grad_norm": 0.5140923745008394, "learning_rate": 4.839715979279857e-06, "loss": 0.2673, "step": 4327 }, { "epoch": 2.067857356191386, "grad_norm": 0.6688922617742407, "learning_rate": 4.835205381433033e-06, "loss": 0.2759, "step": 4328 }, { "epoch": 2.0683352248969595, "grad_norm": 0.5106868534593813, "learning_rate": 4.830696216209133e-06, "loss": 0.2707, "step": 4329 }, { "epoch": 2.068813093602533, "grad_norm": 0.5125392579142816, "learning_rate": 4.826188484858918e-06, "loss": 0.2796, "step": 4330 }, { "epoch": 2.0692909623081057, "grad_norm": 0.4731305023489138, "learning_rate": 4.821682188632749e-06, "loss": 0.2698, "step": 4331 }, { "epoch": 2.069768831013679, "grad_norm": 0.537078693052272, "learning_rate": 4.8171773287806e-06, "loss": 0.2763, "step": 4332 }, { "epoch": 2.0702466997192523, "grad_norm": 0.48986368836511185, "learning_rate": 4.812673906552038e-06, "loss": 0.2682, "step": 4333 }, { "epoch": 2.070724568424825, "grad_norm": 0.49970157825319683, "learning_rate": 4.808171923196227e-06, "loss": 0.2745, "step": 4334 }, { "epoch": 2.0712024371303985, "grad_norm": 0.46279135492922463, "learning_rate": 4.803671379961945e-06, "loss": 0.2732, "step": 4335 }, { "epoch": 2.0716803058359714, "grad_norm": 0.5228404024805013, "learning_rate": 4.7991722780975614e-06, "loss": 0.2569, "step": 4336 }, { "epoch": 2.0721581745415447, "grad_norm": 0.47489759603230547, "learning_rate": 4.794674618851044e-06, "loss": 0.2702, "step": 4337 }, { "epoch": 2.072636043247118, "grad_norm": 0.45711910658365396, "learning_rate": 4.7901784034699695e-06, "loss": 0.2581, "step": 4338 }, { "epoch": 2.073113911952691, "grad_norm": 0.5062614569880117, "learning_rate": 4.785683633201507e-06, "loss": 0.2494, "step": 4339 }, { "epoch": 2.073591780658264, "grad_norm": 0.4838987562625956, "learning_rate": 4.781190309292421e-06, "loss": 0.2666, "step": 4340 }, { "epoch": 2.074069649363837, "grad_norm": 0.48244525999987964, "learning_rate": 4.776698432989089e-06, "loss": 0.2641, "step": 4341 }, { "epoch": 2.0745475180694104, "grad_norm": 0.481389690872325, "learning_rate": 4.7722080055374745e-06, "loss": 0.2766, "step": 4342 }, { "epoch": 2.0750253867749837, "grad_norm": 0.547344894388519, "learning_rate": 4.767719028183139e-06, "loss": 0.2779, "step": 4343 }, { "epoch": 2.0755032554805566, "grad_norm": 0.4969679278468139, "learning_rate": 4.7632315021712494e-06, "loss": 0.2767, "step": 4344 }, { "epoch": 2.07598112418613, "grad_norm": 0.4969616199208339, "learning_rate": 4.758745428746569e-06, "loss": 0.2609, "step": 4345 }, { "epoch": 2.076458992891703, "grad_norm": 0.46420623564379426, "learning_rate": 4.754260809153453e-06, "loss": 0.2606, "step": 4346 }, { "epoch": 2.076936861597276, "grad_norm": 0.4624803013184737, "learning_rate": 4.749777644635851e-06, "loss": 0.2658, "step": 4347 }, { "epoch": 2.0774147303028494, "grad_norm": 0.4564411009275747, "learning_rate": 4.745295936437323e-06, "loss": 0.2893, "step": 4348 }, { "epoch": 2.0778925990084223, "grad_norm": 0.45522303982023976, "learning_rate": 4.74081568580101e-06, "loss": 0.2888, "step": 4349 }, { "epoch": 2.0783704677139956, "grad_norm": 0.45649179256830386, "learning_rate": 4.736336893969652e-06, "loss": 0.2627, "step": 4350 }, { "epoch": 2.078848336419569, "grad_norm": 0.8720177385632242, "learning_rate": 4.731859562185593e-06, "loss": 0.2504, "step": 4351 }, { "epoch": 2.0793262051251418, "grad_norm": 0.464019520768861, "learning_rate": 4.727383691690765e-06, "loss": 0.2803, "step": 4352 }, { "epoch": 2.079804073830715, "grad_norm": 0.4829753872971073, "learning_rate": 4.722909283726687e-06, "loss": 0.2747, "step": 4353 }, { "epoch": 2.080281942536288, "grad_norm": 0.49238958382928866, "learning_rate": 4.718436339534493e-06, "loss": 0.2696, "step": 4354 }, { "epoch": 2.0807598112418613, "grad_norm": 0.5331681056474048, "learning_rate": 4.7139648603548925e-06, "loss": 0.2803, "step": 4355 }, { "epoch": 2.0812376799474346, "grad_norm": 0.5073464839867855, "learning_rate": 4.709494847428193e-06, "loss": 0.2841, "step": 4356 }, { "epoch": 2.0817155486530075, "grad_norm": 0.45403717199200416, "learning_rate": 4.7050263019943035e-06, "loss": 0.2693, "step": 4357 }, { "epoch": 2.0821934173585808, "grad_norm": 0.4790899870289159, "learning_rate": 4.700559225292714e-06, "loss": 0.2723, "step": 4358 }, { "epoch": 2.082671286064154, "grad_norm": 0.5479563382822494, "learning_rate": 4.696093618562514e-06, "loss": 0.2827, "step": 4359 }, { "epoch": 2.083149154769727, "grad_norm": 0.4645988394382689, "learning_rate": 4.691629483042387e-06, "loss": 0.2834, "step": 4360 }, { "epoch": 2.0836270234753003, "grad_norm": 0.48819600396553203, "learning_rate": 4.687166819970605e-06, "loss": 0.2806, "step": 4361 }, { "epoch": 2.084104892180873, "grad_norm": 0.48443281966006396, "learning_rate": 4.682705630585024e-06, "loss": 0.2777, "step": 4362 }, { "epoch": 2.0845827608864465, "grad_norm": 0.4840904657426456, "learning_rate": 4.678245916123111e-06, "loss": 0.2632, "step": 4363 }, { "epoch": 2.0850606295920198, "grad_norm": 0.4817924591446688, "learning_rate": 4.673787677821906e-06, "loss": 0.2571, "step": 4364 }, { "epoch": 2.0855384982975926, "grad_norm": 0.4525499762733216, "learning_rate": 4.669330916918043e-06, "loss": 0.2837, "step": 4365 }, { "epoch": 2.086016367003166, "grad_norm": 0.4568315340280084, "learning_rate": 4.664875634647756e-06, "loss": 0.2785, "step": 4366 }, { "epoch": 2.086494235708739, "grad_norm": 0.5183117984656025, "learning_rate": 4.660421832246858e-06, "loss": 0.2936, "step": 4367 }, { "epoch": 2.086972104414312, "grad_norm": 0.4872870778331395, "learning_rate": 4.655969510950752e-06, "loss": 0.2733, "step": 4368 }, { "epoch": 2.0874499731198854, "grad_norm": 0.46677867691861946, "learning_rate": 4.65151867199444e-06, "loss": 0.2678, "step": 4369 }, { "epoch": 2.0879278418254583, "grad_norm": 0.46455148899464205, "learning_rate": 4.647069316612502e-06, "loss": 0.2687, "step": 4370 }, { "epoch": 2.0884057105310316, "grad_norm": 1.2838537701376698, "learning_rate": 4.6426214460391095e-06, "loss": 0.2844, "step": 4371 }, { "epoch": 2.088883579236605, "grad_norm": 0.47029125249903925, "learning_rate": 4.6381750615080275e-06, "loss": 0.2618, "step": 4372 }, { "epoch": 2.089361447942178, "grad_norm": 0.4888788479772587, "learning_rate": 4.633730164252603e-06, "loss": 0.2643, "step": 4373 }, { "epoch": 2.089839316647751, "grad_norm": 0.4591781777504566, "learning_rate": 4.629286755505768e-06, "loss": 0.2754, "step": 4374 }, { "epoch": 2.090317185353324, "grad_norm": 0.5110989494551736, "learning_rate": 4.624844836500052e-06, "loss": 0.2586, "step": 4375 }, { "epoch": 2.0907950540588973, "grad_norm": 0.4951473466562376, "learning_rate": 4.620404408467559e-06, "loss": 0.2701, "step": 4376 }, { "epoch": 2.0912729227644706, "grad_norm": 0.45662970354071386, "learning_rate": 4.615965472639992e-06, "loss": 0.2649, "step": 4377 }, { "epoch": 2.0917507914700435, "grad_norm": 0.46953730734656046, "learning_rate": 4.611528030248629e-06, "loss": 0.2712, "step": 4378 }, { "epoch": 2.092228660175617, "grad_norm": 0.47897826667944887, "learning_rate": 4.607092082524341e-06, "loss": 0.2544, "step": 4379 }, { "epoch": 2.0927065288811897, "grad_norm": 0.45754772942931315, "learning_rate": 4.60265763069758e-06, "loss": 0.2739, "step": 4380 }, { "epoch": 2.093184397586763, "grad_norm": 0.47343432218709175, "learning_rate": 4.598224675998381e-06, "loss": 0.2632, "step": 4381 }, { "epoch": 2.0936622662923363, "grad_norm": 0.5079217054437215, "learning_rate": 4.593793219656375e-06, "loss": 0.2884, "step": 4382 }, { "epoch": 2.094140134997909, "grad_norm": 0.4755846137336544, "learning_rate": 4.589363262900767e-06, "loss": 0.2769, "step": 4383 }, { "epoch": 2.0946180037034825, "grad_norm": 0.4624399910521351, "learning_rate": 4.5849348069603424e-06, "loss": 0.2833, "step": 4384 }, { "epoch": 2.095095872409056, "grad_norm": 0.47379072504138925, "learning_rate": 4.580507853063487e-06, "loss": 0.2723, "step": 4385 }, { "epoch": 2.0955737411146287, "grad_norm": 0.5404425798815802, "learning_rate": 4.5760824024381545e-06, "loss": 0.2836, "step": 4386 }, { "epoch": 2.096051609820202, "grad_norm": 0.4744131279778523, "learning_rate": 4.571658456311885e-06, "loss": 0.2646, "step": 4387 }, { "epoch": 2.096529478525775, "grad_norm": 0.5257445500444607, "learning_rate": 4.567236015911808e-06, "loss": 0.2756, "step": 4388 }, { "epoch": 2.097007347231348, "grad_norm": 0.46379662230318247, "learning_rate": 4.562815082464628e-06, "loss": 0.2577, "step": 4389 }, { "epoch": 2.0974852159369215, "grad_norm": 0.4711856915631454, "learning_rate": 4.5583956571966295e-06, "loss": 0.2695, "step": 4390 }, { "epoch": 2.0979630846424944, "grad_norm": 0.4668862612467058, "learning_rate": 4.5539777413336916e-06, "loss": 0.2811, "step": 4391 }, { "epoch": 2.0984409533480677, "grad_norm": 0.8250470635919427, "learning_rate": 4.549561336101263e-06, "loss": 0.2681, "step": 4392 }, { "epoch": 2.0989188220536406, "grad_norm": 0.5474390912842495, "learning_rate": 4.545146442724371e-06, "loss": 0.278, "step": 4393 }, { "epoch": 2.099396690759214, "grad_norm": 0.45864322699427934, "learning_rate": 4.540733062427637e-06, "loss": 0.2572, "step": 4394 }, { "epoch": 2.099874559464787, "grad_norm": 0.4642926001377059, "learning_rate": 4.5363211964352524e-06, "loss": 0.2754, "step": 4395 }, { "epoch": 2.10035242817036, "grad_norm": 1.2708133595604119, "learning_rate": 4.531910845970986e-06, "loss": 0.2515, "step": 4396 }, { "epoch": 2.1008302968759334, "grad_norm": 0.46107312847431264, "learning_rate": 4.527502012258201e-06, "loss": 0.2798, "step": 4397 }, { "epoch": 2.1013081655815067, "grad_norm": 0.4654804719208443, "learning_rate": 4.523094696519822e-06, "loss": 0.2599, "step": 4398 }, { "epoch": 2.1017860342870796, "grad_norm": 0.5035051987740295, "learning_rate": 4.5186888999783604e-06, "loss": 0.2796, "step": 4399 }, { "epoch": 2.102263902992653, "grad_norm": 0.4639728245767181, "learning_rate": 4.514284623855915e-06, "loss": 0.2711, "step": 4400 }, { "epoch": 2.1027417716982257, "grad_norm": 0.4628009973817066, "learning_rate": 4.509881869374146e-06, "loss": 0.2749, "step": 4401 }, { "epoch": 2.103219640403799, "grad_norm": 0.46310930729244, "learning_rate": 4.5054806377543e-06, "loss": 0.2766, "step": 4402 }, { "epoch": 2.1036975091093724, "grad_norm": 0.4815051304539407, "learning_rate": 4.501080930217206e-06, "loss": 0.2686, "step": 4403 }, { "epoch": 2.1041753778149452, "grad_norm": 0.452776883071896, "learning_rate": 4.4966827479832645e-06, "loss": 0.2815, "step": 4404 }, { "epoch": 2.1046532465205186, "grad_norm": 0.5572260774768857, "learning_rate": 4.4922860922724466e-06, "loss": 0.2795, "step": 4405 }, { "epoch": 2.1051311152260914, "grad_norm": 0.46536041364576214, "learning_rate": 4.487890964304317e-06, "loss": 0.2607, "step": 4406 }, { "epoch": 2.1056089839316647, "grad_norm": 0.4671149161280602, "learning_rate": 4.483497365298001e-06, "loss": 0.2656, "step": 4407 }, { "epoch": 2.106086852637238, "grad_norm": 0.47754698203735124, "learning_rate": 4.479105296472204e-06, "loss": 0.2636, "step": 4408 }, { "epoch": 2.106564721342811, "grad_norm": 0.4691739905835083, "learning_rate": 4.474714759045213e-06, "loss": 0.2797, "step": 4409 }, { "epoch": 2.1070425900483842, "grad_norm": 0.5429458246931929, "learning_rate": 4.470325754234881e-06, "loss": 0.2727, "step": 4410 }, { "epoch": 2.1075204587539575, "grad_norm": 0.5405366505121745, "learning_rate": 4.465938283258643e-06, "loss": 0.2734, "step": 4411 }, { "epoch": 2.1079983274595304, "grad_norm": 0.7314425834233769, "learning_rate": 4.461552347333509e-06, "loss": 0.2515, "step": 4412 }, { "epoch": 2.1084761961651037, "grad_norm": 0.6100608642443537, "learning_rate": 4.457167947676058e-06, "loss": 0.2697, "step": 4413 }, { "epoch": 2.1089540648706766, "grad_norm": 0.4837731698853667, "learning_rate": 4.45278508550244e-06, "loss": 0.2907, "step": 4414 }, { "epoch": 2.10943193357625, "grad_norm": 0.47440995597138375, "learning_rate": 4.448403762028391e-06, "loss": 0.2709, "step": 4415 }, { "epoch": 2.1099098022818232, "grad_norm": 0.5691216197594825, "learning_rate": 4.444023978469212e-06, "loss": 0.2778, "step": 4416 }, { "epoch": 2.110387670987396, "grad_norm": 0.4528201720944284, "learning_rate": 4.4396457360397704e-06, "loss": 0.2619, "step": 4417 }, { "epoch": 2.1108655396929694, "grad_norm": 0.7031077512368775, "learning_rate": 4.435269035954523e-06, "loss": 0.2658, "step": 4418 }, { "epoch": 2.1113434083985423, "grad_norm": 0.4719207523909015, "learning_rate": 4.430893879427486e-06, "loss": 0.2622, "step": 4419 }, { "epoch": 2.1118212771041156, "grad_norm": 0.4711222028375983, "learning_rate": 4.426520267672244e-06, "loss": 0.2908, "step": 4420 }, { "epoch": 2.112299145809689, "grad_norm": 0.5717573571560005, "learning_rate": 4.422148201901969e-06, "loss": 0.272, "step": 4421 }, { "epoch": 2.112777014515262, "grad_norm": 0.48469162216982337, "learning_rate": 4.4177776833293915e-06, "loss": 0.2789, "step": 4422 }, { "epoch": 2.113254883220835, "grad_norm": 0.4591193788359184, "learning_rate": 4.4134087131668135e-06, "loss": 0.2855, "step": 4423 }, { "epoch": 2.1137327519264084, "grad_norm": 0.451459503417534, "learning_rate": 4.409041292626115e-06, "loss": 0.2718, "step": 4424 }, { "epoch": 2.1142106206319813, "grad_norm": 0.5180358351813875, "learning_rate": 4.40467542291874e-06, "loss": 0.2669, "step": 4425 }, { "epoch": 2.1146884893375546, "grad_norm": 0.48205075176726636, "learning_rate": 4.400311105255698e-06, "loss": 0.2659, "step": 4426 }, { "epoch": 2.1151663580431275, "grad_norm": 0.4544529548772736, "learning_rate": 4.395948340847584e-06, "loss": 0.2739, "step": 4427 }, { "epoch": 2.115644226748701, "grad_norm": 0.44998143432924953, "learning_rate": 4.391587130904544e-06, "loss": 0.2568, "step": 4428 }, { "epoch": 2.116122095454274, "grad_norm": 0.4857935888324327, "learning_rate": 4.387227476636301e-06, "loss": 0.2564, "step": 4429 }, { "epoch": 2.116599964159847, "grad_norm": 0.482388126357244, "learning_rate": 4.382869379252152e-06, "loss": 0.269, "step": 4430 }, { "epoch": 2.1170778328654203, "grad_norm": 0.46675426655921626, "learning_rate": 4.378512839960953e-06, "loss": 0.2701, "step": 4431 }, { "epoch": 2.1175557015709936, "grad_norm": 0.4567298710073206, "learning_rate": 4.374157859971127e-06, "loss": 0.2582, "step": 4432 }, { "epoch": 2.1180335702765665, "grad_norm": 1.0653818376694562, "learning_rate": 4.369804440490676e-06, "loss": 0.2658, "step": 4433 }, { "epoch": 2.11851143898214, "grad_norm": 0.4691784307308584, "learning_rate": 4.3654525827271576e-06, "loss": 0.2797, "step": 4434 }, { "epoch": 2.1189893076877127, "grad_norm": 1.179538881720667, "learning_rate": 4.361102287887698e-06, "loss": 0.2675, "step": 4435 }, { "epoch": 2.119467176393286, "grad_norm": 0.470987292017048, "learning_rate": 4.356753557178999e-06, "loss": 0.2782, "step": 4436 }, { "epoch": 2.1199450450988593, "grad_norm": 0.46682100572447127, "learning_rate": 4.352406391807318e-06, "loss": 0.2718, "step": 4437 }, { "epoch": 2.120422913804432, "grad_norm": 0.5044197345551845, "learning_rate": 4.348060792978479e-06, "loss": 0.2655, "step": 4438 }, { "epoch": 2.1209007825100055, "grad_norm": 0.4681899555394538, "learning_rate": 4.34371676189788e-06, "loss": 0.2658, "step": 4439 }, { "epoch": 2.1213786512155783, "grad_norm": 0.4703827383388374, "learning_rate": 4.339374299770477e-06, "loss": 0.2728, "step": 4440 }, { "epoch": 2.1218565199211517, "grad_norm": 0.48211199249212094, "learning_rate": 4.335033407800787e-06, "loss": 0.2863, "step": 4441 }, { "epoch": 2.122334388626725, "grad_norm": 0.48843268326528816, "learning_rate": 4.330694087192906e-06, "loss": 0.2813, "step": 4442 }, { "epoch": 2.122812257332298, "grad_norm": 0.5164345291433518, "learning_rate": 4.32635633915048e-06, "loss": 0.2712, "step": 4443 }, { "epoch": 2.123290126037871, "grad_norm": 0.46371527898883846, "learning_rate": 4.322020164876722e-06, "loss": 0.2639, "step": 4444 }, { "epoch": 2.123767994743444, "grad_norm": 0.48627299210477126, "learning_rate": 4.31768556557441e-06, "loss": 0.2661, "step": 4445 }, { "epoch": 2.1242458634490173, "grad_norm": 0.5076108512131333, "learning_rate": 4.313352542445892e-06, "loss": 0.2763, "step": 4446 }, { "epoch": 2.1247237321545906, "grad_norm": 0.4982314792290385, "learning_rate": 4.309021096693069e-06, "loss": 0.2623, "step": 4447 }, { "epoch": 2.1252016008601635, "grad_norm": 0.5098348084219093, "learning_rate": 4.3046912295174015e-06, "loss": 0.2598, "step": 4448 }, { "epoch": 2.125679469565737, "grad_norm": 0.4748754501419402, "learning_rate": 4.300362942119929e-06, "loss": 0.2707, "step": 4449 }, { "epoch": 2.12615733827131, "grad_norm": 0.47655181277542424, "learning_rate": 4.296036235701235e-06, "loss": 0.2738, "step": 4450 }, { "epoch": 2.126635206976883, "grad_norm": 0.7574458511584494, "learning_rate": 4.29171111146147e-06, "loss": 0.2991, "step": 4451 }, { "epoch": 2.1271130756824563, "grad_norm": 0.47222019908644236, "learning_rate": 4.2873875706003535e-06, "loss": 0.2535, "step": 4452 }, { "epoch": 2.127590944388029, "grad_norm": 0.45799953680867805, "learning_rate": 4.283065614317156e-06, "loss": 0.2612, "step": 4453 }, { "epoch": 2.1280688130936025, "grad_norm": 0.522281047897022, "learning_rate": 4.278745243810709e-06, "loss": 0.2844, "step": 4454 }, { "epoch": 2.128546681799176, "grad_norm": 0.4660306281172418, "learning_rate": 4.274426460279412e-06, "loss": 0.2466, "step": 4455 }, { "epoch": 2.1290245505047487, "grad_norm": 0.4785082179915365, "learning_rate": 4.270109264921221e-06, "loss": 0.2797, "step": 4456 }, { "epoch": 2.129502419210322, "grad_norm": 0.48055449069442646, "learning_rate": 4.26579365893364e-06, "loss": 0.2984, "step": 4457 }, { "epoch": 2.1299802879158953, "grad_norm": 0.5098722373675344, "learning_rate": 4.261479643513753e-06, "loss": 0.26, "step": 4458 }, { "epoch": 2.130458156621468, "grad_norm": 0.49399059503460835, "learning_rate": 4.257167219858187e-06, "loss": 0.2683, "step": 4459 }, { "epoch": 2.1309360253270415, "grad_norm": 0.5080365898455398, "learning_rate": 4.252856389163128e-06, "loss": 0.2754, "step": 4460 }, { "epoch": 2.1314138940326144, "grad_norm": 0.5554438469805086, "learning_rate": 4.248547152624334e-06, "loss": 0.2601, "step": 4461 }, { "epoch": 2.1318917627381877, "grad_norm": 0.48601767105637644, "learning_rate": 4.244239511437105e-06, "loss": 0.2734, "step": 4462 }, { "epoch": 2.132369631443761, "grad_norm": 0.46636918648450326, "learning_rate": 4.239933466796301e-06, "loss": 0.2573, "step": 4463 }, { "epoch": 2.132847500149334, "grad_norm": 0.4882887183192489, "learning_rate": 4.235629019896352e-06, "loss": 0.2736, "step": 4464 }, { "epoch": 2.133325368854907, "grad_norm": 0.4776658176673478, "learning_rate": 4.231326171931231e-06, "loss": 0.2692, "step": 4465 }, { "epoch": 2.13380323756048, "grad_norm": 0.4866846981727982, "learning_rate": 4.227024924094469e-06, "loss": 0.2577, "step": 4466 }, { "epoch": 2.1342811062660534, "grad_norm": 0.4980447283271286, "learning_rate": 4.222725277579164e-06, "loss": 0.2876, "step": 4467 }, { "epoch": 2.1347589749716267, "grad_norm": 0.4623292009049184, "learning_rate": 4.218427233577956e-06, "loss": 0.2736, "step": 4468 }, { "epoch": 2.1352368436771996, "grad_norm": 0.48084141681866016, "learning_rate": 4.214130793283046e-06, "loss": 0.2801, "step": 4469 }, { "epoch": 2.135714712382773, "grad_norm": 0.4649177248019152, "learning_rate": 4.209835957886196e-06, "loss": 0.2729, "step": 4470 }, { "epoch": 2.1361925810883458, "grad_norm": 0.47322585157208374, "learning_rate": 4.205542728578714e-06, "loss": 0.2757, "step": 4471 }, { "epoch": 2.136670449793919, "grad_norm": 0.4583877785505501, "learning_rate": 4.2012511065514636e-06, "loss": 0.2689, "step": 4472 }, { "epoch": 2.1371483184994924, "grad_norm": 0.4870399136505855, "learning_rate": 4.196961092994871e-06, "loss": 0.2805, "step": 4473 }, { "epoch": 2.1376261872050653, "grad_norm": 0.4685135597479858, "learning_rate": 4.192672689098908e-06, "loss": 0.2648, "step": 4474 }, { "epoch": 2.1381040559106386, "grad_norm": 0.47535234788643843, "learning_rate": 4.188385896053098e-06, "loss": 0.2791, "step": 4475 }, { "epoch": 2.138581924616212, "grad_norm": 0.5302293391229079, "learning_rate": 4.184100715046529e-06, "loss": 0.2731, "step": 4476 }, { "epoch": 2.1390597933217848, "grad_norm": 0.5240248943881992, "learning_rate": 4.179817147267829e-06, "loss": 0.2638, "step": 4477 }, { "epoch": 2.139537662027358, "grad_norm": 0.4619458564887166, "learning_rate": 4.1755351939051845e-06, "loss": 0.2643, "step": 4478 }, { "epoch": 2.140015530732931, "grad_norm": 0.4702047131010919, "learning_rate": 4.171254856146335e-06, "loss": 0.2615, "step": 4479 }, { "epoch": 2.1404933994385043, "grad_norm": 0.4773661172330248, "learning_rate": 4.166976135178575e-06, "loss": 0.2865, "step": 4480 }, { "epoch": 2.1409712681440776, "grad_norm": 0.47727793467693286, "learning_rate": 4.1626990321887425e-06, "loss": 0.2621, "step": 4481 }, { "epoch": 2.1414491368496504, "grad_norm": 0.4871383170567948, "learning_rate": 4.1584235483632265e-06, "loss": 0.2651, "step": 4482 }, { "epoch": 2.1419270055552238, "grad_norm": 0.4714614885059131, "learning_rate": 4.154149684887977e-06, "loss": 0.2794, "step": 4483 }, { "epoch": 2.142404874260797, "grad_norm": 0.4969507858614288, "learning_rate": 4.149877442948486e-06, "loss": 0.2622, "step": 4484 }, { "epoch": 2.14288274296637, "grad_norm": 0.507286683219254, "learning_rate": 4.1456068237297964e-06, "loss": 0.2664, "step": 4485 }, { "epoch": 2.1433606116719432, "grad_norm": 0.4810756637158575, "learning_rate": 4.1413378284165065e-06, "loss": 0.2584, "step": 4486 }, { "epoch": 2.143838480377516, "grad_norm": 0.49276173044738886, "learning_rate": 4.1370704581927575e-06, "loss": 0.2645, "step": 4487 }, { "epoch": 2.1443163490830894, "grad_norm": 0.5062183592646868, "learning_rate": 4.13280471424224e-06, "loss": 0.2681, "step": 4488 }, { "epoch": 2.1447942177886627, "grad_norm": 0.49099788891531865, "learning_rate": 4.128540597748203e-06, "loss": 0.2761, "step": 4489 }, { "epoch": 2.1452720864942356, "grad_norm": 0.518176466847488, "learning_rate": 4.124278109893432e-06, "loss": 0.2663, "step": 4490 }, { "epoch": 2.145749955199809, "grad_norm": 0.49131335284040245, "learning_rate": 4.120017251860266e-06, "loss": 0.2871, "step": 4491 }, { "epoch": 2.146227823905382, "grad_norm": 0.5072033097189093, "learning_rate": 4.115758024830595e-06, "loss": 0.2814, "step": 4492 }, { "epoch": 2.146705692610955, "grad_norm": 0.4876564849105318, "learning_rate": 4.111500429985853e-06, "loss": 0.2759, "step": 4493 }, { "epoch": 2.1471835613165284, "grad_norm": 0.4421916536653529, "learning_rate": 4.1072444685070155e-06, "loss": 0.2618, "step": 4494 }, { "epoch": 2.1476614300221013, "grad_norm": 0.46331905163767734, "learning_rate": 4.10299014157462e-06, "loss": 0.2763, "step": 4495 }, { "epoch": 2.1481392987276746, "grad_norm": 0.5158897747210883, "learning_rate": 4.098737450368738e-06, "loss": 0.2773, "step": 4496 }, { "epoch": 2.1486171674332475, "grad_norm": 0.45891199284426043, "learning_rate": 4.094486396068987e-06, "loss": 0.2774, "step": 4497 }, { "epoch": 2.149095036138821, "grad_norm": 0.47523435146742227, "learning_rate": 4.0902369798545426e-06, "loss": 0.2551, "step": 4498 }, { "epoch": 2.149572904844394, "grad_norm": 0.44933432383147964, "learning_rate": 4.085989202904113e-06, "loss": 0.2606, "step": 4499 }, { "epoch": 2.150050773549967, "grad_norm": 0.47676276634177017, "learning_rate": 4.0817430663959536e-06, "loss": 0.2785, "step": 4500 }, { "epoch": 2.1505286422555403, "grad_norm": 0.45939776538275545, "learning_rate": 4.077498571507874e-06, "loss": 0.2714, "step": 4501 }, { "epoch": 2.1510065109611136, "grad_norm": 0.4702734224108815, "learning_rate": 4.073255719417221e-06, "loss": 0.2615, "step": 4502 }, { "epoch": 2.1514843796666865, "grad_norm": 0.4910412013612873, "learning_rate": 4.0690145113008815e-06, "loss": 0.2566, "step": 4503 }, { "epoch": 2.15196224837226, "grad_norm": 0.48456487516342267, "learning_rate": 4.064774948335299e-06, "loss": 0.2683, "step": 4504 }, { "epoch": 2.1524401170778327, "grad_norm": 0.4873630350755374, "learning_rate": 4.06053703169645e-06, "loss": 0.2628, "step": 4505 }, { "epoch": 2.152917985783406, "grad_norm": 0.512860605368886, "learning_rate": 4.056300762559855e-06, "loss": 0.267, "step": 4506 }, { "epoch": 2.1533958544889793, "grad_norm": 0.5806471029367954, "learning_rate": 4.052066142100587e-06, "loss": 0.2632, "step": 4507 }, { "epoch": 2.153873723194552, "grad_norm": 0.4748345500291511, "learning_rate": 4.047833171493251e-06, "loss": 0.2567, "step": 4508 }, { "epoch": 2.1543515919001255, "grad_norm": 0.4789611805358055, "learning_rate": 4.043601851911996e-06, "loss": 0.2696, "step": 4509 }, { "epoch": 2.154829460605699, "grad_norm": 0.49413914811960363, "learning_rate": 4.039372184530521e-06, "loss": 0.2575, "step": 4510 }, { "epoch": 2.1553073293112717, "grad_norm": 0.5070625267068553, "learning_rate": 4.035144170522055e-06, "loss": 0.2765, "step": 4511 }, { "epoch": 2.155785198016845, "grad_norm": 0.45705266978001025, "learning_rate": 4.030917811059378e-06, "loss": 0.2705, "step": 4512 }, { "epoch": 2.156263066722418, "grad_norm": 1.5569910667354723, "learning_rate": 4.02669310731481e-06, "loss": 0.2803, "step": 4513 }, { "epoch": 2.156740935427991, "grad_norm": 0.49066820686495577, "learning_rate": 4.0224700604602085e-06, "loss": 0.2786, "step": 4514 }, { "epoch": 2.1572188041335645, "grad_norm": 0.47471065389215006, "learning_rate": 4.0182486716669656e-06, "loss": 0.2566, "step": 4515 }, { "epoch": 2.1576966728391374, "grad_norm": 0.46201303993122544, "learning_rate": 4.014028942106028e-06, "loss": 0.2644, "step": 4516 }, { "epoch": 2.1581745415447107, "grad_norm": 0.4810938505072536, "learning_rate": 4.009810872947873e-06, "loss": 0.2723, "step": 4517 }, { "epoch": 2.1586524102502835, "grad_norm": 0.45055790887900427, "learning_rate": 4.005594465362512e-06, "loss": 0.2731, "step": 4518 }, { "epoch": 2.159130278955857, "grad_norm": 0.5023239987249768, "learning_rate": 4.00137972051951e-06, "loss": 0.2721, "step": 4519 }, { "epoch": 2.15960814766143, "grad_norm": 0.507580098147782, "learning_rate": 3.9971666395879605e-06, "loss": 0.2635, "step": 4520 }, { "epoch": 2.160086016367003, "grad_norm": 0.47302338046197384, "learning_rate": 3.992955223736493e-06, "loss": 0.2621, "step": 4521 }, { "epoch": 2.1605638850725764, "grad_norm": 0.4426581792894162, "learning_rate": 3.9887454741332874e-06, "loss": 0.2738, "step": 4522 }, { "epoch": 2.161041753778149, "grad_norm": 0.5038924092723644, "learning_rate": 3.984537391946051e-06, "loss": 0.2859, "step": 4523 }, { "epoch": 2.1615196224837225, "grad_norm": 0.5097699500822241, "learning_rate": 3.980330978342027e-06, "loss": 0.2836, "step": 4524 }, { "epoch": 2.161997491189296, "grad_norm": 0.4997990150539291, "learning_rate": 3.9761262344880096e-06, "loss": 0.2862, "step": 4525 }, { "epoch": 2.1624753598948687, "grad_norm": 0.5068009705170106, "learning_rate": 3.971923161550314e-06, "loss": 0.2617, "step": 4526 }, { "epoch": 2.162953228600442, "grad_norm": 0.5121923973953294, "learning_rate": 3.967721760694796e-06, "loss": 0.2677, "step": 4527 }, { "epoch": 2.1634310973060153, "grad_norm": 0.4586590918299566, "learning_rate": 3.963522033086858e-06, "loss": 0.2812, "step": 4528 }, { "epoch": 2.163908966011588, "grad_norm": 0.48386944735308773, "learning_rate": 3.959323979891427e-06, "loss": 0.2657, "step": 4529 }, { "epoch": 2.1643868347171615, "grad_norm": 0.48660859734621076, "learning_rate": 3.9551276022729644e-06, "loss": 0.292, "step": 4530 }, { "epoch": 2.1648647034227344, "grad_norm": 0.571916496126486, "learning_rate": 3.9509329013954775e-06, "loss": 0.2669, "step": 4531 }, { "epoch": 2.1653425721283077, "grad_norm": 0.4731522622117423, "learning_rate": 3.946739878422502e-06, "loss": 0.2674, "step": 4532 }, { "epoch": 2.165820440833881, "grad_norm": 0.4585592677843948, "learning_rate": 3.942548534517102e-06, "loss": 0.2754, "step": 4533 }, { "epoch": 2.166298309539454, "grad_norm": 0.45992994288587097, "learning_rate": 3.938358870841891e-06, "loss": 0.2842, "step": 4534 }, { "epoch": 2.166776178245027, "grad_norm": 0.48157396171772915, "learning_rate": 3.9341708885590034e-06, "loss": 0.2721, "step": 4535 }, { "epoch": 2.1672540469506005, "grad_norm": 0.46636131218315996, "learning_rate": 3.9299845888301084e-06, "loss": 0.2545, "step": 4536 }, { "epoch": 2.1677319156561734, "grad_norm": 0.48593748291916344, "learning_rate": 3.925799972816419e-06, "loss": 0.2478, "step": 4537 }, { "epoch": 2.1682097843617467, "grad_norm": 0.5044885208397369, "learning_rate": 3.921617041678669e-06, "loss": 0.2679, "step": 4538 }, { "epoch": 2.1686876530673196, "grad_norm": 0.4667942698450788, "learning_rate": 3.917435796577128e-06, "loss": 0.2522, "step": 4539 }, { "epoch": 2.169165521772893, "grad_norm": 0.4949207913026006, "learning_rate": 3.913256238671607e-06, "loss": 0.2694, "step": 4540 }, { "epoch": 2.169643390478466, "grad_norm": 0.4922718382704214, "learning_rate": 3.909078369121435e-06, "loss": 0.272, "step": 4541 }, { "epoch": 2.170121259184039, "grad_norm": 0.5568367688738352, "learning_rate": 3.904902189085479e-06, "loss": 0.2744, "step": 4542 }, { "epoch": 2.1705991278896124, "grad_norm": 0.4687165591772829, "learning_rate": 3.900727699722144e-06, "loss": 0.2758, "step": 4543 }, { "epoch": 2.1710769965951853, "grad_norm": 0.45872513093731365, "learning_rate": 3.896554902189355e-06, "loss": 0.2723, "step": 4544 }, { "epoch": 2.1715548653007586, "grad_norm": 0.4637298698793799, "learning_rate": 3.89238379764457e-06, "loss": 0.2448, "step": 4545 }, { "epoch": 2.172032734006332, "grad_norm": 0.48363038021481214, "learning_rate": 3.888214387244783e-06, "loss": 0.2657, "step": 4546 }, { "epoch": 2.1725106027119048, "grad_norm": 0.4550871624285722, "learning_rate": 3.884046672146518e-06, "loss": 0.2854, "step": 4547 }, { "epoch": 2.172988471417478, "grad_norm": 0.495376118420438, "learning_rate": 3.879880653505824e-06, "loss": 0.2633, "step": 4548 }, { "epoch": 2.1734663401230514, "grad_norm": 0.46098656670137583, "learning_rate": 3.875716332478275e-06, "loss": 0.2657, "step": 4549 }, { "epoch": 2.1739442088286243, "grad_norm": 0.4751772768586887, "learning_rate": 3.871553710218988e-06, "loss": 0.2837, "step": 4550 }, { "epoch": 2.1744220775341976, "grad_norm": 0.47152321412549136, "learning_rate": 3.867392787882599e-06, "loss": 0.2638, "step": 4551 }, { "epoch": 2.1748999462397705, "grad_norm": 0.44704792893751294, "learning_rate": 3.8632335666232686e-06, "loss": 0.2602, "step": 4552 }, { "epoch": 2.1753778149453438, "grad_norm": 0.6555651445287016, "learning_rate": 3.859076047594701e-06, "loss": 0.2518, "step": 4553 }, { "epoch": 2.175855683650917, "grad_norm": 0.4869920295121113, "learning_rate": 3.854920231950113e-06, "loss": 0.2821, "step": 4554 }, { "epoch": 2.17633355235649, "grad_norm": 0.45671533130726394, "learning_rate": 3.850766120842252e-06, "loss": 0.2801, "step": 4555 }, { "epoch": 2.1768114210620633, "grad_norm": 0.4608951885566162, "learning_rate": 3.846613715423402e-06, "loss": 0.2658, "step": 4556 }, { "epoch": 2.177289289767636, "grad_norm": 0.46418435619628584, "learning_rate": 3.842463016845362e-06, "loss": 0.2735, "step": 4557 }, { "epoch": 2.1777671584732095, "grad_norm": 0.47932679730203265, "learning_rate": 3.838314026259462e-06, "loss": 0.25, "step": 4558 }, { "epoch": 2.1782450271787828, "grad_norm": 0.47237247496031814, "learning_rate": 3.8341667448165645e-06, "loss": 0.262, "step": 4559 }, { "epoch": 2.1787228958843556, "grad_norm": 0.5186757703706331, "learning_rate": 3.830021173667048e-06, "loss": 0.2884, "step": 4560 }, { "epoch": 2.179200764589929, "grad_norm": 0.44583744351159743, "learning_rate": 3.8258773139608185e-06, "loss": 0.2725, "step": 4561 }, { "epoch": 2.1796786332955023, "grad_norm": 0.4772180156318942, "learning_rate": 3.821735166847316e-06, "loss": 0.2641, "step": 4562 }, { "epoch": 2.180156502001075, "grad_norm": 0.5094816309115656, "learning_rate": 3.817594733475494e-06, "loss": 0.2657, "step": 4563 }, { "epoch": 2.1806343707066485, "grad_norm": 0.4456377533344565, "learning_rate": 3.813456014993835e-06, "loss": 0.2637, "step": 4564 }, { "epoch": 2.1811122394122213, "grad_norm": 0.47505222662407803, "learning_rate": 3.809319012550352e-06, "loss": 0.2713, "step": 4565 }, { "epoch": 2.1815901081177946, "grad_norm": 0.5173681589868041, "learning_rate": 3.8051837272925728e-06, "loss": 0.2813, "step": 4566 }, { "epoch": 2.182067976823368, "grad_norm": 0.5015740347460304, "learning_rate": 3.80105016036755e-06, "loss": 0.2708, "step": 4567 }, { "epoch": 2.182545845528941, "grad_norm": 0.45211083979868544, "learning_rate": 3.796918312921868e-06, "loss": 0.2774, "step": 4568 }, { "epoch": 2.183023714234514, "grad_norm": 0.4580673412053882, "learning_rate": 3.792788186101626e-06, "loss": 0.2786, "step": 4569 }, { "epoch": 2.183501582940087, "grad_norm": 0.483142068598242, "learning_rate": 3.788659781052444e-06, "loss": 0.2662, "step": 4570 }, { "epoch": 2.1839794516456603, "grad_norm": 0.4585071424628377, "learning_rate": 3.7845330989194762e-06, "loss": 0.2537, "step": 4571 }, { "epoch": 2.1844573203512336, "grad_norm": 0.46364154074230174, "learning_rate": 3.780408140847387e-06, "loss": 0.2703, "step": 4572 }, { "epoch": 2.1849351890568065, "grad_norm": 0.47912865130428883, "learning_rate": 3.7762849079803654e-06, "loss": 0.2599, "step": 4573 }, { "epoch": 2.18541305776238, "grad_norm": 0.486225553510062, "learning_rate": 3.772163401462129e-06, "loss": 0.2707, "step": 4574 }, { "epoch": 2.185890926467953, "grad_norm": 0.5017350180630475, "learning_rate": 3.7680436224359084e-06, "loss": 0.278, "step": 4575 }, { "epoch": 2.186368795173526, "grad_norm": 0.44633892621886945, "learning_rate": 3.7639255720444532e-06, "loss": 0.2564, "step": 4576 }, { "epoch": 2.1868466638790993, "grad_norm": 0.47140634322880026, "learning_rate": 3.7598092514300456e-06, "loss": 0.2706, "step": 4577 }, { "epoch": 2.187324532584672, "grad_norm": 0.47775263124381995, "learning_rate": 3.7556946617344757e-06, "loss": 0.2768, "step": 4578 }, { "epoch": 2.1878024012902455, "grad_norm": 0.4662918039512788, "learning_rate": 3.751581804099056e-06, "loss": 0.2702, "step": 4579 }, { "epoch": 2.188280269995819, "grad_norm": 0.4526279489034841, "learning_rate": 3.747470679664624e-06, "loss": 0.27, "step": 4580 }, { "epoch": 2.1887581387013917, "grad_norm": 0.461833024668242, "learning_rate": 3.7433612895715356e-06, "loss": 0.2525, "step": 4581 }, { "epoch": 2.189236007406965, "grad_norm": 0.45967111066950117, "learning_rate": 3.739253634959661e-06, "loss": 0.2755, "step": 4582 }, { "epoch": 2.1897138761125383, "grad_norm": 0.4888680597465754, "learning_rate": 3.735147716968386e-06, "loss": 0.2578, "step": 4583 }, { "epoch": 2.190191744818111, "grad_norm": 0.4986060150836118, "learning_rate": 3.731043536736628e-06, "loss": 0.2694, "step": 4584 }, { "epoch": 2.1906696135236845, "grad_norm": 0.4543474936433639, "learning_rate": 3.7269410954028107e-06, "loss": 0.2592, "step": 4585 }, { "epoch": 2.1911474822292574, "grad_norm": 1.2235953943030329, "learning_rate": 3.7228403941048753e-06, "loss": 0.2595, "step": 4586 }, { "epoch": 2.1916253509348307, "grad_norm": 0.48594813503110623, "learning_rate": 3.7187414339802906e-06, "loss": 0.2774, "step": 4587 }, { "epoch": 2.192103219640404, "grad_norm": 0.4911464829074915, "learning_rate": 3.7146442161660336e-06, "loss": 0.2636, "step": 4588 }, { "epoch": 2.192581088345977, "grad_norm": 0.47380456620012534, "learning_rate": 3.710548741798594e-06, "loss": 0.265, "step": 4589 }, { "epoch": 2.19305895705155, "grad_norm": 0.4706478201898707, "learning_rate": 3.706455012013994e-06, "loss": 0.2758, "step": 4590 }, { "epoch": 2.193536825757123, "grad_norm": 0.4635842650188313, "learning_rate": 3.702363027947757e-06, "loss": 0.2703, "step": 4591 }, { "epoch": 2.1940146944626964, "grad_norm": 0.4655317804125815, "learning_rate": 3.6982727907349247e-06, "loss": 0.2718, "step": 4592 }, { "epoch": 2.1944925631682697, "grad_norm": 0.4495126115308564, "learning_rate": 3.694184301510063e-06, "loss": 0.2737, "step": 4593 }, { "epoch": 2.1949704318738426, "grad_norm": 0.4535524871952706, "learning_rate": 3.6900975614072433e-06, "loss": 0.2621, "step": 4594 }, { "epoch": 2.195448300579416, "grad_norm": 0.44992841504040854, "learning_rate": 3.6860125715600513e-06, "loss": 0.2805, "step": 4595 }, { "epoch": 2.1959261692849887, "grad_norm": 0.45869083287966966, "learning_rate": 3.6819293331015993e-06, "loss": 0.2865, "step": 4596 }, { "epoch": 2.196404037990562, "grad_norm": 0.46107098925354134, "learning_rate": 3.6778478471645008e-06, "loss": 0.2624, "step": 4597 }, { "epoch": 2.1968819066961354, "grad_norm": 0.44462298567559727, "learning_rate": 3.6737681148808855e-06, "loss": 0.2602, "step": 4598 }, { "epoch": 2.1973597754017082, "grad_norm": 0.4596846501814117, "learning_rate": 3.6696901373824056e-06, "loss": 0.2721, "step": 4599 }, { "epoch": 2.1978376441072816, "grad_norm": 0.6638891345781536, "learning_rate": 3.665613915800217e-06, "loss": 0.261, "step": 4600 }, { "epoch": 2.198315512812855, "grad_norm": 0.49610959186848946, "learning_rate": 3.6615394512649884e-06, "loss": 0.2774, "step": 4601 }, { "epoch": 2.1987933815184277, "grad_norm": 0.5056345462489438, "learning_rate": 3.65746674490691e-06, "loss": 0.2835, "step": 4602 }, { "epoch": 2.199271250224001, "grad_norm": 0.44612162373312525, "learning_rate": 3.6533957978556777e-06, "loss": 0.2798, "step": 4603 }, { "epoch": 2.199749118929574, "grad_norm": 0.4722815796388612, "learning_rate": 3.6493266112404947e-06, "loss": 0.2571, "step": 4604 }, { "epoch": 2.2002269876351472, "grad_norm": 0.4719966235892821, "learning_rate": 3.6452591861900886e-06, "loss": 0.2504, "step": 4605 }, { "epoch": 2.2007048563407205, "grad_norm": 0.4738688790143464, "learning_rate": 3.641193523832689e-06, "loss": 0.2642, "step": 4606 }, { "epoch": 2.2011827250462934, "grad_norm": 0.5033486532463676, "learning_rate": 3.637129625296035e-06, "loss": 0.2634, "step": 4607 }, { "epoch": 2.2016605937518667, "grad_norm": 0.4960105505883145, "learning_rate": 3.633067491707387e-06, "loss": 0.2978, "step": 4608 }, { "epoch": 2.20213846245744, "grad_norm": 0.47252895928191396, "learning_rate": 3.6290071241935067e-06, "loss": 0.2712, "step": 4609 }, { "epoch": 2.202616331163013, "grad_norm": 0.46564634687060924, "learning_rate": 3.6249485238806637e-06, "loss": 0.2749, "step": 4610 }, { "epoch": 2.2030941998685862, "grad_norm": 0.47341686338285643, "learning_rate": 3.62089169189465e-06, "loss": 0.2481, "step": 4611 }, { "epoch": 2.203572068574159, "grad_norm": 0.4813130606312593, "learning_rate": 3.6168366293607526e-06, "loss": 0.2686, "step": 4612 }, { "epoch": 2.2040499372797324, "grad_norm": 0.4572672198704176, "learning_rate": 3.612783337403776e-06, "loss": 0.2702, "step": 4613 }, { "epoch": 2.2045278059853057, "grad_norm": 0.4506090325494477, "learning_rate": 3.6087318171480368e-06, "loss": 0.2668, "step": 4614 }, { "epoch": 2.2050056746908786, "grad_norm": 0.47410093157277494, "learning_rate": 3.6046820697173514e-06, "loss": 0.2699, "step": 4615 }, { "epoch": 2.205483543396452, "grad_norm": 0.46061091915894664, "learning_rate": 3.600634096235046e-06, "loss": 0.2567, "step": 4616 }, { "epoch": 2.205961412102025, "grad_norm": 0.4868564505945191, "learning_rate": 3.596587897823962e-06, "loss": 0.2779, "step": 4617 }, { "epoch": 2.206439280807598, "grad_norm": 0.4832363834974607, "learning_rate": 3.59254347560644e-06, "loss": 0.2638, "step": 4618 }, { "epoch": 2.2069171495131714, "grad_norm": 0.5648718684599673, "learning_rate": 3.58850083070433e-06, "loss": 0.274, "step": 4619 }, { "epoch": 2.2073950182187443, "grad_norm": 0.4940852949638152, "learning_rate": 3.5844599642389965e-06, "loss": 0.2675, "step": 4620 }, { "epoch": 2.2078728869243176, "grad_norm": 0.4787783856445962, "learning_rate": 3.5804208773313e-06, "loss": 0.2678, "step": 4621 }, { "epoch": 2.2083507556298905, "grad_norm": 0.4985708934791662, "learning_rate": 3.576383571101609e-06, "loss": 0.2607, "step": 4622 }, { "epoch": 2.208828624335464, "grad_norm": 0.4467068890356679, "learning_rate": 3.572348046669809e-06, "loss": 0.2782, "step": 4623 }, { "epoch": 2.209306493041037, "grad_norm": 0.624896596323991, "learning_rate": 3.5683143051552784e-06, "loss": 0.2758, "step": 4624 }, { "epoch": 2.20978436174661, "grad_norm": 0.49994003409060306, "learning_rate": 3.564282347676903e-06, "loss": 0.2788, "step": 4625 }, { "epoch": 2.2102622304521833, "grad_norm": 0.47742176535286496, "learning_rate": 3.560252175353084e-06, "loss": 0.266, "step": 4626 }, { "epoch": 2.2107400991577566, "grad_norm": 0.47656971421522326, "learning_rate": 3.556223789301716e-06, "loss": 0.2681, "step": 4627 }, { "epoch": 2.2112179678633295, "grad_norm": 0.569244955801473, "learning_rate": 3.552197190640203e-06, "loss": 0.2668, "step": 4628 }, { "epoch": 2.211695836568903, "grad_norm": 0.4642389811727502, "learning_rate": 3.5481723804854485e-06, "loss": 0.2682, "step": 4629 }, { "epoch": 2.2121737052744757, "grad_norm": 0.5017601977034839, "learning_rate": 3.54414935995387e-06, "loss": 0.257, "step": 4630 }, { "epoch": 2.212651573980049, "grad_norm": 0.4986978616999572, "learning_rate": 3.540128130161381e-06, "loss": 0.2618, "step": 4631 }, { "epoch": 2.2131294426856223, "grad_norm": 0.6552711808635082, "learning_rate": 3.5361086922233944e-06, "loss": 0.2771, "step": 4632 }, { "epoch": 2.213607311391195, "grad_norm": 0.5079540763737515, "learning_rate": 3.53209104725484e-06, "loss": 0.2715, "step": 4633 }, { "epoch": 2.2140851800967685, "grad_norm": 0.5478379669178205, "learning_rate": 3.5280751963701356e-06, "loss": 0.253, "step": 4634 }, { "epoch": 2.214563048802342, "grad_norm": 0.481691741150192, "learning_rate": 3.524061140683206e-06, "loss": 0.2822, "step": 4635 }, { "epoch": 2.2150409175079147, "grad_norm": 0.46183826966564606, "learning_rate": 3.520048881307486e-06, "loss": 0.277, "step": 4636 }, { "epoch": 2.215518786213488, "grad_norm": 0.6334146180967442, "learning_rate": 3.5160384193559017e-06, "loss": 0.2745, "step": 4637 }, { "epoch": 2.215996654919061, "grad_norm": 0.4842285182462256, "learning_rate": 3.512029755940882e-06, "loss": 0.2682, "step": 4638 }, { "epoch": 2.216474523624634, "grad_norm": 0.47466868378671784, "learning_rate": 3.5080228921743653e-06, "loss": 0.2902, "step": 4639 }, { "epoch": 2.2169523923302075, "grad_norm": 0.51214418083174, "learning_rate": 3.5040178291677816e-06, "loss": 0.2522, "step": 4640 }, { "epoch": 2.2174302610357803, "grad_norm": 0.4665166989520757, "learning_rate": 3.5000145680320617e-06, "loss": 0.265, "step": 4641 }, { "epoch": 2.2179081297413537, "grad_norm": 0.48603526531215135, "learning_rate": 3.496013109877646e-06, "loss": 0.2772, "step": 4642 }, { "epoch": 2.2183859984469265, "grad_norm": 0.5575762979046887, "learning_rate": 3.4920134558144645e-06, "loss": 0.2589, "step": 4643 }, { "epoch": 2.2188638671525, "grad_norm": 0.4895518470618773, "learning_rate": 3.48801560695195e-06, "loss": 0.2759, "step": 4644 }, { "epoch": 2.219341735858073, "grad_norm": 0.6734863253510791, "learning_rate": 3.4840195643990383e-06, "loss": 0.2593, "step": 4645 }, { "epoch": 2.219819604563646, "grad_norm": 0.48164159307131443, "learning_rate": 3.4800253292641574e-06, "loss": 0.2496, "step": 4646 }, { "epoch": 2.2202974732692193, "grad_norm": 0.5194894835045092, "learning_rate": 3.476032902655239e-06, "loss": 0.2731, "step": 4647 }, { "epoch": 2.220775341974792, "grad_norm": 0.5048689424410697, "learning_rate": 3.4720422856797163e-06, "loss": 0.272, "step": 4648 }, { "epoch": 2.2212532106803655, "grad_norm": 0.4555324657809056, "learning_rate": 3.468053479444512e-06, "loss": 0.275, "step": 4649 }, { "epoch": 2.221731079385939, "grad_norm": 0.49021751307121847, "learning_rate": 3.464066485056048e-06, "loss": 0.2744, "step": 4650 }, { "epoch": 2.2222089480915117, "grad_norm": 0.4410281797101959, "learning_rate": 3.460081303620252e-06, "loss": 0.2757, "step": 4651 }, { "epoch": 2.222686816797085, "grad_norm": 0.5014942315206841, "learning_rate": 3.4560979362425406e-06, "loss": 0.2797, "step": 4652 }, { "epoch": 2.2231646855026583, "grad_norm": 0.45504543451237, "learning_rate": 3.452116384027826e-06, "loss": 0.2573, "step": 4653 }, { "epoch": 2.223642554208231, "grad_norm": 0.4803760937380114, "learning_rate": 3.4481366480805266e-06, "loss": 0.2483, "step": 4654 }, { "epoch": 2.2241204229138045, "grad_norm": 0.457505611780426, "learning_rate": 3.444158729504549e-06, "loss": 0.2645, "step": 4655 }, { "epoch": 2.2245982916193774, "grad_norm": 0.49598384831058834, "learning_rate": 3.4401826294032924e-06, "loss": 0.2738, "step": 4656 }, { "epoch": 2.2250761603249507, "grad_norm": 0.47321887795699297, "learning_rate": 3.436208348879665e-06, "loss": 0.2852, "step": 4657 }, { "epoch": 2.225554029030524, "grad_norm": 0.47386541058260834, "learning_rate": 3.4322358890360586e-06, "loss": 0.2809, "step": 4658 }, { "epoch": 2.226031897736097, "grad_norm": 0.4403468594295977, "learning_rate": 3.4282652509743596e-06, "loss": 0.2586, "step": 4659 }, { "epoch": 2.22650976644167, "grad_norm": 0.48721058760605696, "learning_rate": 3.4242964357959597e-06, "loss": 0.277, "step": 4660 }, { "epoch": 2.2269876351472435, "grad_norm": 0.5068739043457748, "learning_rate": 3.4203294446017354e-06, "loss": 0.2519, "step": 4661 }, { "epoch": 2.2274655038528164, "grad_norm": 0.4420653696705306, "learning_rate": 3.416364278492057e-06, "loss": 0.2831, "step": 4662 }, { "epoch": 2.2279433725583897, "grad_norm": 0.486948816480161, "learning_rate": 3.4124009385667967e-06, "loss": 0.2734, "step": 4663 }, { "epoch": 2.2284212412639626, "grad_norm": 0.49182635386072504, "learning_rate": 3.408439425925313e-06, "loss": 0.2615, "step": 4664 }, { "epoch": 2.228899109969536, "grad_norm": 0.4713713292607333, "learning_rate": 3.4044797416664564e-06, "loss": 0.2789, "step": 4665 }, { "epoch": 2.229376978675109, "grad_norm": 0.4553911870292443, "learning_rate": 3.4005218868885794e-06, "loss": 0.2865, "step": 4666 }, { "epoch": 2.229854847380682, "grad_norm": 0.48295582603522347, "learning_rate": 3.396565862689518e-06, "loss": 0.2564, "step": 4667 }, { "epoch": 2.2303327160862554, "grad_norm": 0.5092986006165036, "learning_rate": 3.3926116701666013e-06, "loss": 0.2805, "step": 4668 }, { "epoch": 2.2308105847918283, "grad_norm": 0.45858410229406726, "learning_rate": 3.3886593104166575e-06, "loss": 0.2815, "step": 4669 }, { "epoch": 2.2312884534974016, "grad_norm": 0.4947175505998691, "learning_rate": 3.3847087845359996e-06, "loss": 0.2752, "step": 4670 }, { "epoch": 2.231766322202975, "grad_norm": 0.4978122665367534, "learning_rate": 3.38076009362043e-06, "loss": 0.2818, "step": 4671 }, { "epoch": 2.2322441909085478, "grad_norm": 0.5749495630859802, "learning_rate": 3.376813238765252e-06, "loss": 0.2616, "step": 4672 }, { "epoch": 2.232722059614121, "grad_norm": 0.46541839278207525, "learning_rate": 3.3728682210652497e-06, "loss": 0.2738, "step": 4673 }, { "epoch": 2.233199928319694, "grad_norm": 0.4822660477000455, "learning_rate": 3.3689250416147e-06, "loss": 0.2846, "step": 4674 }, { "epoch": 2.2336777970252673, "grad_norm": 0.4569814026626145, "learning_rate": 3.364983701507376e-06, "loss": 0.2602, "step": 4675 }, { "epoch": 2.2341556657308406, "grad_norm": 0.4445416419282075, "learning_rate": 3.361044201836534e-06, "loss": 0.2695, "step": 4676 }, { "epoch": 2.2346335344364134, "grad_norm": 0.45506836119647587, "learning_rate": 3.357106543694918e-06, "loss": 0.2778, "step": 4677 }, { "epoch": 2.2351114031419868, "grad_norm": 0.4574261555122098, "learning_rate": 3.3531707281747717e-06, "loss": 0.2693, "step": 4678 }, { "epoch": 2.23558927184756, "grad_norm": 0.5141167452823477, "learning_rate": 3.3492367563678173e-06, "loss": 0.2765, "step": 4679 }, { "epoch": 2.236067140553133, "grad_norm": 0.6521885973282003, "learning_rate": 3.3453046293652657e-06, "loss": 0.2638, "step": 4680 }, { "epoch": 2.2365450092587063, "grad_norm": 0.492272086211248, "learning_rate": 3.3413743482578233e-06, "loss": 0.2608, "step": 4681 }, { "epoch": 2.237022877964279, "grad_norm": 0.4635845305006642, "learning_rate": 3.337445914135684e-06, "loss": 0.2617, "step": 4682 }, { "epoch": 2.2375007466698524, "grad_norm": 0.5023980255782854, "learning_rate": 3.3335193280885215e-06, "loss": 0.2761, "step": 4683 }, { "epoch": 2.2379786153754258, "grad_norm": 0.5343058309706397, "learning_rate": 3.3295945912055006e-06, "loss": 0.2845, "step": 4684 }, { "epoch": 2.2384564840809986, "grad_norm": 0.45734291677162836, "learning_rate": 3.3256717045752794e-06, "loss": 0.2629, "step": 4685 }, { "epoch": 2.238934352786572, "grad_norm": 0.4585226946873842, "learning_rate": 3.3217506692859937e-06, "loss": 0.2684, "step": 4686 }, { "epoch": 2.2394122214921452, "grad_norm": 0.48321193724337996, "learning_rate": 3.317831486425267e-06, "loss": 0.2785, "step": 4687 }, { "epoch": 2.239890090197718, "grad_norm": 0.49773006035607, "learning_rate": 3.313914157080218e-06, "loss": 0.2723, "step": 4688 }, { "epoch": 2.2403679589032914, "grad_norm": 0.4908921121030998, "learning_rate": 3.3099986823374407e-06, "loss": 0.264, "step": 4689 }, { "epoch": 2.2408458276088643, "grad_norm": 0.5288018646857777, "learning_rate": 3.3060850632830167e-06, "loss": 0.2614, "step": 4690 }, { "epoch": 2.2413236963144376, "grad_norm": 0.4422685276222856, "learning_rate": 3.3021733010025203e-06, "loss": 0.268, "step": 4691 }, { "epoch": 2.241801565020011, "grad_norm": 0.4989573893125483, "learning_rate": 3.298263396581003e-06, "loss": 0.266, "step": 4692 }, { "epoch": 2.242279433725584, "grad_norm": 0.48611829997172, "learning_rate": 3.294355351102999e-06, "loss": 0.2527, "step": 4693 }, { "epoch": 2.242757302431157, "grad_norm": 0.4975978815944788, "learning_rate": 3.2904491656525396e-06, "loss": 0.2634, "step": 4694 }, { "epoch": 2.24323517113673, "grad_norm": 0.47584888029570965, "learning_rate": 3.286544841313126e-06, "loss": 0.2589, "step": 4695 }, { "epoch": 2.2437130398423033, "grad_norm": 0.4684430376922013, "learning_rate": 3.2826423791677475e-06, "loss": 0.2665, "step": 4696 }, { "epoch": 2.2441909085478766, "grad_norm": 0.4697921966643509, "learning_rate": 3.278741780298883e-06, "loss": 0.2684, "step": 4697 }, { "epoch": 2.2446687772534495, "grad_norm": 0.4471854069336229, "learning_rate": 3.2748430457884883e-06, "loss": 0.2858, "step": 4698 }, { "epoch": 2.245146645959023, "grad_norm": 0.5125951863846561, "learning_rate": 3.2709461767180007e-06, "loss": 0.2806, "step": 4699 }, { "epoch": 2.2456245146645957, "grad_norm": 0.5015064985997865, "learning_rate": 3.2670511741683475e-06, "loss": 0.2919, "step": 4700 }, { "epoch": 2.246102383370169, "grad_norm": 0.441527020343577, "learning_rate": 3.2631580392199316e-06, "loss": 0.281, "step": 4701 }, { "epoch": 2.2465802520757423, "grad_norm": 0.48081974361271224, "learning_rate": 3.259266772952636e-06, "loss": 0.2814, "step": 4702 }, { "epoch": 2.247058120781315, "grad_norm": 0.5288631768506852, "learning_rate": 3.2553773764458374e-06, "loss": 0.2626, "step": 4703 }, { "epoch": 2.2475359894868885, "grad_norm": 0.46254313690849835, "learning_rate": 3.251489850778381e-06, "loss": 0.2711, "step": 4704 }, { "epoch": 2.248013858192462, "grad_norm": 0.4737518642755567, "learning_rate": 3.2476041970285945e-06, "loss": 0.2625, "step": 4705 }, { "epoch": 2.2484917268980347, "grad_norm": 0.44502937263508435, "learning_rate": 3.2437204162742975e-06, "loss": 0.2571, "step": 4706 }, { "epoch": 2.248969595603608, "grad_norm": 0.45119405560525677, "learning_rate": 3.2398385095927775e-06, "loss": 0.2877, "step": 4707 }, { "epoch": 2.249447464309181, "grad_norm": 0.45027149302862923, "learning_rate": 3.2359584780608055e-06, "loss": 0.2709, "step": 4708 }, { "epoch": 2.249925333014754, "grad_norm": 0.4419939110436344, "learning_rate": 3.232080322754638e-06, "loss": 0.2712, "step": 4709 }, { "epoch": 2.2504032017203275, "grad_norm": 0.45427067027017565, "learning_rate": 3.2282040447500063e-06, "loss": 0.282, "step": 4710 }, { "epoch": 2.2508810704259004, "grad_norm": 0.45850633742518837, "learning_rate": 3.2243296451221164e-06, "loss": 0.2502, "step": 4711 }, { "epoch": 2.2513589391314737, "grad_norm": 0.4575443548721124, "learning_rate": 3.220457124945665e-06, "loss": 0.2625, "step": 4712 }, { "epoch": 2.251836807837047, "grad_norm": 0.4657370801081544, "learning_rate": 3.2165864852948147e-06, "loss": 0.2373, "step": 4713 }, { "epoch": 2.25231467654262, "grad_norm": 0.45836793153138744, "learning_rate": 3.21271772724322e-06, "loss": 0.2762, "step": 4714 }, { "epoch": 2.252792545248193, "grad_norm": 0.4946186941630403, "learning_rate": 3.208850851863998e-06, "loss": 0.2888, "step": 4715 }, { "epoch": 2.253270413953766, "grad_norm": 0.45248472166716014, "learning_rate": 3.20498586022976e-06, "loss": 0.2702, "step": 4716 }, { "epoch": 2.2537482826593394, "grad_norm": 0.4594724499710993, "learning_rate": 3.201122753412582e-06, "loss": 0.2632, "step": 4717 }, { "epoch": 2.2542261513649127, "grad_norm": 0.48222462052144954, "learning_rate": 3.1972615324840197e-06, "loss": 0.2553, "step": 4718 }, { "epoch": 2.2547040200704855, "grad_norm": 0.4616647404690517, "learning_rate": 3.193402198515112e-06, "loss": 0.2704, "step": 4719 }, { "epoch": 2.255181888776059, "grad_norm": 0.46141169216599737, "learning_rate": 3.189544752576369e-06, "loss": 0.2547, "step": 4720 }, { "epoch": 2.2556597574816317, "grad_norm": 0.46118690700708775, "learning_rate": 3.1856891957377735e-06, "loss": 0.2678, "step": 4721 }, { "epoch": 2.256137626187205, "grad_norm": 0.45651170157809823, "learning_rate": 3.1818355290687962e-06, "loss": 0.2782, "step": 4722 }, { "epoch": 2.2566154948927784, "grad_norm": 0.49019149614604685, "learning_rate": 3.177983753638373e-06, "loss": 0.2498, "step": 4723 }, { "epoch": 2.257093363598351, "grad_norm": 0.47354007893571354, "learning_rate": 3.174133870514914e-06, "loss": 0.2607, "step": 4724 }, { "epoch": 2.2575712323039245, "grad_norm": 0.4752144096947728, "learning_rate": 3.1702858807663175e-06, "loss": 0.2769, "step": 4725 }, { "epoch": 2.2580491010094974, "grad_norm": 0.48451667423176453, "learning_rate": 3.166439785459943e-06, "loss": 0.2708, "step": 4726 }, { "epoch": 2.2585269697150707, "grad_norm": 0.4570766080552371, "learning_rate": 3.1625955856626267e-06, "loss": 0.2646, "step": 4727 }, { "epoch": 2.259004838420644, "grad_norm": 0.4445274144500832, "learning_rate": 3.1587532824406887e-06, "loss": 0.2777, "step": 4728 }, { "epoch": 2.259482707126217, "grad_norm": 0.4703985338457953, "learning_rate": 3.1549128768599123e-06, "loss": 0.2724, "step": 4729 }, { "epoch": 2.25996057583179, "grad_norm": 0.46555737939351904, "learning_rate": 3.151074369985556e-06, "loss": 0.2716, "step": 4730 }, { "epoch": 2.2604384445373635, "grad_norm": 0.45652113924751747, "learning_rate": 3.147237762882359e-06, "loss": 0.2695, "step": 4731 }, { "epoch": 2.2609163132429364, "grad_norm": 0.4598715452430841, "learning_rate": 3.143403056614527e-06, "loss": 0.2631, "step": 4732 }, { "epoch": 2.2613941819485097, "grad_norm": 0.5304735402801358, "learning_rate": 3.139570252245734e-06, "loss": 0.2731, "step": 4733 }, { "epoch": 2.261872050654083, "grad_norm": 0.47206865977241064, "learning_rate": 3.135739350839141e-06, "loss": 0.2752, "step": 4734 }, { "epoch": 2.262349919359656, "grad_norm": 0.4527194547413837, "learning_rate": 3.131910353457369e-06, "loss": 0.2663, "step": 4735 }, { "epoch": 2.262827788065229, "grad_norm": 0.4742204418152278, "learning_rate": 3.1280832611625112e-06, "loss": 0.2727, "step": 4736 }, { "epoch": 2.263305656770802, "grad_norm": 0.4698999007147011, "learning_rate": 3.12425807501614e-06, "loss": 0.2672, "step": 4737 }, { "epoch": 2.2637835254763754, "grad_norm": 0.4507660759859787, "learning_rate": 3.1204347960792935e-06, "loss": 0.3011, "step": 4738 }, { "epoch": 2.2642613941819487, "grad_norm": 0.47815020250201823, "learning_rate": 3.116613425412478e-06, "loss": 0.2707, "step": 4739 }, { "epoch": 2.2647392628875216, "grad_norm": 0.466778767224418, "learning_rate": 3.112793964075681e-06, "loss": 0.2742, "step": 4740 }, { "epoch": 2.265217131593095, "grad_norm": 0.46764545840736804, "learning_rate": 3.1089764131283497e-06, "loss": 0.2618, "step": 4741 }, { "epoch": 2.2656950002986678, "grad_norm": 0.4598354811835029, "learning_rate": 3.105160773629402e-06, "loss": 0.2727, "step": 4742 }, { "epoch": 2.266172869004241, "grad_norm": 0.45939402218354186, "learning_rate": 3.1013470466372373e-06, "loss": 0.2617, "step": 4743 }, { "epoch": 2.2666507377098144, "grad_norm": 0.689169049474016, "learning_rate": 3.0975352332097107e-06, "loss": 0.2719, "step": 4744 }, { "epoch": 2.2671286064153873, "grad_norm": 0.4457399095001578, "learning_rate": 3.0937253344041507e-06, "loss": 0.2641, "step": 4745 }, { "epoch": 2.2676064751209606, "grad_norm": 0.5931896862224231, "learning_rate": 3.0899173512773607e-06, "loss": 0.271, "step": 4746 }, { "epoch": 2.2680843438265335, "grad_norm": 0.44667772647454984, "learning_rate": 3.0861112848856024e-06, "loss": 0.2756, "step": 4747 }, { "epoch": 2.2685622125321068, "grad_norm": 0.4870112961699444, "learning_rate": 3.082307136284616e-06, "loss": 0.2586, "step": 4748 }, { "epoch": 2.26904008123768, "grad_norm": 1.0214438039442433, "learning_rate": 3.0785049065296057e-06, "loss": 0.2663, "step": 4749 }, { "epoch": 2.269517949943253, "grad_norm": 0.5664811753038376, "learning_rate": 3.074704596675242e-06, "loss": 0.2616, "step": 4750 }, { "epoch": 2.2699958186488263, "grad_norm": 0.4869676974104964, "learning_rate": 3.07090620777566e-06, "loss": 0.2865, "step": 4751 }, { "epoch": 2.270473687354399, "grad_norm": 0.5088607622709265, "learning_rate": 3.067109740884472e-06, "loss": 0.268, "step": 4752 }, { "epoch": 2.2709515560599725, "grad_norm": 0.4469599957069902, "learning_rate": 3.063315197054747e-06, "loss": 0.2721, "step": 4753 }, { "epoch": 2.2714294247655458, "grad_norm": 0.4524405576091524, "learning_rate": 3.0595225773390225e-06, "loss": 0.2709, "step": 4754 }, { "epoch": 2.2719072934711186, "grad_norm": 0.5863151040314684, "learning_rate": 3.055731882789311e-06, "loss": 0.2697, "step": 4755 }, { "epoch": 2.272385162176692, "grad_norm": 0.4626038538364683, "learning_rate": 3.05194311445708e-06, "loss": 0.2539, "step": 4756 }, { "epoch": 2.2728630308822653, "grad_norm": 0.45285520691551345, "learning_rate": 3.0481562733932647e-06, "loss": 0.2605, "step": 4757 }, { "epoch": 2.273340899587838, "grad_norm": 0.48766103070408395, "learning_rate": 3.0443713606482727e-06, "loss": 0.2584, "step": 4758 }, { "epoch": 2.2738187682934115, "grad_norm": 0.49630506462100404, "learning_rate": 3.0405883772719715e-06, "loss": 0.2783, "step": 4759 }, { "epoch": 2.2742966369989848, "grad_norm": 0.4608594610607203, "learning_rate": 3.0368073243136874e-06, "loss": 0.2561, "step": 4760 }, { "epoch": 2.2747745057045576, "grad_norm": 0.4530972395926978, "learning_rate": 3.033028202822228e-06, "loss": 0.2554, "step": 4761 }, { "epoch": 2.275252374410131, "grad_norm": 0.45511832040091293, "learning_rate": 3.029251013845849e-06, "loss": 0.2577, "step": 4762 }, { "epoch": 2.275730243115704, "grad_norm": 0.9922747694397978, "learning_rate": 3.0254757584322736e-06, "loss": 0.2623, "step": 4763 }, { "epoch": 2.276208111821277, "grad_norm": 0.48096765381573897, "learning_rate": 3.0217024376286984e-06, "loss": 0.2493, "step": 4764 }, { "epoch": 2.2766859805268505, "grad_norm": 0.47667943487788295, "learning_rate": 3.0179310524817707e-06, "loss": 0.2859, "step": 4765 }, { "epoch": 2.2771638492324233, "grad_norm": 0.43920123160920244, "learning_rate": 3.0141616040376052e-06, "loss": 0.2761, "step": 4766 }, { "epoch": 2.2776417179379966, "grad_norm": 0.485107810636819, "learning_rate": 3.010394093341785e-06, "loss": 0.2865, "step": 4767 }, { "epoch": 2.2781195866435695, "grad_norm": 0.4621289185973684, "learning_rate": 3.00662852143935e-06, "loss": 0.2788, "step": 4768 }, { "epoch": 2.278597455349143, "grad_norm": 0.4850819067216184, "learning_rate": 3.002864889374798e-06, "loss": 0.2655, "step": 4769 }, { "epoch": 2.279075324054716, "grad_norm": 0.47125925955991355, "learning_rate": 2.9991031981921026e-06, "loss": 0.2511, "step": 4770 }, { "epoch": 2.279553192760289, "grad_norm": 1.1105889455924962, "learning_rate": 2.9953434489346856e-06, "loss": 0.2551, "step": 4771 }, { "epoch": 2.2800310614658623, "grad_norm": 0.4773544991225756, "learning_rate": 2.9915856426454324e-06, "loss": 0.2583, "step": 4772 }, { "epoch": 2.280508930171435, "grad_norm": 0.49707858432136753, "learning_rate": 2.987829780366699e-06, "loss": 0.2675, "step": 4773 }, { "epoch": 2.2809867988770085, "grad_norm": 0.47711687874889647, "learning_rate": 2.984075863140292e-06, "loss": 0.2658, "step": 4774 }, { "epoch": 2.281464667582582, "grad_norm": 0.585688509831411, "learning_rate": 2.9803238920074784e-06, "loss": 0.2547, "step": 4775 }, { "epoch": 2.2819425362881547, "grad_norm": 0.46857304420027696, "learning_rate": 2.976573868008995e-06, "loss": 0.2658, "step": 4776 }, { "epoch": 2.282420404993728, "grad_norm": 0.4558766138524307, "learning_rate": 2.9728257921850302e-06, "loss": 0.2586, "step": 4777 }, { "epoch": 2.282898273699301, "grad_norm": 0.45756901243901504, "learning_rate": 2.9690796655752306e-06, "loss": 0.2689, "step": 4778 }, { "epoch": 2.283376142404874, "grad_norm": 0.47832703138164223, "learning_rate": 2.965335489218711e-06, "loss": 0.2786, "step": 4779 }, { "epoch": 2.2838540111104475, "grad_norm": 0.45406741476852475, "learning_rate": 2.961593264154038e-06, "loss": 0.2628, "step": 4780 }, { "epoch": 2.2843318798160204, "grad_norm": 0.47436505180344063, "learning_rate": 2.9578529914192342e-06, "loss": 0.2603, "step": 4781 }, { "epoch": 2.2848097485215937, "grad_norm": 0.45974865789697694, "learning_rate": 2.954114672051789e-06, "loss": 0.2602, "step": 4782 }, { "epoch": 2.285287617227167, "grad_norm": 0.5087537120467954, "learning_rate": 2.9503783070886504e-06, "loss": 0.2638, "step": 4783 }, { "epoch": 2.28576548593274, "grad_norm": 0.4593223370985407, "learning_rate": 2.946643897566216e-06, "loss": 0.26, "step": 4784 }, { "epoch": 2.286243354638313, "grad_norm": 0.4739581626972536, "learning_rate": 2.9429114445203423e-06, "loss": 0.2834, "step": 4785 }, { "epoch": 2.2867212233438865, "grad_norm": 0.4473066092125534, "learning_rate": 2.939180948986352e-06, "loss": 0.258, "step": 4786 }, { "epoch": 2.2871990920494594, "grad_norm": 0.7178714404426283, "learning_rate": 2.9354524119990156e-06, "loss": 0.264, "step": 4787 }, { "epoch": 2.2876769607550327, "grad_norm": 0.4518483998022226, "learning_rate": 2.9317258345925603e-06, "loss": 0.2552, "step": 4788 }, { "epoch": 2.2881548294606056, "grad_norm": 0.4592657516604581, "learning_rate": 2.92800121780068e-06, "loss": 0.2758, "step": 4789 }, { "epoch": 2.288632698166179, "grad_norm": 0.703821050907462, "learning_rate": 2.924278562656514e-06, "loss": 0.2712, "step": 4790 }, { "epoch": 2.289110566871752, "grad_norm": 0.5327889383144262, "learning_rate": 2.9205578701926575e-06, "loss": 0.2548, "step": 4791 }, { "epoch": 2.289588435577325, "grad_norm": 0.4542697805884633, "learning_rate": 2.916839141441172e-06, "loss": 0.2648, "step": 4792 }, { "epoch": 2.2900663042828984, "grad_norm": 0.4471368664844579, "learning_rate": 2.913122377433564e-06, "loss": 0.2701, "step": 4793 }, { "epoch": 2.2905441729884712, "grad_norm": 0.49515537031904927, "learning_rate": 2.9094075792007948e-06, "loss": 0.2612, "step": 4794 }, { "epoch": 2.2910220416940446, "grad_norm": 0.5023092878736718, "learning_rate": 2.90569474777329e-06, "loss": 0.2731, "step": 4795 }, { "epoch": 2.291499910399618, "grad_norm": 0.5203729973321249, "learning_rate": 2.901983884180921e-06, "loss": 0.2693, "step": 4796 }, { "epoch": 2.2919777791051907, "grad_norm": 0.458762113548676, "learning_rate": 2.8982749894530128e-06, "loss": 0.2528, "step": 4797 }, { "epoch": 2.292455647810764, "grad_norm": 0.4454688102007361, "learning_rate": 2.8945680646183527e-06, "loss": 0.2642, "step": 4798 }, { "epoch": 2.292933516516337, "grad_norm": 0.47272093877831456, "learning_rate": 2.8908631107051743e-06, "loss": 0.2845, "step": 4799 }, { "epoch": 2.2934113852219102, "grad_norm": 0.44051662539793884, "learning_rate": 2.8871601287411634e-06, "loss": 0.2814, "step": 4800 }, { "epoch": 2.2938892539274836, "grad_norm": 0.44919987478542955, "learning_rate": 2.8834591197534668e-06, "loss": 0.2616, "step": 4801 }, { "epoch": 2.2943671226330564, "grad_norm": 0.4655075873793121, "learning_rate": 2.879760084768677e-06, "loss": 0.2598, "step": 4802 }, { "epoch": 2.2948449913386297, "grad_norm": 0.44695659849319586, "learning_rate": 2.8760630248128374e-06, "loss": 0.2809, "step": 4803 }, { "epoch": 2.2953228600442026, "grad_norm": 0.46112658531258754, "learning_rate": 2.8723679409114536e-06, "loss": 0.2625, "step": 4804 }, { "epoch": 2.295800728749776, "grad_norm": 0.4656636436014579, "learning_rate": 2.8686748340894744e-06, "loss": 0.2731, "step": 4805 }, { "epoch": 2.2962785974553492, "grad_norm": 0.5844196537588953, "learning_rate": 2.864983705371298e-06, "loss": 0.2663, "step": 4806 }, { "epoch": 2.296756466160922, "grad_norm": 0.46998718606495676, "learning_rate": 2.861294555780786e-06, "loss": 0.2574, "step": 4807 }, { "epoch": 2.2972343348664954, "grad_norm": 0.4823897817406597, "learning_rate": 2.8576073863412402e-06, "loss": 0.2823, "step": 4808 }, { "epoch": 2.2977122035720687, "grad_norm": 0.4531753739973854, "learning_rate": 2.8539221980754115e-06, "loss": 0.2541, "step": 4809 }, { "epoch": 2.2981900722776416, "grad_norm": 0.4579423043984502, "learning_rate": 2.850238992005514e-06, "loss": 0.2737, "step": 4810 }, { "epoch": 2.298667940983215, "grad_norm": 0.44211975158230343, "learning_rate": 2.8465577691532e-06, "loss": 0.2716, "step": 4811 }, { "epoch": 2.2991458096887882, "grad_norm": 0.4480696060954269, "learning_rate": 2.8428785305395733e-06, "loss": 0.261, "step": 4812 }, { "epoch": 2.299623678394361, "grad_norm": 0.4605681190004609, "learning_rate": 2.8392012771851963e-06, "loss": 0.2693, "step": 4813 }, { "epoch": 2.3001015470999344, "grad_norm": 0.6621094950095469, "learning_rate": 2.83552601011007e-06, "loss": 0.2694, "step": 4814 }, { "epoch": 2.3005794158055073, "grad_norm": 0.4783897161449604, "learning_rate": 2.8318527303336465e-06, "loss": 0.2756, "step": 4815 }, { "epoch": 2.3010572845110806, "grad_norm": 0.4512383810963222, "learning_rate": 2.828181438874832e-06, "loss": 0.2712, "step": 4816 }, { "epoch": 2.301535153216654, "grad_norm": 0.4712970827373998, "learning_rate": 2.8245121367519812e-06, "loss": 0.2603, "step": 4817 }, { "epoch": 2.302013021922227, "grad_norm": 0.47038727743638786, "learning_rate": 2.820844824982889e-06, "loss": 0.2601, "step": 4818 }, { "epoch": 2.3024908906278, "grad_norm": 0.4593763213754035, "learning_rate": 2.817179504584802e-06, "loss": 0.2699, "step": 4819 }, { "epoch": 2.302968759333373, "grad_norm": 0.4753946259917545, "learning_rate": 2.81351617657442e-06, "loss": 0.2676, "step": 4820 }, { "epoch": 2.3034466280389463, "grad_norm": 0.46869051732758144, "learning_rate": 2.8098548419678838e-06, "loss": 0.2838, "step": 4821 }, { "epoch": 2.3039244967445196, "grad_norm": 0.45543892367550015, "learning_rate": 2.8061955017807797e-06, "loss": 0.2707, "step": 4822 }, { "epoch": 2.3044023654500925, "grad_norm": 0.48105929755291854, "learning_rate": 2.8025381570281495e-06, "loss": 0.2735, "step": 4823 }, { "epoch": 2.304880234155666, "grad_norm": 0.4562110398780488, "learning_rate": 2.7988828087244735e-06, "loss": 0.2584, "step": 4824 }, { "epoch": 2.3053581028612387, "grad_norm": 0.4708490596777413, "learning_rate": 2.795229457883678e-06, "loss": 0.2521, "step": 4825 }, { "epoch": 2.305835971566812, "grad_norm": 0.47823032227536855, "learning_rate": 2.7915781055191437e-06, "loss": 0.2446, "step": 4826 }, { "epoch": 2.3063138402723853, "grad_norm": 0.4583497787749354, "learning_rate": 2.7879287526436884e-06, "loss": 0.2682, "step": 4827 }, { "epoch": 2.306791708977958, "grad_norm": 0.5195107891663974, "learning_rate": 2.784281400269575e-06, "loss": 0.2639, "step": 4828 }, { "epoch": 2.3072695776835315, "grad_norm": 0.44700898622956803, "learning_rate": 2.7806360494085218e-06, "loss": 0.2704, "step": 4829 }, { "epoch": 2.307747446389105, "grad_norm": 0.45593935022847376, "learning_rate": 2.7769927010716814e-06, "loss": 0.2561, "step": 4830 }, { "epoch": 2.3082253150946777, "grad_norm": 0.4779945808794087, "learning_rate": 2.77335135626965e-06, "loss": 0.2612, "step": 4831 }, { "epoch": 2.308703183800251, "grad_norm": 0.4650534484982984, "learning_rate": 2.76971201601248e-06, "loss": 0.2704, "step": 4832 }, { "epoch": 2.309181052505824, "grad_norm": 0.45968865169832307, "learning_rate": 2.7660746813096575e-06, "loss": 0.2788, "step": 4833 }, { "epoch": 2.309658921211397, "grad_norm": 0.5586814813353537, "learning_rate": 2.76243935317011e-06, "loss": 0.2586, "step": 4834 }, { "epoch": 2.3101367899169705, "grad_norm": 0.46957361443859275, "learning_rate": 2.7588060326022205e-06, "loss": 0.2636, "step": 4835 }, { "epoch": 2.3106146586225433, "grad_norm": 0.5799118470767503, "learning_rate": 2.755174720613806e-06, "loss": 0.2903, "step": 4836 }, { "epoch": 2.3110925273281167, "grad_norm": 0.48771811958787314, "learning_rate": 2.7515454182121238e-06, "loss": 0.2648, "step": 4837 }, { "epoch": 2.31157039603369, "grad_norm": 0.4454160846341557, "learning_rate": 2.7479181264038847e-06, "loss": 0.2673, "step": 4838 }, { "epoch": 2.312048264739263, "grad_norm": 0.48485972776665004, "learning_rate": 2.7442928461952333e-06, "loss": 0.2792, "step": 4839 }, { "epoch": 2.312526133444836, "grad_norm": 0.44389446972461244, "learning_rate": 2.740669578591755e-06, "loss": 0.2575, "step": 4840 }, { "epoch": 2.313004002150409, "grad_norm": 0.5119507485701839, "learning_rate": 2.7370483245984857e-06, "loss": 0.2579, "step": 4841 }, { "epoch": 2.3134818708559823, "grad_norm": 0.479630718784202, "learning_rate": 2.733429085219895e-06, "loss": 0.2533, "step": 4842 }, { "epoch": 2.3139597395615557, "grad_norm": 0.45539893096166484, "learning_rate": 2.7298118614598934e-06, "loss": 0.2874, "step": 4843 }, { "epoch": 2.3144376082671285, "grad_norm": 0.4597008048466559, "learning_rate": 2.726196654321841e-06, "loss": 0.2775, "step": 4844 }, { "epoch": 2.314915476972702, "grad_norm": 0.4884323297495012, "learning_rate": 2.7225834648085282e-06, "loss": 0.2512, "step": 4845 }, { "epoch": 2.3153933456782747, "grad_norm": 0.4765028408676871, "learning_rate": 2.7189722939221875e-06, "loss": 0.2589, "step": 4846 }, { "epoch": 2.315871214383848, "grad_norm": 0.4660406532791925, "learning_rate": 2.715363142664501e-06, "loss": 0.2648, "step": 4847 }, { "epoch": 2.3163490830894213, "grad_norm": 0.4747962483625691, "learning_rate": 2.711756012036577e-06, "loss": 0.2778, "step": 4848 }, { "epoch": 2.316826951794994, "grad_norm": 0.4632488608800758, "learning_rate": 2.708150903038972e-06, "loss": 0.275, "step": 4849 }, { "epoch": 2.3173048205005675, "grad_norm": 0.5453668545195846, "learning_rate": 2.7045478166716843e-06, "loss": 0.2755, "step": 4850 }, { "epoch": 2.3177826892061404, "grad_norm": 0.5001628937077411, "learning_rate": 2.7009467539341426e-06, "loss": 0.2691, "step": 4851 }, { "epoch": 2.3182605579117137, "grad_norm": 0.4553954046591676, "learning_rate": 2.6973477158252146e-06, "loss": 0.2655, "step": 4852 }, { "epoch": 2.318738426617287, "grad_norm": 0.4767807917673003, "learning_rate": 2.6937507033432177e-06, "loss": 0.2806, "step": 4853 }, { "epoch": 2.31921629532286, "grad_norm": 0.45179451190892383, "learning_rate": 2.690155717485895e-06, "loss": 0.2696, "step": 4854 }, { "epoch": 2.319694164028433, "grad_norm": 0.45905304275528375, "learning_rate": 2.6865627592504295e-06, "loss": 0.2672, "step": 4855 }, { "epoch": 2.3201720327340065, "grad_norm": 0.4375085321749962, "learning_rate": 2.6829718296334516e-06, "loss": 0.2562, "step": 4856 }, { "epoch": 2.3206499014395794, "grad_norm": 0.45149139878521694, "learning_rate": 2.6793829296310183e-06, "loss": 0.2903, "step": 4857 }, { "epoch": 2.3211277701451527, "grad_norm": 0.4575764223416466, "learning_rate": 2.6757960602386223e-06, "loss": 0.2534, "step": 4858 }, { "epoch": 2.3216056388507256, "grad_norm": 0.4537222729437179, "learning_rate": 2.6722112224512063e-06, "loss": 0.2518, "step": 4859 }, { "epoch": 2.322083507556299, "grad_norm": 0.45191436383379496, "learning_rate": 2.668628417263137e-06, "loss": 0.2737, "step": 4860 }, { "epoch": 2.322561376261872, "grad_norm": 0.4670433808850828, "learning_rate": 2.6650476456682195e-06, "loss": 0.2645, "step": 4861 }, { "epoch": 2.323039244967445, "grad_norm": 0.46184013428909193, "learning_rate": 2.661468908659701e-06, "loss": 0.2703, "step": 4862 }, { "epoch": 2.3235171136730184, "grad_norm": 0.46573908878240283, "learning_rate": 2.6578922072302572e-06, "loss": 0.2681, "step": 4863 }, { "epoch": 2.3239949823785917, "grad_norm": 0.4495053674176749, "learning_rate": 2.6543175423720004e-06, "loss": 0.2613, "step": 4864 }, { "epoch": 2.3244728510841646, "grad_norm": 0.48067303236066894, "learning_rate": 2.6507449150764852e-06, "loss": 0.2743, "step": 4865 }, { "epoch": 2.324950719789738, "grad_norm": 0.450687819523822, "learning_rate": 2.6471743263346903e-06, "loss": 0.2762, "step": 4866 }, { "epoch": 2.3254285884953108, "grad_norm": 0.5034337207294447, "learning_rate": 2.643605777137034e-06, "loss": 0.2749, "step": 4867 }, { "epoch": 2.325906457200884, "grad_norm": 0.5112858649436341, "learning_rate": 2.6400392684733735e-06, "loss": 0.2726, "step": 4868 }, { "epoch": 2.3263843259064574, "grad_norm": 0.447531179096655, "learning_rate": 2.636474801332992e-06, "loss": 0.2798, "step": 4869 }, { "epoch": 2.3268621946120303, "grad_norm": 0.47053088394428416, "learning_rate": 2.632912376704607e-06, "loss": 0.2772, "step": 4870 }, { "epoch": 2.3273400633176036, "grad_norm": 0.4892476562482173, "learning_rate": 2.629351995576379e-06, "loss": 0.2728, "step": 4871 }, { "epoch": 2.3278179320231764, "grad_norm": 0.4496411171519612, "learning_rate": 2.6257936589358914e-06, "loss": 0.2715, "step": 4872 }, { "epoch": 2.3282958007287498, "grad_norm": 0.4706505012171985, "learning_rate": 2.6222373677701607e-06, "loss": 0.2611, "step": 4873 }, { "epoch": 2.328773669434323, "grad_norm": 0.46352164932756873, "learning_rate": 2.618683123065646e-06, "loss": 0.2802, "step": 4874 }, { "epoch": 2.329251538139896, "grad_norm": 0.4668367012503617, "learning_rate": 2.615130925808228e-06, "loss": 0.255, "step": 4875 }, { "epoch": 2.3297294068454693, "grad_norm": 0.4526742078440504, "learning_rate": 2.6115807769832226e-06, "loss": 0.2543, "step": 4876 }, { "epoch": 2.330207275551042, "grad_norm": 0.48303918462763046, "learning_rate": 2.6080326775753816e-06, "loss": 0.2816, "step": 4877 }, { "epoch": 2.3306851442566154, "grad_norm": 0.5333148870731564, "learning_rate": 2.604486628568885e-06, "loss": 0.274, "step": 4878 }, { "epoch": 2.3311630129621888, "grad_norm": 0.4581104638202767, "learning_rate": 2.6009426309473397e-06, "loss": 0.2565, "step": 4879 }, { "epoch": 2.3316408816677616, "grad_norm": 0.4665861539340606, "learning_rate": 2.597400685693795e-06, "loss": 0.2801, "step": 4880 }, { "epoch": 2.332118750373335, "grad_norm": 0.5767159313567876, "learning_rate": 2.59386079379072e-06, "loss": 0.2744, "step": 4881 }, { "epoch": 2.3325966190789083, "grad_norm": 0.6636916277687075, "learning_rate": 2.590322956220015e-06, "loss": 0.2714, "step": 4882 }, { "epoch": 2.333074487784481, "grad_norm": 0.4707596869750743, "learning_rate": 2.586787173963019e-06, "loss": 0.274, "step": 4883 }, { "epoch": 2.3335523564900544, "grad_norm": 0.5135378298995066, "learning_rate": 2.5832534480004955e-06, "loss": 0.2703, "step": 4884 }, { "epoch": 2.3340302251956278, "grad_norm": 0.4520964774142493, "learning_rate": 2.5797217793126373e-06, "loss": 0.2639, "step": 4885 }, { "epoch": 2.3345080939012006, "grad_norm": 0.5034516755934882, "learning_rate": 2.5761921688790635e-06, "loss": 0.2665, "step": 4886 }, { "epoch": 2.334985962606774, "grad_norm": 0.44517562515923137, "learning_rate": 2.5726646176788307e-06, "loss": 0.2736, "step": 4887 }, { "epoch": 2.335463831312347, "grad_norm": 0.5542169246110865, "learning_rate": 2.5691391266904165e-06, "loss": 0.2665, "step": 4888 }, { "epoch": 2.33594170001792, "grad_norm": 0.4735315168725527, "learning_rate": 2.5656156968917277e-06, "loss": 0.257, "step": 4889 }, { "epoch": 2.3364195687234934, "grad_norm": 0.48423296811822936, "learning_rate": 2.5620943292601074e-06, "loss": 0.2784, "step": 4890 }, { "epoch": 2.3368974374290663, "grad_norm": 0.4515981436184163, "learning_rate": 2.5585750247723183e-06, "loss": 0.2705, "step": 4891 }, { "epoch": 2.3373753061346396, "grad_norm": 0.4733626892725105, "learning_rate": 2.5550577844045498e-06, "loss": 0.2558, "step": 4892 }, { "epoch": 2.3378531748402125, "grad_norm": 0.5043562480358438, "learning_rate": 2.551542609132428e-06, "loss": 0.2436, "step": 4893 }, { "epoch": 2.338331043545786, "grad_norm": 0.5000834206474044, "learning_rate": 2.548029499930997e-06, "loss": 0.2566, "step": 4894 }, { "epoch": 2.338808912251359, "grad_norm": 0.4478341988025251, "learning_rate": 2.5445184577747305e-06, "loss": 0.2586, "step": 4895 }, { "epoch": 2.339286780956932, "grad_norm": 0.5257945834666825, "learning_rate": 2.5410094836375343e-06, "loss": 0.2644, "step": 4896 }, { "epoch": 2.3397646496625053, "grad_norm": 0.4531122219133908, "learning_rate": 2.537502578492733e-06, "loss": 0.2785, "step": 4897 }, { "epoch": 2.340242518368078, "grad_norm": 0.4863425466939351, "learning_rate": 2.533997743313077e-06, "loss": 0.2512, "step": 4898 }, { "epoch": 2.3407203870736515, "grad_norm": 0.47019660570137406, "learning_rate": 2.5304949790707512e-06, "loss": 0.257, "step": 4899 }, { "epoch": 2.341198255779225, "grad_norm": 0.46196723454827476, "learning_rate": 2.52699428673736e-06, "loss": 0.2603, "step": 4900 }, { "epoch": 2.3416761244847977, "grad_norm": 0.5103312090564901, "learning_rate": 2.5234956672839273e-06, "loss": 0.2668, "step": 4901 }, { "epoch": 2.342153993190371, "grad_norm": 0.442174404107808, "learning_rate": 2.519999121680917e-06, "loss": 0.2504, "step": 4902 }, { "epoch": 2.342631861895944, "grad_norm": 0.46898344435430483, "learning_rate": 2.516504650898206e-06, "loss": 0.269, "step": 4903 }, { "epoch": 2.343109730601517, "grad_norm": 0.5615098737605004, "learning_rate": 2.513012255905095e-06, "loss": 0.2708, "step": 4904 }, { "epoch": 2.3435875993070905, "grad_norm": 0.4488077245674206, "learning_rate": 2.5095219376703183e-06, "loss": 0.2668, "step": 4905 }, { "epoch": 2.3440654680126634, "grad_norm": 0.4667442735851483, "learning_rate": 2.5060336971620268e-06, "loss": 0.2774, "step": 4906 }, { "epoch": 2.3445433367182367, "grad_norm": 0.45038553396302394, "learning_rate": 2.5025475353477933e-06, "loss": 0.2595, "step": 4907 }, { "epoch": 2.34502120542381, "grad_norm": 0.45570886034637664, "learning_rate": 2.4990634531946247e-06, "loss": 0.2662, "step": 4908 }, { "epoch": 2.345499074129383, "grad_norm": 0.4532263527919922, "learning_rate": 2.495581451668938e-06, "loss": 0.2712, "step": 4909 }, { "epoch": 2.345976942834956, "grad_norm": 0.5644370739027057, "learning_rate": 2.4921015317365794e-06, "loss": 0.2745, "step": 4910 }, { "epoch": 2.3464548115405295, "grad_norm": 0.45617320912995984, "learning_rate": 2.488623694362822e-06, "loss": 0.254, "step": 4911 }, { "epoch": 2.3469326802461024, "grad_norm": 0.743987369148091, "learning_rate": 2.4851479405123524e-06, "loss": 0.2797, "step": 4912 }, { "epoch": 2.3474105489516757, "grad_norm": 0.4687580573243133, "learning_rate": 2.4816742711492813e-06, "loss": 0.2646, "step": 4913 }, { "epoch": 2.3478884176572485, "grad_norm": 0.4689346718000839, "learning_rate": 2.47820268723715e-06, "loss": 0.2733, "step": 4914 }, { "epoch": 2.348366286362822, "grad_norm": 0.4490192177022896, "learning_rate": 2.4747331897389103e-06, "loss": 0.2665, "step": 4915 }, { "epoch": 2.348844155068395, "grad_norm": 0.49311803151688366, "learning_rate": 2.471265779616938e-06, "loss": 0.2461, "step": 4916 }, { "epoch": 2.349322023773968, "grad_norm": 0.467178488370915, "learning_rate": 2.467800457833034e-06, "loss": 0.2606, "step": 4917 }, { "epoch": 2.3497998924795414, "grad_norm": 0.4596377441591877, "learning_rate": 2.46433722534842e-06, "loss": 0.2535, "step": 4918 }, { "epoch": 2.3502777611851142, "grad_norm": 0.4694260851272706, "learning_rate": 2.460876083123733e-06, "loss": 0.289, "step": 4919 }, { "epoch": 2.3507556298906875, "grad_norm": 0.45269147527252934, "learning_rate": 2.4574170321190305e-06, "loss": 0.2706, "step": 4920 }, { "epoch": 2.351233498596261, "grad_norm": 0.4770873996963581, "learning_rate": 2.4539600732937964e-06, "loss": 0.2492, "step": 4921 }, { "epoch": 2.3517113673018337, "grad_norm": 0.4601878264182159, "learning_rate": 2.450505207606928e-06, "loss": 0.2758, "step": 4922 }, { "epoch": 2.352189236007407, "grad_norm": 0.4716682705387417, "learning_rate": 2.4470524360167413e-06, "loss": 0.2731, "step": 4923 }, { "epoch": 2.35266710471298, "grad_norm": 0.4939845519900784, "learning_rate": 2.4436017594809804e-06, "loss": 0.2924, "step": 4924 }, { "epoch": 2.353144973418553, "grad_norm": 0.4919791373245082, "learning_rate": 2.440153178956798e-06, "loss": 0.2826, "step": 4925 }, { "epoch": 2.3536228421241265, "grad_norm": 0.48187056236329523, "learning_rate": 2.436706695400769e-06, "loss": 0.2796, "step": 4926 }, { "epoch": 2.3541007108296994, "grad_norm": 0.4641311091931838, "learning_rate": 2.43326230976889e-06, "loss": 0.2629, "step": 4927 }, { "epoch": 2.3545785795352727, "grad_norm": 0.4670532060941091, "learning_rate": 2.4298200230165713e-06, "loss": 0.2706, "step": 4928 }, { "epoch": 2.3550564482408456, "grad_norm": 0.4754697383640508, "learning_rate": 2.4263798360986403e-06, "loss": 0.258, "step": 4929 }, { "epoch": 2.355534316946419, "grad_norm": 0.4958072214830247, "learning_rate": 2.42294174996935e-06, "loss": 0.2731, "step": 4930 }, { "epoch": 2.356012185651992, "grad_norm": 0.4688304594840339, "learning_rate": 2.4195057655823596e-06, "loss": 0.2762, "step": 4931 }, { "epoch": 2.356490054357565, "grad_norm": 0.45489501298416457, "learning_rate": 2.4160718838907502e-06, "loss": 0.2682, "step": 4932 }, { "epoch": 2.3569679230631384, "grad_norm": 0.5108041983410803, "learning_rate": 2.412640105847025e-06, "loss": 0.2641, "step": 4933 }, { "epoch": 2.3574457917687117, "grad_norm": 0.48773893586445766, "learning_rate": 2.4092104324030952e-06, "loss": 0.2573, "step": 4934 }, { "epoch": 2.3579236604742846, "grad_norm": 0.48133666696803284, "learning_rate": 2.40578286451029e-06, "loss": 0.2949, "step": 4935 }, { "epoch": 2.358401529179858, "grad_norm": 0.4912715666573927, "learning_rate": 2.4023574031193607e-06, "loss": 0.2742, "step": 4936 }, { "epoch": 2.358879397885431, "grad_norm": 0.4842008813394927, "learning_rate": 2.398934049180468e-06, "loss": 0.2797, "step": 4937 }, { "epoch": 2.359357266591004, "grad_norm": 0.48439100068631125, "learning_rate": 2.395512803643186e-06, "loss": 0.265, "step": 4938 }, { "epoch": 2.3598351352965774, "grad_norm": 0.47578624590536817, "learning_rate": 2.3920936674565155e-06, "loss": 0.2881, "step": 4939 }, { "epoch": 2.3603130040021503, "grad_norm": 0.4437343615163969, "learning_rate": 2.38867664156886e-06, "loss": 0.2797, "step": 4940 }, { "epoch": 2.3607908727077236, "grad_norm": 0.46566862296582356, "learning_rate": 2.38526172692804e-06, "loss": 0.2719, "step": 4941 }, { "epoch": 2.361268741413297, "grad_norm": 0.5051001842897216, "learning_rate": 2.381848924481297e-06, "loss": 0.2664, "step": 4942 }, { "epoch": 2.3617466101188698, "grad_norm": 0.4806571977298347, "learning_rate": 2.378438235175281e-06, "loss": 0.2777, "step": 4943 }, { "epoch": 2.362224478824443, "grad_norm": 0.4537705945452327, "learning_rate": 2.375029659956054e-06, "loss": 0.2692, "step": 4944 }, { "epoch": 2.362702347530016, "grad_norm": 0.4538521904099215, "learning_rate": 2.3716231997691007e-06, "loss": 0.2653, "step": 4945 }, { "epoch": 2.3631802162355893, "grad_norm": 0.44790702234683916, "learning_rate": 2.368218855559309e-06, "loss": 0.2547, "step": 4946 }, { "epoch": 2.3636580849411626, "grad_norm": 0.4411839227865902, "learning_rate": 2.3648166282709806e-06, "loss": 0.2611, "step": 4947 }, { "epoch": 2.3641359536467355, "grad_norm": 0.4512659481124571, "learning_rate": 2.361416518847841e-06, "loss": 0.2614, "step": 4948 }, { "epoch": 2.3646138223523088, "grad_norm": 0.45377164683979093, "learning_rate": 2.3580185282330137e-06, "loss": 0.2686, "step": 4949 }, { "epoch": 2.3650916910578816, "grad_norm": 0.4512279362886925, "learning_rate": 2.3546226573690444e-06, "loss": 0.2661, "step": 4950 }, { "epoch": 2.365569559763455, "grad_norm": 0.4907151137339124, "learning_rate": 2.351228907197891e-06, "loss": 0.2725, "step": 4951 }, { "epoch": 2.3660474284690283, "grad_norm": 0.461594964603084, "learning_rate": 2.347837278660917e-06, "loss": 0.274, "step": 4952 }, { "epoch": 2.366525297174601, "grad_norm": 0.49276481493447555, "learning_rate": 2.3444477726988966e-06, "loss": 0.256, "step": 4953 }, { "epoch": 2.3670031658801745, "grad_norm": 0.5271634745570248, "learning_rate": 2.3410603902520245e-06, "loss": 0.2734, "step": 4954 }, { "epoch": 2.3674810345857473, "grad_norm": 0.5046047617256642, "learning_rate": 2.3376751322599e-06, "loss": 0.2665, "step": 4955 }, { "epoch": 2.3679589032913206, "grad_norm": 0.43935104834974015, "learning_rate": 2.33429199966153e-06, "loss": 0.2542, "step": 4956 }, { "epoch": 2.368436771996894, "grad_norm": 0.47827957926642184, "learning_rate": 2.330910993395341e-06, "loss": 0.2558, "step": 4957 }, { "epoch": 2.368914640702467, "grad_norm": 0.46077109730693455, "learning_rate": 2.3275321143991613e-06, "loss": 0.2665, "step": 4958 }, { "epoch": 2.36939250940804, "grad_norm": 0.4620740363224271, "learning_rate": 2.324155363610231e-06, "loss": 0.281, "step": 4959 }, { "epoch": 2.3698703781136135, "grad_norm": 0.4743650364517856, "learning_rate": 2.320780741965206e-06, "loss": 0.2514, "step": 4960 }, { "epoch": 2.3703482468191863, "grad_norm": 0.48881821904437056, "learning_rate": 2.317408250400144e-06, "loss": 0.2916, "step": 4961 }, { "epoch": 2.3708261155247596, "grad_norm": 0.44779357419192, "learning_rate": 2.3140378898505125e-06, "loss": 0.2567, "step": 4962 }, { "epoch": 2.371303984230333, "grad_norm": 0.4571272492508665, "learning_rate": 2.3106696612511937e-06, "loss": 0.2689, "step": 4963 }, { "epoch": 2.371781852935906, "grad_norm": 0.4680515782108428, "learning_rate": 2.307303565536474e-06, "loss": 0.2632, "step": 4964 }, { "epoch": 2.372259721641479, "grad_norm": 0.50999540376331, "learning_rate": 2.3039396036400463e-06, "loss": 0.2699, "step": 4965 }, { "epoch": 2.372737590347052, "grad_norm": 0.45711754864038767, "learning_rate": 2.3005777764950187e-06, "loss": 0.2672, "step": 4966 }, { "epoch": 2.3732154590526253, "grad_norm": 0.46541233157527173, "learning_rate": 2.2972180850339e-06, "loss": 0.2772, "step": 4967 }, { "epoch": 2.3736933277581986, "grad_norm": 0.45372335105255907, "learning_rate": 2.2938605301886075e-06, "loss": 0.245, "step": 4968 }, { "epoch": 2.3741711964637715, "grad_norm": 0.49762996831577216, "learning_rate": 2.2905051128904733e-06, "loss": 0.2846, "step": 4969 }, { "epoch": 2.374649065169345, "grad_norm": 0.45309610239893056, "learning_rate": 2.287151834070226e-06, "loss": 0.2453, "step": 4970 }, { "epoch": 2.3751269338749177, "grad_norm": 0.45646843129089465, "learning_rate": 2.283800694658006e-06, "loss": 0.2523, "step": 4971 }, { "epoch": 2.375604802580491, "grad_norm": 0.45497585076759617, "learning_rate": 2.2804516955833645e-06, "loss": 0.2636, "step": 4972 }, { "epoch": 2.3760826712860643, "grad_norm": 0.4698293678470528, "learning_rate": 2.2771048377752527e-06, "loss": 0.2792, "step": 4973 }, { "epoch": 2.376560539991637, "grad_norm": 0.4481308379841722, "learning_rate": 2.2737601221620252e-06, "loss": 0.2734, "step": 4974 }, { "epoch": 2.3770384086972105, "grad_norm": 0.47464422001892004, "learning_rate": 2.2704175496714552e-06, "loss": 0.2653, "step": 4975 }, { "epoch": 2.3775162774027834, "grad_norm": 0.48442075929966344, "learning_rate": 2.2670771212307087e-06, "loss": 0.2664, "step": 4976 }, { "epoch": 2.3779941461083567, "grad_norm": 0.46567581192533997, "learning_rate": 2.2637388377663605e-06, "loss": 0.2569, "step": 4977 }, { "epoch": 2.37847201481393, "grad_norm": 0.4640997153073419, "learning_rate": 2.260402700204395e-06, "loss": 0.2691, "step": 4978 }, { "epoch": 2.378949883519503, "grad_norm": 0.4635972785067223, "learning_rate": 2.257068709470197e-06, "loss": 0.2618, "step": 4979 }, { "epoch": 2.379427752225076, "grad_norm": 0.4773585828087478, "learning_rate": 2.2537368664885527e-06, "loss": 0.2754, "step": 4980 }, { "epoch": 2.379905620930649, "grad_norm": 0.439472956742948, "learning_rate": 2.250407172183664e-06, "loss": 0.2664, "step": 4981 }, { "epoch": 2.3803834896362224, "grad_norm": 0.4558716330221375, "learning_rate": 2.247079627479124e-06, "loss": 0.2683, "step": 4982 }, { "epoch": 2.3808613583417957, "grad_norm": 0.4608097977312484, "learning_rate": 2.2437542332979336e-06, "loss": 0.2835, "step": 4983 }, { "epoch": 2.3813392270473686, "grad_norm": 0.4380401331798129, "learning_rate": 2.240430990562501e-06, "loss": 0.2919, "step": 4984 }, { "epoch": 2.381817095752942, "grad_norm": 0.5895395029494848, "learning_rate": 2.2371099001946385e-06, "loss": 0.2624, "step": 4985 }, { "epoch": 2.382294964458515, "grad_norm": 0.46508326131778577, "learning_rate": 2.233790963115554e-06, "loss": 0.2504, "step": 4986 }, { "epoch": 2.382772833164088, "grad_norm": 0.4477530552158064, "learning_rate": 2.2304741802458606e-06, "loss": 0.2661, "step": 4987 }, { "epoch": 2.3832507018696614, "grad_norm": 0.45202080179265264, "learning_rate": 2.22715955250558e-06, "loss": 0.2801, "step": 4988 }, { "epoch": 2.3837285705752347, "grad_norm": 0.46022019801647834, "learning_rate": 2.223847080814129e-06, "loss": 0.2522, "step": 4989 }, { "epoch": 2.3842064392808076, "grad_norm": 0.46337347162678094, "learning_rate": 2.2205367660903267e-06, "loss": 0.2622, "step": 4990 }, { "epoch": 2.384684307986381, "grad_norm": 0.532373751607701, "learning_rate": 2.2172286092523998e-06, "loss": 0.253, "step": 4991 }, { "epoch": 2.3851621766919537, "grad_norm": 0.44622554326728614, "learning_rate": 2.2139226112179713e-06, "loss": 0.2566, "step": 4992 }, { "epoch": 2.385640045397527, "grad_norm": 0.4538870200700637, "learning_rate": 2.210618772904064e-06, "loss": 0.2646, "step": 4993 }, { "epoch": 2.3861179141031004, "grad_norm": 0.4585327419151689, "learning_rate": 2.2073170952271085e-06, "loss": 0.2623, "step": 4994 }, { "epoch": 2.3865957828086732, "grad_norm": 0.46815712443376617, "learning_rate": 2.2040175791029305e-06, "loss": 0.2818, "step": 4995 }, { "epoch": 2.3870736515142466, "grad_norm": 0.5231070665972924, "learning_rate": 2.200720225446755e-06, "loss": 0.2722, "step": 4996 }, { "epoch": 2.3875515202198194, "grad_norm": 0.654443625332581, "learning_rate": 2.197425035173215e-06, "loss": 0.2563, "step": 4997 }, { "epoch": 2.3880293889253927, "grad_norm": 0.45803873330264716, "learning_rate": 2.194132009196335e-06, "loss": 0.2952, "step": 4998 }, { "epoch": 2.388507257630966, "grad_norm": 0.4521097635127778, "learning_rate": 2.19084114842954e-06, "loss": 0.265, "step": 4999 }, { "epoch": 2.388985126336539, "grad_norm": 0.4544732994444673, "learning_rate": 2.187552453785662e-06, "loss": 0.2745, "step": 5000 }, { "epoch": 2.3894629950421122, "grad_norm": 0.4793264716313783, "learning_rate": 2.1842659261769226e-06, "loss": 0.262, "step": 5001 }, { "epoch": 2.389940863747685, "grad_norm": 0.448019751252878, "learning_rate": 2.180981566514947e-06, "loss": 0.2688, "step": 5002 }, { "epoch": 2.3904187324532584, "grad_norm": 0.48536454285303543, "learning_rate": 2.177699375710762e-06, "loss": 0.2623, "step": 5003 }, { "epoch": 2.3908966011588317, "grad_norm": 0.6915355830232118, "learning_rate": 2.174419354674787e-06, "loss": 0.2697, "step": 5004 }, { "epoch": 2.3913744698644046, "grad_norm": 0.47423927727053033, "learning_rate": 2.1711415043168395e-06, "loss": 0.2806, "step": 5005 }, { "epoch": 2.391852338569978, "grad_norm": 0.4699229189597673, "learning_rate": 2.1678658255461427e-06, "loss": 0.2764, "step": 5006 }, { "epoch": 2.3923302072755512, "grad_norm": 0.4366728988564249, "learning_rate": 2.164592319271309e-06, "loss": 0.2626, "step": 5007 }, { "epoch": 2.392808075981124, "grad_norm": 0.4680895354927544, "learning_rate": 2.16132098640035e-06, "loss": 0.2856, "step": 5008 }, { "epoch": 2.3932859446866974, "grad_norm": 0.4831506206251398, "learning_rate": 2.1580518278406793e-06, "loss": 0.2588, "step": 5009 }, { "epoch": 2.3937638133922703, "grad_norm": 0.4699558577692185, "learning_rate": 2.1547848444991025e-06, "loss": 0.2701, "step": 5010 }, { "epoch": 2.3942416820978436, "grad_norm": 0.4489222989978539, "learning_rate": 2.15152003728182e-06, "loss": 0.2834, "step": 5011 }, { "epoch": 2.394719550803417, "grad_norm": 0.47665521913868575, "learning_rate": 2.148257407094436e-06, "loss": 0.2583, "step": 5012 }, { "epoch": 2.39519741950899, "grad_norm": 0.45743260638815375, "learning_rate": 2.1449969548419456e-06, "loss": 0.2677, "step": 5013 }, { "epoch": 2.395675288214563, "grad_norm": 0.45342448226517174, "learning_rate": 2.141738681428738e-06, "loss": 0.2653, "step": 5014 }, { "epoch": 2.3961531569201364, "grad_norm": 0.44238225246261725, "learning_rate": 2.138482587758605e-06, "loss": 0.2823, "step": 5015 }, { "epoch": 2.3966310256257093, "grad_norm": 0.4457574279651664, "learning_rate": 2.1352286747347273e-06, "loss": 0.257, "step": 5016 }, { "epoch": 2.3971088943312826, "grad_norm": 0.4702199309191923, "learning_rate": 2.1319769432596804e-06, "loss": 0.276, "step": 5017 }, { "epoch": 2.3975867630368555, "grad_norm": 0.48532027343768513, "learning_rate": 2.1287273942354393e-06, "loss": 0.2615, "step": 5018 }, { "epoch": 2.398064631742429, "grad_norm": 0.4532406315784221, "learning_rate": 2.125480028563376e-06, "loss": 0.2611, "step": 5019 }, { "epoch": 2.398542500448002, "grad_norm": 0.43656492650267525, "learning_rate": 2.1222348471442477e-06, "loss": 0.2607, "step": 5020 }, { "epoch": 2.399020369153575, "grad_norm": 0.4437139617252678, "learning_rate": 2.118991850878209e-06, "loss": 0.2634, "step": 5021 }, { "epoch": 2.3994982378591483, "grad_norm": 0.4607343235133938, "learning_rate": 2.115751040664815e-06, "loss": 0.2617, "step": 5022 }, { "epoch": 2.399976106564721, "grad_norm": 0.4509401894862981, "learning_rate": 2.1125124174030066e-06, "loss": 0.2649, "step": 5023 }, { "epoch": 2.4004539752702945, "grad_norm": 0.4780514985721637, "learning_rate": 2.1092759819911178e-06, "loss": 0.2587, "step": 5024 }, { "epoch": 2.400931843975868, "grad_norm": 0.47293976576430685, "learning_rate": 2.1060417353268845e-06, "loss": 0.2605, "step": 5025 }, { "epoch": 2.4014097126814407, "grad_norm": 0.4632747242085837, "learning_rate": 2.102809678307427e-06, "loss": 0.2701, "step": 5026 }, { "epoch": 2.401887581387014, "grad_norm": 0.4519610118896511, "learning_rate": 2.0995798118292574e-06, "loss": 0.274, "step": 5027 }, { "epoch": 2.402365450092587, "grad_norm": 0.49007996568127915, "learning_rate": 2.09635213678829e-06, "loss": 0.275, "step": 5028 }, { "epoch": 2.40284331879816, "grad_norm": 0.4482075654858903, "learning_rate": 2.093126654079822e-06, "loss": 0.2724, "step": 5029 }, { "epoch": 2.4033211875037335, "grad_norm": 0.46175751639190643, "learning_rate": 2.0899033645985423e-06, "loss": 0.2734, "step": 5030 }, { "epoch": 2.4037990562093063, "grad_norm": 0.4526383312411215, "learning_rate": 2.0866822692385404e-06, "loss": 0.2845, "step": 5031 }, { "epoch": 2.4042769249148797, "grad_norm": 0.4713547161425226, "learning_rate": 2.083463368893289e-06, "loss": 0.2838, "step": 5032 }, { "epoch": 2.404754793620453, "grad_norm": 0.5388592684225536, "learning_rate": 2.0802466644556507e-06, "loss": 0.2581, "step": 5033 }, { "epoch": 2.405232662326026, "grad_norm": 0.44781970240862895, "learning_rate": 2.0770321568178873e-06, "loss": 0.2552, "step": 5034 }, { "epoch": 2.405710531031599, "grad_norm": 0.45727753815499, "learning_rate": 2.073819846871646e-06, "loss": 0.2784, "step": 5035 }, { "epoch": 2.406188399737172, "grad_norm": 0.4523294487636907, "learning_rate": 2.0706097355079614e-06, "loss": 0.2548, "step": 5036 }, { "epoch": 2.4066662684427453, "grad_norm": 0.4979869120677124, "learning_rate": 2.0674018236172654e-06, "loss": 0.262, "step": 5037 }, { "epoch": 2.4071441371483187, "grad_norm": 0.4670045351404005, "learning_rate": 2.064196112089376e-06, "loss": 0.2836, "step": 5038 }, { "epoch": 2.4076220058538915, "grad_norm": 0.44302766764113355, "learning_rate": 2.0609926018134972e-06, "loss": 0.2544, "step": 5039 }, { "epoch": 2.408099874559465, "grad_norm": 0.4466796302647838, "learning_rate": 2.0577912936782317e-06, "loss": 0.2531, "step": 5040 }, { "epoch": 2.408577743265038, "grad_norm": 0.4662441229505657, "learning_rate": 2.0545921885715624e-06, "loss": 0.2634, "step": 5041 }, { "epoch": 2.409055611970611, "grad_norm": 0.4621440744087701, "learning_rate": 2.051395287380864e-06, "loss": 0.2711, "step": 5042 }, { "epoch": 2.4095334806761843, "grad_norm": 0.4910784013605513, "learning_rate": 2.048200590992904e-06, "loss": 0.2564, "step": 5043 }, { "epoch": 2.410011349381757, "grad_norm": 0.4768608798682245, "learning_rate": 2.0450081002938316e-06, "loss": 0.2643, "step": 5044 }, { "epoch": 2.4104892180873305, "grad_norm": 0.45467660225646006, "learning_rate": 2.041817816169187e-06, "loss": 0.2567, "step": 5045 }, { "epoch": 2.410967086792904, "grad_norm": 0.4570097506050878, "learning_rate": 2.0386297395039023e-06, "loss": 0.2681, "step": 5046 }, { "epoch": 2.4114449554984767, "grad_norm": 0.4718064608424706, "learning_rate": 2.035443871182292e-06, "loss": 0.2523, "step": 5047 }, { "epoch": 2.41192282420405, "grad_norm": 0.510560238857416, "learning_rate": 2.0322602120880576e-06, "loss": 0.2596, "step": 5048 }, { "epoch": 2.412400692909623, "grad_norm": 0.45874298526457225, "learning_rate": 2.0290787631042942e-06, "loss": 0.2654, "step": 5049 }, { "epoch": 2.412878561615196, "grad_norm": 0.4702898124625645, "learning_rate": 2.025899525113474e-06, "loss": 0.2607, "step": 5050 }, { "epoch": 2.4133564303207695, "grad_norm": 0.4419385668349205, "learning_rate": 2.022722498997465e-06, "loss": 0.28, "step": 5051 }, { "epoch": 2.4138342990263424, "grad_norm": 0.49832278835296895, "learning_rate": 2.0195476856375206e-06, "loss": 0.2722, "step": 5052 }, { "epoch": 2.4143121677319157, "grad_norm": 0.4453378413752286, "learning_rate": 2.016375085914275e-06, "loss": 0.2631, "step": 5053 }, { "epoch": 2.4147900364374886, "grad_norm": 0.4464873807438464, "learning_rate": 2.0132047007077504e-06, "loss": 0.2617, "step": 5054 }, { "epoch": 2.415267905143062, "grad_norm": 0.43769077224069114, "learning_rate": 2.010036530897359e-06, "loss": 0.2523, "step": 5055 }, { "epoch": 2.415745773848635, "grad_norm": 0.44478509964099755, "learning_rate": 2.0068705773618937e-06, "loss": 0.2494, "step": 5056 }, { "epoch": 2.416223642554208, "grad_norm": 0.4558953495461828, "learning_rate": 2.003706840979531e-06, "loss": 0.2716, "step": 5057 }, { "epoch": 2.4167015112597814, "grad_norm": 0.7657464247162521, "learning_rate": 2.0005453226278403e-06, "loss": 0.2683, "step": 5058 }, { "epoch": 2.4171793799653547, "grad_norm": 0.4719546599335128, "learning_rate": 1.9973860231837705e-06, "loss": 0.256, "step": 5059 }, { "epoch": 2.4176572486709276, "grad_norm": 0.46006978419118905, "learning_rate": 1.9942289435236506e-06, "loss": 0.2772, "step": 5060 }, { "epoch": 2.418135117376501, "grad_norm": 0.5094940178312064, "learning_rate": 1.9910740845232058e-06, "loss": 0.2713, "step": 5061 }, { "epoch": 2.4186129860820738, "grad_norm": 0.46698646284000606, "learning_rate": 1.9879214470575347e-06, "loss": 0.274, "step": 5062 }, { "epoch": 2.419090854787647, "grad_norm": 0.4589782752776786, "learning_rate": 1.9847710320011206e-06, "loss": 0.2499, "step": 5063 }, { "epoch": 2.4195687234932204, "grad_norm": 0.469003199954616, "learning_rate": 1.9816228402278392e-06, "loss": 0.2681, "step": 5064 }, { "epoch": 2.4200465921987933, "grad_norm": 0.4481176604472585, "learning_rate": 1.978476872610939e-06, "loss": 0.2577, "step": 5065 }, { "epoch": 2.4205244609043666, "grad_norm": 0.4659618384134064, "learning_rate": 1.975333130023056e-06, "loss": 0.2647, "step": 5066 }, { "epoch": 2.42100232960994, "grad_norm": 0.4828500059187709, "learning_rate": 1.972191613336212e-06, "loss": 0.2622, "step": 5067 }, { "epoch": 2.4214801983155128, "grad_norm": 0.4551712653015207, "learning_rate": 1.969052323421806e-06, "loss": 0.2661, "step": 5068 }, { "epoch": 2.421958067021086, "grad_norm": 0.46637572776776237, "learning_rate": 1.9659152611506193e-06, "loss": 0.2722, "step": 5069 }, { "epoch": 2.422435935726659, "grad_norm": 0.4486269710544213, "learning_rate": 1.962780427392823e-06, "loss": 0.2742, "step": 5070 }, { "epoch": 2.4229138044322323, "grad_norm": 0.4481995215441604, "learning_rate": 1.959647823017963e-06, "loss": 0.2709, "step": 5071 }, { "epoch": 2.4233916731378056, "grad_norm": 0.4637020847420594, "learning_rate": 1.9565174488949636e-06, "loss": 0.268, "step": 5072 }, { "epoch": 2.4238695418433784, "grad_norm": 0.4522661575040067, "learning_rate": 1.953389305892143e-06, "loss": 0.2714, "step": 5073 }, { "epoch": 2.4243474105489518, "grad_norm": 0.5188133426193942, "learning_rate": 1.9502633948771888e-06, "loss": 0.2682, "step": 5074 }, { "epoch": 2.4248252792545246, "grad_norm": 0.4570248324569066, "learning_rate": 1.9471397167171714e-06, "loss": 0.2652, "step": 5075 }, { "epoch": 2.425303147960098, "grad_norm": 0.459560178829993, "learning_rate": 1.944018272278548e-06, "loss": 0.2511, "step": 5076 }, { "epoch": 2.4257810166656713, "grad_norm": 0.466238298627355, "learning_rate": 1.9408990624271516e-06, "loss": 0.2687, "step": 5077 }, { "epoch": 2.426258885371244, "grad_norm": 0.49223929370166464, "learning_rate": 1.9377820880281928e-06, "loss": 0.2739, "step": 5078 }, { "epoch": 2.4267367540768174, "grad_norm": 0.46613653698623686, "learning_rate": 1.934667349946271e-06, "loss": 0.2676, "step": 5079 }, { "epoch": 2.4272146227823903, "grad_norm": 0.44196513176630936, "learning_rate": 1.931554849045355e-06, "loss": 0.2662, "step": 5080 }, { "epoch": 2.4276924914879636, "grad_norm": 0.45591063669478454, "learning_rate": 1.9284445861887966e-06, "loss": 0.2799, "step": 5081 }, { "epoch": 2.428170360193537, "grad_norm": 0.47960546577577334, "learning_rate": 1.9253365622393337e-06, "loss": 0.2612, "step": 5082 }, { "epoch": 2.42864822889911, "grad_norm": 0.49874783389566263, "learning_rate": 1.9222307780590734e-06, "loss": 0.259, "step": 5083 }, { "epoch": 2.429126097604683, "grad_norm": 0.48649273799607196, "learning_rate": 1.9191272345095025e-06, "loss": 0.2626, "step": 5084 }, { "epoch": 2.4296039663102564, "grad_norm": 0.46265212095706404, "learning_rate": 1.916025932451493e-06, "loss": 0.2673, "step": 5085 }, { "epoch": 2.4300818350158293, "grad_norm": 0.44902330400044727, "learning_rate": 1.912926872745294e-06, "loss": 0.2641, "step": 5086 }, { "epoch": 2.4305597037214026, "grad_norm": 0.48237243869527174, "learning_rate": 1.9098300562505266e-06, "loss": 0.2677, "step": 5087 }, { "epoch": 2.431037572426976, "grad_norm": 0.47922262239457364, "learning_rate": 1.9067354838261908e-06, "loss": 0.239, "step": 5088 }, { "epoch": 2.431515441132549, "grad_norm": 0.4484227748244973, "learning_rate": 1.9036431563306723e-06, "loss": 0.2676, "step": 5089 }, { "epoch": 2.431993309838122, "grad_norm": 0.4481486490497434, "learning_rate": 1.9005530746217238e-06, "loss": 0.2612, "step": 5090 }, { "epoch": 2.432471178543695, "grad_norm": 0.4751868893945265, "learning_rate": 1.8974652395564785e-06, "loss": 0.2658, "step": 5091 }, { "epoch": 2.4329490472492683, "grad_norm": 0.4910217856403449, "learning_rate": 1.8943796519914525e-06, "loss": 0.2632, "step": 5092 }, { "epoch": 2.4334269159548416, "grad_norm": 0.4671224266607131, "learning_rate": 1.89129631278253e-06, "loss": 0.2804, "step": 5093 }, { "epoch": 2.4339047846604145, "grad_norm": 0.5193462465411075, "learning_rate": 1.8882152227849727e-06, "loss": 0.2845, "step": 5094 }, { "epoch": 2.434382653365988, "grad_norm": 0.5153634268527411, "learning_rate": 1.8851363828534253e-06, "loss": 0.263, "step": 5095 }, { "epoch": 2.4348605220715607, "grad_norm": 0.45007658573232884, "learning_rate": 1.8820597938419028e-06, "loss": 0.2653, "step": 5096 }, { "epoch": 2.435338390777134, "grad_norm": 0.4708101712892127, "learning_rate": 1.8789854566037912e-06, "loss": 0.2681, "step": 5097 }, { "epoch": 2.4358162594827073, "grad_norm": 0.466359009946716, "learning_rate": 1.8759133719918654e-06, "loss": 0.2683, "step": 5098 }, { "epoch": 2.43629412818828, "grad_norm": 0.5260130493552598, "learning_rate": 1.8728435408582634e-06, "loss": 0.2664, "step": 5099 }, { "epoch": 2.4367719968938535, "grad_norm": 0.4471972729113941, "learning_rate": 1.869775964054501e-06, "loss": 0.2667, "step": 5100 }, { "epoch": 2.4372498655994264, "grad_norm": 0.4605895897583309, "learning_rate": 1.866710642431473e-06, "loss": 0.2794, "step": 5101 }, { "epoch": 2.4377277343049997, "grad_norm": 0.49147254180946887, "learning_rate": 1.8636475768394446e-06, "loss": 0.2812, "step": 5102 }, { "epoch": 2.438205603010573, "grad_norm": 0.44497897456741325, "learning_rate": 1.860586768128052e-06, "loss": 0.2679, "step": 5103 }, { "epoch": 2.438683471716146, "grad_norm": 0.45664419543781765, "learning_rate": 1.857528217146317e-06, "loss": 0.2631, "step": 5104 }, { "epoch": 2.439161340421719, "grad_norm": 0.463599939436042, "learning_rate": 1.8544719247426224e-06, "loss": 0.2664, "step": 5105 }, { "epoch": 2.439639209127292, "grad_norm": 0.5020393530396908, "learning_rate": 1.8514178917647297e-06, "loss": 0.2751, "step": 5106 }, { "epoch": 2.4401170778328654, "grad_norm": 0.6334592929322761, "learning_rate": 1.8483661190597778e-06, "loss": 0.273, "step": 5107 }, { "epoch": 2.4405949465384387, "grad_norm": 0.44949715968087817, "learning_rate": 1.8453166074742723e-06, "loss": 0.2626, "step": 5108 }, { "epoch": 2.4410728152440115, "grad_norm": 0.4601875206689804, "learning_rate": 1.8422693578540907e-06, "loss": 0.2524, "step": 5109 }, { "epoch": 2.441550683949585, "grad_norm": 0.4432035611183387, "learning_rate": 1.8392243710444911e-06, "loss": 0.2568, "step": 5110 }, { "epoch": 2.442028552655158, "grad_norm": 0.4420828252613538, "learning_rate": 1.8361816478900986e-06, "loss": 0.2687, "step": 5111 }, { "epoch": 2.442506421360731, "grad_norm": 0.4698539634957639, "learning_rate": 1.833141189234907e-06, "loss": 0.2556, "step": 5112 }, { "epoch": 2.4429842900663044, "grad_norm": 0.4454077363137819, "learning_rate": 1.8301029959222916e-06, "loss": 0.2644, "step": 5113 }, { "epoch": 2.4434621587718777, "grad_norm": 0.4694638175516713, "learning_rate": 1.8270670687949898e-06, "loss": 0.2725, "step": 5114 }, { "epoch": 2.4439400274774505, "grad_norm": 0.4608350257915752, "learning_rate": 1.8240334086951117e-06, "loss": 0.2696, "step": 5115 }, { "epoch": 2.444417896183024, "grad_norm": 0.44781409506792363, "learning_rate": 1.8210020164641483e-06, "loss": 0.2716, "step": 5116 }, { "epoch": 2.4448957648885967, "grad_norm": 0.4969053175588786, "learning_rate": 1.8179728929429507e-06, "loss": 0.2643, "step": 5117 }, { "epoch": 2.44537363359417, "grad_norm": 0.4559222255934472, "learning_rate": 1.814946038971741e-06, "loss": 0.2713, "step": 5118 }, { "epoch": 2.4458515022997434, "grad_norm": 0.44970009381193676, "learning_rate": 1.8119214553901177e-06, "loss": 0.2622, "step": 5119 }, { "epoch": 2.4463293710053162, "grad_norm": 0.4559369781065068, "learning_rate": 1.8088991430370506e-06, "loss": 0.2666, "step": 5120 }, { "epoch": 2.4468072397108895, "grad_norm": 0.45234671729475956, "learning_rate": 1.8058791027508726e-06, "loss": 0.2686, "step": 5121 }, { "epoch": 2.4472851084164624, "grad_norm": 0.44137882355941305, "learning_rate": 1.802861335369287e-06, "loss": 0.2779, "step": 5122 }, { "epoch": 2.4477629771220357, "grad_norm": 0.48440328157222995, "learning_rate": 1.799845841729375e-06, "loss": 0.283, "step": 5123 }, { "epoch": 2.448240845827609, "grad_norm": 0.494486208129345, "learning_rate": 1.796832622667578e-06, "loss": 0.2656, "step": 5124 }, { "epoch": 2.448718714533182, "grad_norm": 0.47590323638368665, "learning_rate": 1.7938216790197071e-06, "loss": 0.2497, "step": 5125 }, { "epoch": 2.449196583238755, "grad_norm": 0.449129199222899, "learning_rate": 1.790813011620951e-06, "loss": 0.2654, "step": 5126 }, { "epoch": 2.449674451944328, "grad_norm": 0.4626004740804751, "learning_rate": 1.7878066213058575e-06, "loss": 0.2708, "step": 5127 }, { "epoch": 2.4501523206499014, "grad_norm": 0.4702626534514859, "learning_rate": 1.7848025089083442e-06, "loss": 0.2782, "step": 5128 }, { "epoch": 2.4506301893554747, "grad_norm": 0.44737255344314847, "learning_rate": 1.7818006752617034e-06, "loss": 0.2791, "step": 5129 }, { "epoch": 2.4511080580610476, "grad_norm": 0.454394369273916, "learning_rate": 1.77880112119859e-06, "loss": 0.2579, "step": 5130 }, { "epoch": 2.451585926766621, "grad_norm": 0.4418070881214252, "learning_rate": 1.7758038475510232e-06, "loss": 0.2703, "step": 5131 }, { "epoch": 2.4520637954721938, "grad_norm": 0.4336894058264998, "learning_rate": 1.7728088551503986e-06, "loss": 0.2688, "step": 5132 }, { "epoch": 2.452541664177767, "grad_norm": 0.46001029127787707, "learning_rate": 1.769816144827472e-06, "loss": 0.2532, "step": 5133 }, { "epoch": 2.4530195328833404, "grad_norm": 0.4507732662463414, "learning_rate": 1.7668257174123672e-06, "loss": 0.2725, "step": 5134 }, { "epoch": 2.4534974015889133, "grad_norm": 0.5054107742226862, "learning_rate": 1.7638375737345804e-06, "loss": 0.2612, "step": 5135 }, { "epoch": 2.4539752702944866, "grad_norm": 0.5719525646703848, "learning_rate": 1.7608517146229677e-06, "loss": 0.2622, "step": 5136 }, { "epoch": 2.45445313900006, "grad_norm": 0.45263029420038425, "learning_rate": 1.7578681409057497e-06, "loss": 0.2571, "step": 5137 }, { "epoch": 2.4549310077056328, "grad_norm": 0.4469493461520194, "learning_rate": 1.7548868534105234e-06, "loss": 0.2795, "step": 5138 }, { "epoch": 2.455408876411206, "grad_norm": 0.47460938094773264, "learning_rate": 1.751907852964243e-06, "loss": 0.2535, "step": 5139 }, { "epoch": 2.4558867451167794, "grad_norm": 0.43572000742600314, "learning_rate": 1.7489311403932274e-06, "loss": 0.2616, "step": 5140 }, { "epoch": 2.4563646138223523, "grad_norm": 0.49154307546187714, "learning_rate": 1.7459567165231695e-06, "loss": 0.2679, "step": 5141 }, { "epoch": 2.4568424825279256, "grad_norm": 0.43369075353473246, "learning_rate": 1.7429845821791202e-06, "loss": 0.2641, "step": 5142 }, { "epoch": 2.4573203512334985, "grad_norm": 0.44569959028444417, "learning_rate": 1.7400147381854936e-06, "loss": 0.2556, "step": 5143 }, { "epoch": 2.4577982199390718, "grad_norm": 0.44301724734443204, "learning_rate": 1.7370471853660775e-06, "loss": 0.2594, "step": 5144 }, { "epoch": 2.458276088644645, "grad_norm": 0.4331333804341633, "learning_rate": 1.7340819245440166e-06, "loss": 0.2698, "step": 5145 }, { "epoch": 2.458753957350218, "grad_norm": 0.44930485499724493, "learning_rate": 1.7311189565418186e-06, "loss": 0.2694, "step": 5146 }, { "epoch": 2.4592318260557913, "grad_norm": 0.4431936634915009, "learning_rate": 1.7281582821813647e-06, "loss": 0.2539, "step": 5147 }, { "epoch": 2.459709694761364, "grad_norm": 0.4801540861517632, "learning_rate": 1.7251999022838895e-06, "loss": 0.26, "step": 5148 }, { "epoch": 2.4601875634669375, "grad_norm": 0.44046656799275724, "learning_rate": 1.722243817669994e-06, "loss": 0.2602, "step": 5149 }, { "epoch": 2.4606654321725108, "grad_norm": 0.4569712842143844, "learning_rate": 1.7192900291596493e-06, "loss": 0.2677, "step": 5150 }, { "epoch": 2.4611433008780836, "grad_norm": 0.5414744354026383, "learning_rate": 1.7163385375721819e-06, "loss": 0.2657, "step": 5151 }, { "epoch": 2.461621169583657, "grad_norm": 0.5030301613387165, "learning_rate": 1.7133893437262771e-06, "loss": 0.2694, "step": 5152 }, { "epoch": 2.46209903828923, "grad_norm": 0.46022742529581706, "learning_rate": 1.7104424484400006e-06, "loss": 0.2657, "step": 5153 }, { "epoch": 2.462576906994803, "grad_norm": 0.4480168925969945, "learning_rate": 1.707497852530763e-06, "loss": 0.2678, "step": 5154 }, { "epoch": 2.4630547757003765, "grad_norm": 0.45217430934005776, "learning_rate": 1.7045555568153415e-06, "loss": 0.2573, "step": 5155 }, { "epoch": 2.4635326444059493, "grad_norm": 0.4771262260653937, "learning_rate": 1.7016155621098818e-06, "loss": 0.2663, "step": 5156 }, { "epoch": 2.4640105131115226, "grad_norm": 0.43652614153589214, "learning_rate": 1.6986778692298843e-06, "loss": 0.257, "step": 5157 }, { "epoch": 2.4644883818170955, "grad_norm": 0.45508857688246773, "learning_rate": 1.695742478990211e-06, "loss": 0.2625, "step": 5158 }, { "epoch": 2.464966250522669, "grad_norm": 0.44398221371433727, "learning_rate": 1.6928093922050913e-06, "loss": 0.259, "step": 5159 }, { "epoch": 2.465444119228242, "grad_norm": 0.46877325097045486, "learning_rate": 1.6898786096881104e-06, "loss": 0.2515, "step": 5160 }, { "epoch": 2.465921987933815, "grad_norm": 0.47517976840848003, "learning_rate": 1.686950132252213e-06, "loss": 0.2668, "step": 5161 }, { "epoch": 2.4663998566393883, "grad_norm": 0.47013178542557804, "learning_rate": 1.6840239607097109e-06, "loss": 0.273, "step": 5162 }, { "epoch": 2.4668777253449616, "grad_norm": 0.6731321265086605, "learning_rate": 1.6811000958722713e-06, "loss": 0.2653, "step": 5163 }, { "epoch": 2.4673555940505345, "grad_norm": 0.500612444710717, "learning_rate": 1.6781785385509197e-06, "loss": 0.2553, "step": 5164 }, { "epoch": 2.467833462756108, "grad_norm": 0.44968808398968174, "learning_rate": 1.6752592895560493e-06, "loss": 0.2651, "step": 5165 }, { "epoch": 2.468311331461681, "grad_norm": 0.4625732004203681, "learning_rate": 1.6723423496974057e-06, "loss": 0.2808, "step": 5166 }, { "epoch": 2.468789200167254, "grad_norm": 0.4489275940956645, "learning_rate": 1.6694277197840947e-06, "loss": 0.2628, "step": 5167 }, { "epoch": 2.4692670688728273, "grad_norm": 0.535652789404287, "learning_rate": 1.6665154006245888e-06, "loss": 0.272, "step": 5168 }, { "epoch": 2.4697449375784, "grad_norm": 0.47206952227811455, "learning_rate": 1.6636053930267093e-06, "loss": 0.2618, "step": 5169 }, { "epoch": 2.4702228062839735, "grad_norm": 0.4583292667257452, "learning_rate": 1.6606976977976408e-06, "loss": 0.2575, "step": 5170 }, { "epoch": 2.470700674989547, "grad_norm": 0.4784423605188315, "learning_rate": 1.6577923157439302e-06, "loss": 0.2642, "step": 5171 }, { "epoch": 2.4711785436951197, "grad_norm": 0.4666671362911929, "learning_rate": 1.6548892476714772e-06, "loss": 0.2663, "step": 5172 }, { "epoch": 2.471656412400693, "grad_norm": 0.4821043795677321, "learning_rate": 1.6519884943855391e-06, "loss": 0.2718, "step": 5173 }, { "epoch": 2.472134281106266, "grad_norm": 0.43488458866311225, "learning_rate": 1.6490900566907396e-06, "loss": 0.2441, "step": 5174 }, { "epoch": 2.472612149811839, "grad_norm": 0.4670451983441338, "learning_rate": 1.6461939353910494e-06, "loss": 0.2845, "step": 5175 }, { "epoch": 2.4730900185174125, "grad_norm": 0.45536226914155875, "learning_rate": 1.643300131289801e-06, "loss": 0.2494, "step": 5176 }, { "epoch": 2.4735678872229854, "grad_norm": 0.4517133543283046, "learning_rate": 1.6404086451896896e-06, "loss": 0.2609, "step": 5177 }, { "epoch": 2.4740457559285587, "grad_norm": 0.453282470130923, "learning_rate": 1.6375194778927594e-06, "loss": 0.274, "step": 5178 }, { "epoch": 2.4745236246341316, "grad_norm": 0.4758223874514832, "learning_rate": 1.6346326302004111e-06, "loss": 0.2682, "step": 5179 }, { "epoch": 2.475001493339705, "grad_norm": 0.4522472979675584, "learning_rate": 1.631748102913412e-06, "loss": 0.2838, "step": 5180 }, { "epoch": 2.475479362045278, "grad_norm": 0.4389102055107512, "learning_rate": 1.6288658968318748e-06, "loss": 0.2756, "step": 5181 }, { "epoch": 2.475957230750851, "grad_norm": 0.4561620443865254, "learning_rate": 1.6259860127552718e-06, "loss": 0.2704, "step": 5182 }, { "epoch": 2.4764350994564244, "grad_norm": 0.46070242911962683, "learning_rate": 1.6231084514824357e-06, "loss": 0.248, "step": 5183 }, { "epoch": 2.4769129681619972, "grad_norm": 0.4500952574278607, "learning_rate": 1.6202332138115495e-06, "loss": 0.2714, "step": 5184 }, { "epoch": 2.4773908368675706, "grad_norm": 0.44423358778364996, "learning_rate": 1.6173603005401505e-06, "loss": 0.2694, "step": 5185 }, { "epoch": 2.477868705573144, "grad_norm": 0.4463179849081536, "learning_rate": 1.614489712465137e-06, "loss": 0.27, "step": 5186 }, { "epoch": 2.4783465742787167, "grad_norm": 0.4983501684463973, "learning_rate": 1.6116214503827632e-06, "loss": 0.2621, "step": 5187 }, { "epoch": 2.47882444298429, "grad_norm": 0.443262010521369, "learning_rate": 1.6087555150886291e-06, "loss": 0.2722, "step": 5188 }, { "epoch": 2.4793023116898634, "grad_norm": 0.4558884591228606, "learning_rate": 1.6058919073776936e-06, "loss": 0.2629, "step": 5189 }, { "epoch": 2.4797801803954362, "grad_norm": 0.4495529234567257, "learning_rate": 1.6030306280442764e-06, "loss": 0.2516, "step": 5190 }, { "epoch": 2.4802580491010096, "grad_norm": 0.4572305669305225, "learning_rate": 1.6001716778820432e-06, "loss": 0.2662, "step": 5191 }, { "epoch": 2.480735917806583, "grad_norm": 0.4459974662375841, "learning_rate": 1.5973150576840134e-06, "loss": 0.2657, "step": 5192 }, { "epoch": 2.4812137865121557, "grad_norm": 0.5110956161774068, "learning_rate": 1.5944607682425684e-06, "loss": 0.2771, "step": 5193 }, { "epoch": 2.481691655217729, "grad_norm": 0.5018127615583312, "learning_rate": 1.5916088103494353e-06, "loss": 0.2706, "step": 5194 }, { "epoch": 2.482169523923302, "grad_norm": 0.46765637570821916, "learning_rate": 1.588759184795694e-06, "loss": 0.2681, "step": 5195 }, { "epoch": 2.4826473926288752, "grad_norm": 0.4455104728164743, "learning_rate": 1.5859118923717853e-06, "loss": 0.2543, "step": 5196 }, { "epoch": 2.4831252613344486, "grad_norm": 0.4943599831926817, "learning_rate": 1.5830669338674953e-06, "loss": 0.2632, "step": 5197 }, { "epoch": 2.4836031300400214, "grad_norm": 0.45460304300287496, "learning_rate": 1.580224310071964e-06, "loss": 0.24, "step": 5198 }, { "epoch": 2.4840809987455947, "grad_norm": 0.4470853999975232, "learning_rate": 1.577384021773689e-06, "loss": 0.2559, "step": 5199 }, { "epoch": 2.4845588674511676, "grad_norm": 0.4435833286013539, "learning_rate": 1.574546069760514e-06, "loss": 0.2511, "step": 5200 }, { "epoch": 2.485036736156741, "grad_norm": 0.4342162740665016, "learning_rate": 1.571710454819635e-06, "loss": 0.2632, "step": 5201 }, { "epoch": 2.4855146048623142, "grad_norm": 0.47609127386587247, "learning_rate": 1.5688771777376044e-06, "loss": 0.2887, "step": 5202 }, { "epoch": 2.485992473567887, "grad_norm": 0.4763885822985041, "learning_rate": 1.5660462393003228e-06, "loss": 0.283, "step": 5203 }, { "epoch": 2.4864703422734604, "grad_norm": 0.45641941404129543, "learning_rate": 1.56321764029304e-06, "loss": 0.2764, "step": 5204 }, { "epoch": 2.4869482109790333, "grad_norm": 0.45134971688590414, "learning_rate": 1.5603913815003634e-06, "loss": 0.2781, "step": 5205 }, { "epoch": 2.4874260796846066, "grad_norm": 0.48230919161187935, "learning_rate": 1.5575674637062465e-06, "loss": 0.2715, "step": 5206 }, { "epoch": 2.48790394839018, "grad_norm": 0.4498323903981208, "learning_rate": 1.5547458876939902e-06, "loss": 0.2675, "step": 5207 }, { "epoch": 2.488381817095753, "grad_norm": 0.5874324453501406, "learning_rate": 1.5519266542462552e-06, "loss": 0.2554, "step": 5208 }, { "epoch": 2.488859685801326, "grad_norm": 0.4541224538296049, "learning_rate": 1.5491097641450448e-06, "loss": 0.2506, "step": 5209 }, { "epoch": 2.4893375545068994, "grad_norm": 0.4646704460437421, "learning_rate": 1.5462952181717117e-06, "loss": 0.267, "step": 5210 }, { "epoch": 2.4898154232124723, "grad_norm": 0.462874473285813, "learning_rate": 1.543483017106967e-06, "loss": 0.2682, "step": 5211 }, { "epoch": 2.4902932919180456, "grad_norm": 0.4602131270975226, "learning_rate": 1.5406731617308635e-06, "loss": 0.2705, "step": 5212 }, { "epoch": 2.4907711606236185, "grad_norm": 0.48190334430500326, "learning_rate": 1.5378656528228032e-06, "loss": 0.2515, "step": 5213 }, { "epoch": 2.491249029329192, "grad_norm": 0.4591216279627935, "learning_rate": 1.535060491161542e-06, "loss": 0.2675, "step": 5214 }, { "epoch": 2.491726898034765, "grad_norm": 0.4619766580382836, "learning_rate": 1.532257677525183e-06, "loss": 0.265, "step": 5215 }, { "epoch": 2.492204766740338, "grad_norm": 0.46916882001283083, "learning_rate": 1.5294572126911723e-06, "loss": 0.2526, "step": 5216 }, { "epoch": 2.4926826354459113, "grad_norm": 0.4622413318042114, "learning_rate": 1.526659097436316e-06, "loss": 0.2688, "step": 5217 }, { "epoch": 2.4931605041514846, "grad_norm": 0.4593352758045286, "learning_rate": 1.5238633325367592e-06, "loss": 0.2721, "step": 5218 }, { "epoch": 2.4936383728570575, "grad_norm": 0.4453905535036544, "learning_rate": 1.5210699187679945e-06, "loss": 0.2707, "step": 5219 }, { "epoch": 2.494116241562631, "grad_norm": 0.4360154512554449, "learning_rate": 1.5182788569048689e-06, "loss": 0.2547, "step": 5220 }, { "epoch": 2.4945941102682037, "grad_norm": 0.4673303368639725, "learning_rate": 1.5154901477215756e-06, "loss": 0.2697, "step": 5221 }, { "epoch": 2.495071978973777, "grad_norm": 0.9738069702656994, "learning_rate": 1.512703791991651e-06, "loss": 0.2795, "step": 5222 }, { "epoch": 2.4955498476793503, "grad_norm": 0.47649155832990264, "learning_rate": 1.5099197904879792e-06, "loss": 0.2777, "step": 5223 }, { "epoch": 2.496027716384923, "grad_norm": 0.454990764062367, "learning_rate": 1.507138143982797e-06, "loss": 0.252, "step": 5224 }, { "epoch": 2.4965055850904965, "grad_norm": 0.4352195318005461, "learning_rate": 1.5043588532476827e-06, "loss": 0.2558, "step": 5225 }, { "epoch": 2.4969834537960693, "grad_norm": 0.4444748687649701, "learning_rate": 1.5015819190535586e-06, "loss": 0.2738, "step": 5226 }, { "epoch": 2.4974613225016427, "grad_norm": 0.45483690731958265, "learning_rate": 1.498807342170704e-06, "loss": 0.2872, "step": 5227 }, { "epoch": 2.497939191207216, "grad_norm": 0.4561626305870393, "learning_rate": 1.4960351233687342e-06, "loss": 0.2676, "step": 5228 }, { "epoch": 2.498417059912789, "grad_norm": 0.5047010790116747, "learning_rate": 1.493265263416611e-06, "loss": 0.2802, "step": 5229 }, { "epoch": 2.498894928618362, "grad_norm": 0.43638203904528666, "learning_rate": 1.49049776308265e-06, "loss": 0.264, "step": 5230 }, { "epoch": 2.499372797323935, "grad_norm": 0.4343364990309924, "learning_rate": 1.4877326231345046e-06, "loss": 0.2681, "step": 5231 }, { "epoch": 2.4998506660295083, "grad_norm": 0.8690286716299779, "learning_rate": 1.4849698443391724e-06, "loss": 0.2644, "step": 5232 }, { "epoch": 2.5003285347350817, "grad_norm": 0.47147128875554356, "learning_rate": 1.4822094274630062e-06, "loss": 0.2457, "step": 5233 }, { "epoch": 2.5008064034406545, "grad_norm": 0.4673984226297333, "learning_rate": 1.479451373271693e-06, "loss": 0.2765, "step": 5234 }, { "epoch": 2.501284272146228, "grad_norm": 0.4542815587395272, "learning_rate": 1.476695682530268e-06, "loss": 0.2895, "step": 5235 }, { "epoch": 2.5017621408518007, "grad_norm": 0.4454228012735007, "learning_rate": 1.473942356003113e-06, "loss": 0.2511, "step": 5236 }, { "epoch": 2.502240009557374, "grad_norm": 0.47387095366988674, "learning_rate": 1.4711913944539524e-06, "loss": 0.2591, "step": 5237 }, { "epoch": 2.5027178782629473, "grad_norm": 0.44223677795487265, "learning_rate": 1.4684427986458506e-06, "loss": 0.2741, "step": 5238 }, { "epoch": 2.5031957469685207, "grad_norm": 0.4744918967505802, "learning_rate": 1.465696569341224e-06, "loss": 0.2664, "step": 5239 }, { "epoch": 2.5036736156740935, "grad_norm": 0.4452348551981471, "learning_rate": 1.4629527073018267e-06, "loss": 0.2449, "step": 5240 }, { "epoch": 2.504151484379667, "grad_norm": 0.449269605401987, "learning_rate": 1.4602112132887558e-06, "loss": 0.2602, "step": 5241 }, { "epoch": 2.5046293530852397, "grad_norm": 0.4633053489713711, "learning_rate": 1.457472088062457e-06, "loss": 0.2636, "step": 5242 }, { "epoch": 2.505107221790813, "grad_norm": 0.4873950617138543, "learning_rate": 1.4547353323827141e-06, "loss": 0.2725, "step": 5243 }, { "epoch": 2.5055850904963863, "grad_norm": 0.47479818038514116, "learning_rate": 1.4520009470086505e-06, "loss": 0.2502, "step": 5244 }, { "epoch": 2.506062959201959, "grad_norm": 0.4685977267735002, "learning_rate": 1.449268932698743e-06, "loss": 0.2608, "step": 5245 }, { "epoch": 2.5065408279075325, "grad_norm": 0.4536044909575744, "learning_rate": 1.4465392902108011e-06, "loss": 0.2439, "step": 5246 }, { "epoch": 2.5070186966131054, "grad_norm": 0.4361436758103319, "learning_rate": 1.4438120203019779e-06, "loss": 0.26, "step": 5247 }, { "epoch": 2.5074965653186787, "grad_norm": 0.47056425580013644, "learning_rate": 1.4410871237287738e-06, "loss": 0.2575, "step": 5248 }, { "epoch": 2.507974434024252, "grad_norm": 0.48001156924264604, "learning_rate": 1.4383646012470254e-06, "loss": 0.2782, "step": 5249 }, { "epoch": 2.508452302729825, "grad_norm": 0.4657177084103505, "learning_rate": 1.4356444536119085e-06, "loss": 0.2592, "step": 5250 }, { "epoch": 2.508930171435398, "grad_norm": 0.4329970043724182, "learning_rate": 1.4329266815779507e-06, "loss": 0.2594, "step": 5251 }, { "epoch": 2.509408040140971, "grad_norm": 0.4624450792688428, "learning_rate": 1.4302112858990103e-06, "loss": 0.2633, "step": 5252 }, { "epoch": 2.5098859088465444, "grad_norm": 0.45995574276765083, "learning_rate": 1.4274982673282867e-06, "loss": 0.2872, "step": 5253 }, { "epoch": 2.5103637775521177, "grad_norm": 0.5483488802392914, "learning_rate": 1.4247876266183314e-06, "loss": 0.2601, "step": 5254 }, { "epoch": 2.5108416462576906, "grad_norm": 0.5288243133437358, "learning_rate": 1.422079364521024e-06, "loss": 0.2625, "step": 5255 }, { "epoch": 2.511319514963264, "grad_norm": 0.44122821307838805, "learning_rate": 1.419373481787587e-06, "loss": 0.2498, "step": 5256 }, { "epoch": 2.5117973836688368, "grad_norm": 0.4805211349693034, "learning_rate": 1.41666997916859e-06, "loss": 0.2691, "step": 5257 }, { "epoch": 2.51227525237441, "grad_norm": 0.4828655990618, "learning_rate": 1.413968857413932e-06, "loss": 0.2589, "step": 5258 }, { "epoch": 2.5127531210799834, "grad_norm": 0.45262455510116995, "learning_rate": 1.411270117272856e-06, "loss": 0.268, "step": 5259 }, { "epoch": 2.5132309897855563, "grad_norm": 0.4517397475555426, "learning_rate": 1.4085737594939497e-06, "loss": 0.2474, "step": 5260 }, { "epoch": 2.5137088584911296, "grad_norm": 0.4581435924810488, "learning_rate": 1.4058797848251315e-06, "loss": 0.2687, "step": 5261 }, { "epoch": 2.5141867271967024, "grad_norm": 0.48238520728771184, "learning_rate": 1.4031881940136615e-06, "loss": 0.275, "step": 5262 }, { "epoch": 2.5146645959022758, "grad_norm": 0.4780046524911139, "learning_rate": 1.4004989878061437e-06, "loss": 0.2711, "step": 5263 }, { "epoch": 2.515142464607849, "grad_norm": 0.45615713033317606, "learning_rate": 1.3978121669485135e-06, "loss": 0.279, "step": 5264 }, { "epoch": 2.5156203333134224, "grad_norm": 0.48685067829542655, "learning_rate": 1.3951277321860468e-06, "loss": 0.2517, "step": 5265 }, { "epoch": 2.5160982020189953, "grad_norm": 0.44894317063915157, "learning_rate": 1.3924456842633615e-06, "loss": 0.267, "step": 5266 }, { "epoch": 2.5165760707245686, "grad_norm": 0.8685504993927438, "learning_rate": 1.3897660239244093e-06, "loss": 0.266, "step": 5267 }, { "epoch": 2.5170539394301414, "grad_norm": 0.4609315916656919, "learning_rate": 1.3870887519124777e-06, "loss": 0.2784, "step": 5268 }, { "epoch": 2.5175318081357148, "grad_norm": 0.4691311012646939, "learning_rate": 1.384413868970199e-06, "loss": 0.2626, "step": 5269 }, { "epoch": 2.518009676841288, "grad_norm": 0.47785266314222014, "learning_rate": 1.381741375839537e-06, "loss": 0.2656, "step": 5270 }, { "epoch": 2.518487545546861, "grad_norm": 0.45350612752353825, "learning_rate": 1.3790712732617918e-06, "loss": 0.2717, "step": 5271 }, { "epoch": 2.5189654142524343, "grad_norm": 0.4465246048292593, "learning_rate": 1.3764035619776062e-06, "loss": 0.2549, "step": 5272 }, { "epoch": 2.519443282958007, "grad_norm": 0.4413523350209254, "learning_rate": 1.3737382427269551e-06, "loss": 0.2622, "step": 5273 }, { "epoch": 2.5199211516635804, "grad_norm": 0.4402416126165139, "learning_rate": 1.3710753162491498e-06, "loss": 0.2425, "step": 5274 }, { "epoch": 2.5203990203691538, "grad_norm": 0.4986151441542075, "learning_rate": 1.3684147832828409e-06, "loss": 0.2519, "step": 5275 }, { "epoch": 2.5208768890747266, "grad_norm": 0.5457742238724771, "learning_rate": 1.365756644566013e-06, "loss": 0.2611, "step": 5276 }, { "epoch": 2.5213547577803, "grad_norm": 0.4435897317652216, "learning_rate": 1.3631009008359874e-06, "loss": 0.2667, "step": 5277 }, { "epoch": 2.521832626485873, "grad_norm": 0.45681886545784145, "learning_rate": 1.360447552829417e-06, "loss": 0.2539, "step": 5278 }, { "epoch": 2.522310495191446, "grad_norm": 0.45114564080703556, "learning_rate": 1.3577966012822974e-06, "loss": 0.2526, "step": 5279 }, { "epoch": 2.5227883638970194, "grad_norm": 0.5748577158549483, "learning_rate": 1.355148046929956e-06, "loss": 0.2846, "step": 5280 }, { "epoch": 2.5232662326025923, "grad_norm": 0.5478505157579419, "learning_rate": 1.352501890507051e-06, "loss": 0.2546, "step": 5281 }, { "epoch": 2.5237441013081656, "grad_norm": 1.090704314845147, "learning_rate": 1.3498581327475847e-06, "loss": 0.2617, "step": 5282 }, { "epoch": 2.5242219700137385, "grad_norm": 0.44651134484658483, "learning_rate": 1.3472167743848863e-06, "loss": 0.2797, "step": 5283 }, { "epoch": 2.524699838719312, "grad_norm": 0.4332379133919312, "learning_rate": 1.344577816151621e-06, "loss": 0.2693, "step": 5284 }, { "epoch": 2.525177707424885, "grad_norm": 0.4454197457889718, "learning_rate": 1.3419412587797908e-06, "loss": 0.2913, "step": 5285 }, { "epoch": 2.525655576130458, "grad_norm": 0.5539819066751364, "learning_rate": 1.3393071030007298e-06, "loss": 0.2734, "step": 5286 }, { "epoch": 2.5261334448360313, "grad_norm": 0.4907491307225207, "learning_rate": 1.3366753495451046e-06, "loss": 0.2559, "step": 5287 }, { "epoch": 2.526611313541604, "grad_norm": 0.44486691210900325, "learning_rate": 1.3340459991429221e-06, "loss": 0.2717, "step": 5288 }, { "epoch": 2.5270891822471775, "grad_norm": 0.524341426871989, "learning_rate": 1.3314190525235148e-06, "loss": 0.2785, "step": 5289 }, { "epoch": 2.527567050952751, "grad_norm": 0.4507961520371896, "learning_rate": 1.3287945104155487e-06, "loss": 0.2692, "step": 5290 }, { "epoch": 2.528044919658324, "grad_norm": 0.4931269433170495, "learning_rate": 1.32617237354703e-06, "loss": 0.2577, "step": 5291 }, { "epoch": 2.528522788363897, "grad_norm": 0.44578474408366947, "learning_rate": 1.3235526426452916e-06, "loss": 0.2853, "step": 5292 }, { "epoch": 2.5290006570694703, "grad_norm": 0.4549319341008648, "learning_rate": 1.3209353184369978e-06, "loss": 0.2599, "step": 5293 }, { "epoch": 2.529478525775043, "grad_norm": 0.44026422411476585, "learning_rate": 1.318320401648152e-06, "loss": 0.274, "step": 5294 }, { "epoch": 2.5299563944806165, "grad_norm": 0.44736941554716725, "learning_rate": 1.3157078930040856e-06, "loss": 0.2668, "step": 5295 }, { "epoch": 2.53043426318619, "grad_norm": 0.4735831103712228, "learning_rate": 1.3130977932294597e-06, "loss": 0.2688, "step": 5296 }, { "epoch": 2.5309121318917627, "grad_norm": 0.4685212925748311, "learning_rate": 1.310490103048273e-06, "loss": 0.2482, "step": 5297 }, { "epoch": 2.531390000597336, "grad_norm": 0.4593947890008595, "learning_rate": 1.3078848231838514e-06, "loss": 0.2581, "step": 5298 }, { "epoch": 2.531867869302909, "grad_norm": 0.4311892224206543, "learning_rate": 1.3052819543588512e-06, "loss": 0.2465, "step": 5299 }, { "epoch": 2.532345738008482, "grad_norm": 0.46314465945990146, "learning_rate": 1.3026814972952674e-06, "loss": 0.2684, "step": 5300 }, { "epoch": 2.5328236067140555, "grad_norm": 0.46309469275162923, "learning_rate": 1.300083452714418e-06, "loss": 0.2615, "step": 5301 }, { "epoch": 2.5333014754196284, "grad_norm": 0.4437569382154475, "learning_rate": 1.2974878213369523e-06, "loss": 0.2639, "step": 5302 }, { "epoch": 2.5337793441252017, "grad_norm": 0.5664502089285215, "learning_rate": 1.294894603882858e-06, "loss": 0.2615, "step": 5303 }, { "epoch": 2.5342572128307745, "grad_norm": 0.4592275859545169, "learning_rate": 1.2923038010714451e-06, "loss": 0.2601, "step": 5304 }, { "epoch": 2.534735081536348, "grad_norm": 0.4309337745483858, "learning_rate": 1.2897154136213542e-06, "loss": 0.2823, "step": 5305 }, { "epoch": 2.535212950241921, "grad_norm": 0.48862862670470414, "learning_rate": 1.287129442250562e-06, "loss": 0.2745, "step": 5306 }, { "epoch": 2.535690818947494, "grad_norm": 0.83841385727078, "learning_rate": 1.2845458876763718e-06, "loss": 0.2782, "step": 5307 }, { "epoch": 2.5361686876530674, "grad_norm": 0.43581783536876467, "learning_rate": 1.281964750615412e-06, "loss": 0.2651, "step": 5308 }, { "epoch": 2.5366465563586402, "grad_norm": 0.4411137035318371, "learning_rate": 1.2793860317836482e-06, "loss": 0.2884, "step": 5309 }, { "epoch": 2.5371244250642135, "grad_norm": 0.46287211410540297, "learning_rate": 1.2768097318963701e-06, "loss": 0.2376, "step": 5310 }, { "epoch": 2.537602293769787, "grad_norm": 0.44652943411212714, "learning_rate": 1.2742358516681963e-06, "loss": 0.2763, "step": 5311 }, { "epoch": 2.53808016247536, "grad_norm": 0.45278372197354316, "learning_rate": 1.27166439181308e-06, "loss": 0.2682, "step": 5312 }, { "epoch": 2.538558031180933, "grad_norm": 0.44885245556664793, "learning_rate": 1.2690953530442963e-06, "loss": 0.2417, "step": 5313 }, { "epoch": 2.539035899886506, "grad_norm": 0.43904932734636665, "learning_rate": 1.2665287360744482e-06, "loss": 0.2647, "step": 5314 }, { "epoch": 2.5395137685920792, "grad_norm": 0.42657730106507596, "learning_rate": 1.2639645416154744e-06, "loss": 0.2546, "step": 5315 }, { "epoch": 2.5399916372976525, "grad_norm": 0.5240326634712088, "learning_rate": 1.2614027703786369e-06, "loss": 0.249, "step": 5316 }, { "epoch": 2.540469506003226, "grad_norm": 0.44736006043482424, "learning_rate": 1.2588434230745228e-06, "loss": 0.2483, "step": 5317 }, { "epoch": 2.5409473747087987, "grad_norm": 0.4431785975651459, "learning_rate": 1.2562865004130532e-06, "loss": 0.2623, "step": 5318 }, { "epoch": 2.541425243414372, "grad_norm": 0.4599390657644793, "learning_rate": 1.2537320031034717e-06, "loss": 0.2704, "step": 5319 }, { "epoch": 2.541903112119945, "grad_norm": 0.4588576873276534, "learning_rate": 1.2511799318543493e-06, "loss": 0.2862, "step": 5320 }, { "epoch": 2.542380980825518, "grad_norm": 0.4409900007452561, "learning_rate": 1.2486302873735878e-06, "loss": 0.2639, "step": 5321 }, { "epoch": 2.5428588495310915, "grad_norm": 0.4463947225335007, "learning_rate": 1.2460830703684147e-06, "loss": 0.2563, "step": 5322 }, { "epoch": 2.5433367182366644, "grad_norm": 0.44554797570869803, "learning_rate": 1.243538281545381e-06, "loss": 0.2618, "step": 5323 }, { "epoch": 2.5438145869422377, "grad_norm": 0.4366283293412151, "learning_rate": 1.2409959216103651e-06, "loss": 0.2727, "step": 5324 }, { "epoch": 2.5442924556478106, "grad_norm": 0.6447095546778173, "learning_rate": 1.2384559912685768e-06, "loss": 0.2596, "step": 5325 }, { "epoch": 2.544770324353384, "grad_norm": 0.46130620935874256, "learning_rate": 1.2359184912245448e-06, "loss": 0.2674, "step": 5326 }, { "epoch": 2.545248193058957, "grad_norm": 0.45602539579070933, "learning_rate": 1.2333834221821262e-06, "loss": 0.2573, "step": 5327 }, { "epoch": 2.54572606176453, "grad_norm": 0.4549646150650348, "learning_rate": 1.2308507848445072e-06, "loss": 0.2579, "step": 5328 }, { "epoch": 2.5462039304701034, "grad_norm": 0.4415752044414283, "learning_rate": 1.228320579914195e-06, "loss": 0.2794, "step": 5329 }, { "epoch": 2.5466817991756763, "grad_norm": 0.47703375879035975, "learning_rate": 1.2257928080930236e-06, "loss": 0.2729, "step": 5330 }, { "epoch": 2.5471596678812496, "grad_norm": 0.6782750426427807, "learning_rate": 1.2232674700821535e-06, "loss": 0.2681, "step": 5331 }, { "epoch": 2.547637536586823, "grad_norm": 0.44620672912194137, "learning_rate": 1.2207445665820695e-06, "loss": 0.2655, "step": 5332 }, { "epoch": 2.5481154052923958, "grad_norm": 0.49483270413584723, "learning_rate": 1.2182240982925764e-06, "loss": 0.255, "step": 5333 }, { "epoch": 2.548593273997969, "grad_norm": 0.4760394124784553, "learning_rate": 1.2157060659128128e-06, "loss": 0.261, "step": 5334 }, { "epoch": 2.549071142703542, "grad_norm": 0.43536619499616924, "learning_rate": 1.2131904701412345e-06, "loss": 0.2699, "step": 5335 }, { "epoch": 2.5495490114091153, "grad_norm": 0.4567672339365635, "learning_rate": 1.2106773116756198e-06, "loss": 0.2887, "step": 5336 }, { "epoch": 2.5500268801146886, "grad_norm": 0.44424439879811406, "learning_rate": 1.2081665912130813e-06, "loss": 0.2715, "step": 5337 }, { "epoch": 2.550504748820262, "grad_norm": 0.47455117509378236, "learning_rate": 1.2056583094500451e-06, "loss": 0.2692, "step": 5338 }, { "epoch": 2.5509826175258348, "grad_norm": 0.4577068157459303, "learning_rate": 1.2031524670822613e-06, "loss": 0.2527, "step": 5339 }, { "epoch": 2.551460486231408, "grad_norm": 0.4635419551668722, "learning_rate": 1.2006490648048118e-06, "loss": 0.2616, "step": 5340 }, { "epoch": 2.551938354936981, "grad_norm": 0.4872478703098568, "learning_rate": 1.1981481033120945e-06, "loss": 0.2779, "step": 5341 }, { "epoch": 2.5524162236425543, "grad_norm": 0.43900058178403417, "learning_rate": 1.1956495832978289e-06, "loss": 0.2635, "step": 5342 }, { "epoch": 2.5528940923481276, "grad_norm": 0.48199098433370163, "learning_rate": 1.1931535054550647e-06, "loss": 0.2598, "step": 5343 }, { "epoch": 2.5533719610537005, "grad_norm": 0.476582985861211, "learning_rate": 1.1906598704761685e-06, "loss": 0.2681, "step": 5344 }, { "epoch": 2.5538498297592738, "grad_norm": 0.43490218370942263, "learning_rate": 1.1881686790528279e-06, "loss": 0.2522, "step": 5345 }, { "epoch": 2.5543276984648466, "grad_norm": 0.435352449223695, "learning_rate": 1.1856799318760592e-06, "loss": 0.2582, "step": 5346 }, { "epoch": 2.55480556717042, "grad_norm": 0.48299189717460383, "learning_rate": 1.1831936296361957e-06, "loss": 0.2582, "step": 5347 }, { "epoch": 2.5552834358759933, "grad_norm": 0.45700996353152185, "learning_rate": 1.1807097730228912e-06, "loss": 0.2652, "step": 5348 }, { "epoch": 2.555761304581566, "grad_norm": 0.45128915246269297, "learning_rate": 1.178228362725129e-06, "loss": 0.2566, "step": 5349 }, { "epoch": 2.5562391732871395, "grad_norm": 0.49437093690101924, "learning_rate": 1.1757493994312052e-06, "loss": 0.2716, "step": 5350 }, { "epoch": 2.5567170419927123, "grad_norm": 0.5541061982972699, "learning_rate": 1.1732728838287388e-06, "loss": 0.2563, "step": 5351 }, { "epoch": 2.5571949106982856, "grad_norm": 0.46593343771916595, "learning_rate": 1.1707988166046757e-06, "loss": 0.2553, "step": 5352 }, { "epoch": 2.557672779403859, "grad_norm": 0.48119739643546555, "learning_rate": 1.168327198445276e-06, "loss": 0.2692, "step": 5353 }, { "epoch": 2.558150648109432, "grad_norm": 0.44694892042251333, "learning_rate": 1.1658580300361223e-06, "loss": 0.2623, "step": 5354 }, { "epoch": 2.558628516815005, "grad_norm": 0.7466740830810888, "learning_rate": 1.1633913120621188e-06, "loss": 0.275, "step": 5355 }, { "epoch": 2.559106385520578, "grad_norm": 0.4510071850128832, "learning_rate": 1.1609270452074917e-06, "loss": 0.2514, "step": 5356 }, { "epoch": 2.5595842542261513, "grad_norm": 0.4667471409690589, "learning_rate": 1.158465230155784e-06, "loss": 0.2806, "step": 5357 }, { "epoch": 2.5600621229317246, "grad_norm": 0.4836523364080265, "learning_rate": 1.1560058675898577e-06, "loss": 0.277, "step": 5358 }, { "epoch": 2.5605399916372975, "grad_norm": 0.4864707078033067, "learning_rate": 1.1535489581919012e-06, "loss": 0.2656, "step": 5359 }, { "epoch": 2.561017860342871, "grad_norm": 0.43787020641937124, "learning_rate": 1.151094502643414e-06, "loss": 0.2451, "step": 5360 }, { "epoch": 2.5614957290484437, "grad_norm": 0.456591257908703, "learning_rate": 1.148642501625218e-06, "loss": 0.2525, "step": 5361 }, { "epoch": 2.561973597754017, "grad_norm": 0.45414715242812, "learning_rate": 1.1461929558174589e-06, "loss": 0.2576, "step": 5362 }, { "epoch": 2.5624514664595903, "grad_norm": 0.4297472089254829, "learning_rate": 1.1437458658995947e-06, "loss": 0.2565, "step": 5363 }, { "epoch": 2.5629293351651636, "grad_norm": 0.6242800594293806, "learning_rate": 1.1413012325504048e-06, "loss": 0.252, "step": 5364 }, { "epoch": 2.5634072038707365, "grad_norm": 0.44685469395106875, "learning_rate": 1.1388590564479895e-06, "loss": 0.2716, "step": 5365 }, { "epoch": 2.56388507257631, "grad_norm": 0.43936844793345164, "learning_rate": 1.1364193382697642e-06, "loss": 0.2564, "step": 5366 }, { "epoch": 2.5643629412818827, "grad_norm": 0.4655861680277552, "learning_rate": 1.1339820786924616e-06, "loss": 0.2878, "step": 5367 }, { "epoch": 2.564840809987456, "grad_norm": 0.46292732997498603, "learning_rate": 1.1315472783921378e-06, "loss": 0.2688, "step": 5368 }, { "epoch": 2.5653186786930293, "grad_norm": 0.4598254943620399, "learning_rate": 1.1291149380441636e-06, "loss": 0.2673, "step": 5369 }, { "epoch": 2.565796547398602, "grad_norm": 0.454472063477016, "learning_rate": 1.1266850583232224e-06, "loss": 0.2478, "step": 5370 }, { "epoch": 2.5662744161041755, "grad_norm": 0.4325026111247085, "learning_rate": 1.1242576399033267e-06, "loss": 0.2562, "step": 5371 }, { "epoch": 2.5667522848097484, "grad_norm": 0.4453897324065896, "learning_rate": 1.1218326834577953e-06, "loss": 0.2518, "step": 5372 }, { "epoch": 2.5672301535153217, "grad_norm": 0.4650279923070856, "learning_rate": 1.119410189659268e-06, "loss": 0.2888, "step": 5373 }, { "epoch": 2.567708022220895, "grad_norm": 0.4475210944515468, "learning_rate": 1.116990159179705e-06, "loss": 0.2607, "step": 5374 }, { "epoch": 2.568185890926468, "grad_norm": 0.4867286499909107, "learning_rate": 1.1145725926903772e-06, "loss": 0.2759, "step": 5375 }, { "epoch": 2.568663759632041, "grad_norm": 0.5000116826228374, "learning_rate": 1.112157490861875e-06, "loss": 0.2527, "step": 5376 }, { "epoch": 2.569141628337614, "grad_norm": 0.4353063417332031, "learning_rate": 1.1097448543641077e-06, "loss": 0.257, "step": 5377 }, { "epoch": 2.5696194970431874, "grad_norm": 0.5646763109551065, "learning_rate": 1.107334683866297e-06, "loss": 0.2804, "step": 5378 }, { "epoch": 2.5700973657487607, "grad_norm": 0.4829620257030537, "learning_rate": 1.1049269800369787e-06, "loss": 0.2713, "step": 5379 }, { "epoch": 2.5705752344543336, "grad_norm": 0.44373746829302196, "learning_rate": 1.1025217435440116e-06, "loss": 0.2708, "step": 5380 }, { "epoch": 2.571053103159907, "grad_norm": 0.45314693225438013, "learning_rate": 1.1001189750545637e-06, "loss": 0.2661, "step": 5381 }, { "epoch": 2.5715309718654797, "grad_norm": 0.4396802093044451, "learning_rate": 1.0977186752351187e-06, "loss": 0.2518, "step": 5382 }, { "epoch": 2.572008840571053, "grad_norm": 0.43552638814311895, "learning_rate": 1.095320844751483e-06, "loss": 0.2547, "step": 5383 }, { "epoch": 2.5724867092766264, "grad_norm": 0.46000133848749897, "learning_rate": 1.0929254842687676e-06, "loss": 0.2746, "step": 5384 }, { "epoch": 2.5729645779821992, "grad_norm": 0.514481355649078, "learning_rate": 1.0905325944514034e-06, "loss": 0.2714, "step": 5385 }, { "epoch": 2.5734424466877726, "grad_norm": 0.44321694655322663, "learning_rate": 1.0881421759631394e-06, "loss": 0.2642, "step": 5386 }, { "epoch": 2.5739203153933454, "grad_norm": 0.4519160941249428, "learning_rate": 1.085754229467032e-06, "loss": 0.2406, "step": 5387 }, { "epoch": 2.5743981840989187, "grad_norm": 0.45129680242728143, "learning_rate": 1.0833687556254558e-06, "loss": 0.2617, "step": 5388 }, { "epoch": 2.574876052804492, "grad_norm": 0.47689305663408194, "learning_rate": 1.0809857551001013e-06, "loss": 0.2532, "step": 5389 }, { "epoch": 2.5753539215100654, "grad_norm": 0.46337528524382954, "learning_rate": 1.078605228551971e-06, "loss": 0.2718, "step": 5390 }, { "epoch": 2.5758317902156382, "grad_norm": 0.4470813207728647, "learning_rate": 1.0762271766413768e-06, "loss": 0.243, "step": 5391 }, { "epoch": 2.5763096589212116, "grad_norm": 0.44289569253985883, "learning_rate": 1.0738516000279542e-06, "loss": 0.2492, "step": 5392 }, { "epoch": 2.5767875276267844, "grad_norm": 0.4728394742707393, "learning_rate": 1.0714784993706418e-06, "loss": 0.2685, "step": 5393 }, { "epoch": 2.5772653963323577, "grad_norm": 0.4549229472146639, "learning_rate": 1.0691078753276962e-06, "loss": 0.2512, "step": 5394 }, { "epoch": 2.577743265037931, "grad_norm": 0.44764612742438686, "learning_rate": 1.0667397285566893e-06, "loss": 0.2549, "step": 5395 }, { "epoch": 2.578221133743504, "grad_norm": 0.4822527109733988, "learning_rate": 1.0643740597145025e-06, "loss": 0.2631, "step": 5396 }, { "epoch": 2.5786990024490772, "grad_norm": 0.4572397580448997, "learning_rate": 1.0620108694573272e-06, "loss": 0.2842, "step": 5397 }, { "epoch": 2.57917687115465, "grad_norm": 0.42473552272997284, "learning_rate": 1.0596501584406749e-06, "loss": 0.2633, "step": 5398 }, { "epoch": 2.5796547398602234, "grad_norm": 0.4314598541362525, "learning_rate": 1.0572919273193639e-06, "loss": 0.2598, "step": 5399 }, { "epoch": 2.5801326085657967, "grad_norm": 0.43609204699648757, "learning_rate": 1.0549361767475241e-06, "loss": 0.2537, "step": 5400 }, { "epoch": 2.5806104772713696, "grad_norm": 0.448934958642449, "learning_rate": 1.052582907378602e-06, "loss": 0.2655, "step": 5401 }, { "epoch": 2.581088345976943, "grad_norm": 0.44431074903974976, "learning_rate": 1.050232119865352e-06, "loss": 0.266, "step": 5402 }, { "epoch": 2.581566214682516, "grad_norm": 0.4752805202026781, "learning_rate": 1.047883814859838e-06, "loss": 0.2891, "step": 5403 }, { "epoch": 2.582044083388089, "grad_norm": 0.4475172447028092, "learning_rate": 1.0455379930134435e-06, "loss": 0.2694, "step": 5404 }, { "epoch": 2.5825219520936624, "grad_norm": 0.5428975699380926, "learning_rate": 1.0431946549768567e-06, "loss": 0.2692, "step": 5405 }, { "epoch": 2.5829998207992353, "grad_norm": 0.46361910025632774, "learning_rate": 1.0408538014000747e-06, "loss": 0.2635, "step": 5406 }, { "epoch": 2.5834776895048086, "grad_norm": 0.4420791110581533, "learning_rate": 1.0385154329324132e-06, "loss": 0.2624, "step": 5407 }, { "epoch": 2.5839555582103815, "grad_norm": 0.4595527525010734, "learning_rate": 1.0361795502224925e-06, "loss": 0.2608, "step": 5408 }, { "epoch": 2.584433426915955, "grad_norm": 0.45101202999040235, "learning_rate": 1.0338461539182443e-06, "loss": 0.2618, "step": 5409 }, { "epoch": 2.584911295621528, "grad_norm": 0.4599073300017213, "learning_rate": 1.0315152446669142e-06, "loss": 0.2703, "step": 5410 }, { "epoch": 2.585389164327101, "grad_norm": 0.5079598888681588, "learning_rate": 1.0291868231150537e-06, "loss": 0.2681, "step": 5411 }, { "epoch": 2.5858670330326743, "grad_norm": 0.48499751233901656, "learning_rate": 1.0268608899085241e-06, "loss": 0.2548, "step": 5412 }, { "epoch": 2.586344901738247, "grad_norm": 0.4422616538627771, "learning_rate": 1.0245374456925029e-06, "loss": 0.2696, "step": 5413 }, { "epoch": 2.5868227704438205, "grad_norm": 0.4443272586161862, "learning_rate": 1.0222164911114697e-06, "loss": 0.2582, "step": 5414 }, { "epoch": 2.587300639149394, "grad_norm": 0.44296846825131514, "learning_rate": 1.019898026809214e-06, "loss": 0.2803, "step": 5415 }, { "epoch": 2.587778507854967, "grad_norm": 0.4517679987221167, "learning_rate": 1.0175820534288416e-06, "loss": 0.2806, "step": 5416 }, { "epoch": 2.58825637656054, "grad_norm": 0.48611487954550253, "learning_rate": 1.0152685716127598e-06, "loss": 0.2602, "step": 5417 }, { "epoch": 2.5887342452661133, "grad_norm": 0.4628864103901881, "learning_rate": 1.0129575820026872e-06, "loss": 0.272, "step": 5418 }, { "epoch": 2.589212113971686, "grad_norm": 0.4649352347539501, "learning_rate": 1.0106490852396544e-06, "loss": 0.2539, "step": 5419 }, { "epoch": 2.5896899826772595, "grad_norm": 0.43939173718242563, "learning_rate": 1.0083430819639962e-06, "loss": 0.2475, "step": 5420 }, { "epoch": 2.590167851382833, "grad_norm": 0.44410831432228415, "learning_rate": 1.0060395728153539e-06, "loss": 0.2722, "step": 5421 }, { "epoch": 2.5906457200884057, "grad_norm": 0.5288794827685146, "learning_rate": 1.0037385584326843e-06, "loss": 0.2566, "step": 5422 }, { "epoch": 2.591123588793979, "grad_norm": 0.43733087257688036, "learning_rate": 1.0014400394542489e-06, "loss": 0.2572, "step": 5423 }, { "epoch": 2.591601457499552, "grad_norm": 0.4783741515835818, "learning_rate": 9.991440165176147e-07, "loss": 0.251, "step": 5424 }, { "epoch": 2.592079326205125, "grad_norm": 0.45766909636052455, "learning_rate": 9.968504902596566e-07, "loss": 0.2809, "step": 5425 }, { "epoch": 2.5925571949106985, "grad_norm": 0.47258848724591235, "learning_rate": 9.94559461316561e-07, "loss": 0.262, "step": 5426 }, { "epoch": 2.5930350636162713, "grad_norm": 0.4790702365770836, "learning_rate": 9.922709303238175e-07, "loss": 0.262, "step": 5427 }, { "epoch": 2.5935129323218447, "grad_norm": 0.676497020285866, "learning_rate": 9.899848979162218e-07, "loss": 0.2711, "step": 5428 }, { "epoch": 2.5939908010274175, "grad_norm": 0.44747328409752335, "learning_rate": 9.877013647278844e-07, "loss": 0.2647, "step": 5429 }, { "epoch": 2.594468669732991, "grad_norm": 0.4606135261704647, "learning_rate": 9.85420331392214e-07, "loss": 0.2532, "step": 5430 }, { "epoch": 2.594946538438564, "grad_norm": 0.43604837240539995, "learning_rate": 9.831417985419278e-07, "loss": 0.2645, "step": 5431 }, { "epoch": 2.595424407144137, "grad_norm": 0.4373132967875298, "learning_rate": 9.808657668090527e-07, "loss": 0.2579, "step": 5432 }, { "epoch": 2.5959022758497103, "grad_norm": 0.4573400005148769, "learning_rate": 9.785922368249201e-07, "loss": 0.2649, "step": 5433 }, { "epoch": 2.596380144555283, "grad_norm": 0.4286328321592479, "learning_rate": 9.763212092201634e-07, "loss": 0.2771, "step": 5434 }, { "epoch": 2.5968580132608565, "grad_norm": 0.46487186359736915, "learning_rate": 9.74052684624731e-07, "loss": 0.2605, "step": 5435 }, { "epoch": 2.59733588196643, "grad_norm": 0.45134027488832124, "learning_rate": 9.717866636678685e-07, "loss": 0.2696, "step": 5436 }, { "epoch": 2.5978137506720027, "grad_norm": 0.4315621650790117, "learning_rate": 9.695231469781285e-07, "loss": 0.2595, "step": 5437 }, { "epoch": 2.598291619377576, "grad_norm": 0.43086059451199493, "learning_rate": 9.672621351833754e-07, "loss": 0.2563, "step": 5438 }, { "epoch": 2.598769488083149, "grad_norm": 0.4510589878533883, "learning_rate": 9.6500362891077e-07, "loss": 0.2753, "step": 5439 }, { "epoch": 2.599247356788722, "grad_norm": 0.47484782844891105, "learning_rate": 9.62747628786782e-07, "loss": 0.2804, "step": 5440 }, { "epoch": 2.5997252254942955, "grad_norm": 0.44734361221136104, "learning_rate": 9.604941354371899e-07, "loss": 0.2565, "step": 5441 }, { "epoch": 2.600203094199869, "grad_norm": 0.4523591857593359, "learning_rate": 9.582431494870693e-07, "loss": 0.2537, "step": 5442 }, { "epoch": 2.6006809629054417, "grad_norm": 0.45376389340573675, "learning_rate": 9.559946715608037e-07, "loss": 0.2473, "step": 5443 }, { "epoch": 2.601158831611015, "grad_norm": 0.5018534323943927, "learning_rate": 9.537487022820846e-07, "loss": 0.2692, "step": 5444 }, { "epoch": 2.601636700316588, "grad_norm": 0.571660106743449, "learning_rate": 9.515052422739035e-07, "loss": 0.2762, "step": 5445 }, { "epoch": 2.602114569022161, "grad_norm": 0.4478057549739928, "learning_rate": 9.492642921585526e-07, "loss": 0.2676, "step": 5446 }, { "epoch": 2.6025924377277345, "grad_norm": 0.4695545143748728, "learning_rate": 9.47025852557636e-07, "loss": 0.2771, "step": 5447 }, { "epoch": 2.6030703064333074, "grad_norm": 0.4452493287419494, "learning_rate": 9.447899240920566e-07, "loss": 0.2578, "step": 5448 }, { "epoch": 2.6035481751388807, "grad_norm": 0.440730727424376, "learning_rate": 9.425565073820198e-07, "loss": 0.3075, "step": 5449 }, { "epoch": 2.6040260438444536, "grad_norm": 0.44510497848773967, "learning_rate": 9.403256030470386e-07, "loss": 0.2695, "step": 5450 }, { "epoch": 2.604503912550027, "grad_norm": 0.4519704788457889, "learning_rate": 9.380972117059262e-07, "loss": 0.2568, "step": 5451 }, { "epoch": 2.6049817812556, "grad_norm": 0.5598137706232549, "learning_rate": 9.358713339767955e-07, "loss": 0.252, "step": 5452 }, { "epoch": 2.605459649961173, "grad_norm": 0.46188600754569076, "learning_rate": 9.336479704770696e-07, "loss": 0.2665, "step": 5453 }, { "epoch": 2.6059375186667464, "grad_norm": 0.49788442318547643, "learning_rate": 9.314271218234693e-07, "loss": 0.2529, "step": 5454 }, { "epoch": 2.6064153873723193, "grad_norm": 0.4345623662569078, "learning_rate": 9.292087886320166e-07, "loss": 0.2615, "step": 5455 }, { "epoch": 2.6068932560778926, "grad_norm": 0.44139821572916704, "learning_rate": 9.269929715180404e-07, "loss": 0.2822, "step": 5456 }, { "epoch": 2.607371124783466, "grad_norm": 0.4252933161869911, "learning_rate": 9.247796710961699e-07, "loss": 0.2483, "step": 5457 }, { "epoch": 2.6078489934890388, "grad_norm": 0.4412914528127249, "learning_rate": 9.225688879803351e-07, "loss": 0.2647, "step": 5458 }, { "epoch": 2.608326862194612, "grad_norm": 0.4357571583911059, "learning_rate": 9.203606227837658e-07, "loss": 0.2616, "step": 5459 }, { "epoch": 2.608804730900185, "grad_norm": 0.4994871920703874, "learning_rate": 9.181548761189996e-07, "loss": 0.2688, "step": 5460 }, { "epoch": 2.6092825996057583, "grad_norm": 0.4351987789797297, "learning_rate": 9.159516485978692e-07, "loss": 0.2691, "step": 5461 }, { "epoch": 2.6097604683113316, "grad_norm": 0.4430845065598318, "learning_rate": 9.137509408315104e-07, "loss": 0.2593, "step": 5462 }, { "epoch": 2.6102383370169044, "grad_norm": 0.47947175182055163, "learning_rate": 9.115527534303637e-07, "loss": 0.2645, "step": 5463 }, { "epoch": 2.6107162057224778, "grad_norm": 0.4428209512787218, "learning_rate": 9.093570870041645e-07, "loss": 0.2655, "step": 5464 }, { "epoch": 2.6111940744280506, "grad_norm": 0.46538192551551194, "learning_rate": 9.071639421619527e-07, "loss": 0.2834, "step": 5465 }, { "epoch": 2.611671943133624, "grad_norm": 0.42759421702996847, "learning_rate": 9.049733195120703e-07, "loss": 0.2607, "step": 5466 }, { "epoch": 2.6121498118391973, "grad_norm": 0.4208547298808128, "learning_rate": 9.027852196621545e-07, "loss": 0.2422, "step": 5467 }, { "epoch": 2.6126276805447706, "grad_norm": 0.4382513214899705, "learning_rate": 9.005996432191455e-07, "loss": 0.2649, "step": 5468 }, { "epoch": 2.6131055492503434, "grad_norm": 0.4914340398062526, "learning_rate": 8.984165907892872e-07, "loss": 0.2635, "step": 5469 }, { "epoch": 2.6135834179559168, "grad_norm": 0.42931017707652125, "learning_rate": 8.962360629781164e-07, "loss": 0.2506, "step": 5470 }, { "epoch": 2.6140612866614896, "grad_norm": 0.46339088752838625, "learning_rate": 8.940580603904736e-07, "loss": 0.2733, "step": 5471 }, { "epoch": 2.614539155367063, "grad_norm": 0.48172702720835325, "learning_rate": 8.918825836304989e-07, "loss": 0.2797, "step": 5472 }, { "epoch": 2.6150170240726363, "grad_norm": 0.4507180023251733, "learning_rate": 8.89709633301632e-07, "loss": 0.2698, "step": 5473 }, { "epoch": 2.615494892778209, "grad_norm": 0.4542073635472215, "learning_rate": 8.875392100066082e-07, "loss": 0.2796, "step": 5474 }, { "epoch": 2.6159727614837824, "grad_norm": 0.4512619657743737, "learning_rate": 8.853713143474685e-07, "loss": 0.2457, "step": 5475 }, { "epoch": 2.6164506301893553, "grad_norm": 0.43470496799876945, "learning_rate": 8.832059469255461e-07, "loss": 0.2638, "step": 5476 }, { "epoch": 2.6169284988949286, "grad_norm": 0.4696707255099846, "learning_rate": 8.810431083414761e-07, "loss": 0.2794, "step": 5477 }, { "epoch": 2.617406367600502, "grad_norm": 0.4654756321536549, "learning_rate": 8.788827991951932e-07, "loss": 0.2772, "step": 5478 }, { "epoch": 2.617884236306075, "grad_norm": 1.1496364863049888, "learning_rate": 8.767250200859278e-07, "loss": 0.2443, "step": 5479 }, { "epoch": 2.618362105011648, "grad_norm": 0.5038462450910971, "learning_rate": 8.745697716122081e-07, "loss": 0.2584, "step": 5480 }, { "epoch": 2.618839973717221, "grad_norm": 0.4531843698900754, "learning_rate": 8.724170543718657e-07, "loss": 0.2617, "step": 5481 }, { "epoch": 2.6193178424227943, "grad_norm": 0.44068312743815813, "learning_rate": 8.702668689620252e-07, "loss": 0.2664, "step": 5482 }, { "epoch": 2.6197957111283676, "grad_norm": 0.4527311915552218, "learning_rate": 8.681192159791074e-07, "loss": 0.2439, "step": 5483 }, { "epoch": 2.6202735798339405, "grad_norm": 0.46007711834116877, "learning_rate": 8.659740960188379e-07, "loss": 0.2632, "step": 5484 }, { "epoch": 2.620751448539514, "grad_norm": 0.4654714237772124, "learning_rate": 8.638315096762318e-07, "loss": 0.264, "step": 5485 }, { "epoch": 2.6212293172450867, "grad_norm": 0.4468898821749278, "learning_rate": 8.616914575456048e-07, "loss": 0.259, "step": 5486 }, { "epoch": 2.62170718595066, "grad_norm": 0.45382995132457704, "learning_rate": 8.595539402205711e-07, "loss": 0.26, "step": 5487 }, { "epoch": 2.6221850546562333, "grad_norm": 0.4466920475534839, "learning_rate": 8.574189582940407e-07, "loss": 0.2509, "step": 5488 }, { "epoch": 2.6226629233618066, "grad_norm": 0.4340782648580306, "learning_rate": 8.552865123582143e-07, "loss": 0.2648, "step": 5489 }, { "epoch": 2.6231407920673795, "grad_norm": 0.44864783353774906, "learning_rate": 8.531566030046035e-07, "loss": 0.2591, "step": 5490 }, { "epoch": 2.6236186607729524, "grad_norm": 0.48371815986744804, "learning_rate": 8.510292308240043e-07, "loss": 0.2819, "step": 5491 }, { "epoch": 2.6240965294785257, "grad_norm": 0.4732488135933287, "learning_rate": 8.489043964065091e-07, "loss": 0.2735, "step": 5492 }, { "epoch": 2.624574398184099, "grad_norm": 0.5366303704759061, "learning_rate": 8.467821003415133e-07, "loss": 0.2659, "step": 5493 }, { "epoch": 2.6250522668896723, "grad_norm": 0.5036292178699638, "learning_rate": 8.446623432177025e-07, "loss": 0.2566, "step": 5494 }, { "epoch": 2.625530135595245, "grad_norm": 0.4953780048542685, "learning_rate": 8.425451256230588e-07, "loss": 0.2638, "step": 5495 }, { "epoch": 2.6260080043008185, "grad_norm": 0.4490484978515238, "learning_rate": 8.404304481448644e-07, "loss": 0.2801, "step": 5496 }, { "epoch": 2.6264858730063914, "grad_norm": 0.48605177710536757, "learning_rate": 8.383183113696914e-07, "loss": 0.2751, "step": 5497 }, { "epoch": 2.6269637417119647, "grad_norm": 0.42633172550873427, "learning_rate": 8.362087158834087e-07, "loss": 0.266, "step": 5498 }, { "epoch": 2.627441610417538, "grad_norm": 0.4472570484490212, "learning_rate": 8.341016622711829e-07, "loss": 0.264, "step": 5499 }, { "epoch": 2.627919479123111, "grad_norm": 0.45968606482345986, "learning_rate": 8.319971511174718e-07, "loss": 0.2736, "step": 5500 }, { "epoch": 2.628397347828684, "grad_norm": 0.5381860245104173, "learning_rate": 8.298951830060286e-07, "loss": 0.2638, "step": 5501 }, { "epoch": 2.628875216534257, "grad_norm": 0.4439356147429359, "learning_rate": 8.277957585199059e-07, "loss": 0.2721, "step": 5502 }, { "epoch": 2.6293530852398304, "grad_norm": 0.46056697797714147, "learning_rate": 8.256988782414454e-07, "loss": 0.2774, "step": 5503 }, { "epoch": 2.6298309539454037, "grad_norm": 0.4429290965835113, "learning_rate": 8.23604542752281e-07, "loss": 0.2608, "step": 5504 }, { "epoch": 2.6303088226509765, "grad_norm": 0.46251564537739054, "learning_rate": 8.215127526333499e-07, "loss": 0.2772, "step": 5505 }, { "epoch": 2.63078669135655, "grad_norm": 0.4896830023615981, "learning_rate": 8.19423508464876e-07, "loss": 0.2595, "step": 5506 }, { "epoch": 2.6312645600621227, "grad_norm": 0.44334268809073074, "learning_rate": 8.173368108263768e-07, "loss": 0.2604, "step": 5507 }, { "epoch": 2.631742428767696, "grad_norm": 0.47811781456110813, "learning_rate": 8.15252660296667e-07, "loss": 0.2692, "step": 5508 }, { "epoch": 2.6322202974732694, "grad_norm": 0.4363044164663179, "learning_rate": 8.131710574538543e-07, "loss": 0.2454, "step": 5509 }, { "epoch": 2.6326981661788422, "grad_norm": 0.449330353460876, "learning_rate": 8.110920028753355e-07, "loss": 0.277, "step": 5510 }, { "epoch": 2.6331760348844155, "grad_norm": 0.4961190817267826, "learning_rate": 8.090154971378073e-07, "loss": 0.27, "step": 5511 }, { "epoch": 2.6336539035899884, "grad_norm": 0.45171261984580413, "learning_rate": 8.069415408172543e-07, "loss": 0.2712, "step": 5512 }, { "epoch": 2.6341317722955617, "grad_norm": 0.4434767897838476, "learning_rate": 8.048701344889531e-07, "loss": 0.2508, "step": 5513 }, { "epoch": 2.634609641001135, "grad_norm": 0.4623705120513708, "learning_rate": 8.028012787274786e-07, "loss": 0.2844, "step": 5514 }, { "epoch": 2.6350875097067084, "grad_norm": 0.4574657819597167, "learning_rate": 8.007349741066939e-07, "loss": 0.2568, "step": 5515 }, { "epoch": 2.6355653784122812, "grad_norm": 0.4450702051786837, "learning_rate": 7.986712211997538e-07, "loss": 0.2759, "step": 5516 }, { "epoch": 2.636043247117854, "grad_norm": 0.6165437153757278, "learning_rate": 7.966100205791094e-07, "loss": 0.2445, "step": 5517 }, { "epoch": 2.6365211158234274, "grad_norm": 0.45144330242103814, "learning_rate": 7.945513728164999e-07, "loss": 0.2495, "step": 5518 }, { "epoch": 2.6369989845290007, "grad_norm": 0.4564427161692554, "learning_rate": 7.924952784829576e-07, "loss": 0.2617, "step": 5519 }, { "epoch": 2.637476853234574, "grad_norm": 0.48463913280050425, "learning_rate": 7.904417381488083e-07, "loss": 0.2655, "step": 5520 }, { "epoch": 2.637954721940147, "grad_norm": 0.4516813676731921, "learning_rate": 7.883907523836676e-07, "loss": 0.2695, "step": 5521 }, { "epoch": 2.63843259064572, "grad_norm": 0.4604887951307146, "learning_rate": 7.863423217564403e-07, "loss": 0.2529, "step": 5522 }, { "epoch": 2.638910459351293, "grad_norm": 0.46157386421059765, "learning_rate": 7.842964468353265e-07, "loss": 0.2618, "step": 5523 }, { "epoch": 2.6393883280568664, "grad_norm": 0.4539399407409506, "learning_rate": 7.822531281878188e-07, "loss": 0.2526, "step": 5524 }, { "epoch": 2.6398661967624397, "grad_norm": 0.44721471315508016, "learning_rate": 7.802123663806938e-07, "loss": 0.2355, "step": 5525 }, { "epoch": 2.6403440654680126, "grad_norm": 0.45922430751841153, "learning_rate": 7.781741619800231e-07, "loss": 0.261, "step": 5526 }, { "epoch": 2.640821934173586, "grad_norm": 0.46053983816343896, "learning_rate": 7.761385155511714e-07, "loss": 0.2486, "step": 5527 }, { "epoch": 2.6412998028791588, "grad_norm": 0.45385759627956784, "learning_rate": 7.741054276587889e-07, "loss": 0.2623, "step": 5528 }, { "epoch": 2.641777671584732, "grad_norm": 0.44483157426363334, "learning_rate": 7.72074898866817e-07, "loss": 0.2903, "step": 5529 }, { "epoch": 2.6422555402903054, "grad_norm": 0.46467024535160295, "learning_rate": 7.700469297384927e-07, "loss": 0.27, "step": 5530 }, { "epoch": 2.6427334089958783, "grad_norm": 0.46336583237822215, "learning_rate": 7.680215208363362e-07, "loss": 0.2571, "step": 5531 }, { "epoch": 2.6432112777014516, "grad_norm": 0.4589731194382133, "learning_rate": 7.659986727221591e-07, "loss": 0.2748, "step": 5532 }, { "epoch": 2.6436891464070245, "grad_norm": 0.4479455221268227, "learning_rate": 7.63978385957066e-07, "loss": 0.2579, "step": 5533 }, { "epoch": 2.6441670151125978, "grad_norm": 0.45680582982603707, "learning_rate": 7.619606611014485e-07, "loss": 0.2563, "step": 5534 }, { "epoch": 2.644644883818171, "grad_norm": 0.4341476061003986, "learning_rate": 7.599454987149868e-07, "loss": 0.2633, "step": 5535 }, { "epoch": 2.645122752523744, "grad_norm": 0.5906143297103831, "learning_rate": 7.579328993566526e-07, "loss": 0.287, "step": 5536 }, { "epoch": 2.6456006212293173, "grad_norm": 0.5449600193189947, "learning_rate": 7.559228635847049e-07, "loss": 0.267, "step": 5537 }, { "epoch": 2.64607848993489, "grad_norm": 0.46047021853305603, "learning_rate": 7.5391539195669e-07, "loss": 0.2745, "step": 5538 }, { "epoch": 2.6465563586404635, "grad_norm": 0.4884388346418383, "learning_rate": 7.5191048502945e-07, "loss": 0.2482, "step": 5539 }, { "epoch": 2.6470342273460368, "grad_norm": 0.4774890364230652, "learning_rate": 7.499081433591071e-07, "loss": 0.2819, "step": 5540 }, { "epoch": 2.64751209605161, "grad_norm": 0.43773957361680527, "learning_rate": 7.479083675010746e-07, "loss": 0.2711, "step": 5541 }, { "epoch": 2.647989964757183, "grad_norm": 0.4400542304427017, "learning_rate": 7.459111580100587e-07, "loss": 0.2584, "step": 5542 }, { "epoch": 2.6484678334627563, "grad_norm": 0.47571447666450534, "learning_rate": 7.439165154400485e-07, "loss": 0.2619, "step": 5543 }, { "epoch": 2.648945702168329, "grad_norm": 0.45317629524346903, "learning_rate": 7.419244403443215e-07, "loss": 0.2426, "step": 5544 }, { "epoch": 2.6494235708739025, "grad_norm": 0.43903772149485587, "learning_rate": 7.399349332754458e-07, "loss": 0.2731, "step": 5545 }, { "epoch": 2.6499014395794758, "grad_norm": 0.45753347345809436, "learning_rate": 7.379479947852752e-07, "loss": 0.2752, "step": 5546 }, { "epoch": 2.6503793082850486, "grad_norm": 0.4465731511139684, "learning_rate": 7.359636254249491e-07, "loss": 0.2749, "step": 5547 }, { "epoch": 2.650857176990622, "grad_norm": 0.45206339925897226, "learning_rate": 7.339818257448994e-07, "loss": 0.2641, "step": 5548 }, { "epoch": 2.651335045696195, "grad_norm": 0.44301665799534984, "learning_rate": 7.320025962948429e-07, "loss": 0.266, "step": 5549 }, { "epoch": 2.651812914401768, "grad_norm": 0.4510933254303754, "learning_rate": 7.300259376237795e-07, "loss": 0.2658, "step": 5550 }, { "epoch": 2.6522907831073415, "grad_norm": 0.45072706019066416, "learning_rate": 7.280518502800027e-07, "loss": 0.276, "step": 5551 }, { "epoch": 2.6527686518129143, "grad_norm": 0.5069678986700116, "learning_rate": 7.260803348110879e-07, "loss": 0.2749, "step": 5552 }, { "epoch": 2.6532465205184876, "grad_norm": 0.4204064472652244, "learning_rate": 7.241113917638987e-07, "loss": 0.2482, "step": 5553 }, { "epoch": 2.6537243892240605, "grad_norm": 0.4606764828883414, "learning_rate": 7.22145021684586e-07, "loss": 0.2686, "step": 5554 }, { "epoch": 2.654202257929634, "grad_norm": 0.43007497445227916, "learning_rate": 7.201812251185869e-07, "loss": 0.2331, "step": 5555 }, { "epoch": 2.654680126635207, "grad_norm": 0.43763153678237027, "learning_rate": 7.182200026106201e-07, "loss": 0.2569, "step": 5556 }, { "epoch": 2.65515799534078, "grad_norm": 0.4720679073397097, "learning_rate": 7.16261354704697e-07, "loss": 0.2645, "step": 5557 }, { "epoch": 2.6556358640463533, "grad_norm": 0.4342444064007902, "learning_rate": 7.143052819441143e-07, "loss": 0.2586, "step": 5558 }, { "epoch": 2.656113732751926, "grad_norm": 0.4761061871295211, "learning_rate": 7.123517848714479e-07, "loss": 0.2519, "step": 5559 }, { "epoch": 2.6565916014574995, "grad_norm": 0.4487477696170997, "learning_rate": 7.104008640285642e-07, "loss": 0.2691, "step": 5560 }, { "epoch": 2.657069470163073, "grad_norm": 0.43960627882338654, "learning_rate": 7.084525199566172e-07, "loss": 0.2735, "step": 5561 }, { "epoch": 2.6575473388686457, "grad_norm": 0.43396104580229183, "learning_rate": 7.065067531960412e-07, "loss": 0.2416, "step": 5562 }, { "epoch": 2.658025207574219, "grad_norm": 0.4262202664604116, "learning_rate": 7.045635642865555e-07, "loss": 0.246, "step": 5563 }, { "epoch": 2.658503076279792, "grad_norm": 0.4305745545871823, "learning_rate": 7.026229537671692e-07, "loss": 0.2754, "step": 5564 }, { "epoch": 2.658980944985365, "grad_norm": 0.595294358705173, "learning_rate": 7.006849221761736e-07, "loss": 0.2681, "step": 5565 }, { "epoch": 2.6594588136909385, "grad_norm": 0.4579698632907277, "learning_rate": 6.987494700511411e-07, "loss": 0.2691, "step": 5566 }, { "epoch": 2.659936682396512, "grad_norm": 0.42289834526883935, "learning_rate": 6.968165979289365e-07, "loss": 0.2346, "step": 5567 }, { "epoch": 2.6604145511020847, "grad_norm": 0.43997486144720366, "learning_rate": 6.948863063457023e-07, "loss": 0.2627, "step": 5568 }, { "epoch": 2.660892419807658, "grad_norm": 0.5245364644243271, "learning_rate": 6.929585958368656e-07, "loss": 0.2744, "step": 5569 }, { "epoch": 2.661370288513231, "grad_norm": 0.44871184731230485, "learning_rate": 6.910334669371433e-07, "loss": 0.2597, "step": 5570 }, { "epoch": 2.661848157218804, "grad_norm": 0.4530485747359608, "learning_rate": 6.891109201805291e-07, "loss": 0.289, "step": 5571 }, { "epoch": 2.6623260259243775, "grad_norm": 0.4459777138982337, "learning_rate": 6.871909561003032e-07, "loss": 0.2773, "step": 5572 }, { "epoch": 2.6628038946299504, "grad_norm": 0.44624182034655313, "learning_rate": 6.852735752290318e-07, "loss": 0.2535, "step": 5573 }, { "epoch": 2.6632817633355237, "grad_norm": 0.4408183385200215, "learning_rate": 6.833587780985618e-07, "loss": 0.2609, "step": 5574 }, { "epoch": 2.6637596320410966, "grad_norm": 0.48483246105806804, "learning_rate": 6.814465652400237e-07, "loss": 0.2797, "step": 5575 }, { "epoch": 2.66423750074667, "grad_norm": 0.4412804280001876, "learning_rate": 6.795369371838323e-07, "loss": 0.2561, "step": 5576 }, { "epoch": 2.664715369452243, "grad_norm": 0.4401284623732288, "learning_rate": 6.776298944596849e-07, "loss": 0.2667, "step": 5577 }, { "epoch": 2.665193238157816, "grad_norm": 0.49218861155591376, "learning_rate": 6.757254375965583e-07, "loss": 0.2639, "step": 5578 }, { "epoch": 2.6656711068633894, "grad_norm": 0.4452641669010618, "learning_rate": 6.738235671227212e-07, "loss": 0.2782, "step": 5579 }, { "epoch": 2.6661489755689622, "grad_norm": 0.4385273221330306, "learning_rate": 6.719242835657147e-07, "loss": 0.2631, "step": 5580 }, { "epoch": 2.6666268442745356, "grad_norm": 0.42111306891766304, "learning_rate": 6.700275874523665e-07, "loss": 0.262, "step": 5581 }, { "epoch": 2.667104712980109, "grad_norm": 0.4348075279027199, "learning_rate": 6.681334793087879e-07, "loss": 0.2421, "step": 5582 }, { "epoch": 2.6675825816856817, "grad_norm": 0.5049628037720003, "learning_rate": 6.66241959660372e-07, "loss": 0.2847, "step": 5583 }, { "epoch": 2.668060450391255, "grad_norm": 0.4683050522187598, "learning_rate": 6.643530290317901e-07, "loss": 0.2568, "step": 5584 }, { "epoch": 2.668538319096828, "grad_norm": 0.4496767613831931, "learning_rate": 6.62466687947001e-07, "loss": 0.255, "step": 5585 }, { "epoch": 2.6690161878024012, "grad_norm": 0.4383351210567021, "learning_rate": 6.605829369292427e-07, "loss": 0.2624, "step": 5586 }, { "epoch": 2.6694940565079746, "grad_norm": 0.4633981201266098, "learning_rate": 6.587017765010306e-07, "loss": 0.2696, "step": 5587 }, { "epoch": 2.6699719252135474, "grad_norm": 0.43636703634880375, "learning_rate": 6.568232071841695e-07, "loss": 0.2474, "step": 5588 }, { "epoch": 2.6704497939191207, "grad_norm": 0.4639743358839676, "learning_rate": 6.549472294997405e-07, "loss": 0.2789, "step": 5589 }, { "epoch": 2.6709276626246936, "grad_norm": 0.48119723931800046, "learning_rate": 6.530738439681017e-07, "loss": 0.2675, "step": 5590 }, { "epoch": 2.671405531330267, "grad_norm": 0.4541784315517988, "learning_rate": 6.512030511089063e-07, "loss": 0.2663, "step": 5591 }, { "epoch": 2.6718834000358402, "grad_norm": 0.4386851803015807, "learning_rate": 6.493348514410735e-07, "loss": 0.2664, "step": 5592 }, { "epoch": 2.6723612687414136, "grad_norm": 0.443049021233755, "learning_rate": 6.474692454828091e-07, "loss": 0.2634, "step": 5593 }, { "epoch": 2.6728391374469864, "grad_norm": 0.5368940349519489, "learning_rate": 6.456062337516023e-07, "loss": 0.2691, "step": 5594 }, { "epoch": 2.6733170061525597, "grad_norm": 0.4362459871804461, "learning_rate": 6.437458167642164e-07, "loss": 0.2768, "step": 5595 }, { "epoch": 2.6737948748581326, "grad_norm": 0.43405302870615214, "learning_rate": 6.418879950366986e-07, "loss": 0.2628, "step": 5596 }, { "epoch": 2.674272743563706, "grad_norm": 0.4537944478141257, "learning_rate": 6.400327690843777e-07, "loss": 0.2537, "step": 5597 }, { "epoch": 2.6747506122692792, "grad_norm": 0.44225140880189395, "learning_rate": 6.3818013942186e-07, "loss": 0.2623, "step": 5598 }, { "epoch": 2.675228480974852, "grad_norm": 0.48940459598044767, "learning_rate": 6.363301065630301e-07, "loss": 0.262, "step": 5599 }, { "epoch": 2.6757063496804254, "grad_norm": 0.4290257298591847, "learning_rate": 6.344826710210584e-07, "loss": 0.2574, "step": 5600 }, { "epoch": 2.6761842183859983, "grad_norm": 0.44260393461554365, "learning_rate": 6.326378333083883e-07, "loss": 0.2613, "step": 5601 }, { "epoch": 2.6766620870915716, "grad_norm": 0.47868079245627504, "learning_rate": 6.307955939367449e-07, "loss": 0.2636, "step": 5602 }, { "epoch": 2.677139955797145, "grad_norm": 0.43232776082930185, "learning_rate": 6.289559534171353e-07, "loss": 0.2682, "step": 5603 }, { "epoch": 2.677617824502718, "grad_norm": 0.44295068309351476, "learning_rate": 6.27118912259842e-07, "loss": 0.2532, "step": 5604 }, { "epoch": 2.678095693208291, "grad_norm": 0.43112065814472633, "learning_rate": 6.252844709744255e-07, "loss": 0.2847, "step": 5605 }, { "epoch": 2.678573561913864, "grad_norm": 0.44589686576732196, "learning_rate": 6.234526300697308e-07, "loss": 0.2586, "step": 5606 }, { "epoch": 2.6790514306194373, "grad_norm": 0.46936007169833993, "learning_rate": 6.216233900538782e-07, "loss": 0.262, "step": 5607 }, { "epoch": 2.6795292993250106, "grad_norm": 0.45133407836159795, "learning_rate": 6.197967514342628e-07, "loss": 0.2667, "step": 5608 }, { "epoch": 2.6800071680305835, "grad_norm": 0.4988501134193684, "learning_rate": 6.179727147175663e-07, "loss": 0.2666, "step": 5609 }, { "epoch": 2.680485036736157, "grad_norm": 0.43159011005236786, "learning_rate": 6.161512804097436e-07, "loss": 0.2696, "step": 5610 }, { "epoch": 2.6809629054417297, "grad_norm": 0.4392692032042625, "learning_rate": 6.143324490160252e-07, "loss": 0.2564, "step": 5611 }, { "epoch": 2.681440774147303, "grad_norm": 0.5262468308872646, "learning_rate": 6.125162210409263e-07, "loss": 0.2881, "step": 5612 }, { "epoch": 2.6819186428528763, "grad_norm": 0.4429786878821737, "learning_rate": 6.107025969882363e-07, "loss": 0.2745, "step": 5613 }, { "epoch": 2.682396511558449, "grad_norm": 0.4290197449049834, "learning_rate": 6.088915773610194e-07, "loss": 0.2735, "step": 5614 }, { "epoch": 2.6828743802640225, "grad_norm": 0.44057175272421367, "learning_rate": 6.070831626616236e-07, "loss": 0.2694, "step": 5615 }, { "epoch": 2.6833522489695953, "grad_norm": 0.46685871466805356, "learning_rate": 6.052773533916712e-07, "loss": 0.2663, "step": 5616 }, { "epoch": 2.6838301176751687, "grad_norm": 0.44112906840605176, "learning_rate": 6.034741500520591e-07, "loss": 0.2731, "step": 5617 }, { "epoch": 2.684307986380742, "grad_norm": 0.4480978993764805, "learning_rate": 6.016735531429674e-07, "loss": 0.2638, "step": 5618 }, { "epoch": 2.6847858550863153, "grad_norm": 0.4757023777345707, "learning_rate": 5.998755631638486e-07, "loss": 0.2625, "step": 5619 }, { "epoch": 2.685263723791888, "grad_norm": 0.43772130701917733, "learning_rate": 5.980801806134318e-07, "loss": 0.2628, "step": 5620 }, { "epoch": 2.6857415924974615, "grad_norm": 0.4408707212239334, "learning_rate": 5.962874059897273e-07, "loss": 0.259, "step": 5621 }, { "epoch": 2.6862194612030343, "grad_norm": 0.44875603266564296, "learning_rate": 5.944972397900173e-07, "loss": 0.252, "step": 5622 }, { "epoch": 2.6866973299086077, "grad_norm": 0.7243707606230397, "learning_rate": 5.927096825108614e-07, "loss": 0.2644, "step": 5623 }, { "epoch": 2.687175198614181, "grad_norm": 0.4437660770119595, "learning_rate": 5.909247346480973e-07, "loss": 0.2584, "step": 5624 }, { "epoch": 2.687653067319754, "grad_norm": 0.4428332177909088, "learning_rate": 5.891423966968413e-07, "loss": 0.2785, "step": 5625 }, { "epoch": 2.688130936025327, "grad_norm": 0.5234803476776241, "learning_rate": 5.873626691514789e-07, "loss": 0.249, "step": 5626 }, { "epoch": 2.6886088047309, "grad_norm": 0.47236649678503034, "learning_rate": 5.855855525056742e-07, "loss": 0.2761, "step": 5627 }, { "epoch": 2.6890866734364733, "grad_norm": 0.45396864953408195, "learning_rate": 5.838110472523728e-07, "loss": 0.2694, "step": 5628 }, { "epoch": 2.6895645421420467, "grad_norm": 0.44956507553613845, "learning_rate": 5.820391538837866e-07, "loss": 0.2608, "step": 5629 }, { "epoch": 2.6900424108476195, "grad_norm": 0.4401864408494058, "learning_rate": 5.80269872891408e-07, "loss": 0.2655, "step": 5630 }, { "epoch": 2.690520279553193, "grad_norm": 0.4886914370387613, "learning_rate": 5.785032047660077e-07, "loss": 0.2549, "step": 5631 }, { "epoch": 2.6909981482587657, "grad_norm": 0.44296980817168874, "learning_rate": 5.76739149997625e-07, "loss": 0.247, "step": 5632 }, { "epoch": 2.691476016964339, "grad_norm": 0.44550057207332994, "learning_rate": 5.749777090755781e-07, "loss": 0.2618, "step": 5633 }, { "epoch": 2.6919538856699123, "grad_norm": 0.43268666197260663, "learning_rate": 5.73218882488461e-07, "loss": 0.2688, "step": 5634 }, { "epoch": 2.692431754375485, "grad_norm": 0.4461412386675045, "learning_rate": 5.714626707241411e-07, "loss": 0.2718, "step": 5635 }, { "epoch": 2.6929096230810585, "grad_norm": 0.4558906789502105, "learning_rate": 5.697090742697576e-07, "loss": 0.2792, "step": 5636 }, { "epoch": 2.6933874917866314, "grad_norm": 0.4744314781330036, "learning_rate": 5.679580936117312e-07, "loss": 0.2591, "step": 5637 }, { "epoch": 2.6938653604922047, "grad_norm": 0.5192870350309746, "learning_rate": 5.662097292357505e-07, "loss": 0.2734, "step": 5638 }, { "epoch": 2.694343229197778, "grad_norm": 0.4435569578566844, "learning_rate": 5.644639816267817e-07, "loss": 0.2666, "step": 5639 }, { "epoch": 2.694821097903351, "grad_norm": 0.43856696864544303, "learning_rate": 5.627208512690641e-07, "loss": 0.2705, "step": 5640 }, { "epoch": 2.695298966608924, "grad_norm": 0.45553085802947646, "learning_rate": 5.609803386461133e-07, "loss": 0.2656, "step": 5641 }, { "epoch": 2.695776835314497, "grad_norm": 0.4666048945211158, "learning_rate": 5.59242444240713e-07, "loss": 0.2738, "step": 5642 }, { "epoch": 2.6962547040200704, "grad_norm": 0.43113345766045935, "learning_rate": 5.575071685349276e-07, "loss": 0.2572, "step": 5643 }, { "epoch": 2.6967325727256437, "grad_norm": 1.125132833480618, "learning_rate": 5.55774512010091e-07, "loss": 0.2558, "step": 5644 }, { "epoch": 2.697210441431217, "grad_norm": 0.608140367809618, "learning_rate": 5.5404447514681e-07, "loss": 0.2737, "step": 5645 }, { "epoch": 2.69768831013679, "grad_norm": 0.4533877869576038, "learning_rate": 5.523170584249704e-07, "loss": 0.2498, "step": 5646 }, { "epoch": 2.698166178842363, "grad_norm": 0.4415778575924575, "learning_rate": 5.505922623237237e-07, "loss": 0.2639, "step": 5647 }, { "epoch": 2.698644047547936, "grad_norm": 0.42774906759972836, "learning_rate": 5.488700873214969e-07, "loss": 0.2692, "step": 5648 }, { "epoch": 2.6991219162535094, "grad_norm": 0.5330417311187563, "learning_rate": 5.471505338959948e-07, "loss": 0.2707, "step": 5649 }, { "epoch": 2.6995997849590827, "grad_norm": 0.5461645932501131, "learning_rate": 5.45433602524188e-07, "loss": 0.2588, "step": 5650 }, { "epoch": 2.7000776536646556, "grad_norm": 0.4242076160737116, "learning_rate": 5.437192936823243e-07, "loss": 0.2651, "step": 5651 }, { "epoch": 2.700555522370229, "grad_norm": 0.4676634194490461, "learning_rate": 5.420076078459236e-07, "loss": 0.27, "step": 5652 }, { "epoch": 2.7010333910758018, "grad_norm": 0.43263645783838944, "learning_rate": 5.402985454897758e-07, "loss": 0.2589, "step": 5653 }, { "epoch": 2.701511259781375, "grad_norm": 0.45382249392362367, "learning_rate": 5.385921070879441e-07, "loss": 0.2487, "step": 5654 }, { "epoch": 2.7019891284869484, "grad_norm": 0.49441942637541253, "learning_rate": 5.368882931137675e-07, "loss": 0.2565, "step": 5655 }, { "epoch": 2.7024669971925213, "grad_norm": 0.4405834049292135, "learning_rate": 5.351871040398515e-07, "loss": 0.2744, "step": 5656 }, { "epoch": 2.7029448658980946, "grad_norm": 0.5559524037795076, "learning_rate": 5.33488540338074e-07, "loss": 0.2583, "step": 5657 }, { "epoch": 2.7034227346036674, "grad_norm": 0.43260600511156877, "learning_rate": 5.317926024795906e-07, "loss": 0.2572, "step": 5658 }, { "epoch": 2.7039006033092408, "grad_norm": 0.4394492413785682, "learning_rate": 5.300992909348234e-07, "loss": 0.2545, "step": 5659 }, { "epoch": 2.704378472014814, "grad_norm": 0.4362896925210051, "learning_rate": 5.284086061734672e-07, "loss": 0.2733, "step": 5660 }, { "epoch": 2.704856340720387, "grad_norm": 0.5093789250306926, "learning_rate": 5.267205486644866e-07, "loss": 0.2701, "step": 5661 }, { "epoch": 2.7053342094259603, "grad_norm": 0.48183431324608156, "learning_rate": 5.250351188761204e-07, "loss": 0.2541, "step": 5662 }, { "epoch": 2.705812078131533, "grad_norm": 0.4409302418627837, "learning_rate": 5.23352317275877e-07, "loss": 0.2602, "step": 5663 }, { "epoch": 2.7062899468371064, "grad_norm": 0.46481073306379156, "learning_rate": 5.21672144330535e-07, "loss": 0.2668, "step": 5664 }, { "epoch": 2.7067678155426798, "grad_norm": 0.573145605718102, "learning_rate": 5.199946005061462e-07, "loss": 0.248, "step": 5665 }, { "epoch": 2.7072456842482526, "grad_norm": 0.4458113575270228, "learning_rate": 5.183196862680307e-07, "loss": 0.2495, "step": 5666 }, { "epoch": 2.707723552953826, "grad_norm": 0.566881416643556, "learning_rate": 5.166474020807788e-07, "loss": 0.2699, "step": 5667 }, { "epoch": 2.708201421659399, "grad_norm": 0.49148904309220365, "learning_rate": 5.149777484082552e-07, "loss": 0.2686, "step": 5668 }, { "epoch": 2.708679290364972, "grad_norm": 0.4396165691920815, "learning_rate": 5.133107257135917e-07, "loss": 0.2695, "step": 5669 }, { "epoch": 2.7091571590705454, "grad_norm": 0.4733614560975121, "learning_rate": 5.116463344591893e-07, "loss": 0.2687, "step": 5670 }, { "epoch": 2.7096350277761188, "grad_norm": 0.4756203545300059, "learning_rate": 5.099845751067234e-07, "loss": 0.2652, "step": 5671 }, { "epoch": 2.7101128964816916, "grad_norm": 0.4877805042234815, "learning_rate": 5.083254481171352e-07, "loss": 0.2786, "step": 5672 }, { "epoch": 2.710590765187265, "grad_norm": 0.43232222654764546, "learning_rate": 5.066689539506353e-07, "loss": 0.2713, "step": 5673 }, { "epoch": 2.711068633892838, "grad_norm": 0.44816531622174133, "learning_rate": 5.050150930667108e-07, "loss": 0.2755, "step": 5674 }, { "epoch": 2.711546502598411, "grad_norm": 0.4627071189494935, "learning_rate": 5.033638659241102e-07, "loss": 0.2574, "step": 5675 }, { "epoch": 2.7120243713039844, "grad_norm": 0.433698330766837, "learning_rate": 5.017152729808539e-07, "loss": 0.2659, "step": 5676 }, { "epoch": 2.7125022400095573, "grad_norm": 0.4537219642809981, "learning_rate": 5.000693146942359e-07, "loss": 0.2628, "step": 5677 }, { "epoch": 2.7129801087151306, "grad_norm": 0.7128807971801155, "learning_rate": 4.984259915208134e-07, "loss": 0.2826, "step": 5678 }, { "epoch": 2.7134579774207035, "grad_norm": 0.5197321996631703, "learning_rate": 4.96785303916415e-07, "loss": 0.2838, "step": 5679 }, { "epoch": 2.713935846126277, "grad_norm": 0.43144572298262895, "learning_rate": 4.951472523361401e-07, "loss": 0.2493, "step": 5680 }, { "epoch": 2.71441371483185, "grad_norm": 0.47874783163343665, "learning_rate": 4.935118372343561e-07, "loss": 0.2728, "step": 5681 }, { "epoch": 2.714891583537423, "grad_norm": 0.4506171864407369, "learning_rate": 4.918790590646938e-07, "loss": 0.24, "step": 5682 }, { "epoch": 2.7153694522429963, "grad_norm": 0.4365104968626452, "learning_rate": 4.90248918280063e-07, "loss": 0.2807, "step": 5683 }, { "epoch": 2.715847320948569, "grad_norm": 0.43550955708683503, "learning_rate": 4.88621415332633e-07, "loss": 0.2549, "step": 5684 }, { "epoch": 2.7163251896541425, "grad_norm": 0.43640296493089487, "learning_rate": 4.869965506738416e-07, "loss": 0.2524, "step": 5685 }, { "epoch": 2.716803058359716, "grad_norm": 0.466624117863505, "learning_rate": 4.85374324754404e-07, "loss": 0.2825, "step": 5686 }, { "epoch": 2.7172809270652887, "grad_norm": 0.4280281243864085, "learning_rate": 4.837547380242924e-07, "loss": 0.2718, "step": 5687 }, { "epoch": 2.717758795770862, "grad_norm": 0.48966203018348153, "learning_rate": 4.821377909327518e-07, "loss": 0.2496, "step": 5688 }, { "epoch": 2.718236664476435, "grad_norm": 0.43749253431245166, "learning_rate": 4.805234839282968e-07, "loss": 0.2738, "step": 5689 }, { "epoch": 2.718714533182008, "grad_norm": 0.457351266972444, "learning_rate": 4.789118174587071e-07, "loss": 0.2867, "step": 5690 }, { "epoch": 2.7191924018875815, "grad_norm": 0.4324107322349869, "learning_rate": 4.773027919710272e-07, "loss": 0.2698, "step": 5691 }, { "epoch": 2.719670270593155, "grad_norm": 0.4372967375001939, "learning_rate": 4.756964079115778e-07, "loss": 0.2454, "step": 5692 }, { "epoch": 2.7201481392987277, "grad_norm": 0.44502655752254144, "learning_rate": 4.740926657259393e-07, "loss": 0.27, "step": 5693 }, { "epoch": 2.7206260080043005, "grad_norm": 0.4773080512151135, "learning_rate": 4.7249156585895904e-07, "loss": 0.273, "step": 5694 }, { "epoch": 2.721103876709874, "grad_norm": 0.43589564759727806, "learning_rate": 4.7089310875475856e-07, "loss": 0.2702, "step": 5695 }, { "epoch": 2.721581745415447, "grad_norm": 0.435891297678135, "learning_rate": 4.692972948567187e-07, "loss": 0.2721, "step": 5696 }, { "epoch": 2.7220596141210205, "grad_norm": 0.4556795601807038, "learning_rate": 4.677041246074887e-07, "loss": 0.268, "step": 5697 }, { "epoch": 2.7225374828265934, "grad_norm": 0.43816241757476965, "learning_rate": 4.661135984489895e-07, "loss": 0.2822, "step": 5698 }, { "epoch": 2.7230153515321667, "grad_norm": 0.45203308436452894, "learning_rate": 4.645257168224038e-07, "loss": 0.2646, "step": 5699 }, { "epoch": 2.7234932202377395, "grad_norm": 0.43051792277467377, "learning_rate": 4.6294048016817917e-07, "loss": 0.2576, "step": 5700 }, { "epoch": 2.723971088943313, "grad_norm": 0.4279847606367255, "learning_rate": 4.6135788892603615e-07, "loss": 0.2505, "step": 5701 }, { "epoch": 2.724448957648886, "grad_norm": 0.43586951533967105, "learning_rate": 4.5977794353495584e-07, "loss": 0.2667, "step": 5702 }, { "epoch": 2.724926826354459, "grad_norm": 0.4436479201943215, "learning_rate": 4.582006444331866e-07, "loss": 0.2584, "step": 5703 }, { "epoch": 2.7254046950600324, "grad_norm": 0.46315434030840746, "learning_rate": 4.56625992058245e-07, "loss": 0.276, "step": 5704 }, { "epoch": 2.7258825637656052, "grad_norm": 0.46920092210161374, "learning_rate": 4.550539868469106e-07, "loss": 0.2829, "step": 5705 }, { "epoch": 2.7263604324711785, "grad_norm": 0.49465749907516765, "learning_rate": 4.5348462923523017e-07, "loss": 0.2528, "step": 5706 }, { "epoch": 2.726838301176752, "grad_norm": 0.5070139510002986, "learning_rate": 4.519179196585166e-07, "loss": 0.2578, "step": 5707 }, { "epoch": 2.7273161698823247, "grad_norm": 0.4548552543610518, "learning_rate": 4.5035385855134674e-07, "loss": 0.2647, "step": 5708 }, { "epoch": 2.727794038587898, "grad_norm": 0.46825179549102325, "learning_rate": 4.4879244634756125e-07, "loss": 0.2892, "step": 5709 }, { "epoch": 2.728271907293471, "grad_norm": 0.42982413711336054, "learning_rate": 4.4723368348027375e-07, "loss": 0.2394, "step": 5710 }, { "epoch": 2.7287497759990442, "grad_norm": 0.5254320930661824, "learning_rate": 4.4567757038185387e-07, "loss": 0.2501, "step": 5711 }, { "epoch": 2.7292276447046175, "grad_norm": 0.43831906460766556, "learning_rate": 4.4412410748393973e-07, "loss": 0.2721, "step": 5712 }, { "epoch": 2.7297055134101904, "grad_norm": 0.44601508670964723, "learning_rate": 4.4257329521743554e-07, "loss": 0.2687, "step": 5713 }, { "epoch": 2.7301833821157637, "grad_norm": 0.4244997363062748, "learning_rate": 4.4102513401251047e-07, "loss": 0.2593, "step": 5714 }, { "epoch": 2.7306612508213366, "grad_norm": 0.43283927918633053, "learning_rate": 4.394796242985933e-07, "loss": 0.2397, "step": 5715 }, { "epoch": 2.73113911952691, "grad_norm": 0.4532562420987665, "learning_rate": 4.3793676650438545e-07, "loss": 0.2622, "step": 5716 }, { "epoch": 2.7316169882324832, "grad_norm": 0.4699173683543495, "learning_rate": 4.363965610578469e-07, "loss": 0.2618, "step": 5717 }, { "epoch": 2.7320948569380565, "grad_norm": 0.44230687360379534, "learning_rate": 4.348590083862025e-07, "loss": 0.2777, "step": 5718 }, { "epoch": 2.7325727256436294, "grad_norm": 0.42842808714224573, "learning_rate": 4.3332410891594346e-07, "loss": 0.2648, "step": 5719 }, { "epoch": 2.7330505943492027, "grad_norm": 0.4448453155274851, "learning_rate": 4.317918630728235e-07, "loss": 0.2507, "step": 5720 }, { "epoch": 2.7335284630547756, "grad_norm": 0.45643300317990476, "learning_rate": 4.302622712818594e-07, "loss": 0.2588, "step": 5721 }, { "epoch": 2.734006331760349, "grad_norm": 0.4357425686343285, "learning_rate": 4.28735333967335e-07, "loss": 0.2761, "step": 5722 }, { "epoch": 2.734484200465922, "grad_norm": 0.4304752980285673, "learning_rate": 4.2721105155279496e-07, "loss": 0.2722, "step": 5723 }, { "epoch": 2.734962069171495, "grad_norm": 0.43442802885881027, "learning_rate": 4.2568942446104657e-07, "loss": 0.2603, "step": 5724 }, { "epoch": 2.7354399378770684, "grad_norm": 0.47160630821995014, "learning_rate": 4.241704531141633e-07, "loss": 0.2688, "step": 5725 }, { "epoch": 2.7359178065826413, "grad_norm": 0.4501959887057499, "learning_rate": 4.2265413793348363e-07, "loss": 0.2668, "step": 5726 }, { "epoch": 2.7363956752882146, "grad_norm": 0.4175734384990899, "learning_rate": 4.2114047933960453e-07, "loss": 0.2606, "step": 5727 }, { "epoch": 2.736873543993788, "grad_norm": 0.45694768736227065, "learning_rate": 4.196294777523868e-07, "loss": 0.2674, "step": 5728 }, { "epoch": 2.7373514126993608, "grad_norm": 0.4673946349847667, "learning_rate": 4.181211335909585e-07, "loss": 0.279, "step": 5729 }, { "epoch": 2.737829281404934, "grad_norm": 0.44503813090400046, "learning_rate": 4.166154472737061e-07, "loss": 0.2631, "step": 5730 }, { "epoch": 2.738307150110507, "grad_norm": 0.4340503304353276, "learning_rate": 4.151124192182798e-07, "loss": 0.2603, "step": 5731 }, { "epoch": 2.7387850188160803, "grad_norm": 0.48847952518429966, "learning_rate": 4.136120498415952e-07, "loss": 0.2624, "step": 5732 }, { "epoch": 2.7392628875216536, "grad_norm": 0.5059660565626302, "learning_rate": 4.1211433955982707e-07, "loss": 0.2595, "step": 5733 }, { "epoch": 2.7397407562272265, "grad_norm": 0.46797353640218137, "learning_rate": 4.1061928878841193e-07, "loss": 0.237, "step": 5734 }, { "epoch": 2.7402186249327998, "grad_norm": 0.45219798246778975, "learning_rate": 4.091268979420537e-07, "loss": 0.2592, "step": 5735 }, { "epoch": 2.7406964936383726, "grad_norm": 0.4447422926180344, "learning_rate": 4.0763716743471346e-07, "loss": 0.2664, "step": 5736 }, { "epoch": 2.741174362343946, "grad_norm": 0.666905018898308, "learning_rate": 4.061500976796162e-07, "loss": 0.2797, "step": 5737 }, { "epoch": 2.7416522310495193, "grad_norm": 0.4393683214525845, "learning_rate": 4.0466568908925087e-07, "loss": 0.2648, "step": 5738 }, { "epoch": 2.742130099755092, "grad_norm": 0.4571492436490682, "learning_rate": 4.031839420753636e-07, "loss": 0.2714, "step": 5739 }, { "epoch": 2.7426079684606655, "grad_norm": 0.4227610872002852, "learning_rate": 4.0170485704896453e-07, "loss": 0.2658, "step": 5740 }, { "epoch": 2.7430858371662383, "grad_norm": 0.4436196880313655, "learning_rate": 4.002284344203289e-07, "loss": 0.272, "step": 5741 }, { "epoch": 2.7435637058718116, "grad_norm": 0.5126894803406438, "learning_rate": 3.987546745989879e-07, "loss": 0.2627, "step": 5742 }, { "epoch": 2.744041574577385, "grad_norm": 0.5064687140304711, "learning_rate": 3.9728357799373675e-07, "loss": 0.2745, "step": 5743 }, { "epoch": 2.7445194432829583, "grad_norm": 0.5135425690883777, "learning_rate": 3.958151450126324e-07, "loss": 0.2371, "step": 5744 }, { "epoch": 2.744997311988531, "grad_norm": 0.45220414835144906, "learning_rate": 3.943493760629924e-07, "loss": 0.2614, "step": 5745 }, { "epoch": 2.7454751806941045, "grad_norm": 0.43240096665660527, "learning_rate": 3.928862715513937e-07, "loss": 0.2611, "step": 5746 }, { "epoch": 2.7459530493996773, "grad_norm": 0.43526686209905135, "learning_rate": 3.914258318836772e-07, "loss": 0.2631, "step": 5747 }, { "epoch": 2.7464309181052506, "grad_norm": 0.47983339284389653, "learning_rate": 3.8996805746494336e-07, "loss": 0.2501, "step": 5748 }, { "epoch": 2.746908786810824, "grad_norm": 0.48766854570737883, "learning_rate": 3.885129486995498e-07, "loss": 0.2514, "step": 5749 }, { "epoch": 2.747386655516397, "grad_norm": 0.4342714380611848, "learning_rate": 3.8706050599112363e-07, "loss": 0.2641, "step": 5750 }, { "epoch": 2.74786452422197, "grad_norm": 0.4493794754550567, "learning_rate": 3.8561072974254267e-07, "loss": 0.2584, "step": 5751 }, { "epoch": 2.748342392927543, "grad_norm": 0.5181439015969199, "learning_rate": 3.8416362035594847e-07, "loss": 0.2698, "step": 5752 }, { "epoch": 2.7488202616331163, "grad_norm": 0.6154335078519316, "learning_rate": 3.827191782327477e-07, "loss": 0.2791, "step": 5753 }, { "epoch": 2.7492981303386896, "grad_norm": 0.43729212520698946, "learning_rate": 3.812774037736011e-07, "loss": 0.2656, "step": 5754 }, { "epoch": 2.7497759990442625, "grad_norm": 0.45813669988799843, "learning_rate": 3.798382973784298e-07, "loss": 0.2571, "step": 5755 }, { "epoch": 2.750253867749836, "grad_norm": 0.4464533961762353, "learning_rate": 3.7840185944641894e-07, "loss": 0.2612, "step": 5756 }, { "epoch": 2.7507317364554087, "grad_norm": 0.4320135449118599, "learning_rate": 3.769680903760109e-07, "loss": 0.2753, "step": 5757 }, { "epoch": 2.751209605160982, "grad_norm": 0.4585589926522614, "learning_rate": 3.7553699056490536e-07, "loss": 0.2537, "step": 5758 }, { "epoch": 2.7516874738665553, "grad_norm": 1.3126054515046628, "learning_rate": 3.7410856041006694e-07, "loss": 0.2696, "step": 5759 }, { "epoch": 2.752165342572128, "grad_norm": 0.48621630808612154, "learning_rate": 3.7268280030771655e-07, "loss": 0.2472, "step": 5760 }, { "epoch": 2.7526432112777015, "grad_norm": 0.4490574992798825, "learning_rate": 3.712597106533344e-07, "loss": 0.2572, "step": 5761 }, { "epoch": 2.7531210799832744, "grad_norm": 0.4203205745967875, "learning_rate": 3.698392918416593e-07, "loss": 0.2539, "step": 5762 }, { "epoch": 2.7535989486888477, "grad_norm": 0.4656550654291942, "learning_rate": 3.684215442666927e-07, "loss": 0.2693, "step": 5763 }, { "epoch": 2.754076817394421, "grad_norm": 0.45928373313115284, "learning_rate": 3.670064683216912e-07, "loss": 0.2512, "step": 5764 }, { "epoch": 2.754554686099994, "grad_norm": 0.45661432116739664, "learning_rate": 3.655940643991718e-07, "loss": 0.2477, "step": 5765 }, { "epoch": 2.755032554805567, "grad_norm": 0.4323656387442569, "learning_rate": 3.641843328909123e-07, "loss": 0.2485, "step": 5766 }, { "epoch": 2.75551042351114, "grad_norm": 0.4525029456321567, "learning_rate": 3.6277727418794537e-07, "loss": 0.2465, "step": 5767 }, { "epoch": 2.7559882922167134, "grad_norm": 0.45847229829882746, "learning_rate": 3.613728886805634e-07, "loss": 0.2678, "step": 5768 }, { "epoch": 2.7564661609222867, "grad_norm": 0.4354223231831446, "learning_rate": 3.599711767583214e-07, "loss": 0.2592, "step": 5769 }, { "epoch": 2.75694402962786, "grad_norm": 0.45784981558251925, "learning_rate": 3.585721388100283e-07, "loss": 0.2722, "step": 5770 }, { "epoch": 2.757421898333433, "grad_norm": 0.7302615178327121, "learning_rate": 3.5717577522375037e-07, "loss": 0.2564, "step": 5771 }, { "epoch": 2.757899767039006, "grad_norm": 0.43796173111069236, "learning_rate": 3.557820863868167e-07, "loss": 0.2622, "step": 5772 }, { "epoch": 2.758377635744579, "grad_norm": 0.462073755028673, "learning_rate": 3.543910726858113e-07, "loss": 0.2728, "step": 5773 }, { "epoch": 2.7588555044501524, "grad_norm": 0.5233980811044266, "learning_rate": 3.5300273450657564e-07, "loss": 0.2708, "step": 5774 }, { "epoch": 2.7593333731557257, "grad_norm": 0.5993407016424378, "learning_rate": 3.516170722342127e-07, "loss": 0.2667, "step": 5775 }, { "epoch": 2.7598112418612986, "grad_norm": 0.425632476030798, "learning_rate": 3.5023408625307844e-07, "loss": 0.2765, "step": 5776 }, { "epoch": 2.760289110566872, "grad_norm": 0.4264176424109537, "learning_rate": 3.488537769467892e-07, "loss": 0.2528, "step": 5777 }, { "epoch": 2.7607669792724447, "grad_norm": 0.4322253085066871, "learning_rate": 3.4747614469822e-07, "loss": 0.271, "step": 5778 }, { "epoch": 2.761244847978018, "grad_norm": 0.44062241649420975, "learning_rate": 3.461011898895017e-07, "loss": 0.2597, "step": 5779 }, { "epoch": 2.7617227166835914, "grad_norm": 0.4288740826814237, "learning_rate": 3.4472891290201927e-07, "loss": 0.2659, "step": 5780 }, { "epoch": 2.7622005853891642, "grad_norm": 0.441917086276448, "learning_rate": 3.4335931411642153e-07, "loss": 0.2716, "step": 5781 }, { "epoch": 2.7626784540947376, "grad_norm": 0.7227881237165457, "learning_rate": 3.419923939126102e-07, "loss": 0.2642, "step": 5782 }, { "epoch": 2.7631563228003104, "grad_norm": 0.4262654766844021, "learning_rate": 3.4062815266974304e-07, "loss": 0.2533, "step": 5783 }, { "epoch": 2.7636341915058837, "grad_norm": 0.44298432828730744, "learning_rate": 3.3926659076623846e-07, "loss": 0.278, "step": 5784 }, { "epoch": 2.764112060211457, "grad_norm": 0.43947410330039516, "learning_rate": 3.3790770857976995e-07, "loss": 0.2623, "step": 5785 }, { "epoch": 2.76458992891703, "grad_norm": 0.42339510096523475, "learning_rate": 3.3655150648726485e-07, "loss": 0.279, "step": 5786 }, { "epoch": 2.7650677976226032, "grad_norm": 0.45899672855736107, "learning_rate": 3.351979848649134e-07, "loss": 0.257, "step": 5787 }, { "epoch": 2.765545666328176, "grad_norm": 0.45322359078395513, "learning_rate": 3.3384714408815745e-07, "loss": 0.2552, "step": 5788 }, { "epoch": 2.7660235350337494, "grad_norm": 0.42446680964479644, "learning_rate": 3.324989845316928e-07, "loss": 0.2699, "step": 5789 }, { "epoch": 2.7665014037393227, "grad_norm": 0.42230616756573375, "learning_rate": 3.3115350656948043e-07, "loss": 0.2662, "step": 5790 }, { "epoch": 2.7669792724448956, "grad_norm": 0.5214966776605962, "learning_rate": 3.298107105747295e-07, "loss": 0.2722, "step": 5791 }, { "epoch": 2.767457141150469, "grad_norm": 0.4648415035471319, "learning_rate": 3.2847059691990644e-07, "loss": 0.2555, "step": 5792 }, { "epoch": 2.767935009856042, "grad_norm": 0.4484544390935882, "learning_rate": 3.271331659767385e-07, "loss": 0.2464, "step": 5793 }, { "epoch": 2.768412878561615, "grad_norm": 0.5447610391648413, "learning_rate": 3.257984181162044e-07, "loss": 0.2698, "step": 5794 }, { "epoch": 2.7688907472671884, "grad_norm": 0.4602729737342174, "learning_rate": 3.2446635370853686e-07, "loss": 0.2563, "step": 5795 }, { "epoch": 2.7693686159727617, "grad_norm": 0.4937625519242985, "learning_rate": 3.2313697312323143e-07, "loss": 0.2561, "step": 5796 }, { "epoch": 2.7698464846783346, "grad_norm": 0.47243649147806527, "learning_rate": 3.218102767290332e-07, "loss": 0.2559, "step": 5797 }, { "epoch": 2.770324353383908, "grad_norm": 0.5039359434080682, "learning_rate": 3.204862648939422e-07, "loss": 0.2622, "step": 5798 }, { "epoch": 2.770802222089481, "grad_norm": 0.438956565568596, "learning_rate": 3.19164937985218e-07, "loss": 0.2659, "step": 5799 }, { "epoch": 2.771280090795054, "grad_norm": 0.4635475451765875, "learning_rate": 3.1784629636937404e-07, "loss": 0.2584, "step": 5800 }, { "epoch": 2.7717579595006274, "grad_norm": 0.4439760668552996, "learning_rate": 3.1653034041217555e-07, "loss": 0.2537, "step": 5801 }, { "epoch": 2.7722358282062003, "grad_norm": 0.42748355624347, "learning_rate": 3.1521707047864836e-07, "loss": 0.266, "step": 5802 }, { "epoch": 2.7727136969117736, "grad_norm": 0.45627260236223416, "learning_rate": 3.139064869330699e-07, "loss": 0.2684, "step": 5803 }, { "epoch": 2.7731915656173465, "grad_norm": 0.45339200404546826, "learning_rate": 3.125985901389694e-07, "loss": 0.2669, "step": 5804 }, { "epoch": 2.77366943432292, "grad_norm": 0.45868520533855495, "learning_rate": 3.1129338045914004e-07, "loss": 0.2461, "step": 5805 }, { "epoch": 2.774147303028493, "grad_norm": 0.4882764629386277, "learning_rate": 3.099908582556199e-07, "loss": 0.2725, "step": 5806 }, { "epoch": 2.774625171734066, "grad_norm": 0.46500740654242584, "learning_rate": 3.0869102388970673e-07, "loss": 0.2506, "step": 5807 }, { "epoch": 2.7751030404396393, "grad_norm": 0.452079842828752, "learning_rate": 3.0739387772195205e-07, "loss": 0.2631, "step": 5808 }, { "epoch": 2.775580909145212, "grad_norm": 0.4381477199767286, "learning_rate": 3.0609942011216144e-07, "loss": 0.2754, "step": 5809 }, { "epoch": 2.7760587778507855, "grad_norm": 0.4408816665348534, "learning_rate": 3.0480765141939316e-07, "loss": 0.2743, "step": 5810 }, { "epoch": 2.776536646556359, "grad_norm": 0.46086546867548916, "learning_rate": 3.035185720019629e-07, "loss": 0.2704, "step": 5811 }, { "epoch": 2.7770145152619317, "grad_norm": 0.44332016111656897, "learning_rate": 3.022321822174379e-07, "loss": 0.2682, "step": 5812 }, { "epoch": 2.777492383967505, "grad_norm": 0.4492380713862776, "learning_rate": 3.0094848242263943e-07, "loss": 0.2673, "step": 5813 }, { "epoch": 2.777970252673078, "grad_norm": 0.43400035663218267, "learning_rate": 2.9966747297364375e-07, "loss": 0.261, "step": 5814 }, { "epoch": 2.778448121378651, "grad_norm": 0.44189833048994365, "learning_rate": 2.9838915422578e-07, "loss": 0.2563, "step": 5815 }, { "epoch": 2.7789259900842245, "grad_norm": 0.4644677539843851, "learning_rate": 2.9711352653363115e-07, "loss": 0.2566, "step": 5816 }, { "epoch": 2.7794038587897973, "grad_norm": 0.4475255303780797, "learning_rate": 2.9584059025103415e-07, "loss": 0.2636, "step": 5817 }, { "epoch": 2.7798817274953707, "grad_norm": 0.443274368634649, "learning_rate": 2.9457034573108e-07, "loss": 0.2544, "step": 5818 }, { "epoch": 2.7803595962009435, "grad_norm": 0.4450610430568845, "learning_rate": 2.933027933261101e-07, "loss": 0.2714, "step": 5819 }, { "epoch": 2.780837464906517, "grad_norm": 0.42837406809925016, "learning_rate": 2.920379333877221e-07, "loss": 0.2525, "step": 5820 }, { "epoch": 2.78131533361209, "grad_norm": 0.45240402207297176, "learning_rate": 2.907757662667665e-07, "loss": 0.2589, "step": 5821 }, { "epoch": 2.7817932023176635, "grad_norm": 0.46379312040070836, "learning_rate": 2.8951629231334434e-07, "loss": 0.2472, "step": 5822 }, { "epoch": 2.7822710710232363, "grad_norm": 0.4412306348002121, "learning_rate": 2.8825951187681387e-07, "loss": 0.276, "step": 5823 }, { "epoch": 2.7827489397288097, "grad_norm": 0.4603946932137324, "learning_rate": 2.87005425305783e-07, "loss": 0.2722, "step": 5824 }, { "epoch": 2.7832268084343825, "grad_norm": 0.45987250970095805, "learning_rate": 2.8575403294811123e-07, "loss": 0.2744, "step": 5825 }, { "epoch": 2.783704677139956, "grad_norm": 0.43880186402447113, "learning_rate": 2.845053351509142e-07, "loss": 0.2384, "step": 5826 }, { "epoch": 2.784182545845529, "grad_norm": 0.45035442317121277, "learning_rate": 2.8325933226056033e-07, "loss": 0.2759, "step": 5827 }, { "epoch": 2.784660414551102, "grad_norm": 0.4669047993561521, "learning_rate": 2.8201602462266775e-07, "loss": 0.2575, "step": 5828 }, { "epoch": 2.7851382832566753, "grad_norm": 0.5303829700742767, "learning_rate": 2.8077541258210607e-07, "loss": 0.2591, "step": 5829 }, { "epoch": 2.785616151962248, "grad_norm": 0.6146920950572496, "learning_rate": 2.795374964830022e-07, "loss": 0.2634, "step": 5830 }, { "epoch": 2.7860940206678215, "grad_norm": 0.4537712641615584, "learning_rate": 2.7830227666872933e-07, "loss": 0.2754, "step": 5831 }, { "epoch": 2.786571889373395, "grad_norm": 0.4379198918938658, "learning_rate": 2.770697534819178e-07, "loss": 0.2787, "step": 5832 }, { "epoch": 2.7870497580789677, "grad_norm": 0.5202726893687436, "learning_rate": 2.758399272644474e-07, "loss": 0.284, "step": 5833 }, { "epoch": 2.787527626784541, "grad_norm": 0.45661668436412567, "learning_rate": 2.746127983574498e-07, "loss": 0.2649, "step": 5834 }, { "epoch": 2.788005495490114, "grad_norm": 0.4649814028316174, "learning_rate": 2.733883671013082e-07, "loss": 0.2505, "step": 5835 }, { "epoch": 2.788483364195687, "grad_norm": 0.5661711250377324, "learning_rate": 2.721666338356599e-07, "loss": 0.2538, "step": 5836 }, { "epoch": 2.7889612329012605, "grad_norm": 0.4777341668360208, "learning_rate": 2.709475988993915e-07, "loss": 0.2666, "step": 5837 }, { "epoch": 2.7894391016068334, "grad_norm": 0.47234324125709043, "learning_rate": 2.6973126263064143e-07, "loss": 0.2677, "step": 5838 }, { "epoch": 2.7899169703124067, "grad_norm": 0.4313068768665777, "learning_rate": 2.685176253667998e-07, "loss": 0.2648, "step": 5839 }, { "epoch": 2.7903948390179796, "grad_norm": 0.44321493365822373, "learning_rate": 2.673066874445096e-07, "loss": 0.2479, "step": 5840 }, { "epoch": 2.790872707723553, "grad_norm": 0.47299846939177753, "learning_rate": 2.66098449199661e-07, "loss": 0.256, "step": 5841 }, { "epoch": 2.791350576429126, "grad_norm": 0.44183881798639174, "learning_rate": 2.648929109674003e-07, "loss": 0.2559, "step": 5842 }, { "epoch": 2.791828445134699, "grad_norm": 0.48856394471690884, "learning_rate": 2.6369007308212233e-07, "loss": 0.2528, "step": 5843 }, { "epoch": 2.7923063138402724, "grad_norm": 0.42875317220473846, "learning_rate": 2.6248993587747017e-07, "loss": 0.2632, "step": 5844 }, { "epoch": 2.7927841825458453, "grad_norm": 0.439006550935603, "learning_rate": 2.612924996863453e-07, "loss": 0.275, "step": 5845 }, { "epoch": 2.7932620512514186, "grad_norm": 0.502986151150374, "learning_rate": 2.600977648408931e-07, "loss": 0.2728, "step": 5846 }, { "epoch": 2.793739919956992, "grad_norm": 0.502221402752791, "learning_rate": 2.5890573167251076e-07, "loss": 0.2759, "step": 5847 }, { "epoch": 2.794217788662565, "grad_norm": 0.46120467386193476, "learning_rate": 2.5771640051184933e-07, "loss": 0.2901, "step": 5848 }, { "epoch": 2.794695657368138, "grad_norm": 0.46525631074461465, "learning_rate": 2.565297716888082e-07, "loss": 0.2537, "step": 5849 }, { "epoch": 2.7951735260737114, "grad_norm": 0.5367360951991326, "learning_rate": 2.5534584553253526e-07, "loss": 0.2737, "step": 5850 }, { "epoch": 2.7956513947792843, "grad_norm": 0.477928653419597, "learning_rate": 2.5416462237143224e-07, "loss": 0.2723, "step": 5851 }, { "epoch": 2.7961292634848576, "grad_norm": 0.43779540429422825, "learning_rate": 2.5298610253315037e-07, "loss": 0.2492, "step": 5852 }, { "epoch": 2.796607132190431, "grad_norm": 0.4473417651730253, "learning_rate": 2.5181028634458704e-07, "loss": 0.2591, "step": 5853 }, { "epoch": 2.7970850008960038, "grad_norm": 0.4237788400316185, "learning_rate": 2.5063717413189695e-07, "loss": 0.2713, "step": 5854 }, { "epoch": 2.797562869601577, "grad_norm": 0.46121222455074623, "learning_rate": 2.494667662204797e-07, "loss": 0.2587, "step": 5855 }, { "epoch": 2.79804073830715, "grad_norm": 0.44337477946493636, "learning_rate": 2.482990629349824e-07, "loss": 0.277, "step": 5856 }, { "epoch": 2.7985186070127233, "grad_norm": 0.4594080279967684, "learning_rate": 2.471340645993103e-07, "loss": 0.2668, "step": 5857 }, { "epoch": 2.7989964757182966, "grad_norm": 0.4400931256728512, "learning_rate": 2.4597177153661056e-07, "loss": 0.2598, "step": 5858 }, { "epoch": 2.7994743444238694, "grad_norm": 0.4426802092154914, "learning_rate": 2.4481218406928297e-07, "loss": 0.2763, "step": 5859 }, { "epoch": 2.7999522131294428, "grad_norm": 0.4353913914636723, "learning_rate": 2.436553025189758e-07, "loss": 0.2573, "step": 5860 }, { "epoch": 2.8004300818350156, "grad_norm": 0.5177432480573205, "learning_rate": 2.4250112720659024e-07, "loss": 0.2689, "step": 5861 }, { "epoch": 2.800907950540589, "grad_norm": 0.5709604532387811, "learning_rate": 2.413496584522723e-07, "loss": 0.2526, "step": 5862 }, { "epoch": 2.8013858192461623, "grad_norm": 0.45801417663423666, "learning_rate": 2.402008965754199e-07, "loss": 0.2537, "step": 5863 }, { "epoch": 2.801863687951735, "grad_norm": 0.4707325970556673, "learning_rate": 2.3905484189467807e-07, "loss": 0.2759, "step": 5864 }, { "epoch": 2.8023415566573084, "grad_norm": 0.4486900242136094, "learning_rate": 2.3791149472794373e-07, "loss": 0.257, "step": 5865 }, { "epoch": 2.8028194253628813, "grad_norm": 0.45965714254275614, "learning_rate": 2.3677085539235977e-07, "loss": 0.2709, "step": 5866 }, { "epoch": 2.8032972940684546, "grad_norm": 0.43566747125808025, "learning_rate": 2.3563292420432094e-07, "loss": 0.274, "step": 5867 }, { "epoch": 2.803775162774028, "grad_norm": 0.4541848106577836, "learning_rate": 2.3449770147946804e-07, "loss": 0.2683, "step": 5868 }, { "epoch": 2.8042530314796013, "grad_norm": 0.8078669068971982, "learning_rate": 2.3336518753269144e-07, "loss": 0.2577, "step": 5869 }, { "epoch": 2.804730900185174, "grad_norm": 0.42680658838886604, "learning_rate": 2.3223538267813317e-07, "loss": 0.2434, "step": 5870 }, { "epoch": 2.805208768890747, "grad_norm": 0.4358025311911329, "learning_rate": 2.3110828722917812e-07, "loss": 0.2625, "step": 5871 }, { "epoch": 2.8056866375963203, "grad_norm": 0.45786374708678146, "learning_rate": 2.2998390149846395e-07, "loss": 0.268, "step": 5872 }, { "epoch": 2.8061645063018936, "grad_norm": 0.44034045702230934, "learning_rate": 2.2886222579787565e-07, "loss": 0.2593, "step": 5873 }, { "epoch": 2.806642375007467, "grad_norm": 0.629034529435717, "learning_rate": 2.2774326043854656e-07, "loss": 0.2797, "step": 5874 }, { "epoch": 2.80712024371304, "grad_norm": 0.4663816214402025, "learning_rate": 2.2662700573085505e-07, "loss": 0.2589, "step": 5875 }, { "epoch": 2.807598112418613, "grad_norm": 0.4520335805323787, "learning_rate": 2.255134619844357e-07, "loss": 0.2693, "step": 5876 }, { "epoch": 2.808075981124186, "grad_norm": 0.45421012246090176, "learning_rate": 2.2440262950816138e-07, "loss": 0.2486, "step": 5877 }, { "epoch": 2.8085538498297593, "grad_norm": 0.44872612791769856, "learning_rate": 2.23294508610159e-07, "loss": 0.2772, "step": 5878 }, { "epoch": 2.8090317185353326, "grad_norm": 0.4471553331629049, "learning_rate": 2.2218909959780265e-07, "loss": 0.2622, "step": 5879 }, { "epoch": 2.8095095872409055, "grad_norm": 0.4326277574229714, "learning_rate": 2.2108640277771153e-07, "loss": 0.241, "step": 5880 }, { "epoch": 2.809987455946479, "grad_norm": 0.4447810782306284, "learning_rate": 2.1998641845575542e-07, "loss": 0.2596, "step": 5881 }, { "epoch": 2.8104653246520517, "grad_norm": 0.4438607779450878, "learning_rate": 2.1888914693705132e-07, "loss": 0.2781, "step": 5882 }, { "epoch": 2.810943193357625, "grad_norm": 0.4515855882029372, "learning_rate": 2.1779458852596136e-07, "loss": 0.26, "step": 5883 }, { "epoch": 2.8114210620631983, "grad_norm": 0.46722738200529673, "learning_rate": 2.167027435260971e-07, "loss": 0.263, "step": 5884 }, { "epoch": 2.811898930768771, "grad_norm": 0.8295117627891755, "learning_rate": 2.156136122403174e-07, "loss": 0.2524, "step": 5885 }, { "epoch": 2.8123767994743445, "grad_norm": 0.4993459060161254, "learning_rate": 2.1452719497072839e-07, "loss": 0.2702, "step": 5886 }, { "epoch": 2.8128546681799174, "grad_norm": 0.4414682776865743, "learning_rate": 2.1344349201868232e-07, "loss": 0.2771, "step": 5887 }, { "epoch": 2.8133325368854907, "grad_norm": 0.44309446914027134, "learning_rate": 2.1236250368477985e-07, "loss": 0.2827, "step": 5888 }, { "epoch": 2.813810405591064, "grad_norm": 0.4392176207768714, "learning_rate": 2.1128423026886892e-07, "loss": 0.2721, "step": 5889 }, { "epoch": 2.814288274296637, "grad_norm": 0.4418095009271383, "learning_rate": 2.1020867207004026e-07, "loss": 0.2673, "step": 5890 }, { "epoch": 2.81476614300221, "grad_norm": 0.4422044960477242, "learning_rate": 2.0913582938663855e-07, "loss": 0.2686, "step": 5891 }, { "epoch": 2.815244011707783, "grad_norm": 0.4697009809230947, "learning_rate": 2.0806570251625023e-07, "loss": 0.2533, "step": 5892 }, { "epoch": 2.8157218804133564, "grad_norm": 0.4507224779670188, "learning_rate": 2.0699829175570785e-07, "loss": 0.2519, "step": 5893 }, { "epoch": 2.8161997491189297, "grad_norm": 0.4528792818717239, "learning_rate": 2.0593359740109452e-07, "loss": 0.2701, "step": 5894 }, { "epoch": 2.816677617824503, "grad_norm": 0.4460463818994321, "learning_rate": 2.048716197477374e-07, "loss": 0.2599, "step": 5895 }, { "epoch": 2.817155486530076, "grad_norm": 0.49958157563151245, "learning_rate": 2.038123590902086e-07, "loss": 0.2625, "step": 5896 }, { "epoch": 2.8176333552356487, "grad_norm": 0.5406609588682755, "learning_rate": 2.0275581572233083e-07, "loss": 0.2459, "step": 5897 }, { "epoch": 2.818111223941222, "grad_norm": 0.4618527016308968, "learning_rate": 2.017019899371686e-07, "loss": 0.2725, "step": 5898 }, { "epoch": 2.8185890926467954, "grad_norm": 0.43028097781936997, "learning_rate": 2.0065088202703587e-07, "loss": 0.2742, "step": 5899 }, { "epoch": 2.8190669613523687, "grad_norm": 0.4615866120409888, "learning_rate": 1.996024922834905e-07, "loss": 0.2488, "step": 5900 }, { "epoch": 2.8195448300579415, "grad_norm": 0.4243814410711499, "learning_rate": 1.9855682099733876e-07, "loss": 0.251, "step": 5901 }, { "epoch": 2.820022698763515, "grad_norm": 0.42845665939883043, "learning_rate": 1.9751386845862864e-07, "loss": 0.2627, "step": 5902 }, { "epoch": 2.8205005674690877, "grad_norm": 0.44017495021305203, "learning_rate": 1.9647363495665983e-07, "loss": 0.2529, "step": 5903 }, { "epoch": 2.820978436174661, "grad_norm": 0.4587273313715358, "learning_rate": 1.9543612077997376e-07, "loss": 0.2563, "step": 5904 }, { "epoch": 2.8214563048802344, "grad_norm": 0.47195775641267157, "learning_rate": 1.9440132621635687e-07, "loss": 0.2654, "step": 5905 }, { "epoch": 2.8219341735858072, "grad_norm": 0.43800418779605343, "learning_rate": 1.9336925155284514e-07, "loss": 0.2698, "step": 5906 }, { "epoch": 2.8224120422913805, "grad_norm": 0.5249547147069987, "learning_rate": 1.9233989707571732e-07, "loss": 0.2738, "step": 5907 }, { "epoch": 2.8228899109969534, "grad_norm": 0.4479165992631786, "learning_rate": 1.9131326307049724e-07, "loss": 0.2583, "step": 5908 }, { "epoch": 2.8233677797025267, "grad_norm": 0.4367008126895965, "learning_rate": 1.9028934982195602e-07, "loss": 0.2483, "step": 5909 }, { "epoch": 2.8238456484081, "grad_norm": 0.4453324046755028, "learning_rate": 1.8926815761410867e-07, "loss": 0.2595, "step": 5910 }, { "epoch": 2.824323517113673, "grad_norm": 0.4515610654582882, "learning_rate": 1.8824968673021525e-07, "loss": 0.2633, "step": 5911 }, { "epoch": 2.8248013858192462, "grad_norm": 0.44115993805926473, "learning_rate": 1.872339374527843e-07, "loss": 0.2536, "step": 5912 }, { "epoch": 2.825279254524819, "grad_norm": 0.4228775353271241, "learning_rate": 1.8622091006356368e-07, "loss": 0.2481, "step": 5913 }, { "epoch": 2.8257571232303924, "grad_norm": 0.44436167133939714, "learning_rate": 1.852106048435498e-07, "loss": 0.2657, "step": 5914 }, { "epoch": 2.8262349919359657, "grad_norm": 0.4406708131002175, "learning_rate": 1.8420302207298623e-07, "loss": 0.2604, "step": 5915 }, { "epoch": 2.8267128606415386, "grad_norm": 0.4654540147793532, "learning_rate": 1.831981620313561e-07, "loss": 0.2753, "step": 5916 }, { "epoch": 2.827190729347112, "grad_norm": 0.4371867269522205, "learning_rate": 1.8219602499738863e-07, "loss": 0.2584, "step": 5917 }, { "epoch": 2.827668598052685, "grad_norm": 0.4341168805289483, "learning_rate": 1.8119661124906262e-07, "loss": 0.2697, "step": 5918 }, { "epoch": 2.828146466758258, "grad_norm": 0.5332475340009322, "learning_rate": 1.801999210635952e-07, "loss": 0.2544, "step": 5919 }, { "epoch": 2.8286243354638314, "grad_norm": 0.4415760206731807, "learning_rate": 1.792059547174507e-07, "loss": 0.2608, "step": 5920 }, { "epoch": 2.8291022041694047, "grad_norm": 0.4539316651950751, "learning_rate": 1.7821471248633982e-07, "loss": 0.2545, "step": 5921 }, { "epoch": 2.8295800728749776, "grad_norm": 0.4269206432587368, "learning_rate": 1.7722619464521363e-07, "loss": 0.254, "step": 5922 }, { "epoch": 2.830057941580551, "grad_norm": 0.43673477313105225, "learning_rate": 1.762404014682706e-07, "loss": 0.2667, "step": 5923 }, { "epoch": 2.8305358102861238, "grad_norm": 0.4414874551970224, "learning_rate": 1.75257333228952e-07, "loss": 0.2696, "step": 5924 }, { "epoch": 2.831013678991697, "grad_norm": 0.4497054490741176, "learning_rate": 1.7427699019994415e-07, "loss": 0.2676, "step": 5925 }, { "epoch": 2.8314915476972704, "grad_norm": 0.4908756734479855, "learning_rate": 1.7329937265317508e-07, "loss": 0.2771, "step": 5926 }, { "epoch": 2.8319694164028433, "grad_norm": 0.4454065328919432, "learning_rate": 1.7232448085982012e-07, "loss": 0.2597, "step": 5927 }, { "epoch": 2.8324472851084166, "grad_norm": 0.42499929417919846, "learning_rate": 1.713523150902985e-07, "loss": 0.2689, "step": 5928 }, { "epoch": 2.8329251538139895, "grad_norm": 0.48284867449405694, "learning_rate": 1.7038287561426892e-07, "loss": 0.2686, "step": 5929 }, { "epoch": 2.8334030225195628, "grad_norm": 0.44411974690243017, "learning_rate": 1.6941616270063854e-07, "loss": 0.2877, "step": 5930 }, { "epoch": 2.833880891225136, "grad_norm": 0.442865545746602, "learning_rate": 1.684521766175562e-07, "loss": 0.2575, "step": 5931 }, { "epoch": 2.834358759930709, "grad_norm": 0.5103520942204934, "learning_rate": 1.6749091763241464e-07, "loss": 0.2606, "step": 5932 }, { "epoch": 2.8348366286362823, "grad_norm": 0.447650390063257, "learning_rate": 1.665323860118495e-07, "loss": 0.2705, "step": 5933 }, { "epoch": 2.835314497341855, "grad_norm": 0.44565808230621423, "learning_rate": 1.6557658202174254e-07, "loss": 0.2703, "step": 5934 }, { "epoch": 2.8357923660474285, "grad_norm": 0.43861667920681485, "learning_rate": 1.6462350592721498e-07, "loss": 0.2447, "step": 5935 }, { "epoch": 2.8362702347530018, "grad_norm": 0.4633523191534991, "learning_rate": 1.6367315799263206e-07, "loss": 0.2713, "step": 5936 }, { "epoch": 2.8367481034585746, "grad_norm": 0.4531309552740638, "learning_rate": 1.6272553848160733e-07, "loss": 0.2649, "step": 5937 }, { "epoch": 2.837225972164148, "grad_norm": 0.4621397423597151, "learning_rate": 1.6178064765699052e-07, "loss": 0.2752, "step": 5938 }, { "epoch": 2.837703840869721, "grad_norm": 0.4415497248019033, "learning_rate": 1.6083848578087868e-07, "loss": 0.2527, "step": 5939 }, { "epoch": 2.838181709575294, "grad_norm": 0.44367203604390565, "learning_rate": 1.5989905311461274e-07, "loss": 0.2677, "step": 5940 }, { "epoch": 2.8386595782808675, "grad_norm": 0.5809887272446366, "learning_rate": 1.5896234991877202e-07, "loss": 0.2713, "step": 5941 }, { "epoch": 2.8391374469864403, "grad_norm": 0.4303037195655415, "learning_rate": 1.5802837645318203e-07, "loss": 0.2455, "step": 5942 }, { "epoch": 2.8396153156920136, "grad_norm": 0.47424596948568043, "learning_rate": 1.570971329769111e-07, "loss": 0.2524, "step": 5943 }, { "epoch": 2.8400931843975865, "grad_norm": 0.4178692383364968, "learning_rate": 1.5616861974827036e-07, "loss": 0.2526, "step": 5944 }, { "epoch": 2.84057105310316, "grad_norm": 0.46166989830773797, "learning_rate": 1.5524283702481158e-07, "loss": 0.271, "step": 5945 }, { "epoch": 2.841048921808733, "grad_norm": 0.4440783366258334, "learning_rate": 1.5431978506333155e-07, "loss": 0.2633, "step": 5946 }, { "epoch": 2.8415267905143065, "grad_norm": 0.441365885091842, "learning_rate": 1.5339946411986885e-07, "loss": 0.2528, "step": 5947 }, { "epoch": 2.8420046592198793, "grad_norm": 0.4737324334331007, "learning_rate": 1.5248187444970252e-07, "loss": 0.2599, "step": 5948 }, { "epoch": 2.8424825279254526, "grad_norm": 0.48811117143680444, "learning_rate": 1.5156701630735792e-07, "loss": 0.2453, "step": 5949 }, { "epoch": 2.8429603966310255, "grad_norm": 0.43188605131922725, "learning_rate": 1.5065488994659983e-07, "loss": 0.2669, "step": 5950 }, { "epoch": 2.843438265336599, "grad_norm": 0.434588565968877, "learning_rate": 1.497454956204347e-07, "loss": 0.2621, "step": 5951 }, { "epoch": 2.843916134042172, "grad_norm": 0.45504777884303677, "learning_rate": 1.4883883358111418e-07, "loss": 0.2599, "step": 5952 }, { "epoch": 2.844394002747745, "grad_norm": 0.43178757210896046, "learning_rate": 1.4793490408013033e-07, "loss": 0.2617, "step": 5953 }, { "epoch": 2.8448718714533183, "grad_norm": 0.45654767138484836, "learning_rate": 1.4703370736821487e-07, "loss": 0.268, "step": 5954 }, { "epoch": 2.845349740158891, "grad_norm": 0.48119573262488585, "learning_rate": 1.461352436953478e-07, "loss": 0.2622, "step": 5955 }, { "epoch": 2.8458276088644645, "grad_norm": 0.4351617846746113, "learning_rate": 1.4523951331074426e-07, "loss": 0.2601, "step": 5956 }, { "epoch": 2.846305477570038, "grad_norm": 0.45633355644087814, "learning_rate": 1.4434651646286325e-07, "loss": 0.2653, "step": 5957 }, { "epoch": 2.8467833462756107, "grad_norm": 0.4736965437562393, "learning_rate": 1.4345625339940994e-07, "loss": 0.2781, "step": 5958 }, { "epoch": 2.847261214981184, "grad_norm": 0.46355248896208456, "learning_rate": 1.4256872436732461e-07, "loss": 0.2591, "step": 5959 }, { "epoch": 2.847739083686757, "grad_norm": 0.4283329171153488, "learning_rate": 1.4168392961279254e-07, "loss": 0.256, "step": 5960 }, { "epoch": 2.84821695239233, "grad_norm": 0.46933751036840643, "learning_rate": 1.4080186938124074e-07, "loss": 0.264, "step": 5961 }, { "epoch": 2.8486948210979035, "grad_norm": 0.4421456212297339, "learning_rate": 1.3992254391733794e-07, "loss": 0.2435, "step": 5962 }, { "epoch": 2.8491726898034764, "grad_norm": 0.43361352767650047, "learning_rate": 1.390459534649924e-07, "loss": 0.2628, "step": 5963 }, { "epoch": 2.8496505585090497, "grad_norm": 0.43095231476269297, "learning_rate": 1.38172098267354e-07, "loss": 0.2607, "step": 5964 }, { "epoch": 2.8501284272146226, "grad_norm": 0.4703203261155736, "learning_rate": 1.3730097856681668e-07, "loss": 0.2504, "step": 5965 }, { "epoch": 2.850606295920196, "grad_norm": 0.4376878982675997, "learning_rate": 1.364325946050138e-07, "loss": 0.2799, "step": 5966 }, { "epoch": 2.851084164625769, "grad_norm": 0.4391097408323728, "learning_rate": 1.355669466228171e-07, "loss": 0.2491, "step": 5967 }, { "epoch": 2.851562033331342, "grad_norm": 0.5148238033140623, "learning_rate": 1.3470403486034566e-07, "loss": 0.268, "step": 5968 }, { "epoch": 2.8520399020369154, "grad_norm": 0.44170548346421123, "learning_rate": 1.3384385955695355e-07, "loss": 0.2603, "step": 5969 }, { "epoch": 2.8525177707424882, "grad_norm": 0.46229977499534447, "learning_rate": 1.329864209512377e-07, "loss": 0.2643, "step": 5970 }, { "epoch": 2.8529956394480616, "grad_norm": 0.4703215322101558, "learning_rate": 1.3213171928103785e-07, "loss": 0.2613, "step": 5971 }, { "epoch": 2.853473508153635, "grad_norm": 0.4977139206239717, "learning_rate": 1.3127975478343435e-07, "loss": 0.259, "step": 5972 }, { "epoch": 2.853951376859208, "grad_norm": 0.44329852164883693, "learning_rate": 1.3043052769474375e-07, "loss": 0.25, "step": 5973 }, { "epoch": 2.854429245564781, "grad_norm": 0.44218052941146163, "learning_rate": 1.2958403825052978e-07, "loss": 0.254, "step": 5974 }, { "epoch": 2.8549071142703544, "grad_norm": 0.4466772904005583, "learning_rate": 1.2874028668559247e-07, "loss": 0.2651, "step": 5975 }, { "epoch": 2.8553849829759272, "grad_norm": 0.43862131247970004, "learning_rate": 1.2789927323397232e-07, "loss": 0.2602, "step": 5976 }, { "epoch": 2.8558628516815006, "grad_norm": 0.45565339829882245, "learning_rate": 1.270609981289539e-07, "loss": 0.2699, "step": 5977 }, { "epoch": 2.856340720387074, "grad_norm": 0.45795122735762345, "learning_rate": 1.2622546160305894e-07, "loss": 0.2498, "step": 5978 }, { "epoch": 2.8568185890926467, "grad_norm": 0.43648336997823883, "learning_rate": 1.2539266388804981e-07, "loss": 0.2627, "step": 5979 }, { "epoch": 2.85729645779822, "grad_norm": 0.4637171759508475, "learning_rate": 1.245626052149318e-07, "loss": 0.2535, "step": 5980 }, { "epoch": 2.857774326503793, "grad_norm": 0.45562841177816166, "learning_rate": 1.2373528581394733e-07, "loss": 0.2462, "step": 5981 }, { "epoch": 2.8582521952093662, "grad_norm": 0.4450586940009108, "learning_rate": 1.2291070591457842e-07, "loss": 0.2788, "step": 5982 }, { "epoch": 2.8587300639149396, "grad_norm": 0.4386721155226896, "learning_rate": 1.2208886574555323e-07, "loss": 0.2725, "step": 5983 }, { "epoch": 2.8592079326205124, "grad_norm": 0.42956994998421677, "learning_rate": 1.212697655348316e-07, "loss": 0.266, "step": 5984 }, { "epoch": 2.8596858013260857, "grad_norm": 0.44622397158469107, "learning_rate": 1.2045340550961958e-07, "loss": 0.2606, "step": 5985 }, { "epoch": 2.8601636700316586, "grad_norm": 0.8670959918097046, "learning_rate": 1.196397858963616e-07, "loss": 0.275, "step": 5986 }, { "epoch": 2.860641538737232, "grad_norm": 0.42188794447231875, "learning_rate": 1.1882890692073933e-07, "loss": 0.2709, "step": 5987 }, { "epoch": 2.8611194074428052, "grad_norm": 0.45699426887028993, "learning_rate": 1.1802076880767732e-07, "loss": 0.261, "step": 5988 }, { "epoch": 2.861597276148378, "grad_norm": 0.6013209724994314, "learning_rate": 1.1721537178133958e-07, "loss": 0.2665, "step": 5989 }, { "epoch": 2.8620751448539514, "grad_norm": 0.5033962646495204, "learning_rate": 1.164127160651285e-07, "loss": 0.2886, "step": 5990 }, { "epoch": 2.8625530135595243, "grad_norm": 0.4387868638958641, "learning_rate": 1.15612801881686e-07, "loss": 0.2631, "step": 5991 }, { "epoch": 2.8630308822650976, "grad_norm": 0.4470342389927088, "learning_rate": 1.148156294528946e-07, "loss": 0.2748, "step": 5992 }, { "epoch": 2.863508750970671, "grad_norm": 0.44782132948777004, "learning_rate": 1.1402119899987629e-07, "loss": 0.2482, "step": 5993 }, { "epoch": 2.863986619676244, "grad_norm": 0.4309117184393657, "learning_rate": 1.1322951074299149e-07, "loss": 0.2616, "step": 5994 }, { "epoch": 2.864464488381817, "grad_norm": 0.45641574770069654, "learning_rate": 1.1244056490184008e-07, "loss": 0.2741, "step": 5995 }, { "epoch": 2.86494235708739, "grad_norm": 0.4795660239874649, "learning_rate": 1.1165436169526366e-07, "loss": 0.2679, "step": 5996 }, { "epoch": 2.8654202257929633, "grad_norm": 0.5323760308177075, "learning_rate": 1.1087090134134005e-07, "loss": 0.2589, "step": 5997 }, { "epoch": 2.8658980944985366, "grad_norm": 0.504237551040962, "learning_rate": 1.1009018405738536e-07, "loss": 0.2625, "step": 5998 }, { "epoch": 2.86637596320411, "grad_norm": 0.4402136999052143, "learning_rate": 1.0931221005996084e-07, "loss": 0.2755, "step": 5999 }, { "epoch": 2.866853831909683, "grad_norm": 0.4491859848447432, "learning_rate": 1.0853697956485942e-07, "loss": 0.245, "step": 6000 }, { "epoch": 2.867331700615256, "grad_norm": 0.45231341758240234, "learning_rate": 1.0776449278711686e-07, "loss": 0.2491, "step": 6001 }, { "epoch": 2.867809569320829, "grad_norm": 0.4369763191287368, "learning_rate": 1.0699474994100845e-07, "loss": 0.2558, "step": 6002 }, { "epoch": 2.8682874380264023, "grad_norm": 0.5358207159433925, "learning_rate": 1.0622775124004669e-07, "loss": 0.2668, "step": 6003 }, { "epoch": 2.8687653067319756, "grad_norm": 0.471953453207217, "learning_rate": 1.0546349689698365e-07, "loss": 0.2685, "step": 6004 }, { "epoch": 2.8692431754375485, "grad_norm": 0.44877487345300576, "learning_rate": 1.0470198712381086e-07, "loss": 0.2629, "step": 6005 }, { "epoch": 2.869721044143122, "grad_norm": 0.43639707706363223, "learning_rate": 1.03943222131756e-07, "loss": 0.2643, "step": 6006 }, { "epoch": 2.8701989128486947, "grad_norm": 0.43048550859297546, "learning_rate": 1.0318720213128741e-07, "loss": 0.2682, "step": 6007 }, { "epoch": 2.870676781554268, "grad_norm": 0.4366175477899045, "learning_rate": 1.0243392733211289e-07, "loss": 0.2576, "step": 6008 }, { "epoch": 2.8711546502598413, "grad_norm": 0.4420799299343235, "learning_rate": 1.0168339794317638e-07, "loss": 0.2664, "step": 6009 }, { "epoch": 2.871632518965414, "grad_norm": 0.43731535232122004, "learning_rate": 1.009356141726614e-07, "loss": 0.2485, "step": 6010 }, { "epoch": 2.8721103876709875, "grad_norm": 0.45731961228738927, "learning_rate": 1.00190576227992e-07, "loss": 0.2713, "step": 6011 }, { "epoch": 2.8725882563765603, "grad_norm": 0.4357113263887429, "learning_rate": 9.94482843158262e-08, "loss": 0.2542, "step": 6012 }, { "epoch": 2.8730661250821337, "grad_norm": 0.4222045104321912, "learning_rate": 9.870873864206376e-08, "loss": 0.2551, "step": 6013 }, { "epoch": 2.873543993787707, "grad_norm": 0.4337713135885946, "learning_rate": 9.797193941184169e-08, "loss": 0.2286, "step": 6014 }, { "epoch": 2.87402186249328, "grad_norm": 0.521268220454681, "learning_rate": 9.723788682953539e-08, "loss": 0.274, "step": 6015 }, { "epoch": 2.874499731198853, "grad_norm": 0.4312558999875928, "learning_rate": 9.650658109875533e-08, "loss": 0.2431, "step": 6016 }, { "epoch": 2.874977599904426, "grad_norm": 0.43570324284996176, "learning_rate": 9.5778022422357e-08, "loss": 0.2506, "step": 6017 }, { "epoch": 2.8754554686099993, "grad_norm": 0.43684524986259893, "learning_rate": 9.505221100242767e-08, "loss": 0.2554, "step": 6018 }, { "epoch": 2.8759333373155727, "grad_norm": 0.45826436214172095, "learning_rate": 9.432914704029406e-08, "loss": 0.2606, "step": 6019 }, { "epoch": 2.8764112060211455, "grad_norm": 0.4296388690654118, "learning_rate": 9.360883073652238e-08, "loss": 0.2677, "step": 6020 }, { "epoch": 2.876889074726719, "grad_norm": 1.0983884559688328, "learning_rate": 9.289126229091505e-08, "loss": 0.2423, "step": 6021 }, { "epoch": 2.8773669434322917, "grad_norm": 0.4443043294613616, "learning_rate": 9.217644190251285e-08, "loss": 0.2727, "step": 6022 }, { "epoch": 2.877844812137865, "grad_norm": 0.45206181728205963, "learning_rate": 9.146436976959605e-08, "loss": 0.2572, "step": 6023 }, { "epoch": 2.8783226808434383, "grad_norm": 0.45123890210077205, "learning_rate": 9.075504608967889e-08, "loss": 0.266, "step": 6024 }, { "epoch": 2.8788005495490117, "grad_norm": 0.4575185175136139, "learning_rate": 9.004847105951509e-08, "loss": 0.2639, "step": 6025 }, { "epoch": 2.8792784182545845, "grad_norm": 0.48693714644668074, "learning_rate": 8.934464487509786e-08, "loss": 0.2693, "step": 6026 }, { "epoch": 2.879756286960158, "grad_norm": 0.4425901100192589, "learning_rate": 8.86435677316544e-08, "loss": 0.2525, "step": 6027 }, { "epoch": 2.8802341556657307, "grad_norm": 0.44877326238514714, "learning_rate": 8.794523982365134e-08, "loss": 0.2641, "step": 6028 }, { "epoch": 2.880712024371304, "grad_norm": 0.4285119487781865, "learning_rate": 8.724966134479374e-08, "loss": 0.2534, "step": 6029 }, { "epoch": 2.8811898930768773, "grad_norm": 0.46436342305670525, "learning_rate": 8.655683248802282e-08, "loss": 0.2821, "step": 6030 }, { "epoch": 2.88166776178245, "grad_norm": 0.44312616547475814, "learning_rate": 8.586675344551599e-08, "loss": 0.2731, "step": 6031 }, { "epoch": 2.8821456304880235, "grad_norm": 0.43442743265027634, "learning_rate": 8.517942440868898e-08, "loss": 0.2506, "step": 6032 }, { "epoch": 2.8826234991935964, "grad_norm": 0.4308115226635295, "learning_rate": 8.449484556819598e-08, "loss": 0.2701, "step": 6033 }, { "epoch": 2.8831013678991697, "grad_norm": 0.4460495905460458, "learning_rate": 8.38130171139262e-08, "loss": 0.2551, "step": 6034 }, { "epoch": 2.883579236604743, "grad_norm": 0.5389940715079629, "learning_rate": 8.313393923500613e-08, "loss": 0.2657, "step": 6035 }, { "epoch": 2.884057105310316, "grad_norm": 0.44293129901679096, "learning_rate": 8.245761211980174e-08, "loss": 0.242, "step": 6036 }, { "epoch": 2.884534974015889, "grad_norm": 0.47983678443629796, "learning_rate": 8.1784035955913e-08, "loss": 0.2629, "step": 6037 }, { "epoch": 2.885012842721462, "grad_norm": 0.4318505645360447, "learning_rate": 8.11132109301782e-08, "loss": 0.2722, "step": 6038 }, { "epoch": 2.8854907114270354, "grad_norm": 0.43438969618864703, "learning_rate": 8.044513722867298e-08, "loss": 0.2696, "step": 6039 }, { "epoch": 2.8859685801326087, "grad_norm": 0.4569375021145621, "learning_rate": 7.977981503670795e-08, "loss": 0.2851, "step": 6040 }, { "epoch": 2.8864464488381816, "grad_norm": 0.4479908637746793, "learning_rate": 7.911724453883329e-08, "loss": 0.2706, "step": 6041 }, { "epoch": 2.886924317543755, "grad_norm": 0.4326153900898914, "learning_rate": 7.845742591883309e-08, "loss": 0.2551, "step": 6042 }, { "epoch": 2.8874021862493278, "grad_norm": 0.4357317158977019, "learning_rate": 7.780035935972985e-08, "loss": 0.2599, "step": 6043 }, { "epoch": 2.887880054954901, "grad_norm": 0.47807706881714124, "learning_rate": 7.714604504378332e-08, "loss": 0.2515, "step": 6044 }, { "epoch": 2.8883579236604744, "grad_norm": 0.48906636149481963, "learning_rate": 7.64944831524872e-08, "loss": 0.2628, "step": 6045 }, { "epoch": 2.8888357923660477, "grad_norm": 0.48643435510610833, "learning_rate": 7.584567386657248e-08, "loss": 0.2653, "step": 6046 }, { "epoch": 2.8893136610716206, "grad_norm": 0.4656245737115965, "learning_rate": 7.519961736601078e-08, "loss": 0.272, "step": 6047 }, { "epoch": 2.8897915297771934, "grad_norm": 0.44143454689073186, "learning_rate": 7.455631383000428e-08, "loss": 0.2603, "step": 6048 }, { "epoch": 2.8902693984827668, "grad_norm": 0.622134137502078, "learning_rate": 7.391576343699359e-08, "loss": 0.2628, "step": 6049 }, { "epoch": 2.89074726718834, "grad_norm": 0.4276530571655008, "learning_rate": 7.327796636465767e-08, "loss": 0.2689, "step": 6050 }, { "epoch": 2.8912251358939134, "grad_norm": 0.4382139339105153, "learning_rate": 7.264292278990947e-08, "loss": 0.251, "step": 6051 }, { "epoch": 2.8917030045994863, "grad_norm": 0.47876504229735695, "learning_rate": 7.201063288889809e-08, "loss": 0.2814, "step": 6052 }, { "epoch": 2.8921808733050596, "grad_norm": 0.4448938723916756, "learning_rate": 7.138109683701211e-08, "loss": 0.2654, "step": 6053 }, { "epoch": 2.8926587420106324, "grad_norm": 0.4453495110101877, "learning_rate": 7.075431480887074e-08, "loss": 0.2682, "step": 6054 }, { "epoch": 2.8931366107162058, "grad_norm": 0.4498675594265008, "learning_rate": 7.01302869783338e-08, "loss": 0.2618, "step": 6055 }, { "epoch": 2.893614479421779, "grad_norm": 0.44182308271744547, "learning_rate": 6.950901351849504e-08, "loss": 0.2738, "step": 6056 }, { "epoch": 2.894092348127352, "grad_norm": 0.4297806934525359, "learning_rate": 6.889049460168662e-08, "loss": 0.265, "step": 6057 }, { "epoch": 2.8945702168329253, "grad_norm": 0.4270875134609304, "learning_rate": 6.827473039947131e-08, "loss": 0.2455, "step": 6058 }, { "epoch": 2.895048085538498, "grad_norm": 0.4386255563204099, "learning_rate": 6.766172108265356e-08, "loss": 0.272, "step": 6059 }, { "epoch": 2.8955259542440714, "grad_norm": 0.44212270253634706, "learning_rate": 6.705146682127184e-08, "loss": 0.2708, "step": 6060 }, { "epoch": 2.8960038229496448, "grad_norm": 0.4827942892265252, "learning_rate": 6.64439677845985e-08, "loss": 0.2766, "step": 6061 }, { "epoch": 2.8964816916552176, "grad_norm": 0.4419630496584052, "learning_rate": 6.583922414114318e-08, "loss": 0.2609, "step": 6062 }, { "epoch": 2.896959560360791, "grad_norm": 0.4750351230744057, "learning_rate": 6.523723605865174e-08, "loss": 0.2763, "step": 6063 }, { "epoch": 2.897437429066364, "grad_norm": 0.47028028478371914, "learning_rate": 6.4638003704105e-08, "loss": 0.2661, "step": 6064 }, { "epoch": 2.897915297771937, "grad_norm": 0.4625973271853386, "learning_rate": 6.404152724371892e-08, "loss": 0.2702, "step": 6065 }, { "epoch": 2.8983931664775104, "grad_norm": 0.7072555980090101, "learning_rate": 6.344780684294671e-08, "loss": 0.2669, "step": 6066 }, { "epoch": 2.8988710351830833, "grad_norm": 0.4482616347126552, "learning_rate": 6.28568426664744e-08, "loss": 0.2533, "step": 6067 }, { "epoch": 2.8993489038886566, "grad_norm": 0.4303610137833005, "learning_rate": 6.226863487822532e-08, "loss": 0.2702, "step": 6068 }, { "epoch": 2.8998267725942295, "grad_norm": 0.4385548652125175, "learning_rate": 6.168318364135895e-08, "loss": 0.2586, "step": 6069 }, { "epoch": 2.900304641299803, "grad_norm": 0.46216493168944667, "learning_rate": 6.110048911826871e-08, "loss": 0.2605, "step": 6070 }, { "epoch": 2.900782510005376, "grad_norm": 0.42831561535218843, "learning_rate": 6.05205514705831e-08, "loss": 0.2689, "step": 6071 }, { "epoch": 2.9012603787109494, "grad_norm": 0.42968742796774145, "learning_rate": 5.99433708591679e-08, "loss": 0.2601, "step": 6072 }, { "epoch": 2.9017382474165223, "grad_norm": 1.0032255802524332, "learning_rate": 5.9368947444121695e-08, "loss": 0.2851, "step": 6073 }, { "epoch": 2.902216116122095, "grad_norm": 0.44746843283921595, "learning_rate": 5.879728138477925e-08, "loss": 0.2645, "step": 6074 }, { "epoch": 2.9026939848276685, "grad_norm": 0.45839779776483, "learning_rate": 5.82283728397115e-08, "loss": 0.2928, "step": 6075 }, { "epoch": 2.903171853533242, "grad_norm": 0.4426768127258644, "learning_rate": 5.766222196672333e-08, "loss": 0.2529, "step": 6076 }, { "epoch": 2.903649722238815, "grad_norm": 0.4306421815201952, "learning_rate": 5.709882892285468e-08, "loss": 0.2596, "step": 6077 }, { "epoch": 2.904127590944388, "grad_norm": 0.4339299174215774, "learning_rate": 5.653819386438164e-08, "loss": 0.2543, "step": 6078 }, { "epoch": 2.9046054596499613, "grad_norm": 0.46448596548019094, "learning_rate": 5.598031694681316e-08, "loss": 0.2698, "step": 6079 }, { "epoch": 2.905083328355534, "grad_norm": 0.4319137355146327, "learning_rate": 5.542519832489546e-08, "loss": 0.2696, "step": 6080 }, { "epoch": 2.9055611970611075, "grad_norm": 0.5684354169120892, "learning_rate": 5.487283815260869e-08, "loss": 0.2694, "step": 6081 }, { "epoch": 2.906039065766681, "grad_norm": 0.4360547156677553, "learning_rate": 5.432323658316807e-08, "loss": 0.2629, "step": 6082 }, { "epoch": 2.9065169344722537, "grad_norm": 0.4342812482531317, "learning_rate": 5.3776393769021664e-08, "loss": 0.271, "step": 6083 }, { "epoch": 2.906994803177827, "grad_norm": 0.46003291233940047, "learning_rate": 5.3232309861857015e-08, "loss": 0.2489, "step": 6084 }, { "epoch": 2.9074726718834, "grad_norm": 0.4325963268001337, "learning_rate": 5.269098501259007e-08, "loss": 0.2794, "step": 6085 }, { "epoch": 2.907950540588973, "grad_norm": 0.4637871151293466, "learning_rate": 5.2152419371376276e-08, "loss": 0.2726, "step": 6086 }, { "epoch": 2.9084284092945465, "grad_norm": 0.45775569434566626, "learning_rate": 5.161661308760613e-08, "loss": 0.2769, "step": 6087 }, { "epoch": 2.9089062780001194, "grad_norm": 0.5008124956659363, "learning_rate": 5.108356630989963e-08, "loss": 0.2598, "step": 6088 }, { "epoch": 2.9093841467056927, "grad_norm": 0.4407017585102997, "learning_rate": 5.05532791861163e-08, "loss": 0.2704, "step": 6089 }, { "epoch": 2.9098620154112655, "grad_norm": 0.461333224542017, "learning_rate": 5.002575186334735e-08, "loss": 0.2817, "step": 6090 }, { "epoch": 2.910339884116839, "grad_norm": 0.44289468727956904, "learning_rate": 4.950098448792129e-08, "loss": 0.2656, "step": 6091 }, { "epoch": 2.910817752822412, "grad_norm": 0.47285680491757126, "learning_rate": 4.897897720539835e-08, "loss": 0.2561, "step": 6092 }, { "epoch": 2.911295621527985, "grad_norm": 0.4770909008792775, "learning_rate": 4.8459730160573814e-08, "loss": 0.2543, "step": 6093 }, { "epoch": 2.9117734902335584, "grad_norm": 0.4711159546492226, "learning_rate": 4.794324349747803e-08, "loss": 0.263, "step": 6094 }, { "epoch": 2.9122513589391312, "grad_norm": 0.4302650665937928, "learning_rate": 4.742951735937418e-08, "loss": 0.263, "step": 6095 }, { "epoch": 2.9127292276447045, "grad_norm": 0.44234241950445463, "learning_rate": 4.691855188876271e-08, "loss": 0.2591, "step": 6096 }, { "epoch": 2.913207096350278, "grad_norm": 0.43773577729487084, "learning_rate": 4.641034722737581e-08, "loss": 0.2743, "step": 6097 }, { "epoch": 2.913684965055851, "grad_norm": 0.4482633004011929, "learning_rate": 4.590490351618071e-08, "loss": 0.2542, "step": 6098 }, { "epoch": 2.914162833761424, "grad_norm": 0.45388506640042264, "learning_rate": 4.5402220895377494e-08, "loss": 0.2608, "step": 6099 }, { "epoch": 2.9146407024669974, "grad_norm": 0.4677634390084611, "learning_rate": 4.490229950440239e-08, "loss": 0.267, "step": 6100 }, { "epoch": 2.9151185711725702, "grad_norm": 0.43322907347723366, "learning_rate": 4.4405139481924485e-08, "loss": 0.2852, "step": 6101 }, { "epoch": 2.9155964398781435, "grad_norm": 0.43253137272245884, "learning_rate": 4.39107409658468e-08, "loss": 0.2523, "step": 6102 }, { "epoch": 2.916074308583717, "grad_norm": 0.8312520936498295, "learning_rate": 4.341910409330741e-08, "loss": 0.2635, "step": 6103 }, { "epoch": 2.9165521772892897, "grad_norm": 0.6559893173522333, "learning_rate": 4.293022900067723e-08, "loss": 0.2503, "step": 6104 }, { "epoch": 2.917030045994863, "grad_norm": 0.4843023216915645, "learning_rate": 4.2444115823562226e-08, "loss": 0.2675, "step": 6105 }, { "epoch": 2.917507914700436, "grad_norm": 0.43442410136217113, "learning_rate": 4.196076469680122e-08, "loss": 0.2843, "step": 6106 }, { "epoch": 2.9179857834060092, "grad_norm": 0.43098967320599624, "learning_rate": 4.148017575446695e-08, "loss": 0.2564, "step": 6107 }, { "epoch": 2.9184636521115825, "grad_norm": 0.4679928302022156, "learning_rate": 4.100234912986611e-08, "loss": 0.2544, "step": 6108 }, { "epoch": 2.9189415208171554, "grad_norm": 0.4269404926785668, "learning_rate": 4.052728495554159e-08, "loss": 0.2624, "step": 6109 }, { "epoch": 2.9194193895227287, "grad_norm": 0.46370533071441733, "learning_rate": 4.005498336326463e-08, "loss": 0.2847, "step": 6110 }, { "epoch": 2.9198972582283016, "grad_norm": 0.4586911857247521, "learning_rate": 3.958544448404489e-08, "loss": 0.2683, "step": 6111 }, { "epoch": 2.920375126933875, "grad_norm": 0.4440943392862457, "learning_rate": 3.911866844812484e-08, "loss": 0.2576, "step": 6112 }, { "epoch": 2.9208529956394482, "grad_norm": 0.4520138400943036, "learning_rate": 3.865465538497981e-08, "loss": 0.2718, "step": 6113 }, { "epoch": 2.921330864345021, "grad_norm": 0.7797476182577636, "learning_rate": 3.819340542331684e-08, "loss": 0.2698, "step": 6114 }, { "epoch": 2.9218087330505944, "grad_norm": 0.46668125602963445, "learning_rate": 3.773491869108137e-08, "loss": 0.2821, "step": 6115 }, { "epoch": 2.9222866017561673, "grad_norm": 0.4473526269050483, "learning_rate": 3.727919531544721e-08, "loss": 0.2619, "step": 6116 }, { "epoch": 2.9227644704617406, "grad_norm": 0.4465730584095882, "learning_rate": 3.6826235422824375e-08, "loss": 0.2719, "step": 6117 }, { "epoch": 2.923242339167314, "grad_norm": 0.4405667747199723, "learning_rate": 3.63760391388579e-08, "loss": 0.2582, "step": 6118 }, { "epoch": 2.923720207872887, "grad_norm": 0.43242706008898524, "learning_rate": 3.5928606588422344e-08, "loss": 0.2616, "step": 6119 }, { "epoch": 2.92419807657846, "grad_norm": 0.43740281555793414, "learning_rate": 3.548393789562732e-08, "loss": 0.2542, "step": 6120 }, { "epoch": 2.924675945284033, "grad_norm": 0.436622075631485, "learning_rate": 3.504203318381749e-08, "loss": 0.26, "step": 6121 }, { "epoch": 2.9251538139896063, "grad_norm": 0.4323874580312323, "learning_rate": 3.460289257556926e-08, "loss": 0.2544, "step": 6122 }, { "epoch": 2.9256316826951796, "grad_norm": 0.4449893196540946, "learning_rate": 3.416651619269073e-08, "loss": 0.2632, "step": 6123 }, { "epoch": 2.926109551400753, "grad_norm": 0.4383521097185537, "learning_rate": 3.373290415622732e-08, "loss": 0.2594, "step": 6124 }, { "epoch": 2.9265874201063258, "grad_norm": 0.4616607673944823, "learning_rate": 3.3302056586453916e-08, "loss": 0.2419, "step": 6125 }, { "epoch": 2.927065288811899, "grad_norm": 0.43426940100872696, "learning_rate": 3.287397360288047e-08, "loss": 0.2517, "step": 6126 }, { "epoch": 2.927543157517472, "grad_norm": 0.43983360686347156, "learning_rate": 3.244865532424979e-08, "loss": 0.2776, "step": 6127 }, { "epoch": 2.9280210262230453, "grad_norm": 0.42574611069588764, "learning_rate": 3.2026101868538605e-08, "loss": 0.255, "step": 6128 }, { "epoch": 2.9284988949286186, "grad_norm": 0.4331855136966984, "learning_rate": 3.1606313352953166e-08, "loss": 0.2561, "step": 6129 }, { "epoch": 2.9289767636341915, "grad_norm": 0.4360937130460965, "learning_rate": 3.118928989393699e-08, "loss": 0.2666, "step": 6130 }, { "epoch": 2.9294546323397648, "grad_norm": 0.4647708422391565, "learning_rate": 3.077503160716533e-08, "loss": 0.2639, "step": 6131 }, { "epoch": 2.9299325010453376, "grad_norm": 0.43528666761458085, "learning_rate": 3.0363538607546264e-08, "loss": 0.2655, "step": 6132 }, { "epoch": 2.930410369750911, "grad_norm": 0.4433119943756909, "learning_rate": 2.99548110092196e-08, "loss": 0.2583, "step": 6133 }, { "epoch": 2.9308882384564843, "grad_norm": 0.440304631789597, "learning_rate": 2.9548848925560204e-08, "loss": 0.2647, "step": 6134 }, { "epoch": 2.931366107162057, "grad_norm": 0.4259765178512256, "learning_rate": 2.9145652469174666e-08, "loss": 0.2695, "step": 6135 }, { "epoch": 2.9318439758676305, "grad_norm": 0.4379793362905143, "learning_rate": 2.874522175190242e-08, "loss": 0.2571, "step": 6136 }, { "epoch": 2.9323218445732033, "grad_norm": 0.4505313706517066, "learning_rate": 2.8347556884814608e-08, "loss": 0.2732, "step": 6137 }, { "epoch": 2.9327997132787766, "grad_norm": 0.43130212488145003, "learning_rate": 2.7952657978218557e-08, "loss": 0.2715, "step": 6138 }, { "epoch": 2.93327758198435, "grad_norm": 0.455093876629633, "learning_rate": 2.7560525141651085e-08, "loss": 0.2708, "step": 6139 }, { "epoch": 2.933755450689923, "grad_norm": 0.542151458023748, "learning_rate": 2.7171158483882963e-08, "loss": 0.2667, "step": 6140 }, { "epoch": 2.934233319395496, "grad_norm": 0.43987512532735656, "learning_rate": 2.678455811291669e-08, "loss": 0.2648, "step": 6141 }, { "epoch": 2.934711188101069, "grad_norm": 0.4611481415195722, "learning_rate": 2.640072413599093e-08, "loss": 0.2523, "step": 6142 }, { "epoch": 2.9351890568066423, "grad_norm": 0.4680505547187445, "learning_rate": 2.6019656659572734e-08, "loss": 0.2761, "step": 6143 }, { "epoch": 2.9356669255122156, "grad_norm": 0.4347789880573274, "learning_rate": 2.564135578936422e-08, "loss": 0.2678, "step": 6144 }, { "epoch": 2.9361447942177885, "grad_norm": 0.43779621846285605, "learning_rate": 2.5265821630298116e-08, "loss": 0.2703, "step": 6145 }, { "epoch": 2.936622662923362, "grad_norm": 0.5080676465853426, "learning_rate": 2.4893054286542207e-08, "loss": 0.282, "step": 6146 }, { "epoch": 2.9371005316289347, "grad_norm": 0.4240127151465205, "learning_rate": 2.4523053861494894e-08, "loss": 0.2666, "step": 6147 }, { "epoch": 2.937578400334508, "grad_norm": 0.43808035306246806, "learning_rate": 2.4155820457788525e-08, "loss": 0.2559, "step": 6148 }, { "epoch": 2.9380562690400813, "grad_norm": 0.4600124982961411, "learning_rate": 2.3791354177286062e-08, "loss": 0.2551, "step": 6149 }, { "epoch": 2.9385341377456546, "grad_norm": 0.4808481604273592, "learning_rate": 2.3429655121085525e-08, "loss": 0.2572, "step": 6150 }, { "epoch": 2.9390120064512275, "grad_norm": 0.4420542869404689, "learning_rate": 2.3070723389514437e-08, "loss": 0.2684, "step": 6151 }, { "epoch": 2.939489875156801, "grad_norm": 0.42935263804364654, "learning_rate": 2.2714559082134267e-08, "loss": 0.2482, "step": 6152 }, { "epoch": 2.9399677438623737, "grad_norm": 0.4476128760597761, "learning_rate": 2.2361162297739327e-08, "loss": 0.2692, "step": 6153 }, { "epoch": 2.940445612567947, "grad_norm": 0.4687545989577253, "learning_rate": 2.201053313435564e-08, "loss": 0.2652, "step": 6154 }, { "epoch": 2.9409234812735203, "grad_norm": 0.4447212940871088, "learning_rate": 2.1662671689242076e-08, "loss": 0.2644, "step": 6155 }, { "epoch": 2.941401349979093, "grad_norm": 0.4315254295809012, "learning_rate": 2.131757805888701e-08, "loss": 0.2588, "step": 6156 }, { "epoch": 2.9418792186846665, "grad_norm": 0.43460273368097063, "learning_rate": 2.0975252339016095e-08, "loss": 0.2647, "step": 6157 }, { "epoch": 2.9423570873902394, "grad_norm": 0.4537080705014196, "learning_rate": 2.0635694624582258e-08, "loss": 0.2605, "step": 6158 }, { "epoch": 2.9428349560958127, "grad_norm": 0.5945505658301509, "learning_rate": 2.0298905009774606e-08, "loss": 0.2566, "step": 6159 }, { "epoch": 2.943312824801386, "grad_norm": 0.4221694390673157, "learning_rate": 1.996488358801174e-08, "loss": 0.2622, "step": 6160 }, { "epoch": 2.943790693506959, "grad_norm": 0.4369946666917783, "learning_rate": 1.9633630451945106e-08, "loss": 0.2588, "step": 6161 }, { "epoch": 2.944268562212532, "grad_norm": 0.4581796037391759, "learning_rate": 1.9305145693457873e-08, "loss": 0.262, "step": 6162 }, { "epoch": 2.944746430918105, "grad_norm": 0.45812073938100284, "learning_rate": 1.897942940366715e-08, "loss": 0.2633, "step": 6163 }, { "epoch": 2.9452242996236784, "grad_norm": 0.4342899841434601, "learning_rate": 1.8656481672921778e-08, "loss": 0.2524, "step": 6164 }, { "epoch": 2.9457021683292517, "grad_norm": 0.5515670473317069, "learning_rate": 1.8336302590798992e-08, "loss": 0.2504, "step": 6165 }, { "epoch": 2.9461800370348246, "grad_norm": 0.4650205257532432, "learning_rate": 1.8018892246113307e-08, "loss": 0.2555, "step": 6166 }, { "epoch": 2.946657905740398, "grad_norm": 0.5280605407104664, "learning_rate": 1.7704250726907623e-08, "loss": 0.2598, "step": 6167 }, { "epoch": 2.9471357744459707, "grad_norm": 0.4312322170959968, "learning_rate": 1.7392378120457686e-08, "loss": 0.2695, "step": 6168 }, { "epoch": 2.947613643151544, "grad_norm": 0.43878655465975397, "learning_rate": 1.7083274513272075e-08, "loss": 0.2416, "step": 6169 }, { "epoch": 2.9480915118571174, "grad_norm": 0.47724365855005124, "learning_rate": 1.677693999109109e-08, "loss": 0.2799, "step": 6170 }, { "epoch": 2.9485693805626902, "grad_norm": 0.42611304191004884, "learning_rate": 1.6473374638885653e-08, "loss": 0.2656, "step": 6171 }, { "epoch": 2.9490472492682636, "grad_norm": 0.4437090768992525, "learning_rate": 1.6172578540859518e-08, "loss": 0.2739, "step": 6172 }, { "epoch": 2.9495251179738364, "grad_norm": 0.4993388134546404, "learning_rate": 1.5874551780448168e-08, "loss": 0.2611, "step": 6173 }, { "epoch": 2.9500029866794097, "grad_norm": 0.5246384618817709, "learning_rate": 1.5579294440319914e-08, "loss": 0.26, "step": 6174 }, { "epoch": 2.950480855384983, "grad_norm": 0.4396611447170219, "learning_rate": 1.5286806602372583e-08, "loss": 0.2662, "step": 6175 }, { "epoch": 2.9509587240905564, "grad_norm": 0.43916399298231507, "learning_rate": 1.4997088347737942e-08, "loss": 0.2786, "step": 6176 }, { "epoch": 2.9514365927961292, "grad_norm": 0.4416382489397697, "learning_rate": 1.4710139756778374e-08, "loss": 0.2701, "step": 6177 }, { "epoch": 2.9519144615017026, "grad_norm": 0.44737409568336867, "learning_rate": 1.4425960909087989e-08, "loss": 0.27, "step": 6178 }, { "epoch": 2.9523923302072754, "grad_norm": 0.44284766864103403, "learning_rate": 1.414455188349484e-08, "loss": 0.264, "step": 6179 }, { "epoch": 2.9528701989128487, "grad_norm": 0.5614132268071157, "learning_rate": 1.3865912758054267e-08, "loss": 0.2693, "step": 6180 }, { "epoch": 2.953348067618422, "grad_norm": 0.44645974337163896, "learning_rate": 1.3590043610057779e-08, "loss": 0.2856, "step": 6181 }, { "epoch": 2.953825936323995, "grad_norm": 0.4384106162295247, "learning_rate": 1.3316944516026386e-08, "loss": 0.2532, "step": 6182 }, { "epoch": 2.9543038050295682, "grad_norm": 0.43462058917441077, "learning_rate": 1.3046615551711716e-08, "loss": 0.2646, "step": 6183 }, { "epoch": 2.954781673735141, "grad_norm": 0.5819832565699149, "learning_rate": 1.2779056792099343e-08, "loss": 0.2679, "step": 6184 }, { "epoch": 2.9552595424407144, "grad_norm": 0.4460079501289947, "learning_rate": 1.2514268311405452e-08, "loss": 0.246, "step": 6185 }, { "epoch": 2.9557374111462877, "grad_norm": 0.4356231894949297, "learning_rate": 1.2252250183076852e-08, "loss": 0.262, "step": 6186 }, { "epoch": 2.9562152798518606, "grad_norm": 0.43337849139435436, "learning_rate": 1.1993002479793181e-08, "loss": 0.2683, "step": 6187 }, { "epoch": 2.956693148557434, "grad_norm": 0.4622749840308141, "learning_rate": 1.1736525273465805e-08, "loss": 0.2842, "step": 6188 }, { "epoch": 2.957171017263007, "grad_norm": 0.4502631856582929, "learning_rate": 1.1482818635235592e-08, "loss": 0.2788, "step": 6189 }, { "epoch": 2.95764888596858, "grad_norm": 0.47079401887102384, "learning_rate": 1.1231882635477364e-08, "loss": 0.2614, "step": 6190 }, { "epoch": 2.9581267546741534, "grad_norm": 0.4695321059725002, "learning_rate": 1.0983717343796552e-08, "loss": 0.2673, "step": 6191 }, { "epoch": 2.9586046233797263, "grad_norm": 0.44118500556042844, "learning_rate": 1.0738322829028092e-08, "loss": 0.259, "step": 6192 }, { "epoch": 2.9590824920852996, "grad_norm": 0.4277238079715053, "learning_rate": 1.0495699159241979e-08, "loss": 0.2523, "step": 6193 }, { "epoch": 2.9595603607908725, "grad_norm": 0.4388918259690208, "learning_rate": 1.0255846401737713e-08, "loss": 0.2711, "step": 6194 }, { "epoch": 2.960038229496446, "grad_norm": 0.43524845914682875, "learning_rate": 1.0018764623045407e-08, "loss": 0.2712, "step": 6195 }, { "epoch": 2.960516098202019, "grad_norm": 0.43647257946844553, "learning_rate": 9.784453888926903e-09, "loss": 0.2615, "step": 6196 }, { "epoch": 2.960993966907592, "grad_norm": 0.4421145755712329, "learning_rate": 9.552914264376878e-09, "loss": 0.2705, "step": 6197 }, { "epoch": 2.9614718356131653, "grad_norm": 0.44265663704994035, "learning_rate": 9.324145813619512e-09, "loss": 0.2559, "step": 6198 }, { "epoch": 2.961949704318738, "grad_norm": 0.49174281184358326, "learning_rate": 9.098148600111822e-09, "loss": 0.2795, "step": 6199 }, { "epoch": 2.9624275730243115, "grad_norm": 0.4440331427084486, "learning_rate": 8.874922686541442e-09, "loss": 0.2811, "step": 6200 }, { "epoch": 2.962905441729885, "grad_norm": 0.44320171963272337, "learning_rate": 8.654468134826621e-09, "loss": 0.2501, "step": 6201 }, { "epoch": 2.963383310435458, "grad_norm": 0.44354434096176776, "learning_rate": 8.436785006118442e-09, "loss": 0.2701, "step": 6202 }, { "epoch": 2.963861179141031, "grad_norm": 0.4389788144482154, "learning_rate": 8.221873360798604e-09, "loss": 0.2738, "step": 6203 }, { "epoch": 2.9643390478466043, "grad_norm": 0.49592260296870216, "learning_rate": 8.009733258478314e-09, "loss": 0.268, "step": 6204 }, { "epoch": 2.964816916552177, "grad_norm": 0.4400993468472191, "learning_rate": 7.800364758002721e-09, "loss": 0.2597, "step": 6205 }, { "epoch": 2.9652947852577505, "grad_norm": 0.4309074500166779, "learning_rate": 7.593767917445372e-09, "loss": 0.264, "step": 6206 }, { "epoch": 2.965772653963324, "grad_norm": 0.45179483541632487, "learning_rate": 7.389942794114868e-09, "loss": 0.2736, "step": 6207 }, { "epoch": 2.9662505226688967, "grad_norm": 0.4220551564794903, "learning_rate": 7.188889444548208e-09, "loss": 0.2685, "step": 6208 }, { "epoch": 2.96672839137447, "grad_norm": 0.4806287977254884, "learning_rate": 6.990607924511894e-09, "loss": 0.2508, "step": 6209 }, { "epoch": 2.967206260080043, "grad_norm": 0.4320953184140779, "learning_rate": 6.795098289008595e-09, "loss": 0.2842, "step": 6210 }, { "epoch": 2.967684128785616, "grad_norm": 0.43834356326205404, "learning_rate": 6.602360592267154e-09, "loss": 0.279, "step": 6211 }, { "epoch": 2.9681619974911895, "grad_norm": 0.4481157300961235, "learning_rate": 6.412394887750362e-09, "loss": 0.2663, "step": 6212 }, { "epoch": 2.9686398661967623, "grad_norm": 0.4295979402150286, "learning_rate": 6.225201228151623e-09, "loss": 0.2652, "step": 6213 }, { "epoch": 2.9691177349023357, "grad_norm": 0.6031224976708806, "learning_rate": 6.040779665394958e-09, "loss": 0.2506, "step": 6214 }, { "epoch": 2.9695956036079085, "grad_norm": 0.4315091977947095, "learning_rate": 5.859130250636113e-09, "loss": 0.2695, "step": 6215 }, { "epoch": 2.970073472313482, "grad_norm": 0.6583738125813944, "learning_rate": 5.680253034260341e-09, "loss": 0.2548, "step": 6216 }, { "epoch": 2.970551341019055, "grad_norm": 0.4378215901474907, "learning_rate": 5.504148065885728e-09, "loss": 0.2646, "step": 6217 }, { "epoch": 2.971029209724628, "grad_norm": 0.4503142761167827, "learning_rate": 5.330815394359867e-09, "loss": 0.2473, "step": 6218 }, { "epoch": 2.9715070784302013, "grad_norm": 0.4469746516278154, "learning_rate": 5.160255067764297e-09, "loss": 0.268, "step": 6219 }, { "epoch": 2.971984947135774, "grad_norm": 0.4335561129582791, "learning_rate": 4.992467133406731e-09, "loss": 0.2516, "step": 6220 }, { "epoch": 2.9724628158413475, "grad_norm": 0.4418315312058818, "learning_rate": 4.827451637829938e-09, "loss": 0.2604, "step": 6221 }, { "epoch": 2.972940684546921, "grad_norm": 0.4494480907514312, "learning_rate": 4.665208626807305e-09, "loss": 0.254, "step": 6222 }, { "epoch": 2.9734185532524937, "grad_norm": 0.4324550053316957, "learning_rate": 4.50573814534061e-09, "loss": 0.2436, "step": 6223 }, { "epoch": 2.973896421958067, "grad_norm": 0.47336069337487435, "learning_rate": 4.34904023766447e-09, "loss": 0.2705, "step": 6224 }, { "epoch": 2.97437429066364, "grad_norm": 0.4628580904559059, "learning_rate": 4.195114947244117e-09, "loss": 0.2687, "step": 6225 }, { "epoch": 2.974852159369213, "grad_norm": 0.43605946474328056, "learning_rate": 4.043962316775396e-09, "loss": 0.2634, "step": 6226 }, { "epoch": 2.9753300280747865, "grad_norm": 0.42009356204523896, "learning_rate": 3.895582388186991e-09, "loss": 0.2566, "step": 6227 }, { "epoch": 2.97580789678036, "grad_norm": 0.44595200592177053, "learning_rate": 3.749975202635981e-09, "loss": 0.2609, "step": 6228 }, { "epoch": 2.9762857654859327, "grad_norm": 0.45865874193509315, "learning_rate": 3.607140800510056e-09, "loss": 0.2625, "step": 6229 }, { "epoch": 2.976763634191506, "grad_norm": 0.5227990370320047, "learning_rate": 3.4670792214297476e-09, "loss": 0.2808, "step": 6230 }, { "epoch": 2.977241502897079, "grad_norm": 0.444601214169423, "learning_rate": 3.3297905042462e-09, "loss": 0.2686, "step": 6231 }, { "epoch": 2.977719371602652, "grad_norm": 0.44325315886349526, "learning_rate": 3.1952746870411723e-09, "loss": 0.259, "step": 6232 }, { "epoch": 2.9781972403082255, "grad_norm": 0.4263331644343002, "learning_rate": 3.06353180712593e-09, "loss": 0.2601, "step": 6233 }, { "epoch": 2.9786751090137984, "grad_norm": 0.445388777913223, "learning_rate": 2.9345619010434644e-09, "loss": 0.2709, "step": 6234 }, { "epoch": 2.9791529777193717, "grad_norm": 0.4290478297016977, "learning_rate": 2.808365004569602e-09, "loss": 0.2349, "step": 6235 }, { "epoch": 2.9796308464249446, "grad_norm": 0.43300161485588196, "learning_rate": 2.684941152706344e-09, "loss": 0.254, "step": 6236 }, { "epoch": 2.980108715130518, "grad_norm": 0.44745868192997307, "learning_rate": 2.5642903796918583e-09, "loss": 0.2586, "step": 6237 }, { "epoch": 2.980586583836091, "grad_norm": 0.5017074416234102, "learning_rate": 2.4464127189915975e-09, "loss": 0.2563, "step": 6238 }, { "epoch": 2.981064452541664, "grad_norm": 0.43904803227553557, "learning_rate": 2.33130820330163e-09, "loss": 0.2695, "step": 6239 }, { "epoch": 2.9815423212472374, "grad_norm": 0.4428267317870827, "learning_rate": 2.2189768645519693e-09, "loss": 0.2783, "step": 6240 }, { "epoch": 2.9820201899528103, "grad_norm": 0.44591251523595293, "learning_rate": 2.109418733899915e-09, "loss": 0.2721, "step": 6241 }, { "epoch": 2.9824980586583836, "grad_norm": 0.4610652816160574, "learning_rate": 2.0026338417344913e-09, "loss": 0.2717, "step": 6242 }, { "epoch": 2.982975927363957, "grad_norm": 0.4494233009141444, "learning_rate": 1.898622217677559e-09, "loss": 0.2636, "step": 6243 }, { "epoch": 2.9834537960695298, "grad_norm": 0.4363101853354611, "learning_rate": 1.7973838905793739e-09, "loss": 0.2728, "step": 6244 }, { "epoch": 2.983931664775103, "grad_norm": 0.4882459582895431, "learning_rate": 1.6989188885219165e-09, "loss": 0.2542, "step": 6245 }, { "epoch": 2.984409533480676, "grad_norm": 0.44390851843666584, "learning_rate": 1.6032272388166736e-09, "loss": 0.2714, "step": 6246 }, { "epoch": 2.9848874021862493, "grad_norm": 0.4429583465368995, "learning_rate": 1.5103089680079674e-09, "loss": 0.271, "step": 6247 }, { "epoch": 2.9853652708918226, "grad_norm": 0.4302641158411793, "learning_rate": 1.4201641018685152e-09, "loss": 0.264, "step": 6248 }, { "epoch": 2.985843139597396, "grad_norm": 0.4371029630970897, "learning_rate": 1.3327926654049805e-09, "loss": 0.2669, "step": 6249 }, { "epoch": 2.9863210083029688, "grad_norm": 0.44144481030011334, "learning_rate": 1.2481946828502011e-09, "loss": 0.2598, "step": 6250 }, { "epoch": 2.9867988770085416, "grad_norm": 0.5001018088911403, "learning_rate": 1.1663701776709613e-09, "loss": 0.2434, "step": 6251 }, { "epoch": 2.987276745714115, "grad_norm": 0.430612900839865, "learning_rate": 1.0873191725646604e-09, "loss": 0.2798, "step": 6252 }, { "epoch": 2.9877546144196883, "grad_norm": 0.45350891778339647, "learning_rate": 1.0110416894593133e-09, "loss": 0.2716, "step": 6253 }, { "epoch": 2.9882324831252616, "grad_norm": 0.4337165441795163, "learning_rate": 9.375377495102201e-10, "loss": 0.254, "step": 6254 }, { "epoch": 2.9887103518308344, "grad_norm": 0.4462339228646419, "learning_rate": 8.668073731088467e-10, "loss": 0.2599, "step": 6255 }, { "epoch": 2.9891882205364078, "grad_norm": 0.45995272797479353, "learning_rate": 7.988505798728341e-10, "loss": 0.2673, "step": 6256 }, { "epoch": 2.9896660892419806, "grad_norm": 0.43074554696329737, "learning_rate": 7.33667388652659e-10, "loss": 0.2644, "step": 6257 }, { "epoch": 2.990143957947554, "grad_norm": 0.44281340320268414, "learning_rate": 6.712578175294138e-10, "loss": 0.2684, "step": 6258 }, { "epoch": 2.9906218266531273, "grad_norm": 0.45094383291551116, "learning_rate": 6.116218838148058e-10, "loss": 0.2601, "step": 6259 }, { "epoch": 2.9910996953587, "grad_norm": 0.4263274399474076, "learning_rate": 5.547596040489378e-10, "loss": 0.2628, "step": 6260 }, { "epoch": 2.9915775640642734, "grad_norm": 0.44924007993713105, "learning_rate": 5.006709940058585e-10, "loss": 0.2646, "step": 6261 }, { "epoch": 2.9920554327698463, "grad_norm": 0.4157503424555004, "learning_rate": 4.4935606868912186e-10, "loss": 0.2688, "step": 6262 }, { "epoch": 2.9925333014754196, "grad_norm": 0.5530870038829894, "learning_rate": 4.008148423306768e-10, "loss": 0.261, "step": 6263 }, { "epoch": 2.993011170180993, "grad_norm": 0.43360985730611284, "learning_rate": 3.5504732839752867e-10, "loss": 0.2605, "step": 6264 }, { "epoch": 2.993489038886566, "grad_norm": 0.4305991915878331, "learning_rate": 3.1205353958285724e-10, "loss": 0.2565, "step": 6265 }, { "epoch": 2.993966907592139, "grad_norm": 0.46592325654029976, "learning_rate": 2.718334878137885e-10, "loss": 0.2678, "step": 6266 }, { "epoch": 2.994444776297712, "grad_norm": 0.44676400478930745, "learning_rate": 2.3438718424473315e-10, "loss": 0.2524, "step": 6267 }, { "epoch": 2.9949226450032853, "grad_norm": 0.4574246654627526, "learning_rate": 1.9971463926515833e-10, "loss": 0.2602, "step": 6268 }, { "epoch": 2.9954005137088586, "grad_norm": 0.45138653999661854, "learning_rate": 1.6781586249070559e-10, "loss": 0.2489, "step": 6269 }, { "epoch": 2.9958783824144315, "grad_norm": 0.44396985604168704, "learning_rate": 1.3869086276985243e-10, "loss": 0.2679, "step": 6270 }, { "epoch": 2.996356251120005, "grad_norm": 0.4756036796960099, "learning_rate": 1.1233964818169185e-10, "loss": 0.2624, "step": 6271 }, { "epoch": 2.9968341198255777, "grad_norm": 0.46382667279220274, "learning_rate": 8.876222603593221e-11, "loss": 0.2646, "step": 6272 }, { "epoch": 2.997311988531151, "grad_norm": 0.4785623678848972, "learning_rate": 6.795860287178713e-11, "loss": 0.2526, "step": 6273 }, { "epoch": 2.9977898572367243, "grad_norm": 0.42404628861817545, "learning_rate": 4.992878446019589e-11, "loss": 0.2651, "step": 6274 }, { "epoch": 2.9982677259422976, "grad_norm": 0.45160121782102897, "learning_rate": 3.467277580271322e-11, "loss": 0.2663, "step": 6275 }, { "epoch": 2.9987455946478705, "grad_norm": 0.4354577405562673, "learning_rate": 2.219058113039907e-11, "loss": 0.2655, "step": 6276 }, { "epoch": 2.9992234633534434, "grad_norm": 0.45460340307340963, "learning_rate": 1.2482203904928824e-11, "loss": 0.2514, "step": 6277 }, { "epoch": 2.9997013320590167, "grad_norm": 0.47846568959514535, "learning_rate": 5.547646820813768e-12, "loss": 0.2667, "step": 6278 }, { "epoch": 3.0, "grad_norm": 0.5483148609921531, "learning_rate": 1.3869118009601778e-12, "loss": 0.2654, "step": 6279 } ], "logging_steps": 1, "max_steps": 6279, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5031825133207552e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }