deepseek_1b_sql / trainer_state.json
minhnhat136's picture
Upload 13 files
b89fed9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.061420345489443,
"eval_steps": 500,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019193857965451054,
"grad_norm": 1.0817455053329468,
"learning_rate": 4.9999436730259053e-05,
"loss": 0.2614,
"num_input_tokens_seen": 933376,
"step": 5
},
{
"epoch": 0.03838771593090211,
"grad_norm": 0.6683880686759949,
"learning_rate": 4.999774694641803e-05,
"loss": 0.1894,
"num_input_tokens_seen": 1884800,
"step": 10
},
{
"epoch": 0.05758157389635317,
"grad_norm": 0.6495689153671265,
"learning_rate": 4.999493072462126e-05,
"loss": 0.1836,
"num_input_tokens_seen": 2847104,
"step": 15
},
{
"epoch": 0.07677543186180422,
"grad_norm": 0.5534822940826416,
"learning_rate": 4.999098819177214e-05,
"loss": 0.1759,
"num_input_tokens_seen": 3780480,
"step": 20
},
{
"epoch": 0.09596928982725528,
"grad_norm": 0.5017605423927307,
"learning_rate": 4.9985919525527434e-05,
"loss": 0.1705,
"num_input_tokens_seen": 4742272,
"step": 25
},
{
"epoch": 0.11516314779270634,
"grad_norm": 0.4681239128112793,
"learning_rate": 4.9979724954289244e-05,
"loss": 0.1722,
"num_input_tokens_seen": 5687424,
"step": 30
},
{
"epoch": 0.1343570057581574,
"grad_norm": 0.48403823375701904,
"learning_rate": 4.9972404757194736e-05,
"loss": 0.1696,
"num_input_tokens_seen": 6600832,
"step": 35
},
{
"epoch": 0.15355086372360843,
"grad_norm": 0.48744267225265503,
"learning_rate": 4.9963959264103544e-05,
"loss": 0.1638,
"num_input_tokens_seen": 7545344,
"step": 40
},
{
"epoch": 0.1727447216890595,
"grad_norm": 0.4389607608318329,
"learning_rate": 4.995438885558294e-05,
"loss": 0.1606,
"num_input_tokens_seen": 8489728,
"step": 45
},
{
"epoch": 0.19193857965451055,
"grad_norm": 0.4652620851993561,
"learning_rate": 4.994369396289063e-05,
"loss": 0.1606,
"num_input_tokens_seen": 9426304,
"step": 50
},
{
"epoch": 0.21113243761996162,
"grad_norm": 0.48464274406433105,
"learning_rate": 4.993187506795538e-05,
"loss": 0.1661,
"num_input_tokens_seen": 10372608,
"step": 55
},
{
"epoch": 0.23032629558541268,
"grad_norm": 0.4516374468803406,
"learning_rate": 4.9918932703355256e-05,
"loss": 0.1629,
"num_input_tokens_seen": 11298944,
"step": 60
},
{
"epoch": 0.2495201535508637,
"grad_norm": 0.44246649742126465,
"learning_rate": 4.990486745229364e-05,
"loss": 0.1573,
"num_input_tokens_seen": 12230784,
"step": 65
},
{
"epoch": 0.2687140115163148,
"grad_norm": 0.4362417459487915,
"learning_rate": 4.9889679948572974e-05,
"loss": 0.1548,
"num_input_tokens_seen": 13177856,
"step": 70
},
{
"epoch": 0.28790786948176583,
"grad_norm": 0.46021828055381775,
"learning_rate": 4.987337087656614e-05,
"loss": 0.1536,
"num_input_tokens_seen": 14148864,
"step": 75
},
{
"epoch": 0.30710172744721687,
"grad_norm": 0.43709027767181396,
"learning_rate": 4.98559409711857e-05,
"loss": 0.1615,
"num_input_tokens_seen": 15136256,
"step": 80
},
{
"epoch": 0.32629558541266795,
"grad_norm": 0.4331000745296478,
"learning_rate": 4.983739101785071e-05,
"loss": 0.1593,
"num_input_tokens_seen": 16088064,
"step": 85
},
{
"epoch": 0.345489443378119,
"grad_norm": 0.40528881549835205,
"learning_rate": 4.981772185245135e-05,
"loss": 0.1509,
"num_input_tokens_seen": 17043840,
"step": 90
},
{
"epoch": 0.3646833013435701,
"grad_norm": 0.42082345485687256,
"learning_rate": 4.97969343613113e-05,
"loss": 0.1518,
"num_input_tokens_seen": 17986816,
"step": 95
},
{
"epoch": 0.3838771593090211,
"grad_norm": 0.4149463176727295,
"learning_rate": 4.977502948114772e-05,
"loss": 0.1558,
"num_input_tokens_seen": 18904192,
"step": 100
},
{
"epoch": 0.40307101727447214,
"grad_norm": 0.41713905334472656,
"learning_rate": 4.97520081990291e-05,
"loss": 0.152,
"num_input_tokens_seen": 19854720,
"step": 105
},
{
"epoch": 0.42226487523992323,
"grad_norm": 0.4443305432796478,
"learning_rate": 4.9727871552330794e-05,
"loss": 0.15,
"num_input_tokens_seen": 20807040,
"step": 110
},
{
"epoch": 0.44145873320537427,
"grad_norm": 0.4278104603290558,
"learning_rate": 4.97026206286882e-05,
"loss": 0.1563,
"num_input_tokens_seen": 21742080,
"step": 115
},
{
"epoch": 0.46065259117082535,
"grad_norm": 0.41511863470077515,
"learning_rate": 4.967625656594782e-05,
"loss": 0.1545,
"num_input_tokens_seen": 22686464,
"step": 120
},
{
"epoch": 0.4798464491362764,
"grad_norm": 0.42981916666030884,
"learning_rate": 4.964878055211597e-05,
"loss": 0.1483,
"num_input_tokens_seen": 23654272,
"step": 125
},
{
"epoch": 0.4990403071017274,
"grad_norm": 0.39455243945121765,
"learning_rate": 4.962019382530521e-05,
"loss": 0.15,
"num_input_tokens_seen": 24606720,
"step": 130
},
{
"epoch": 0.5182341650671785,
"grad_norm": 0.40956762433052063,
"learning_rate": 4.959049767367859e-05,
"loss": 0.1516,
"num_input_tokens_seen": 25554944,
"step": 135
},
{
"epoch": 0.5374280230326296,
"grad_norm": 0.39926445484161377,
"learning_rate": 4.955969343539162e-05,
"loss": 0.1521,
"num_input_tokens_seen": 26515968,
"step": 140
},
{
"epoch": 0.5566218809980806,
"grad_norm": 0.4021078050136566,
"learning_rate": 4.9527782498531915e-05,
"loss": 0.1535,
"num_input_tokens_seen": 27450112,
"step": 145
},
{
"epoch": 0.5758157389635317,
"grad_norm": 0.40622368454933167,
"learning_rate": 4.949476630105669e-05,
"loss": 0.1521,
"num_input_tokens_seen": 28398592,
"step": 150
},
{
"epoch": 0.5950095969289827,
"grad_norm": 0.4052083194255829,
"learning_rate": 4.946064633072795e-05,
"loss": 0.1498,
"num_input_tokens_seen": 29364864,
"step": 155
},
{
"epoch": 0.6142034548944337,
"grad_norm": 0.39543649554252625,
"learning_rate": 4.942542412504543e-05,
"loss": 0.1499,
"num_input_tokens_seen": 30316928,
"step": 160
},
{
"epoch": 0.6333973128598849,
"grad_norm": 0.37419360876083374,
"learning_rate": 4.9389101271177355e-05,
"loss": 0.1479,
"num_input_tokens_seen": 31287680,
"step": 165
},
{
"epoch": 0.6525911708253359,
"grad_norm": 0.40969106554985046,
"learning_rate": 4.935167940588887e-05,
"loss": 0.1506,
"num_input_tokens_seen": 32245888,
"step": 170
},
{
"epoch": 0.6717850287907869,
"grad_norm": 0.39717161655426025,
"learning_rate": 4.9313160215468334e-05,
"loss": 0.1451,
"num_input_tokens_seen": 33194368,
"step": 175
},
{
"epoch": 0.690978886756238,
"grad_norm": 0.3813941478729248,
"learning_rate": 4.92735454356513e-05,
"loss": 0.1378,
"num_input_tokens_seen": 34140160,
"step": 180
},
{
"epoch": 0.710172744721689,
"grad_norm": 0.3822474777698517,
"learning_rate": 4.923283685154231e-05,
"loss": 0.1417,
"num_input_tokens_seen": 35078528,
"step": 185
},
{
"epoch": 0.7293666026871402,
"grad_norm": 0.4150351583957672,
"learning_rate": 4.9191036297534454e-05,
"loss": 0.1476,
"num_input_tokens_seen": 36013696,
"step": 190
},
{
"epoch": 0.7485604606525912,
"grad_norm": 0.3787771463394165,
"learning_rate": 4.914814565722671e-05,
"loss": 0.1414,
"num_input_tokens_seen": 36973056,
"step": 195
},
{
"epoch": 0.7677543186180422,
"grad_norm": 0.4316699504852295,
"learning_rate": 4.910416686333906e-05,
"loss": 0.1447,
"num_input_tokens_seen": 37904256,
"step": 200
},
{
"epoch": 0.7869481765834933,
"grad_norm": 0.38349735736846924,
"learning_rate": 4.905910189762542e-05,
"loss": 0.1417,
"num_input_tokens_seen": 38858240,
"step": 205
},
{
"epoch": 0.8061420345489443,
"grad_norm": 0.41351762413978577,
"learning_rate": 4.901295279078431e-05,
"loss": 0.1435,
"num_input_tokens_seen": 39800192,
"step": 210
},
{
"epoch": 0.8253358925143954,
"grad_norm": 0.36677101254463196,
"learning_rate": 4.896572162236737e-05,
"loss": 0.1454,
"num_input_tokens_seen": 40736128,
"step": 215
},
{
"epoch": 0.8445297504798465,
"grad_norm": 0.4057168662548065,
"learning_rate": 4.8917410520685635e-05,
"loss": 0.1414,
"num_input_tokens_seen": 41699072,
"step": 220
},
{
"epoch": 0.8637236084452975,
"grad_norm": 0.3790329396724701,
"learning_rate": 4.886802166271364e-05,
"loss": 0.1452,
"num_input_tokens_seen": 42651648,
"step": 225
},
{
"epoch": 0.8829174664107485,
"grad_norm": 0.37877827882766724,
"learning_rate": 4.881755727399134e-05,
"loss": 0.1408,
"num_input_tokens_seen": 43584512,
"step": 230
},
{
"epoch": 0.9021113243761996,
"grad_norm": 0.37050819396972656,
"learning_rate": 4.8766019628523775e-05,
"loss": 0.1342,
"num_input_tokens_seen": 44552448,
"step": 235
},
{
"epoch": 0.9213051823416507,
"grad_norm": 0.367374449968338,
"learning_rate": 4.8713411048678635e-05,
"loss": 0.142,
"num_input_tokens_seen": 45498368,
"step": 240
},
{
"epoch": 0.9404990403071017,
"grad_norm": 0.3566618859767914,
"learning_rate": 4.8659733905081634e-05,
"loss": 0.1398,
"num_input_tokens_seen": 46445952,
"step": 245
},
{
"epoch": 0.9596928982725528,
"grad_norm": 0.35429617762565613,
"learning_rate": 4.8604990616509616e-05,
"loss": 0.1345,
"num_input_tokens_seen": 47391872,
"step": 250
},
{
"epoch": 0.9788867562380038,
"grad_norm": 0.3834087550640106,
"learning_rate": 4.8549183649781626e-05,
"loss": 0.14,
"num_input_tokens_seen": 48339584,
"step": 255
},
{
"epoch": 0.9980806142034548,
"grad_norm": 0.36162251234054565,
"learning_rate": 4.849231551964771e-05,
"loss": 0.1376,
"num_input_tokens_seen": 49294848,
"step": 260
},
{
"epoch": 1.017274472168906,
"grad_norm": 0.3323320746421814,
"learning_rate": 4.8434388788675635e-05,
"loss": 0.1149,
"num_input_tokens_seen": 50247296,
"step": 265
},
{
"epoch": 1.036468330134357,
"grad_norm": 0.36334532499313354,
"learning_rate": 4.837540606713538e-05,
"loss": 0.1119,
"num_input_tokens_seen": 51193472,
"step": 270
},
{
"epoch": 1.055662188099808,
"grad_norm": 0.37446385622024536,
"learning_rate": 4.8315370012881514e-05,
"loss": 0.1125,
"num_input_tokens_seen": 52144000,
"step": 275
},
{
"epoch": 1.0748560460652592,
"grad_norm": 0.4256220757961273,
"learning_rate": 4.8254283331233464e-05,
"loss": 0.1083,
"num_input_tokens_seen": 53097216,
"step": 280
},
{
"epoch": 1.0940499040307101,
"grad_norm": 0.39395830035209656,
"learning_rate": 4.819214877485358e-05,
"loss": 0.1083,
"num_input_tokens_seen": 54032896,
"step": 285
},
{
"epoch": 1.1132437619961613,
"grad_norm": 0.3702699542045593,
"learning_rate": 4.812896914362309e-05,
"loss": 0.1092,
"num_input_tokens_seen": 54988800,
"step": 290
},
{
"epoch": 1.1324376199616122,
"grad_norm": 0.389553040266037,
"learning_rate": 4.806474728451597e-05,
"loss": 0.1057,
"num_input_tokens_seen": 55947520,
"step": 295
},
{
"epoch": 1.1516314779270633,
"grad_norm": 0.37560170888900757,
"learning_rate": 4.799948609147061e-05,
"loss": 0.108,
"num_input_tokens_seen": 56864384,
"step": 300
},
{
"epoch": 1.1708253358925145,
"grad_norm": 0.3832947611808777,
"learning_rate": 4.793318850525943e-05,
"loss": 0.1092,
"num_input_tokens_seen": 57816960,
"step": 305
},
{
"epoch": 1.1900191938579654,
"grad_norm": 0.41275399923324585,
"learning_rate": 4.786585751335637e-05,
"loss": 0.1076,
"num_input_tokens_seen": 58773376,
"step": 310
},
{
"epoch": 1.2092130518234165,
"grad_norm": 0.4026075601577759,
"learning_rate": 4.7797496149802256e-05,
"loss": 0.1061,
"num_input_tokens_seen": 59710592,
"step": 315
},
{
"epoch": 1.2284069097888675,
"grad_norm": 0.4040988087654114,
"learning_rate": 4.77281074950681e-05,
"loss": 0.1073,
"num_input_tokens_seen": 60660352,
"step": 320
},
{
"epoch": 1.2476007677543186,
"grad_norm": 0.40489524602890015,
"learning_rate": 4.765769467591625e-05,
"loss": 0.1085,
"num_input_tokens_seen": 61600256,
"step": 325
},
{
"epoch": 1.2667946257197698,
"grad_norm": 0.40675088763237,
"learning_rate": 4.758626086525956e-05,
"loss": 0.1149,
"num_input_tokens_seen": 62558720,
"step": 330
},
{
"epoch": 1.2859884836852207,
"grad_norm": 0.4014970362186432,
"learning_rate": 4.751380928201834e-05,
"loss": 0.1101,
"num_input_tokens_seen": 63505152,
"step": 335
},
{
"epoch": 1.3051823416506718,
"grad_norm": 0.3987894654273987,
"learning_rate": 4.744034319097535e-05,
"loss": 0.1113,
"num_input_tokens_seen": 64475392,
"step": 340
},
{
"epoch": 1.3243761996161227,
"grad_norm": 0.3988923728466034,
"learning_rate": 4.7365865902628684e-05,
"loss": 0.1065,
"num_input_tokens_seen": 65412352,
"step": 345
},
{
"epoch": 1.3435700575815739,
"grad_norm": 0.40686464309692383,
"learning_rate": 4.7290380773042575e-05,
"loss": 0.1058,
"num_input_tokens_seen": 66358272,
"step": 350
},
{
"epoch": 1.362763915547025,
"grad_norm": 0.4416196048259735,
"learning_rate": 4.7213891203696164e-05,
"loss": 0.1068,
"num_input_tokens_seen": 67310208,
"step": 355
},
{
"epoch": 1.381957773512476,
"grad_norm": 0.38802987337112427,
"learning_rate": 4.713640064133025e-05,
"loss": 0.1119,
"num_input_tokens_seen": 68256256,
"step": 360
},
{
"epoch": 1.401151631477927,
"grad_norm": 0.37979230284690857,
"learning_rate": 4.705791257779195e-05,
"loss": 0.1087,
"num_input_tokens_seen": 69200896,
"step": 365
},
{
"epoch": 1.420345489443378,
"grad_norm": 0.40544721484184265,
"learning_rate": 4.697843054987737e-05,
"loss": 0.1101,
"num_input_tokens_seen": 70129792,
"step": 370
},
{
"epoch": 1.4395393474088292,
"grad_norm": 0.39788997173309326,
"learning_rate": 4.68979581391722e-05,
"loss": 0.1117,
"num_input_tokens_seen": 71074944,
"step": 375
},
{
"epoch": 1.4587332053742803,
"grad_norm": 0.40104883909225464,
"learning_rate": 4.681649897189036e-05,
"loss": 0.1067,
"num_input_tokens_seen": 72009088,
"step": 380
},
{
"epoch": 1.4779270633397312,
"grad_norm": 0.4051682949066162,
"learning_rate": 4.673405671871057e-05,
"loss": 0.1106,
"num_input_tokens_seen": 72954880,
"step": 385
},
{
"epoch": 1.4971209213051824,
"grad_norm": 0.40561723709106445,
"learning_rate": 4.665063509461097e-05,
"loss": 0.108,
"num_input_tokens_seen": 73890176,
"step": 390
},
{
"epoch": 1.5163147792706333,
"grad_norm": 0.4338354468345642,
"learning_rate": 4.656623785870167e-05,
"loss": 0.1093,
"num_input_tokens_seen": 74851840,
"step": 395
},
{
"epoch": 1.5355086372360844,
"grad_norm": 0.40794938802719116,
"learning_rate": 4.6480868814055424e-05,
"loss": 0.1071,
"num_input_tokens_seen": 75794816,
"step": 400
},
{
"epoch": 1.5547024952015356,
"grad_norm": 0.406972199678421,
"learning_rate": 4.639453180753619e-05,
"loss": 0.1092,
"num_input_tokens_seen": 76736512,
"step": 405
},
{
"epoch": 1.5738963531669867,
"grad_norm": 0.4144577383995056,
"learning_rate": 4.630723072962584e-05,
"loss": 0.1083,
"num_input_tokens_seen": 77692288,
"step": 410
},
{
"epoch": 1.5930902111324377,
"grad_norm": 0.44782555103302,
"learning_rate": 4.6218969514248814e-05,
"loss": 0.1068,
"num_input_tokens_seen": 78654720,
"step": 415
},
{
"epoch": 1.6122840690978886,
"grad_norm": 0.45688125491142273,
"learning_rate": 4.6129752138594874e-05,
"loss": 0.1083,
"num_input_tokens_seen": 79607552,
"step": 420
},
{
"epoch": 1.6314779270633397,
"grad_norm": 0.43751007318496704,
"learning_rate": 4.6039582622939854e-05,
"loss": 0.1087,
"num_input_tokens_seen": 80547328,
"step": 425
},
{
"epoch": 1.6506717850287909,
"grad_norm": 0.42292338609695435,
"learning_rate": 4.5948465030464536e-05,
"loss": 0.107,
"num_input_tokens_seen": 81478400,
"step": 430
},
{
"epoch": 1.669865642994242,
"grad_norm": 0.4191446006298065,
"learning_rate": 4.5856403467071536e-05,
"loss": 0.1084,
"num_input_tokens_seen": 82441984,
"step": 435
},
{
"epoch": 1.689059500959693,
"grad_norm": 0.397989958524704,
"learning_rate": 4.5763402081200294e-05,
"loss": 0.1069,
"num_input_tokens_seen": 83382912,
"step": 440
},
{
"epoch": 1.7082533589251438,
"grad_norm": 0.4078799784183502,
"learning_rate": 4.566946506364013e-05,
"loss": 0.1064,
"num_input_tokens_seen": 84332672,
"step": 445
},
{
"epoch": 1.727447216890595,
"grad_norm": 0.41216278076171875,
"learning_rate": 4.557459664734141e-05,
"loss": 0.1114,
"num_input_tokens_seen": 85280128,
"step": 450
},
{
"epoch": 1.7466410748560461,
"grad_norm": 0.43625858426094055,
"learning_rate": 4.54788011072248e-05,
"loss": 0.1069,
"num_input_tokens_seen": 86221568,
"step": 455
},
{
"epoch": 1.7658349328214973,
"grad_norm": 0.3984062075614929,
"learning_rate": 4.538208275998861e-05,
"loss": 0.107,
"num_input_tokens_seen": 87159936,
"step": 460
},
{
"epoch": 1.7850287907869482,
"grad_norm": 0.4033821225166321,
"learning_rate": 4.528444596391433e-05,
"loss": 0.1066,
"num_input_tokens_seen": 88115712,
"step": 465
},
{
"epoch": 1.8042226487523991,
"grad_norm": 0.40416958928108215,
"learning_rate": 4.518589511867017e-05,
"loss": 0.1054,
"num_input_tokens_seen": 89069824,
"step": 470
},
{
"epoch": 1.8234165067178503,
"grad_norm": 0.4133840501308441,
"learning_rate": 4.5086434665112864e-05,
"loss": 0.1055,
"num_input_tokens_seen": 90018176,
"step": 475
},
{
"epoch": 1.8426103646833014,
"grad_norm": 0.44508275389671326,
"learning_rate": 4.498606908508754e-05,
"loss": 0.1091,
"num_input_tokens_seen": 90960128,
"step": 480
},
{
"epoch": 1.8618042226487526,
"grad_norm": 0.4138317108154297,
"learning_rate": 4.4884802901225695e-05,
"loss": 0.106,
"num_input_tokens_seen": 91911936,
"step": 485
},
{
"epoch": 1.8809980806142035,
"grad_norm": 0.41188734769821167,
"learning_rate": 4.478264067674155e-05,
"loss": 0.106,
"num_input_tokens_seen": 92849280,
"step": 490
},
{
"epoch": 1.9001919385796544,
"grad_norm": 0.43868768215179443,
"learning_rate": 4.4679587015226253e-05,
"loss": 0.1065,
"num_input_tokens_seen": 93810560,
"step": 495
},
{
"epoch": 1.9193857965451055,
"grad_norm": 0.4238053560256958,
"learning_rate": 4.457564656044056e-05,
"loss": 0.1036,
"num_input_tokens_seen": 94771328,
"step": 500
},
{
"epoch": 1.9385796545105567,
"grad_norm": 0.4343940317630768,
"learning_rate": 4.447082399610549e-05,
"loss": 0.1039,
"num_input_tokens_seen": 95733632,
"step": 505
},
{
"epoch": 1.9577735124760078,
"grad_norm": 0.4396176040172577,
"learning_rate": 4.436512404569136e-05,
"loss": 0.1073,
"num_input_tokens_seen": 96689280,
"step": 510
},
{
"epoch": 1.9769673704414588,
"grad_norm": 0.4687812030315399,
"learning_rate": 4.4258551472204865e-05,
"loss": 0.1082,
"num_input_tokens_seen": 97626112,
"step": 515
},
{
"epoch": 1.9961612284069097,
"grad_norm": 0.4092023968696594,
"learning_rate": 4.415111107797445e-05,
"loss": 0.1049,
"num_input_tokens_seen": 98561280,
"step": 520
},
{
"epoch": 2.015355086372361,
"grad_norm": 0.3592652976512909,
"learning_rate": 4.404280770443398e-05,
"loss": 0.0829,
"num_input_tokens_seen": 99516160,
"step": 525
},
{
"epoch": 2.034548944337812,
"grad_norm": 0.4278601109981537,
"learning_rate": 4.3933646231904504e-05,
"loss": 0.0727,
"num_input_tokens_seen": 100466048,
"step": 530
},
{
"epoch": 2.053742802303263,
"grad_norm": 0.48623570799827576,
"learning_rate": 4.3823631579374354e-05,
"loss": 0.0706,
"num_input_tokens_seen": 101422208,
"step": 535
},
{
"epoch": 2.072936660268714,
"grad_norm": 0.4464566111564636,
"learning_rate": 4.371276870427753e-05,
"loss": 0.0683,
"num_input_tokens_seen": 102365568,
"step": 540
},
{
"epoch": 2.092130518234165,
"grad_norm": 0.48030439019203186,
"learning_rate": 4.360106260227027e-05,
"loss": 0.0708,
"num_input_tokens_seen": 103312128,
"step": 545
},
{
"epoch": 2.111324376199616,
"grad_norm": 0.4528995752334595,
"learning_rate": 4.348851830700593e-05,
"loss": 0.0693,
"num_input_tokens_seen": 104258560,
"step": 550
},
{
"epoch": 2.1305182341650672,
"grad_norm": 0.48248499631881714,
"learning_rate": 4.337514088990822e-05,
"loss": 0.067,
"num_input_tokens_seen": 105199360,
"step": 555
},
{
"epoch": 2.1497120921305184,
"grad_norm": 0.5198423266410828,
"learning_rate": 4.3260935459942584e-05,
"loss": 0.0701,
"num_input_tokens_seen": 106155776,
"step": 560
},
{
"epoch": 2.168905950095969,
"grad_norm": 0.47610777616500854,
"learning_rate": 4.3145907163386064e-05,
"loss": 0.0673,
"num_input_tokens_seen": 107121792,
"step": 565
},
{
"epoch": 2.1880998080614202,
"grad_norm": 0.4852938950061798,
"learning_rate": 4.303006118359537e-05,
"loss": 0.0667,
"num_input_tokens_seen": 108049280,
"step": 570
},
{
"epoch": 2.2072936660268714,
"grad_norm": 0.5004666447639465,
"learning_rate": 4.2913402740773294e-05,
"loss": 0.0708,
"num_input_tokens_seen": 108994816,
"step": 575
},
{
"epoch": 2.2264875239923225,
"grad_norm": 0.500302255153656,
"learning_rate": 4.2795937091733515e-05,
"loss": 0.073,
"num_input_tokens_seen": 109929728,
"step": 580
},
{
"epoch": 2.2456813819577737,
"grad_norm": 0.45096588134765625,
"learning_rate": 4.267766952966369e-05,
"loss": 0.0712,
"num_input_tokens_seen": 110867328,
"step": 585
},
{
"epoch": 2.2648752399232244,
"grad_norm": 0.48498988151550293,
"learning_rate": 4.255860538388694e-05,
"loss": 0.0703,
"num_input_tokens_seen": 111824640,
"step": 590
},
{
"epoch": 2.2840690978886755,
"grad_norm": 0.6077526807785034,
"learning_rate": 4.24387500196217e-05,
"loss": 0.0719,
"num_input_tokens_seen": 112779648,
"step": 595
},
{
"epoch": 2.3032629558541267,
"grad_norm": 0.5256830453872681,
"learning_rate": 4.231810883773999e-05,
"loss": 0.0693,
"num_input_tokens_seen": 113709312,
"step": 600
},
{
"epoch": 2.322456813819578,
"grad_norm": 0.5583354234695435,
"learning_rate": 4.219668727452397e-05,
"loss": 0.0754,
"num_input_tokens_seen": 114654080,
"step": 605
},
{
"epoch": 2.341650671785029,
"grad_norm": 0.47424212098121643,
"learning_rate": 4.207449080142104e-05,
"loss": 0.0692,
"num_input_tokens_seen": 115618176,
"step": 610
},
{
"epoch": 2.36084452975048,
"grad_norm": 0.4865480959415436,
"learning_rate": 4.195152492479727e-05,
"loss": 0.0734,
"num_input_tokens_seen": 116583168,
"step": 615
},
{
"epoch": 2.380038387715931,
"grad_norm": 0.5127935409545898,
"learning_rate": 4.182779518568926e-05,
"loss": 0.0756,
"num_input_tokens_seen": 117525632,
"step": 620
},
{
"epoch": 2.399232245681382,
"grad_norm": 0.48488515615463257,
"learning_rate": 4.170330715955444e-05,
"loss": 0.0721,
"num_input_tokens_seen": 118487040,
"step": 625
},
{
"epoch": 2.418426103646833,
"grad_norm": 0.5220322012901306,
"learning_rate": 4.157806645601988e-05,
"loss": 0.0708,
"num_input_tokens_seen": 119424384,
"step": 630
},
{
"epoch": 2.4376199616122842,
"grad_norm": 0.4799134433269501,
"learning_rate": 4.145207871862947e-05,
"loss": 0.071,
"num_input_tokens_seen": 120394496,
"step": 635
},
{
"epoch": 2.456813819577735,
"grad_norm": 0.5201560258865356,
"learning_rate": 4.132534962458962e-05,
"loss": 0.0733,
"num_input_tokens_seen": 121340160,
"step": 640
},
{
"epoch": 2.476007677543186,
"grad_norm": 0.46850621700286865,
"learning_rate": 4.1197884884513474e-05,
"loss": 0.0725,
"num_input_tokens_seen": 122270720,
"step": 645
},
{
"epoch": 2.495201535508637,
"grad_norm": 0.5046434998512268,
"learning_rate": 4.1069690242163484e-05,
"loss": 0.0723,
"num_input_tokens_seen": 123226880,
"step": 650
},
{
"epoch": 2.5143953934740884,
"grad_norm": 0.4388461410999298,
"learning_rate": 4.094077147419271e-05,
"loss": 0.0702,
"num_input_tokens_seen": 124177664,
"step": 655
},
{
"epoch": 2.5335892514395395,
"grad_norm": 0.5357294678688049,
"learning_rate": 4.0811134389884433e-05,
"loss": 0.0716,
"num_input_tokens_seen": 125131520,
"step": 660
},
{
"epoch": 2.5527831094049906,
"grad_norm": 0.56220543384552,
"learning_rate": 4.0680784830890405e-05,
"loss": 0.0719,
"num_input_tokens_seen": 126085248,
"step": 665
},
{
"epoch": 2.5719769673704413,
"grad_norm": 0.4952705204486847,
"learning_rate": 4.05497286709676e-05,
"loss": 0.0705,
"num_input_tokens_seen": 127024128,
"step": 670
},
{
"epoch": 2.5911708253358925,
"grad_norm": 0.4931686520576477,
"learning_rate": 4.0417971815713584e-05,
"loss": 0.0705,
"num_input_tokens_seen": 127971072,
"step": 675
},
{
"epoch": 2.6103646833013436,
"grad_norm": 0.5248854756355286,
"learning_rate": 4.028552020230031e-05,
"loss": 0.0741,
"num_input_tokens_seen": 128914176,
"step": 680
},
{
"epoch": 2.629558541266795,
"grad_norm": 0.48179954290390015,
"learning_rate": 4.015237979920666e-05,
"loss": 0.0722,
"num_input_tokens_seen": 129855360,
"step": 685
},
{
"epoch": 2.6487523992322455,
"grad_norm": 0.5252842903137207,
"learning_rate": 4.001855660594948e-05,
"loss": 0.0746,
"num_input_tokens_seen": 130808960,
"step": 690
},
{
"epoch": 2.6679462571976966,
"grad_norm": 0.48107513785362244,
"learning_rate": 3.9884056652813184e-05,
"loss": 0.0692,
"num_input_tokens_seen": 131750144,
"step": 695
},
{
"epoch": 2.6871401151631478,
"grad_norm": 0.6338366866111755,
"learning_rate": 3.974888600057808e-05,
"loss": 0.0713,
"num_input_tokens_seen": 132695936,
"step": 700
},
{
"epoch": 2.706333973128599,
"grad_norm": 0.4953431785106659,
"learning_rate": 3.9613050740247224e-05,
"loss": 0.0711,
"num_input_tokens_seen": 133666944,
"step": 705
},
{
"epoch": 2.72552783109405,
"grad_norm": 0.4697812497615814,
"learning_rate": 3.947655699277197e-05,
"loss": 0.0677,
"num_input_tokens_seen": 134611456,
"step": 710
},
{
"epoch": 2.744721689059501,
"grad_norm": 0.5141253471374512,
"learning_rate": 3.933941090877615e-05,
"loss": 0.0742,
"num_input_tokens_seen": 135558400,
"step": 715
},
{
"epoch": 2.763915547024952,
"grad_norm": 0.4803192913532257,
"learning_rate": 3.920161866827889e-05,
"loss": 0.07,
"num_input_tokens_seen": 136509696,
"step": 720
},
{
"epoch": 2.783109404990403,
"grad_norm": 0.47573211789131165,
"learning_rate": 3.906318648041617e-05,
"loss": 0.0725,
"num_input_tokens_seen": 137442816,
"step": 725
},
{
"epoch": 2.802303262955854,
"grad_norm": 0.5205782055854797,
"learning_rate": 3.8924120583160985e-05,
"loss": 0.0704,
"num_input_tokens_seen": 138383232,
"step": 730
},
{
"epoch": 2.8214971209213053,
"grad_norm": 0.5315016508102417,
"learning_rate": 3.8784427243042296e-05,
"loss": 0.0721,
"num_input_tokens_seen": 139326464,
"step": 735
},
{
"epoch": 2.840690978886756,
"grad_norm": 0.4982779324054718,
"learning_rate": 3.8644112754862614e-05,
"loss": 0.0702,
"num_input_tokens_seen": 140295296,
"step": 740
},
{
"epoch": 2.859884836852207,
"grad_norm": 0.550413191318512,
"learning_rate": 3.850318344141439e-05,
"loss": 0.0719,
"num_input_tokens_seen": 141247488,
"step": 745
},
{
"epoch": 2.8790786948176583,
"grad_norm": 0.5022615194320679,
"learning_rate": 3.8361645653195026e-05,
"loss": 0.0712,
"num_input_tokens_seen": 142186240,
"step": 750
},
{
"epoch": 2.8982725527831095,
"grad_norm": 0.5040895342826843,
"learning_rate": 3.821950576812081e-05,
"loss": 0.0717,
"num_input_tokens_seen": 143130624,
"step": 755
},
{
"epoch": 2.9174664107485606,
"grad_norm": 0.5027932524681091,
"learning_rate": 3.807677019123944e-05,
"loss": 0.0717,
"num_input_tokens_seen": 144061568,
"step": 760
},
{
"epoch": 2.9366602687140118,
"grad_norm": 0.5451627969741821,
"learning_rate": 3.793344535444142e-05,
"loss": 0.0694,
"num_input_tokens_seen": 145013632,
"step": 765
},
{
"epoch": 2.9558541266794625,
"grad_norm": 0.5232383012771606,
"learning_rate": 3.7789537716170256e-05,
"loss": 0.07,
"num_input_tokens_seen": 145962880,
"step": 770
},
{
"epoch": 2.9750479846449136,
"grad_norm": 0.46057307720184326,
"learning_rate": 3.764505376113138e-05,
"loss": 0.0699,
"num_input_tokens_seen": 146916480,
"step": 775
},
{
"epoch": 2.9942418426103647,
"grad_norm": 0.510372519493103,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0705,
"num_input_tokens_seen": 147865344,
"step": 780
},
{
"epoch": 3.013435700575816,
"grad_norm": 0.3846762180328369,
"learning_rate": 3.735438296912768e-05,
"loss": 0.0515,
"num_input_tokens_seen": 148817664,
"step": 785
},
{
"epoch": 3.0326295585412666,
"grad_norm": 0.5160824656486511,
"learning_rate": 3.720820923024778e-05,
"loss": 0.0398,
"num_input_tokens_seen": 149770880,
"step": 790
},
{
"epoch": 3.0518234165067177,
"grad_norm": 0.5621275305747986,
"learning_rate": 3.7061485370179835e-05,
"loss": 0.0388,
"num_input_tokens_seen": 150717312,
"step": 795
},
{
"epoch": 3.071017274472169,
"grad_norm": 0.5459131002426147,
"learning_rate": 3.69142180005327e-05,
"loss": 0.0392,
"num_input_tokens_seen": 151666432,
"step": 800
},
{
"epoch": 3.09021113243762,
"grad_norm": 0.5296151638031006,
"learning_rate": 3.676641375740662e-05,
"loss": 0.0373,
"num_input_tokens_seen": 152584064,
"step": 805
},
{
"epoch": 3.109404990403071,
"grad_norm": 0.4850353002548218,
"learning_rate": 3.6618079301094216e-05,
"loss": 0.039,
"num_input_tokens_seen": 153525248,
"step": 810
},
{
"epoch": 3.128598848368522,
"grad_norm": 0.5585963726043701,
"learning_rate": 3.646922131578036e-05,
"loss": 0.038,
"num_input_tokens_seen": 154456704,
"step": 815
},
{
"epoch": 3.147792706333973,
"grad_norm": 0.5561864972114563,
"learning_rate": 3.631984650924094e-05,
"loss": 0.0387,
"num_input_tokens_seen": 155405312,
"step": 820
},
{
"epoch": 3.166986564299424,
"grad_norm": 0.5100105404853821,
"learning_rate": 3.6169961612540645e-05,
"loss": 0.0387,
"num_input_tokens_seen": 156361984,
"step": 825
},
{
"epoch": 3.1861804222648753,
"grad_norm": 0.5485111474990845,
"learning_rate": 3.6019573379729643e-05,
"loss": 0.0378,
"num_input_tokens_seen": 157296768,
"step": 830
},
{
"epoch": 3.2053742802303264,
"grad_norm": 0.5596518516540527,
"learning_rate": 3.586868858753921e-05,
"loss": 0.0376,
"num_input_tokens_seen": 158262144,
"step": 835
},
{
"epoch": 3.224568138195777,
"grad_norm": 0.5062248110771179,
"learning_rate": 3.5717314035076355e-05,
"loss": 0.0394,
"num_input_tokens_seen": 159213952,
"step": 840
},
{
"epoch": 3.2437619961612283,
"grad_norm": 0.560895562171936,
"learning_rate": 3.556545654351749e-05,
"loss": 0.0412,
"num_input_tokens_seen": 160142208,
"step": 845
},
{
"epoch": 3.2629558541266794,
"grad_norm": 0.5781249403953552,
"learning_rate": 3.5413122955801005e-05,
"loss": 0.0379,
"num_input_tokens_seen": 161101824,
"step": 850
},
{
"epoch": 3.2821497120921306,
"grad_norm": 0.5329350233078003,
"learning_rate": 3.526032013631893e-05,
"loss": 0.0389,
"num_input_tokens_seen": 162062592,
"step": 855
},
{
"epoch": 3.3013435700575817,
"grad_norm": 0.5311657190322876,
"learning_rate": 3.510705497060762e-05,
"loss": 0.0407,
"num_input_tokens_seen": 163008896,
"step": 860
},
{
"epoch": 3.320537428023033,
"grad_norm": 0.5238428115844727,
"learning_rate": 3.4953334365037525e-05,
"loss": 0.0396,
"num_input_tokens_seen": 163957632,
"step": 865
},
{
"epoch": 3.3397312859884836,
"grad_norm": 0.5340429544448853,
"learning_rate": 3.479916524650188e-05,
"loss": 0.0398,
"num_input_tokens_seen": 164903680,
"step": 870
},
{
"epoch": 3.3589251439539347,
"grad_norm": 0.5380253195762634,
"learning_rate": 3.4644554562104634e-05,
"loss": 0.0378,
"num_input_tokens_seen": 165832960,
"step": 875
},
{
"epoch": 3.378119001919386,
"grad_norm": 0.5895189046859741,
"learning_rate": 3.4489509278847414e-05,
"loss": 0.0386,
"num_input_tokens_seen": 166790016,
"step": 880
},
{
"epoch": 3.397312859884837,
"grad_norm": 0.5654773712158203,
"learning_rate": 3.433403638331553e-05,
"loss": 0.0392,
"num_input_tokens_seen": 167751808,
"step": 885
},
{
"epoch": 3.4165067178502877,
"grad_norm": 0.5874072909355164,
"learning_rate": 3.417814288136319e-05,
"loss": 0.039,
"num_input_tokens_seen": 168697984,
"step": 890
},
{
"epoch": 3.435700575815739,
"grad_norm": 0.5615460276603699,
"learning_rate": 3.4021835797797804e-05,
"loss": 0.0397,
"num_input_tokens_seen": 169634432,
"step": 895
},
{
"epoch": 3.45489443378119,
"grad_norm": 0.5442382097244263,
"learning_rate": 3.386512217606339e-05,
"loss": 0.0408,
"num_input_tokens_seen": 170583808,
"step": 900
},
{
"epoch": 3.474088291746641,
"grad_norm": 0.5456710457801819,
"learning_rate": 3.370800907792325e-05,
"loss": 0.0393,
"num_input_tokens_seen": 171575808,
"step": 905
},
{
"epoch": 3.4932821497120923,
"grad_norm": 0.6690710186958313,
"learning_rate": 3.355050358314172e-05,
"loss": 0.0399,
"num_input_tokens_seen": 172516224,
"step": 910
},
{
"epoch": 3.5124760076775434,
"grad_norm": 0.5574905276298523,
"learning_rate": 3.339261278916512e-05,
"loss": 0.0382,
"num_input_tokens_seen": 173462784,
"step": 915
},
{
"epoch": 3.531669865642994,
"grad_norm": 0.6012794971466064,
"learning_rate": 3.323434381080199e-05,
"loss": 0.0393,
"num_input_tokens_seen": 174417408,
"step": 920
},
{
"epoch": 3.5508637236084453,
"grad_norm": 0.5913425087928772,
"learning_rate": 3.307570377990245e-05,
"loss": 0.0401,
"num_input_tokens_seen": 175372160,
"step": 925
},
{
"epoch": 3.5700575815738964,
"grad_norm": 0.5533618927001953,
"learning_rate": 3.2916699845036816e-05,
"loss": 0.0389,
"num_input_tokens_seen": 176325376,
"step": 930
},
{
"epoch": 3.5892514395393476,
"grad_norm": 0.5684598684310913,
"learning_rate": 3.2757339171173506e-05,
"loss": 0.0405,
"num_input_tokens_seen": 177279232,
"step": 935
},
{
"epoch": 3.6084452975047983,
"grad_norm": 0.5171712040901184,
"learning_rate": 3.2597628939356175e-05,
"loss": 0.0385,
"num_input_tokens_seen": 178230656,
"step": 940
},
{
"epoch": 3.6276391554702494,
"grad_norm": 0.5861181616783142,
"learning_rate": 3.243757634638008e-05,
"loss": 0.0406,
"num_input_tokens_seen": 179165056,
"step": 945
},
{
"epoch": 3.6468330134357005,
"grad_norm": 0.5875094532966614,
"learning_rate": 3.227718860446782e-05,
"loss": 0.0397,
"num_input_tokens_seen": 180110464,
"step": 950
},
{
"epoch": 3.6660268714011517,
"grad_norm": 0.518147885799408,
"learning_rate": 3.211647294094437e-05,
"loss": 0.0406,
"num_input_tokens_seen": 181050368,
"step": 955
},
{
"epoch": 3.685220729366603,
"grad_norm": 0.585909903049469,
"learning_rate": 3.195543659791132e-05,
"loss": 0.0399,
"num_input_tokens_seen": 181985408,
"step": 960
},
{
"epoch": 3.704414587332054,
"grad_norm": 0.5211689472198486,
"learning_rate": 3.179408683192061e-05,
"loss": 0.0403,
"num_input_tokens_seen": 182932096,
"step": 965
},
{
"epoch": 3.7236084452975047,
"grad_norm": 0.5516882538795471,
"learning_rate": 3.163243091364752e-05,
"loss": 0.0414,
"num_input_tokens_seen": 183894528,
"step": 970
},
{
"epoch": 3.742802303262956,
"grad_norm": 0.5610490441322327,
"learning_rate": 3.147047612756302e-05,
"loss": 0.04,
"num_input_tokens_seen": 184841472,
"step": 975
},
{
"epoch": 3.761996161228407,
"grad_norm": 0.5976274013519287,
"learning_rate": 3.130822977160554e-05,
"loss": 0.0411,
"num_input_tokens_seen": 185803520,
"step": 980
},
{
"epoch": 3.781190019193858,
"grad_norm": 0.5341821312904358,
"learning_rate": 3.114569915685213e-05,
"loss": 0.0416,
"num_input_tokens_seen": 186746880,
"step": 985
},
{
"epoch": 3.800383877159309,
"grad_norm": 0.5178216695785522,
"learning_rate": 3.098289160718895e-05,
"loss": 0.0418,
"num_input_tokens_seen": 187703040,
"step": 990
},
{
"epoch": 3.81957773512476,
"grad_norm": 0.6667547225952148,
"learning_rate": 3.081981445898131e-05,
"loss": 0.0399,
"num_input_tokens_seen": 188650112,
"step": 995
},
{
"epoch": 3.838771593090211,
"grad_norm": 0.6052145957946777,
"learning_rate": 3.065647506074306e-05,
"loss": 0.0369,
"num_input_tokens_seen": 189577088,
"step": 1000
},
{
"epoch": 3.8579654510556622,
"grad_norm": 0.5983301401138306,
"learning_rate": 3.0492880772805433e-05,
"loss": 0.0389,
"num_input_tokens_seen": 190533376,
"step": 1005
},
{
"epoch": 3.8771593090211134,
"grad_norm": 0.5493575930595398,
"learning_rate": 3.03290389669854e-05,
"loss": 0.039,
"num_input_tokens_seen": 191484672,
"step": 1010
},
{
"epoch": 3.8963531669865645,
"grad_norm": 0.5318569540977478,
"learning_rate": 3.016495702625351e-05,
"loss": 0.0401,
"num_input_tokens_seen": 192425728,
"step": 1015
},
{
"epoch": 3.9155470249520152,
"grad_norm": 0.5920220613479614,
"learning_rate": 3.0000642344401113e-05,
"loss": 0.0408,
"num_input_tokens_seen": 193386368,
"step": 1020
},
{
"epoch": 3.9347408829174664,
"grad_norm": 0.48210155963897705,
"learning_rate": 2.983610232570728e-05,
"loss": 0.0389,
"num_input_tokens_seen": 194338560,
"step": 1025
},
{
"epoch": 3.9539347408829175,
"grad_norm": 0.551249623298645,
"learning_rate": 2.9671344384605127e-05,
"loss": 0.0392,
"num_input_tokens_seen": 195283968,
"step": 1030
},
{
"epoch": 3.9731285988483687,
"grad_norm": 0.5769145488739014,
"learning_rate": 2.950637594534765e-05,
"loss": 0.0397,
"num_input_tokens_seen": 196222080,
"step": 1035
},
{
"epoch": 3.9923224568138194,
"grad_norm": 0.49872222542762756,
"learning_rate": 2.9341204441673266e-05,
"loss": 0.0396,
"num_input_tokens_seen": 197174016,
"step": 1040
},
{
"epoch": 4.0115163147792705,
"grad_norm": 0.32796505093574524,
"learning_rate": 2.917583731647077e-05,
"loss": 0.0271,
"num_input_tokens_seen": 198129792,
"step": 1045
},
{
"epoch": 4.030710172744722,
"grad_norm": 0.4801371395587921,
"learning_rate": 2.9010282021444008e-05,
"loss": 0.0188,
"num_input_tokens_seen": 199088384,
"step": 1050
},
{
"epoch": 4.049904030710173,
"grad_norm": 0.5048861503601074,
"learning_rate": 2.8844546016776013e-05,
"loss": 0.0195,
"num_input_tokens_seen": 200040832,
"step": 1055
},
{
"epoch": 4.069097888675624,
"grad_norm": 0.44816502928733826,
"learning_rate": 2.8678636770792906e-05,
"loss": 0.0174,
"num_input_tokens_seen": 200993024,
"step": 1060
},
{
"epoch": 4.088291746641075,
"grad_norm": 0.5177501440048218,
"learning_rate": 2.851256175962732e-05,
"loss": 0.0175,
"num_input_tokens_seen": 201925376,
"step": 1065
},
{
"epoch": 4.107485604606526,
"grad_norm": 0.44937270879745483,
"learning_rate": 2.8346328466881545e-05,
"loss": 0.0175,
"num_input_tokens_seen": 202875520,
"step": 1070
},
{
"epoch": 4.126679462571977,
"grad_norm": 0.46722468733787537,
"learning_rate": 2.8179944383290274e-05,
"loss": 0.0177,
"num_input_tokens_seen": 203793536,
"step": 1075
},
{
"epoch": 4.145873320537428,
"grad_norm": 0.47768163681030273,
"learning_rate": 2.8013417006383076e-05,
"loss": 0.0178,
"num_input_tokens_seen": 204730496,
"step": 1080
},
{
"epoch": 4.165067178502879,
"grad_norm": 0.5000248551368713,
"learning_rate": 2.784675384014656e-05,
"loss": 0.0183,
"num_input_tokens_seen": 205692416,
"step": 1085
},
{
"epoch": 4.18426103646833,
"grad_norm": 0.43874186277389526,
"learning_rate": 2.7679962394686198e-05,
"loss": 0.0163,
"num_input_tokens_seen": 206628608,
"step": 1090
},
{
"epoch": 4.203454894433781,
"grad_norm": 0.6000416874885559,
"learning_rate": 2.751305018588793e-05,
"loss": 0.0173,
"num_input_tokens_seen": 207572096,
"step": 1095
},
{
"epoch": 4.222648752399232,
"grad_norm": 0.5658661127090454,
"learning_rate": 2.7346024735079486e-05,
"loss": 0.0174,
"num_input_tokens_seen": 208517376,
"step": 1100
},
{
"epoch": 4.241842610364683,
"grad_norm": 0.47470784187316895,
"learning_rate": 2.717889356869146e-05,
"loss": 0.017,
"num_input_tokens_seen": 209485056,
"step": 1105
},
{
"epoch": 4.2610364683301345,
"grad_norm": 0.5162433385848999,
"learning_rate": 2.7011664217918154e-05,
"loss": 0.0176,
"num_input_tokens_seen": 210415232,
"step": 1110
},
{
"epoch": 4.280230326295586,
"grad_norm": 0.5180822014808655,
"learning_rate": 2.684434421837821e-05,
"loss": 0.018,
"num_input_tokens_seen": 211362560,
"step": 1115
},
{
"epoch": 4.299424184261037,
"grad_norm": 0.5464392304420471,
"learning_rate": 2.667694110977506e-05,
"loss": 0.0176,
"num_input_tokens_seen": 212339456,
"step": 1120
},
{
"epoch": 4.318618042226488,
"grad_norm": 0.5567265748977661,
"learning_rate": 2.6509462435557152e-05,
"loss": 0.0187,
"num_input_tokens_seen": 213288320,
"step": 1125
},
{
"epoch": 4.337811900191938,
"grad_norm": 0.5592502951622009,
"learning_rate": 2.6341915742578037e-05,
"loss": 0.0189,
"num_input_tokens_seen": 214232576,
"step": 1130
},
{
"epoch": 4.357005758157389,
"grad_norm": 0.53173828125,
"learning_rate": 2.617430858075632e-05,
"loss": 0.0174,
"num_input_tokens_seen": 215180800,
"step": 1135
},
{
"epoch": 4.3761996161228405,
"grad_norm": 0.4227728843688965,
"learning_rate": 2.600664850273538e-05,
"loss": 0.0175,
"num_input_tokens_seen": 216116480,
"step": 1140
},
{
"epoch": 4.395393474088292,
"grad_norm": 0.6441555023193359,
"learning_rate": 2.5838943063543136e-05,
"loss": 0.0174,
"num_input_tokens_seen": 217079552,
"step": 1145
},
{
"epoch": 4.414587332053743,
"grad_norm": 0.4902968108654022,
"learning_rate": 2.5671199820251534e-05,
"loss": 0.016,
"num_input_tokens_seen": 218026496,
"step": 1150
},
{
"epoch": 4.433781190019194,
"grad_norm": 0.46307000517845154,
"learning_rate": 2.550342633163601e-05,
"loss": 0.0171,
"num_input_tokens_seen": 218977152,
"step": 1155
},
{
"epoch": 4.452975047984645,
"grad_norm": 0.4390980899333954,
"learning_rate": 2.5335630157834937e-05,
"loss": 0.0178,
"num_input_tokens_seen": 219918080,
"step": 1160
},
{
"epoch": 4.472168905950096,
"grad_norm": 0.5380067825317383,
"learning_rate": 2.5167818860008908e-05,
"loss": 0.0182,
"num_input_tokens_seen": 220859776,
"step": 1165
},
{
"epoch": 4.491362763915547,
"grad_norm": 0.6842356324195862,
"learning_rate": 2.5e-05,
"loss": 0.0184,
"num_input_tokens_seen": 221800192,
"step": 1170
},
{
"epoch": 4.510556621880998,
"grad_norm": 0.5243207812309265,
"learning_rate": 2.48321811399911e-05,
"loss": 0.0182,
"num_input_tokens_seen": 222755840,
"step": 1175
},
{
"epoch": 4.529750479846449,
"grad_norm": 0.5043371915817261,
"learning_rate": 2.4664369842165068e-05,
"loss": 0.0182,
"num_input_tokens_seen": 223683328,
"step": 1180
},
{
"epoch": 4.5489443378119,
"grad_norm": 0.5096651315689087,
"learning_rate": 2.4496573668363996e-05,
"loss": 0.0192,
"num_input_tokens_seen": 224617088,
"step": 1185
},
{
"epoch": 4.568138195777351,
"grad_norm": 0.4918656349182129,
"learning_rate": 2.4328800179748475e-05,
"loss": 0.0189,
"num_input_tokens_seen": 225549824,
"step": 1190
},
{
"epoch": 4.587332053742802,
"grad_norm": 0.50703364610672,
"learning_rate": 2.4161056936456873e-05,
"loss": 0.0179,
"num_input_tokens_seen": 226499968,
"step": 1195
},
{
"epoch": 4.606525911708253,
"grad_norm": 0.535247266292572,
"learning_rate": 2.399335149726463e-05,
"loss": 0.0184,
"num_input_tokens_seen": 227438592,
"step": 1200
},
{
"epoch": 4.6257197696737045,
"grad_norm": 0.5735894441604614,
"learning_rate": 2.3825691419243694e-05,
"loss": 0.0173,
"num_input_tokens_seen": 228392064,
"step": 1205
},
{
"epoch": 4.644913627639156,
"grad_norm": 0.525362491607666,
"learning_rate": 2.365808425742196e-05,
"loss": 0.0172,
"num_input_tokens_seen": 229337856,
"step": 1210
},
{
"epoch": 4.664107485604607,
"grad_norm": 0.548953115940094,
"learning_rate": 2.3490537564442847e-05,
"loss": 0.0172,
"num_input_tokens_seen": 230301184,
"step": 1215
},
{
"epoch": 4.683301343570058,
"grad_norm": 0.543995201587677,
"learning_rate": 2.3323058890224938e-05,
"loss": 0.0183,
"num_input_tokens_seen": 231239808,
"step": 1220
},
{
"epoch": 4.702495201535509,
"grad_norm": 0.4721289873123169,
"learning_rate": 2.3155655781621793e-05,
"loss": 0.017,
"num_input_tokens_seen": 232200960,
"step": 1225
},
{
"epoch": 4.72168905950096,
"grad_norm": 0.5055978298187256,
"learning_rate": 2.2988335782081855e-05,
"loss": 0.018,
"num_input_tokens_seen": 233170176,
"step": 1230
},
{
"epoch": 4.74088291746641,
"grad_norm": 0.5819576382637024,
"learning_rate": 2.2821106431308544e-05,
"loss": 0.0192,
"num_input_tokens_seen": 234121984,
"step": 1235
},
{
"epoch": 4.760076775431862,
"grad_norm": 0.5343950986862183,
"learning_rate": 2.265397526492052e-05,
"loss": 0.019,
"num_input_tokens_seen": 235080448,
"step": 1240
},
{
"epoch": 4.779270633397313,
"grad_norm": 0.5696731805801392,
"learning_rate": 2.2486949814112077e-05,
"loss": 0.0167,
"num_input_tokens_seen": 236029440,
"step": 1245
},
{
"epoch": 4.798464491362764,
"grad_norm": 0.513455331325531,
"learning_rate": 2.2320037605313808e-05,
"loss": 0.0172,
"num_input_tokens_seen": 236989952,
"step": 1250
},
{
"epoch": 4.817658349328215,
"grad_norm": 0.5600264072418213,
"learning_rate": 2.2153246159853446e-05,
"loss": 0.0178,
"num_input_tokens_seen": 237952640,
"step": 1255
},
{
"epoch": 4.836852207293666,
"grad_norm": 0.5633450746536255,
"learning_rate": 2.1986582993616926e-05,
"loss": 0.0179,
"num_input_tokens_seen": 238887552,
"step": 1260
},
{
"epoch": 4.856046065259117,
"grad_norm": 0.4715237021446228,
"learning_rate": 2.1820055616709735e-05,
"loss": 0.0163,
"num_input_tokens_seen": 239837440,
"step": 1265
},
{
"epoch": 4.8752399232245685,
"grad_norm": 0.6615576148033142,
"learning_rate": 2.1653671533118468e-05,
"loss": 0.019,
"num_input_tokens_seen": 240771712,
"step": 1270
},
{
"epoch": 4.894433781190019,
"grad_norm": 0.4694409668445587,
"learning_rate": 2.148743824037269e-05,
"loss": 0.0169,
"num_input_tokens_seen": 241706496,
"step": 1275
},
{
"epoch": 4.91362763915547,
"grad_norm": 0.5807188153266907,
"learning_rate": 2.1321363229207096e-05,
"loss": 0.0165,
"num_input_tokens_seen": 242651520,
"step": 1280
},
{
"epoch": 4.932821497120921,
"grad_norm": 0.5913830399513245,
"learning_rate": 2.115545398322399e-05,
"loss": 0.018,
"num_input_tokens_seen": 243610880,
"step": 1285
},
{
"epoch": 4.952015355086372,
"grad_norm": 0.47264307737350464,
"learning_rate": 2.098971797855599e-05,
"loss": 0.0171,
"num_input_tokens_seen": 244553344,
"step": 1290
},
{
"epoch": 4.971209213051823,
"grad_norm": 0.5455983281135559,
"learning_rate": 2.0824162683529224e-05,
"loss": 0.0178,
"num_input_tokens_seen": 245489280,
"step": 1295
},
{
"epoch": 4.990403071017274,
"grad_norm": 0.5236849784851074,
"learning_rate": 2.0658795558326743e-05,
"loss": 0.0177,
"num_input_tokens_seen": 246430848,
"step": 1300
},
{
"epoch": 5.009596928982726,
"grad_norm": 0.3665928840637207,
"learning_rate": 2.0493624054652357e-05,
"loss": 0.0129,
"num_input_tokens_seen": 247392000,
"step": 1305
},
{
"epoch": 5.028790786948177,
"grad_norm": 0.2873888313770294,
"learning_rate": 2.0328655615394882e-05,
"loss": 0.0076,
"num_input_tokens_seen": 248329472,
"step": 1310
},
{
"epoch": 5.047984644913628,
"grad_norm": 0.3191753029823303,
"learning_rate": 2.016389767429272e-05,
"loss": 0.0069,
"num_input_tokens_seen": 249250688,
"step": 1315
},
{
"epoch": 5.067178502879079,
"grad_norm": 0.3973754048347473,
"learning_rate": 1.9999357655598893e-05,
"loss": 0.0068,
"num_input_tokens_seen": 250189312,
"step": 1320
},
{
"epoch": 5.08637236084453,
"grad_norm": 0.27491700649261475,
"learning_rate": 1.98350429737465e-05,
"loss": 0.0063,
"num_input_tokens_seen": 251125632,
"step": 1325
},
{
"epoch": 5.10556621880998,
"grad_norm": 0.448505163192749,
"learning_rate": 1.9670961033014605e-05,
"loss": 0.0063,
"num_input_tokens_seen": 252084224,
"step": 1330
},
{
"epoch": 5.1247600767754315,
"grad_norm": 0.39973142743110657,
"learning_rate": 1.950711922719458e-05,
"loss": 0.0067,
"num_input_tokens_seen": 253038080,
"step": 1335
},
{
"epoch": 5.143953934740883,
"grad_norm": 0.3733747899532318,
"learning_rate": 1.934352493925695e-05,
"loss": 0.0059,
"num_input_tokens_seen": 253959936,
"step": 1340
},
{
"epoch": 5.163147792706334,
"grad_norm": 0.33044806122779846,
"learning_rate": 1.9180185541018695e-05,
"loss": 0.006,
"num_input_tokens_seen": 254898176,
"step": 1345
},
{
"epoch": 5.182341650671785,
"grad_norm": 0.39706701040267944,
"learning_rate": 1.9017108392811065e-05,
"loss": 0.006,
"num_input_tokens_seen": 255850368,
"step": 1350
},
{
"epoch": 5.201535508637236,
"grad_norm": 0.391462117433548,
"learning_rate": 1.8854300843147875e-05,
"loss": 0.0062,
"num_input_tokens_seen": 256820096,
"step": 1355
},
{
"epoch": 5.220729366602687,
"grad_norm": 0.41619014739990234,
"learning_rate": 1.8691770228394456e-05,
"loss": 0.0063,
"num_input_tokens_seen": 257742080,
"step": 1360
},
{
"epoch": 5.239923224568138,
"grad_norm": 0.3508637547492981,
"learning_rate": 1.852952387243698e-05,
"loss": 0.0054,
"num_input_tokens_seen": 258705664,
"step": 1365
},
{
"epoch": 5.25911708253359,
"grad_norm": 0.3165414333343506,
"learning_rate": 1.8367569086352483e-05,
"loss": 0.006,
"num_input_tokens_seen": 259630464,
"step": 1370
},
{
"epoch": 5.278310940499041,
"grad_norm": 0.31605008244514465,
"learning_rate": 1.820591316807939e-05,
"loss": 0.006,
"num_input_tokens_seen": 260580352,
"step": 1375
},
{
"epoch": 5.297504798464491,
"grad_norm": 0.235918790102005,
"learning_rate": 1.8044563402088684e-05,
"loss": 0.005,
"num_input_tokens_seen": 261539072,
"step": 1380
},
{
"epoch": 5.316698656429942,
"grad_norm": 0.37443870306015015,
"learning_rate": 1.788352705905563e-05,
"loss": 0.0049,
"num_input_tokens_seen": 262481152,
"step": 1385
},
{
"epoch": 5.335892514395393,
"grad_norm": 0.42907220125198364,
"learning_rate": 1.7722811395532178e-05,
"loss": 0.0052,
"num_input_tokens_seen": 263416064,
"step": 1390
},
{
"epoch": 5.355086372360844,
"grad_norm": 0.3673328161239624,
"learning_rate": 1.756242365361993e-05,
"loss": 0.0055,
"num_input_tokens_seen": 264374016,
"step": 1395
},
{
"epoch": 5.3742802303262955,
"grad_norm": 0.40815985202789307,
"learning_rate": 1.740237106064383e-05,
"loss": 0.0057,
"num_input_tokens_seen": 265326976,
"step": 1400
},
{
"epoch": 5.393474088291747,
"grad_norm": 0.3974185287952423,
"learning_rate": 1.72426608288265e-05,
"loss": 0.0053,
"num_input_tokens_seen": 266300672,
"step": 1405
},
{
"epoch": 5.412667946257198,
"grad_norm": 0.415446013212204,
"learning_rate": 1.7083300154963193e-05,
"loss": 0.006,
"num_input_tokens_seen": 267247488,
"step": 1410
},
{
"epoch": 5.431861804222649,
"grad_norm": 0.24956613779067993,
"learning_rate": 1.6924296220097556e-05,
"loss": 0.0054,
"num_input_tokens_seen": 268195072,
"step": 1415
},
{
"epoch": 5.4510556621881,
"grad_norm": 0.3614320456981659,
"learning_rate": 1.6765656189198013e-05,
"loss": 0.0058,
"num_input_tokens_seen": 269156736,
"step": 1420
},
{
"epoch": 5.470249520153551,
"grad_norm": 0.3081935942173004,
"learning_rate": 1.6607387210834887e-05,
"loss": 0.0059,
"num_input_tokens_seen": 270111232,
"step": 1425
},
{
"epoch": 5.4894433781190015,
"grad_norm": 0.616385281085968,
"learning_rate": 1.6449496416858284e-05,
"loss": 0.006,
"num_input_tokens_seen": 271075328,
"step": 1430
},
{
"epoch": 5.508637236084453,
"grad_norm": 0.2966477572917938,
"learning_rate": 1.6291990922076745e-05,
"loss": 0.0056,
"num_input_tokens_seen": 272035200,
"step": 1435
},
{
"epoch": 5.527831094049904,
"grad_norm": 0.3426896035671234,
"learning_rate": 1.613487782393661e-05,
"loss": 0.0055,
"num_input_tokens_seen": 273006464,
"step": 1440
},
{
"epoch": 5.547024952015355,
"grad_norm": 0.333395779132843,
"learning_rate": 1.59781642022022e-05,
"loss": 0.0052,
"num_input_tokens_seen": 273947520,
"step": 1445
},
{
"epoch": 5.566218809980806,
"grad_norm": 0.37326982617378235,
"learning_rate": 1.582185711863681e-05,
"loss": 0.0058,
"num_input_tokens_seen": 274880640,
"step": 1450
},
{
"epoch": 5.585412667946257,
"grad_norm": 0.47231632471084595,
"learning_rate": 1.5665963616684476e-05,
"loss": 0.006,
"num_input_tokens_seen": 275832576,
"step": 1455
},
{
"epoch": 5.604606525911708,
"grad_norm": 0.4276968240737915,
"learning_rate": 1.5510490721152592e-05,
"loss": 0.0059,
"num_input_tokens_seen": 276766720,
"step": 1460
},
{
"epoch": 5.6238003838771595,
"grad_norm": 0.3536054491996765,
"learning_rate": 1.535544543789537e-05,
"loss": 0.0057,
"num_input_tokens_seen": 277722752,
"step": 1465
},
{
"epoch": 5.642994241842611,
"grad_norm": 0.3875754475593567,
"learning_rate": 1.5200834753498128e-05,
"loss": 0.0055,
"num_input_tokens_seen": 278668544,
"step": 1470
},
{
"epoch": 5.662188099808061,
"grad_norm": 0.39483705163002014,
"learning_rate": 1.5046665634962476e-05,
"loss": 0.006,
"num_input_tokens_seen": 279602944,
"step": 1475
},
{
"epoch": 5.681381957773512,
"grad_norm": 0.3274906873703003,
"learning_rate": 1.489294502939238e-05,
"loss": 0.0058,
"num_input_tokens_seen": 280545408,
"step": 1480
},
{
"epoch": 5.700575815738963,
"grad_norm": 0.41820597648620605,
"learning_rate": 1.4739679863681086e-05,
"loss": 0.0052,
"num_input_tokens_seen": 281486208,
"step": 1485
},
{
"epoch": 5.719769673704414,
"grad_norm": 0.3023267984390259,
"learning_rate": 1.4586877044199016e-05,
"loss": 0.0056,
"num_input_tokens_seen": 282428032,
"step": 1490
},
{
"epoch": 5.7389635316698655,
"grad_norm": 0.40845391154289246,
"learning_rate": 1.443454345648252e-05,
"loss": 0.0061,
"num_input_tokens_seen": 283387264,
"step": 1495
},
{
"epoch": 5.758157389635317,
"grad_norm": 0.2751927971839905,
"learning_rate": 1.4282685964923642e-05,
"loss": 0.0058,
"num_input_tokens_seen": 284347008,
"step": 1500
},
{
"epoch": 5.777351247600768,
"grad_norm": 0.39462777972221375,
"learning_rate": 1.4131311412460796e-05,
"loss": 0.0061,
"num_input_tokens_seen": 285271424,
"step": 1505
},
{
"epoch": 5.796545105566219,
"grad_norm": 0.3681143522262573,
"learning_rate": 1.398042662027035e-05,
"loss": 0.0055,
"num_input_tokens_seen": 286222208,
"step": 1510
},
{
"epoch": 5.81573896353167,
"grad_norm": 0.3678882122039795,
"learning_rate": 1.3830038387459354e-05,
"loss": 0.0056,
"num_input_tokens_seen": 287186304,
"step": 1515
},
{
"epoch": 5.834932821497121,
"grad_norm": 0.3934548795223236,
"learning_rate": 1.3680153490759073e-05,
"loss": 0.0055,
"num_input_tokens_seen": 288142848,
"step": 1520
},
{
"epoch": 5.854126679462572,
"grad_norm": 0.3608354926109314,
"learning_rate": 1.3530778684219648e-05,
"loss": 0.0055,
"num_input_tokens_seen": 289076608,
"step": 1525
},
{
"epoch": 5.8733205374280235,
"grad_norm": 0.3579324781894684,
"learning_rate": 1.3381920698905787e-05,
"loss": 0.006,
"num_input_tokens_seen": 290014848,
"step": 1530
},
{
"epoch": 5.892514395393474,
"grad_norm": 0.45630770921707153,
"learning_rate": 1.3233586242593387e-05,
"loss": 0.0056,
"num_input_tokens_seen": 290956928,
"step": 1535
},
{
"epoch": 5.911708253358925,
"grad_norm": 0.48819243907928467,
"learning_rate": 1.3085781999467303e-05,
"loss": 0.0059,
"num_input_tokens_seen": 291889408,
"step": 1540
},
{
"epoch": 5.930902111324376,
"grad_norm": 0.39040514826774597,
"learning_rate": 1.293851462982017e-05,
"loss": 0.0056,
"num_input_tokens_seen": 292832768,
"step": 1545
},
{
"epoch": 5.950095969289827,
"grad_norm": 0.33169373869895935,
"learning_rate": 1.2791790769752232e-05,
"loss": 0.0054,
"num_input_tokens_seen": 293767040,
"step": 1550
},
{
"epoch": 5.969289827255278,
"grad_norm": 0.3252679705619812,
"learning_rate": 1.2645617030872328e-05,
"loss": 0.0049,
"num_input_tokens_seen": 294750208,
"step": 1555
},
{
"epoch": 5.9884836852207295,
"grad_norm": 0.35827863216400146,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.0051,
"num_input_tokens_seen": 295721088,
"step": 1560
},
{
"epoch": 6.007677543186181,
"grad_norm": 0.2643219530582428,
"learning_rate": 1.2354946238868631e-05,
"loss": 0.0037,
"num_input_tokens_seen": 296669184,
"step": 1565
},
{
"epoch": 6.026871401151632,
"grad_norm": 0.16359519958496094,
"learning_rate": 1.2210462283829755e-05,
"loss": 0.0021,
"num_input_tokens_seen": 297624448,
"step": 1570
},
{
"epoch": 6.046065259117083,
"grad_norm": 0.17375914752483368,
"learning_rate": 1.2066554645558578e-05,
"loss": 0.0019,
"num_input_tokens_seen": 298558080,
"step": 1575
},
{
"epoch": 6.065259117082533,
"grad_norm": 0.16882829368114471,
"learning_rate": 1.1923229808760564e-05,
"loss": 0.002,
"num_input_tokens_seen": 299492352,
"step": 1580
},
{
"epoch": 6.084452975047984,
"grad_norm": 0.15032212436199188,
"learning_rate": 1.1780494231879183e-05,
"loss": 0.0017,
"num_input_tokens_seen": 300446976,
"step": 1585
},
{
"epoch": 6.1036468330134355,
"grad_norm": 0.1795198768377304,
"learning_rate": 1.1638354346804971e-05,
"loss": 0.0016,
"num_input_tokens_seen": 301379328,
"step": 1590
},
{
"epoch": 6.122840690978887,
"grad_norm": 0.19939038157463074,
"learning_rate": 1.1496816558585622e-05,
"loss": 0.0017,
"num_input_tokens_seen": 302306944,
"step": 1595
},
{
"epoch": 6.142034548944338,
"grad_norm": 0.13161912560462952,
"learning_rate": 1.1355887245137383e-05,
"loss": 0.0014,
"num_input_tokens_seen": 303246848,
"step": 1600
},
{
"epoch": 6.161228406909789,
"grad_norm": 0.12050630152225494,
"learning_rate": 1.121557275695771e-05,
"loss": 0.0018,
"num_input_tokens_seen": 304181248,
"step": 1605
},
{
"epoch": 6.18042226487524,
"grad_norm": 0.1479523628950119,
"learning_rate": 1.1075879416839023e-05,
"loss": 0.002,
"num_input_tokens_seen": 305139200,
"step": 1610
},
{
"epoch": 6.199616122840691,
"grad_norm": 0.2239997535943985,
"learning_rate": 1.093681351958383e-05,
"loss": 0.0017,
"num_input_tokens_seen": 306099328,
"step": 1615
},
{
"epoch": 6.218809980806142,
"grad_norm": 0.1326994150876999,
"learning_rate": 1.0798381331721109e-05,
"loss": 0.0016,
"num_input_tokens_seen": 307053568,
"step": 1620
},
{
"epoch": 6.2380038387715935,
"grad_norm": 0.16336026787757874,
"learning_rate": 1.0660589091223855e-05,
"loss": 0.0016,
"num_input_tokens_seen": 308003200,
"step": 1625
},
{
"epoch": 6.257197696737044,
"grad_norm": 0.19519683718681335,
"learning_rate": 1.052344300722803e-05,
"loss": 0.0016,
"num_input_tokens_seen": 308958720,
"step": 1630
},
{
"epoch": 6.276391554702495,
"grad_norm": 0.19589100778102875,
"learning_rate": 1.0386949259752785e-05,
"loss": 0.0017,
"num_input_tokens_seen": 309904384,
"step": 1635
},
{
"epoch": 6.295585412667946,
"grad_norm": 0.1591753512620926,
"learning_rate": 1.0251113999421935e-05,
"loss": 0.0017,
"num_input_tokens_seen": 310861568,
"step": 1640
},
{
"epoch": 6.314779270633397,
"grad_norm": 0.1489296555519104,
"learning_rate": 1.0115943347186826e-05,
"loss": 0.0015,
"num_input_tokens_seen": 311800064,
"step": 1645
},
{
"epoch": 6.333973128598848,
"grad_norm": 0.1340964287519455,
"learning_rate": 9.981443394050525e-06,
"loss": 0.0014,
"num_input_tokens_seen": 312742656,
"step": 1650
},
{
"epoch": 6.3531669865642995,
"grad_norm": 0.4749620258808136,
"learning_rate": 9.847620200793343e-06,
"loss": 0.0016,
"num_input_tokens_seen": 313683840,
"step": 1655
},
{
"epoch": 6.372360844529751,
"grad_norm": 0.27900460362434387,
"learning_rate": 9.714479797699694e-06,
"loss": 0.0015,
"num_input_tokens_seen": 314630400,
"step": 1660
},
{
"epoch": 6.391554702495202,
"grad_norm": 0.15642359852790833,
"learning_rate": 9.582028184286423e-06,
"loss": 0.0016,
"num_input_tokens_seen": 315612544,
"step": 1665
},
{
"epoch": 6.410748560460653,
"grad_norm": 0.2352278232574463,
"learning_rate": 9.450271329032404e-06,
"loss": 0.0016,
"num_input_tokens_seen": 316564224,
"step": 1670
},
{
"epoch": 6.429942418426104,
"grad_norm": 0.2565127909183502,
"learning_rate": 9.3192151691096e-06,
"loss": 0.0018,
"num_input_tokens_seen": 317537024,
"step": 1675
},
{
"epoch": 6.449136276391554,
"grad_norm": 0.23012320697307587,
"learning_rate": 9.18886561011557e-06,
"loss": 0.0016,
"num_input_tokens_seen": 318482944,
"step": 1680
},
{
"epoch": 6.468330134357005,
"grad_norm": 0.15872737765312195,
"learning_rate": 9.059228525807296e-06,
"loss": 0.0015,
"num_input_tokens_seen": 319438848,
"step": 1685
},
{
"epoch": 6.487523992322457,
"grad_norm": 0.1375139206647873,
"learning_rate": 8.930309757836517e-06,
"loss": 0.0016,
"num_input_tokens_seen": 320388736,
"step": 1690
},
{
"epoch": 6.506717850287908,
"grad_norm": 0.12399590760469437,
"learning_rate": 8.802115115486535e-06,
"loss": 0.0013,
"num_input_tokens_seen": 321354880,
"step": 1695
},
{
"epoch": 6.525911708253359,
"grad_norm": 0.23495157063007355,
"learning_rate": 8.67465037541038e-06,
"loss": 0.0016,
"num_input_tokens_seen": 322301952,
"step": 1700
},
{
"epoch": 6.54510556621881,
"grad_norm": 0.15574663877487183,
"learning_rate": 8.54792128137053e-06,
"loss": 0.0014,
"num_input_tokens_seen": 323246208,
"step": 1705
},
{
"epoch": 6.564299424184261,
"grad_norm": 0.1617659628391266,
"learning_rate": 8.421933543980126e-06,
"loss": 0.0015,
"num_input_tokens_seen": 324191616,
"step": 1710
},
{
"epoch": 6.583493282149712,
"grad_norm": 0.14752747118473053,
"learning_rate": 8.29669284044557e-06,
"loss": 0.0017,
"num_input_tokens_seen": 325125888,
"step": 1715
},
{
"epoch": 6.6026871401151634,
"grad_norm": 0.2986501455307007,
"learning_rate": 8.172204814310742e-06,
"loss": 0.0015,
"num_input_tokens_seen": 326070784,
"step": 1720
},
{
"epoch": 6.621880998080615,
"grad_norm": 0.23812003433704376,
"learning_rate": 8.048475075202727e-06,
"loss": 0.0017,
"num_input_tokens_seen": 326995712,
"step": 1725
},
{
"epoch": 6.641074856046066,
"grad_norm": 0.13329505920410156,
"learning_rate": 7.92550919857896e-06,
"loss": 0.0019,
"num_input_tokens_seen": 327954816,
"step": 1730
},
{
"epoch": 6.660268714011516,
"grad_norm": 0.10942558199167252,
"learning_rate": 7.803312725476031e-06,
"loss": 0.0016,
"num_input_tokens_seen": 328909184,
"step": 1735
},
{
"epoch": 6.679462571976967,
"grad_norm": 0.10470914840698242,
"learning_rate": 7.681891162260015e-06,
"loss": 0.0015,
"num_input_tokens_seen": 329862144,
"step": 1740
},
{
"epoch": 6.698656429942418,
"grad_norm": 0.2065214365720749,
"learning_rate": 7.561249980378301e-06,
"loss": 0.0018,
"num_input_tokens_seen": 330812544,
"step": 1745
},
{
"epoch": 6.717850287907869,
"grad_norm": 0.2046762853860855,
"learning_rate": 7.441394616113062e-06,
"loss": 0.0016,
"num_input_tokens_seen": 331769216,
"step": 1750
},
{
"epoch": 6.737044145873321,
"grad_norm": 0.17153848707675934,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.0015,
"num_input_tokens_seen": 332703744,
"step": 1755
},
{
"epoch": 6.756238003838772,
"grad_norm": 0.11358804255723953,
"learning_rate": 7.20406290826649e-06,
"loss": 0.0015,
"num_input_tokens_seen": 333635712,
"step": 1760
},
{
"epoch": 6.775431861804223,
"grad_norm": 0.21972091495990753,
"learning_rate": 7.086597259226707e-06,
"loss": 0.0016,
"num_input_tokens_seen": 334579968,
"step": 1765
},
{
"epoch": 6.794625719769674,
"grad_norm": 0.16843383014202118,
"learning_rate": 6.969938816404639e-06,
"loss": 0.0017,
"num_input_tokens_seen": 335506304,
"step": 1770
},
{
"epoch": 6.813819577735125,
"grad_norm": 0.155124694108963,
"learning_rate": 6.854092836613948e-06,
"loss": 0.0019,
"num_input_tokens_seen": 336457856,
"step": 1775
},
{
"epoch": 6.833013435700575,
"grad_norm": 0.11480195820331573,
"learning_rate": 6.739064540057424e-06,
"loss": 0.0015,
"num_input_tokens_seen": 337391616,
"step": 1780
},
{
"epoch": 6.8522072936660265,
"grad_norm": 0.13995982706546783,
"learning_rate": 6.624859110091791e-06,
"loss": 0.0019,
"num_input_tokens_seen": 338349568,
"step": 1785
},
{
"epoch": 6.871401151631478,
"grad_norm": 0.2626807689666748,
"learning_rate": 6.511481692994076e-06,
"loss": 0.0017,
"num_input_tokens_seen": 339293440,
"step": 1790
},
{
"epoch": 6.890595009596929,
"grad_norm": 0.21995946764945984,
"learning_rate": 6.3989373977297315e-06,
"loss": 0.0012,
"num_input_tokens_seen": 340271360,
"step": 1795
},
{
"epoch": 6.90978886756238,
"grad_norm": 0.11761938780546188,
"learning_rate": 6.28723129572247e-06,
"loss": 0.0014,
"num_input_tokens_seen": 341216384,
"step": 1800
},
{
"epoch": 6.928982725527831,
"grad_norm": 0.14595161378383636,
"learning_rate": 6.1763684206256525e-06,
"loss": 0.0014,
"num_input_tokens_seen": 342183808,
"step": 1805
},
{
"epoch": 6.948176583493282,
"grad_norm": 0.1292518526315689,
"learning_rate": 6.066353768095504e-06,
"loss": 0.0014,
"num_input_tokens_seen": 343123712,
"step": 1810
},
{
"epoch": 6.967370441458733,
"grad_norm": 0.19953125715255737,
"learning_rate": 5.957192295566022e-06,
"loss": 0.0014,
"num_input_tokens_seen": 344062976,
"step": 1815
},
{
"epoch": 6.9865642994241846,
"grad_norm": 0.17268332839012146,
"learning_rate": 5.848888922025553e-06,
"loss": 0.0015,
"num_input_tokens_seen": 345019904,
"step": 1820
},
{
"epoch": 7.005758157389636,
"grad_norm": 0.0566362664103508,
"learning_rate": 5.741448527795137e-06,
"loss": 0.0011,
"num_input_tokens_seen": 345976320,
"step": 1825
},
{
"epoch": 7.024952015355086,
"grad_norm": 0.07774636894464493,
"learning_rate": 5.634875954308638e-06,
"loss": 0.0007,
"num_input_tokens_seen": 346933120,
"step": 1830
},
{
"epoch": 7.044145873320537,
"grad_norm": 0.09574442356824875,
"learning_rate": 5.52917600389451e-06,
"loss": 0.0008,
"num_input_tokens_seen": 347883392,
"step": 1835
},
{
"epoch": 7.063339731285988,
"grad_norm": 0.04345453530550003,
"learning_rate": 5.424353439559446e-06,
"loss": 0.0007,
"num_input_tokens_seen": 348815616,
"step": 1840
},
{
"epoch": 7.082533589251439,
"grad_norm": 0.08021701127290726,
"learning_rate": 5.320412984773748e-06,
"loss": 0.0007,
"num_input_tokens_seen": 349761152,
"step": 1845
},
{
"epoch": 7.1017274472168905,
"grad_norm": 0.04693225026130676,
"learning_rate": 5.217359323258459e-06,
"loss": 0.0007,
"num_input_tokens_seen": 350714880,
"step": 1850
},
{
"epoch": 7.120921305182342,
"grad_norm": 0.041134320199489594,
"learning_rate": 5.115197098774302e-06,
"loss": 0.0007,
"num_input_tokens_seen": 351676544,
"step": 1855
},
{
"epoch": 7.140115163147793,
"grad_norm": 0.06960324198007584,
"learning_rate": 5.013930914912476e-06,
"loss": 0.0007,
"num_input_tokens_seen": 352604672,
"step": 1860
},
{
"epoch": 7.159309021113244,
"grad_norm": 0.057376183569431305,
"learning_rate": 4.913565334887135e-06,
"loss": 0.0007,
"num_input_tokens_seen": 353552640,
"step": 1865
},
{
"epoch": 7.178502879078695,
"grad_norm": 0.04041101410984993,
"learning_rate": 4.814104881329828e-06,
"loss": 0.0006,
"num_input_tokens_seen": 354511360,
"step": 1870
},
{
"epoch": 7.197696737044146,
"grad_norm": 0.052469249814748764,
"learning_rate": 4.715554036085673e-06,
"loss": 0.0007,
"num_input_tokens_seen": 355478144,
"step": 1875
},
{
"epoch": 7.2168905950095965,
"grad_norm": 0.11581304669380188,
"learning_rate": 4.617917240011394e-06,
"loss": 0.0006,
"num_input_tokens_seen": 356424448,
"step": 1880
},
{
"epoch": 7.236084452975048,
"grad_norm": 0.04358832538127899,
"learning_rate": 4.521198892775203e-06,
"loss": 0.0006,
"num_input_tokens_seen": 357374208,
"step": 1885
},
{
"epoch": 7.255278310940499,
"grad_norm": 0.04860702529549599,
"learning_rate": 4.425403352658591e-06,
"loss": 0.0006,
"num_input_tokens_seen": 358315392,
"step": 1890
},
{
"epoch": 7.27447216890595,
"grad_norm": 0.05813557654619217,
"learning_rate": 4.330534936359873e-06,
"loss": 0.0007,
"num_input_tokens_seen": 359280384,
"step": 1895
},
{
"epoch": 7.293666026871401,
"grad_norm": 0.08444052934646606,
"learning_rate": 4.236597918799709e-06,
"loss": 0.0007,
"num_input_tokens_seen": 360221440,
"step": 1900
},
{
"epoch": 7.312859884836852,
"grad_norm": 0.09866166114807129,
"learning_rate": 4.143596532928468e-06,
"loss": 0.0006,
"num_input_tokens_seen": 361175936,
"step": 1905
},
{
"epoch": 7.332053742802303,
"grad_norm": 0.039118677377700806,
"learning_rate": 4.051534969535472e-06,
"loss": 0.0006,
"num_input_tokens_seen": 362113280,
"step": 1910
},
{
"epoch": 7.3512476007677545,
"grad_norm": 0.08323455601930618,
"learning_rate": 3.960417377060152e-06,
"loss": 0.0006,
"num_input_tokens_seen": 363056512,
"step": 1915
},
{
"epoch": 7.370441458733206,
"grad_norm": 0.03620649501681328,
"learning_rate": 3.8702478614051355e-06,
"loss": 0.0006,
"num_input_tokens_seen": 364026752,
"step": 1920
},
{
"epoch": 7.389635316698657,
"grad_norm": 0.06456654518842697,
"learning_rate": 3.7810304857511914e-06,
"loss": 0.0006,
"num_input_tokens_seen": 364979456,
"step": 1925
},
{
"epoch": 7.408829174664108,
"grad_norm": 0.057450417429208755,
"learning_rate": 3.6927692703741634e-06,
"loss": 0.0006,
"num_input_tokens_seen": 365919488,
"step": 1930
},
{
"epoch": 7.428023032629558,
"grad_norm": 0.10976872593164444,
"learning_rate": 3.605468192463815e-06,
"loss": 0.0006,
"num_input_tokens_seen": 366871552,
"step": 1935
},
{
"epoch": 7.447216890595009,
"grad_norm": 0.04592859372496605,
"learning_rate": 3.5191311859445796e-06,
"loss": 0.0006,
"num_input_tokens_seen": 367824768,
"step": 1940
},
{
"epoch": 7.4664107485604605,
"grad_norm": 0.08413061499595642,
"learning_rate": 3.4337621412983274e-06,
"loss": 0.0007,
"num_input_tokens_seen": 368776704,
"step": 1945
},
{
"epoch": 7.485604606525912,
"grad_norm": 0.06421375274658203,
"learning_rate": 3.3493649053890326e-06,
"loss": 0.0006,
"num_input_tokens_seen": 369726848,
"step": 1950
},
{
"epoch": 7.504798464491363,
"grad_norm": 0.04606764018535614,
"learning_rate": 3.2659432812894296e-06,
"loss": 0.0007,
"num_input_tokens_seen": 370669184,
"step": 1955
},
{
"epoch": 7.523992322456814,
"grad_norm": 0.14935247600078583,
"learning_rate": 3.183501028109642e-06,
"loss": 0.001,
"num_input_tokens_seen": 371619072,
"step": 1960
},
{
"epoch": 7.543186180422265,
"grad_norm": 0.03758896514773369,
"learning_rate": 3.1020418608278035e-06,
"loss": 0.0006,
"num_input_tokens_seen": 372591104,
"step": 1965
},
{
"epoch": 7.562380038387716,
"grad_norm": 0.08971104770898819,
"learning_rate": 3.0215694501226384e-06,
"loss": 0.0006,
"num_input_tokens_seen": 373540352,
"step": 1970
},
{
"epoch": 7.581573896353167,
"grad_norm": 0.049786727875471115,
"learning_rate": 2.942087422208051e-06,
"loss": 0.0006,
"num_input_tokens_seen": 374494336,
"step": 1975
},
{
"epoch": 7.600767754318618,
"grad_norm": 0.14331580698490143,
"learning_rate": 2.8635993586697553e-06,
"loss": 0.0007,
"num_input_tokens_seen": 375443968,
"step": 1980
},
{
"epoch": 7.619961612284069,
"grad_norm": 0.1280643343925476,
"learning_rate": 2.7861087963038435e-06,
"loss": 0.0007,
"num_input_tokens_seen": 376398848,
"step": 1985
},
{
"epoch": 7.63915547024952,
"grad_norm": 0.07448034733533859,
"learning_rate": 2.70961922695743e-06,
"loss": 0.0006,
"num_input_tokens_seen": 377363072,
"step": 1990
},
{
"epoch": 7.658349328214971,
"grad_norm": 0.047854091972112656,
"learning_rate": 2.6341340973713187e-06,
"loss": 0.0005,
"num_input_tokens_seen": 378285440,
"step": 1995
},
{
"epoch": 7.677543186180422,
"grad_norm": 0.05751164257526398,
"learning_rate": 2.5596568090246548e-06,
"loss": 0.0006,
"num_input_tokens_seen": 379227776,
"step": 2000
},
{
"epoch": 7.696737044145873,
"grad_norm": 0.09036415070295334,
"learning_rate": 2.486190717981665e-06,
"loss": 0.0008,
"num_input_tokens_seen": 380168064,
"step": 2005
},
{
"epoch": 7.7159309021113245,
"grad_norm": 0.0800870880484581,
"learning_rate": 2.4137391347404476e-06,
"loss": 0.0007,
"num_input_tokens_seen": 381124736,
"step": 2010
},
{
"epoch": 7.735124760076776,
"grad_norm": 0.055397044867277145,
"learning_rate": 2.3423053240837515e-06,
"loss": 0.0007,
"num_input_tokens_seen": 382078592,
"step": 2015
},
{
"epoch": 7.754318618042227,
"grad_norm": 0.06471221148967743,
"learning_rate": 2.271892504931905e-06,
"loss": 0.0006,
"num_input_tokens_seen": 383012224,
"step": 2020
},
{
"epoch": 7.773512476007678,
"grad_norm": 0.05552973225712776,
"learning_rate": 2.2025038501977486e-06,
"loss": 0.0007,
"num_input_tokens_seen": 383955328,
"step": 2025
},
{
"epoch": 7.792706333973129,
"grad_norm": 0.05302416905760765,
"learning_rate": 2.1341424866436364e-06,
"loss": 0.0006,
"num_input_tokens_seen": 384918528,
"step": 2030
},
{
"epoch": 7.811900191938579,
"grad_norm": 0.04085630923509598,
"learning_rate": 2.0668114947405726e-06,
"loss": 0.0006,
"num_input_tokens_seen": 385865472,
"step": 2035
},
{
"epoch": 7.8310940499040305,
"grad_norm": 0.11027319729328156,
"learning_rate": 2.0005139085293945e-06,
"loss": 0.0007,
"num_input_tokens_seen": 386815488,
"step": 2040
},
{
"epoch": 7.850287907869482,
"grad_norm": 0.10259139537811279,
"learning_rate": 1.9352527154840345e-06,
"loss": 0.0006,
"num_input_tokens_seen": 387746176,
"step": 2045
},
{
"epoch": 7.869481765834933,
"grad_norm": 0.08809423446655273,
"learning_rate": 1.8710308563769124e-06,
"loss": 0.0006,
"num_input_tokens_seen": 388682752,
"step": 2050
},
{
"epoch": 7.888675623800384,
"grad_norm": 0.04966364800930023,
"learning_rate": 1.8078512251464286e-06,
"loss": 0.0006,
"num_input_tokens_seen": 389629056,
"step": 2055
},
{
"epoch": 7.907869481765835,
"grad_norm": 0.17464539408683777,
"learning_rate": 1.7457166687665449e-06,
"loss": 0.0008,
"num_input_tokens_seen": 390563584,
"step": 2060
},
{
"epoch": 7.927063339731286,
"grad_norm": 0.04215671867132187,
"learning_rate": 1.684629987118494e-06,
"loss": 0.0006,
"num_input_tokens_seen": 391511808,
"step": 2065
},
{
"epoch": 7.946257197696737,
"grad_norm": 0.07717634737491608,
"learning_rate": 1.624593932864632e-06,
"loss": 0.0007,
"num_input_tokens_seen": 392460032,
"step": 2070
},
{
"epoch": 7.9654510556621885,
"grad_norm": 0.04429319128394127,
"learning_rate": 1.5656112113243721e-06,
"loss": 0.0006,
"num_input_tokens_seen": 393403264,
"step": 2075
},
{
"epoch": 7.984644913627639,
"grad_norm": 0.06878451257944107,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.0006,
"num_input_tokens_seen": 394339584,
"step": 2080
},
{
"epoch": 8.00383877159309,
"grad_norm": 0.05545974150300026,
"learning_rate": 1.4508163502183786e-06,
"loss": 0.0006,
"num_input_tokens_seen": 395291264,
"step": 2085
},
{
"epoch": 8.023032629558541,
"grad_norm": 0.07342655211687088,
"learning_rate": 1.3950093834903866e-06,
"loss": 0.0005,
"num_input_tokens_seen": 396246272,
"step": 2090
},
{
"epoch": 8.042226487523992,
"grad_norm": 0.030074596405029297,
"learning_rate": 1.340266094918366e-06,
"loss": 0.0005,
"num_input_tokens_seen": 397189376,
"step": 2095
},
{
"epoch": 8.061420345489443,
"grad_norm": 0.0349307544529438,
"learning_rate": 1.286588951321363e-06,
"loss": 0.0004,
"num_input_tokens_seen": 398128000,
"step": 2100
}
],
"logging_steps": 5,
"max_steps": 2340,
"num_input_tokens_seen": 398128000,
"num_train_epochs": 9,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.201849892590846e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}