| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.061420345489443, | |
| "eval_steps": 500, | |
| "global_step": 2100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.019193857965451054, | |
| "grad_norm": 1.0817455053329468, | |
| "learning_rate": 4.9999436730259053e-05, | |
| "loss": 0.2614, | |
| "num_input_tokens_seen": 933376, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03838771593090211, | |
| "grad_norm": 0.6683880686759949, | |
| "learning_rate": 4.999774694641803e-05, | |
| "loss": 0.1894, | |
| "num_input_tokens_seen": 1884800, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05758157389635317, | |
| "grad_norm": 0.6495689153671265, | |
| "learning_rate": 4.999493072462126e-05, | |
| "loss": 0.1836, | |
| "num_input_tokens_seen": 2847104, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07677543186180422, | |
| "grad_norm": 0.5534822940826416, | |
| "learning_rate": 4.999098819177214e-05, | |
| "loss": 0.1759, | |
| "num_input_tokens_seen": 3780480, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09596928982725528, | |
| "grad_norm": 0.5017605423927307, | |
| "learning_rate": 4.9985919525527434e-05, | |
| "loss": 0.1705, | |
| "num_input_tokens_seen": 4742272, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11516314779270634, | |
| "grad_norm": 0.4681239128112793, | |
| "learning_rate": 4.9979724954289244e-05, | |
| "loss": 0.1722, | |
| "num_input_tokens_seen": 5687424, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1343570057581574, | |
| "grad_norm": 0.48403823375701904, | |
| "learning_rate": 4.9972404757194736e-05, | |
| "loss": 0.1696, | |
| "num_input_tokens_seen": 6600832, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.15355086372360843, | |
| "grad_norm": 0.48744267225265503, | |
| "learning_rate": 4.9963959264103544e-05, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 7545344, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1727447216890595, | |
| "grad_norm": 0.4389607608318329, | |
| "learning_rate": 4.995438885558294e-05, | |
| "loss": 0.1606, | |
| "num_input_tokens_seen": 8489728, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.19193857965451055, | |
| "grad_norm": 0.4652620851993561, | |
| "learning_rate": 4.994369396289063e-05, | |
| "loss": 0.1606, | |
| "num_input_tokens_seen": 9426304, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21113243761996162, | |
| "grad_norm": 0.48464274406433105, | |
| "learning_rate": 4.993187506795538e-05, | |
| "loss": 0.1661, | |
| "num_input_tokens_seen": 10372608, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.23032629558541268, | |
| "grad_norm": 0.4516374468803406, | |
| "learning_rate": 4.9918932703355256e-05, | |
| "loss": 0.1629, | |
| "num_input_tokens_seen": 11298944, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2495201535508637, | |
| "grad_norm": 0.44246649742126465, | |
| "learning_rate": 4.990486745229364e-05, | |
| "loss": 0.1573, | |
| "num_input_tokens_seen": 12230784, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2687140115163148, | |
| "grad_norm": 0.4362417459487915, | |
| "learning_rate": 4.9889679948572974e-05, | |
| "loss": 0.1548, | |
| "num_input_tokens_seen": 13177856, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.28790786948176583, | |
| "grad_norm": 0.46021828055381775, | |
| "learning_rate": 4.987337087656614e-05, | |
| "loss": 0.1536, | |
| "num_input_tokens_seen": 14148864, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.30710172744721687, | |
| "grad_norm": 0.43709027767181396, | |
| "learning_rate": 4.98559409711857e-05, | |
| "loss": 0.1615, | |
| "num_input_tokens_seen": 15136256, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32629558541266795, | |
| "grad_norm": 0.4331000745296478, | |
| "learning_rate": 4.983739101785071e-05, | |
| "loss": 0.1593, | |
| "num_input_tokens_seen": 16088064, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.345489443378119, | |
| "grad_norm": 0.40528881549835205, | |
| "learning_rate": 4.981772185245135e-05, | |
| "loss": 0.1509, | |
| "num_input_tokens_seen": 17043840, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3646833013435701, | |
| "grad_norm": 0.42082345485687256, | |
| "learning_rate": 4.97969343613113e-05, | |
| "loss": 0.1518, | |
| "num_input_tokens_seen": 17986816, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3838771593090211, | |
| "grad_norm": 0.4149463176727295, | |
| "learning_rate": 4.977502948114772e-05, | |
| "loss": 0.1558, | |
| "num_input_tokens_seen": 18904192, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.40307101727447214, | |
| "grad_norm": 0.41713905334472656, | |
| "learning_rate": 4.97520081990291e-05, | |
| "loss": 0.152, | |
| "num_input_tokens_seen": 19854720, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.42226487523992323, | |
| "grad_norm": 0.4443305432796478, | |
| "learning_rate": 4.9727871552330794e-05, | |
| "loss": 0.15, | |
| "num_input_tokens_seen": 20807040, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.44145873320537427, | |
| "grad_norm": 0.4278104603290558, | |
| "learning_rate": 4.97026206286882e-05, | |
| "loss": 0.1563, | |
| "num_input_tokens_seen": 21742080, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.46065259117082535, | |
| "grad_norm": 0.41511863470077515, | |
| "learning_rate": 4.967625656594782e-05, | |
| "loss": 0.1545, | |
| "num_input_tokens_seen": 22686464, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4798464491362764, | |
| "grad_norm": 0.42981916666030884, | |
| "learning_rate": 4.964878055211597e-05, | |
| "loss": 0.1483, | |
| "num_input_tokens_seen": 23654272, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4990403071017274, | |
| "grad_norm": 0.39455243945121765, | |
| "learning_rate": 4.962019382530521e-05, | |
| "loss": 0.15, | |
| "num_input_tokens_seen": 24606720, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5182341650671785, | |
| "grad_norm": 0.40956762433052063, | |
| "learning_rate": 4.959049767367859e-05, | |
| "loss": 0.1516, | |
| "num_input_tokens_seen": 25554944, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5374280230326296, | |
| "grad_norm": 0.39926445484161377, | |
| "learning_rate": 4.955969343539162e-05, | |
| "loss": 0.1521, | |
| "num_input_tokens_seen": 26515968, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5566218809980806, | |
| "grad_norm": 0.4021078050136566, | |
| "learning_rate": 4.9527782498531915e-05, | |
| "loss": 0.1535, | |
| "num_input_tokens_seen": 27450112, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5758157389635317, | |
| "grad_norm": 0.40622368454933167, | |
| "learning_rate": 4.949476630105669e-05, | |
| "loss": 0.1521, | |
| "num_input_tokens_seen": 28398592, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5950095969289827, | |
| "grad_norm": 0.4052083194255829, | |
| "learning_rate": 4.946064633072795e-05, | |
| "loss": 0.1498, | |
| "num_input_tokens_seen": 29364864, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6142034548944337, | |
| "grad_norm": 0.39543649554252625, | |
| "learning_rate": 4.942542412504543e-05, | |
| "loss": 0.1499, | |
| "num_input_tokens_seen": 30316928, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6333973128598849, | |
| "grad_norm": 0.37419360876083374, | |
| "learning_rate": 4.9389101271177355e-05, | |
| "loss": 0.1479, | |
| "num_input_tokens_seen": 31287680, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6525911708253359, | |
| "grad_norm": 0.40969106554985046, | |
| "learning_rate": 4.935167940588887e-05, | |
| "loss": 0.1506, | |
| "num_input_tokens_seen": 32245888, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6717850287907869, | |
| "grad_norm": 0.39717161655426025, | |
| "learning_rate": 4.9313160215468334e-05, | |
| "loss": 0.1451, | |
| "num_input_tokens_seen": 33194368, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.690978886756238, | |
| "grad_norm": 0.3813941478729248, | |
| "learning_rate": 4.92735454356513e-05, | |
| "loss": 0.1378, | |
| "num_input_tokens_seen": 34140160, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.710172744721689, | |
| "grad_norm": 0.3822474777698517, | |
| "learning_rate": 4.923283685154231e-05, | |
| "loss": 0.1417, | |
| "num_input_tokens_seen": 35078528, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7293666026871402, | |
| "grad_norm": 0.4150351583957672, | |
| "learning_rate": 4.9191036297534454e-05, | |
| "loss": 0.1476, | |
| "num_input_tokens_seen": 36013696, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7485604606525912, | |
| "grad_norm": 0.3787771463394165, | |
| "learning_rate": 4.914814565722671e-05, | |
| "loss": 0.1414, | |
| "num_input_tokens_seen": 36973056, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7677543186180422, | |
| "grad_norm": 0.4316699504852295, | |
| "learning_rate": 4.910416686333906e-05, | |
| "loss": 0.1447, | |
| "num_input_tokens_seen": 37904256, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7869481765834933, | |
| "grad_norm": 0.38349735736846924, | |
| "learning_rate": 4.905910189762542e-05, | |
| "loss": 0.1417, | |
| "num_input_tokens_seen": 38858240, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8061420345489443, | |
| "grad_norm": 0.41351762413978577, | |
| "learning_rate": 4.901295279078431e-05, | |
| "loss": 0.1435, | |
| "num_input_tokens_seen": 39800192, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8253358925143954, | |
| "grad_norm": 0.36677101254463196, | |
| "learning_rate": 4.896572162236737e-05, | |
| "loss": 0.1454, | |
| "num_input_tokens_seen": 40736128, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8445297504798465, | |
| "grad_norm": 0.4057168662548065, | |
| "learning_rate": 4.8917410520685635e-05, | |
| "loss": 0.1414, | |
| "num_input_tokens_seen": 41699072, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8637236084452975, | |
| "grad_norm": 0.3790329396724701, | |
| "learning_rate": 4.886802166271364e-05, | |
| "loss": 0.1452, | |
| "num_input_tokens_seen": 42651648, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.8829174664107485, | |
| "grad_norm": 0.37877827882766724, | |
| "learning_rate": 4.881755727399134e-05, | |
| "loss": 0.1408, | |
| "num_input_tokens_seen": 43584512, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9021113243761996, | |
| "grad_norm": 0.37050819396972656, | |
| "learning_rate": 4.8766019628523775e-05, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 44552448, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9213051823416507, | |
| "grad_norm": 0.367374449968338, | |
| "learning_rate": 4.8713411048678635e-05, | |
| "loss": 0.142, | |
| "num_input_tokens_seen": 45498368, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9404990403071017, | |
| "grad_norm": 0.3566618859767914, | |
| "learning_rate": 4.8659733905081634e-05, | |
| "loss": 0.1398, | |
| "num_input_tokens_seen": 46445952, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9596928982725528, | |
| "grad_norm": 0.35429617762565613, | |
| "learning_rate": 4.8604990616509616e-05, | |
| "loss": 0.1345, | |
| "num_input_tokens_seen": 47391872, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9788867562380038, | |
| "grad_norm": 0.3834087550640106, | |
| "learning_rate": 4.8549183649781626e-05, | |
| "loss": 0.14, | |
| "num_input_tokens_seen": 48339584, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.9980806142034548, | |
| "grad_norm": 0.36162251234054565, | |
| "learning_rate": 4.849231551964771e-05, | |
| "loss": 0.1376, | |
| "num_input_tokens_seen": 49294848, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.017274472168906, | |
| "grad_norm": 0.3323320746421814, | |
| "learning_rate": 4.8434388788675635e-05, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 50247296, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.036468330134357, | |
| "grad_norm": 0.36334532499313354, | |
| "learning_rate": 4.837540606713538e-05, | |
| "loss": 0.1119, | |
| "num_input_tokens_seen": 51193472, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.055662188099808, | |
| "grad_norm": 0.37446385622024536, | |
| "learning_rate": 4.8315370012881514e-05, | |
| "loss": 0.1125, | |
| "num_input_tokens_seen": 52144000, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.0748560460652592, | |
| "grad_norm": 0.4256220757961273, | |
| "learning_rate": 4.8254283331233464e-05, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 53097216, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0940499040307101, | |
| "grad_norm": 0.39395830035209656, | |
| "learning_rate": 4.819214877485358e-05, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 54032896, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.1132437619961613, | |
| "grad_norm": 0.3702699542045593, | |
| "learning_rate": 4.812896914362309e-05, | |
| "loss": 0.1092, | |
| "num_input_tokens_seen": 54988800, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1324376199616122, | |
| "grad_norm": 0.389553040266037, | |
| "learning_rate": 4.806474728451597e-05, | |
| "loss": 0.1057, | |
| "num_input_tokens_seen": 55947520, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.1516314779270633, | |
| "grad_norm": 0.37560170888900757, | |
| "learning_rate": 4.799948609147061e-05, | |
| "loss": 0.108, | |
| "num_input_tokens_seen": 56864384, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1708253358925145, | |
| "grad_norm": 0.3832947611808777, | |
| "learning_rate": 4.793318850525943e-05, | |
| "loss": 0.1092, | |
| "num_input_tokens_seen": 57816960, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.1900191938579654, | |
| "grad_norm": 0.41275399923324585, | |
| "learning_rate": 4.786585751335637e-05, | |
| "loss": 0.1076, | |
| "num_input_tokens_seen": 58773376, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2092130518234165, | |
| "grad_norm": 0.4026075601577759, | |
| "learning_rate": 4.7797496149802256e-05, | |
| "loss": 0.1061, | |
| "num_input_tokens_seen": 59710592, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.2284069097888675, | |
| "grad_norm": 0.4040988087654114, | |
| "learning_rate": 4.77281074950681e-05, | |
| "loss": 0.1073, | |
| "num_input_tokens_seen": 60660352, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2476007677543186, | |
| "grad_norm": 0.40489524602890015, | |
| "learning_rate": 4.765769467591625e-05, | |
| "loss": 0.1085, | |
| "num_input_tokens_seen": 61600256, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.2667946257197698, | |
| "grad_norm": 0.40675088763237, | |
| "learning_rate": 4.758626086525956e-05, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 62558720, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2859884836852207, | |
| "grad_norm": 0.4014970362186432, | |
| "learning_rate": 4.751380928201834e-05, | |
| "loss": 0.1101, | |
| "num_input_tokens_seen": 63505152, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.3051823416506718, | |
| "grad_norm": 0.3987894654273987, | |
| "learning_rate": 4.744034319097535e-05, | |
| "loss": 0.1113, | |
| "num_input_tokens_seen": 64475392, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3243761996161227, | |
| "grad_norm": 0.3988923728466034, | |
| "learning_rate": 4.7365865902628684e-05, | |
| "loss": 0.1065, | |
| "num_input_tokens_seen": 65412352, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.3435700575815739, | |
| "grad_norm": 0.40686464309692383, | |
| "learning_rate": 4.7290380773042575e-05, | |
| "loss": 0.1058, | |
| "num_input_tokens_seen": 66358272, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.362763915547025, | |
| "grad_norm": 0.4416196048259735, | |
| "learning_rate": 4.7213891203696164e-05, | |
| "loss": 0.1068, | |
| "num_input_tokens_seen": 67310208, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.381957773512476, | |
| "grad_norm": 0.38802987337112427, | |
| "learning_rate": 4.713640064133025e-05, | |
| "loss": 0.1119, | |
| "num_input_tokens_seen": 68256256, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.401151631477927, | |
| "grad_norm": 0.37979230284690857, | |
| "learning_rate": 4.705791257779195e-05, | |
| "loss": 0.1087, | |
| "num_input_tokens_seen": 69200896, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.420345489443378, | |
| "grad_norm": 0.40544721484184265, | |
| "learning_rate": 4.697843054987737e-05, | |
| "loss": 0.1101, | |
| "num_input_tokens_seen": 70129792, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4395393474088292, | |
| "grad_norm": 0.39788997173309326, | |
| "learning_rate": 4.68979581391722e-05, | |
| "loss": 0.1117, | |
| "num_input_tokens_seen": 71074944, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.4587332053742803, | |
| "grad_norm": 0.40104883909225464, | |
| "learning_rate": 4.681649897189036e-05, | |
| "loss": 0.1067, | |
| "num_input_tokens_seen": 72009088, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4779270633397312, | |
| "grad_norm": 0.4051682949066162, | |
| "learning_rate": 4.673405671871057e-05, | |
| "loss": 0.1106, | |
| "num_input_tokens_seen": 72954880, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.4971209213051824, | |
| "grad_norm": 0.40561723709106445, | |
| "learning_rate": 4.665063509461097e-05, | |
| "loss": 0.108, | |
| "num_input_tokens_seen": 73890176, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.5163147792706333, | |
| "grad_norm": 0.4338354468345642, | |
| "learning_rate": 4.656623785870167e-05, | |
| "loss": 0.1093, | |
| "num_input_tokens_seen": 74851840, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.5355086372360844, | |
| "grad_norm": 0.40794938802719116, | |
| "learning_rate": 4.6480868814055424e-05, | |
| "loss": 0.1071, | |
| "num_input_tokens_seen": 75794816, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5547024952015356, | |
| "grad_norm": 0.406972199678421, | |
| "learning_rate": 4.639453180753619e-05, | |
| "loss": 0.1092, | |
| "num_input_tokens_seen": 76736512, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.5738963531669867, | |
| "grad_norm": 0.4144577383995056, | |
| "learning_rate": 4.630723072962584e-05, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 77692288, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5930902111324377, | |
| "grad_norm": 0.44782555103302, | |
| "learning_rate": 4.6218969514248814e-05, | |
| "loss": 0.1068, | |
| "num_input_tokens_seen": 78654720, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.6122840690978886, | |
| "grad_norm": 0.45688125491142273, | |
| "learning_rate": 4.6129752138594874e-05, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 79607552, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6314779270633397, | |
| "grad_norm": 0.43751007318496704, | |
| "learning_rate": 4.6039582622939854e-05, | |
| "loss": 0.1087, | |
| "num_input_tokens_seen": 80547328, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.6506717850287909, | |
| "grad_norm": 0.42292338609695435, | |
| "learning_rate": 4.5948465030464536e-05, | |
| "loss": 0.107, | |
| "num_input_tokens_seen": 81478400, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.669865642994242, | |
| "grad_norm": 0.4191446006298065, | |
| "learning_rate": 4.5856403467071536e-05, | |
| "loss": 0.1084, | |
| "num_input_tokens_seen": 82441984, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.689059500959693, | |
| "grad_norm": 0.397989958524704, | |
| "learning_rate": 4.5763402081200294e-05, | |
| "loss": 0.1069, | |
| "num_input_tokens_seen": 83382912, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7082533589251438, | |
| "grad_norm": 0.4078799784183502, | |
| "learning_rate": 4.566946506364013e-05, | |
| "loss": 0.1064, | |
| "num_input_tokens_seen": 84332672, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.727447216890595, | |
| "grad_norm": 0.41216278076171875, | |
| "learning_rate": 4.557459664734141e-05, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 85280128, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7466410748560461, | |
| "grad_norm": 0.43625858426094055, | |
| "learning_rate": 4.54788011072248e-05, | |
| "loss": 0.1069, | |
| "num_input_tokens_seen": 86221568, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.7658349328214973, | |
| "grad_norm": 0.3984062075614929, | |
| "learning_rate": 4.538208275998861e-05, | |
| "loss": 0.107, | |
| "num_input_tokens_seen": 87159936, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.7850287907869482, | |
| "grad_norm": 0.4033821225166321, | |
| "learning_rate": 4.528444596391433e-05, | |
| "loss": 0.1066, | |
| "num_input_tokens_seen": 88115712, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.8042226487523991, | |
| "grad_norm": 0.40416958928108215, | |
| "learning_rate": 4.518589511867017e-05, | |
| "loss": 0.1054, | |
| "num_input_tokens_seen": 89069824, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.8234165067178503, | |
| "grad_norm": 0.4133840501308441, | |
| "learning_rate": 4.5086434665112864e-05, | |
| "loss": 0.1055, | |
| "num_input_tokens_seen": 90018176, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.8426103646833014, | |
| "grad_norm": 0.44508275389671326, | |
| "learning_rate": 4.498606908508754e-05, | |
| "loss": 0.1091, | |
| "num_input_tokens_seen": 90960128, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.8618042226487526, | |
| "grad_norm": 0.4138317108154297, | |
| "learning_rate": 4.4884802901225695e-05, | |
| "loss": 0.106, | |
| "num_input_tokens_seen": 91911936, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.8809980806142035, | |
| "grad_norm": 0.41188734769821167, | |
| "learning_rate": 4.478264067674155e-05, | |
| "loss": 0.106, | |
| "num_input_tokens_seen": 92849280, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9001919385796544, | |
| "grad_norm": 0.43868768215179443, | |
| "learning_rate": 4.4679587015226253e-05, | |
| "loss": 0.1065, | |
| "num_input_tokens_seen": 93810560, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.9193857965451055, | |
| "grad_norm": 0.4238053560256958, | |
| "learning_rate": 4.457564656044056e-05, | |
| "loss": 0.1036, | |
| "num_input_tokens_seen": 94771328, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9385796545105567, | |
| "grad_norm": 0.4343940317630768, | |
| "learning_rate": 4.447082399610549e-05, | |
| "loss": 0.1039, | |
| "num_input_tokens_seen": 95733632, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.9577735124760078, | |
| "grad_norm": 0.4396176040172577, | |
| "learning_rate": 4.436512404569136e-05, | |
| "loss": 0.1073, | |
| "num_input_tokens_seen": 96689280, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9769673704414588, | |
| "grad_norm": 0.4687812030315399, | |
| "learning_rate": 4.4258551472204865e-05, | |
| "loss": 0.1082, | |
| "num_input_tokens_seen": 97626112, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.9961612284069097, | |
| "grad_norm": 0.4092023968696594, | |
| "learning_rate": 4.415111107797445e-05, | |
| "loss": 0.1049, | |
| "num_input_tokens_seen": 98561280, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.015355086372361, | |
| "grad_norm": 0.3592652976512909, | |
| "learning_rate": 4.404280770443398e-05, | |
| "loss": 0.0829, | |
| "num_input_tokens_seen": 99516160, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.034548944337812, | |
| "grad_norm": 0.4278601109981537, | |
| "learning_rate": 4.3933646231904504e-05, | |
| "loss": 0.0727, | |
| "num_input_tokens_seen": 100466048, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.053742802303263, | |
| "grad_norm": 0.48623570799827576, | |
| "learning_rate": 4.3823631579374354e-05, | |
| "loss": 0.0706, | |
| "num_input_tokens_seen": 101422208, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.072936660268714, | |
| "grad_norm": 0.4464566111564636, | |
| "learning_rate": 4.371276870427753e-05, | |
| "loss": 0.0683, | |
| "num_input_tokens_seen": 102365568, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.092130518234165, | |
| "grad_norm": 0.48030439019203186, | |
| "learning_rate": 4.360106260227027e-05, | |
| "loss": 0.0708, | |
| "num_input_tokens_seen": 103312128, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.111324376199616, | |
| "grad_norm": 0.4528995752334595, | |
| "learning_rate": 4.348851830700593e-05, | |
| "loss": 0.0693, | |
| "num_input_tokens_seen": 104258560, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.1305182341650672, | |
| "grad_norm": 0.48248499631881714, | |
| "learning_rate": 4.337514088990822e-05, | |
| "loss": 0.067, | |
| "num_input_tokens_seen": 105199360, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.1497120921305184, | |
| "grad_norm": 0.5198423266410828, | |
| "learning_rate": 4.3260935459942584e-05, | |
| "loss": 0.0701, | |
| "num_input_tokens_seen": 106155776, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.168905950095969, | |
| "grad_norm": 0.47610777616500854, | |
| "learning_rate": 4.3145907163386064e-05, | |
| "loss": 0.0673, | |
| "num_input_tokens_seen": 107121792, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.1880998080614202, | |
| "grad_norm": 0.4852938950061798, | |
| "learning_rate": 4.303006118359537e-05, | |
| "loss": 0.0667, | |
| "num_input_tokens_seen": 108049280, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.2072936660268714, | |
| "grad_norm": 0.5004666447639465, | |
| "learning_rate": 4.2913402740773294e-05, | |
| "loss": 0.0708, | |
| "num_input_tokens_seen": 108994816, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.2264875239923225, | |
| "grad_norm": 0.500302255153656, | |
| "learning_rate": 4.2795937091733515e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 109929728, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.2456813819577737, | |
| "grad_norm": 0.45096588134765625, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 110867328, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.2648752399232244, | |
| "grad_norm": 0.48498988151550293, | |
| "learning_rate": 4.255860538388694e-05, | |
| "loss": 0.0703, | |
| "num_input_tokens_seen": 111824640, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.2840690978886755, | |
| "grad_norm": 0.6077526807785034, | |
| "learning_rate": 4.24387500196217e-05, | |
| "loss": 0.0719, | |
| "num_input_tokens_seen": 112779648, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.3032629558541267, | |
| "grad_norm": 0.5256830453872681, | |
| "learning_rate": 4.231810883773999e-05, | |
| "loss": 0.0693, | |
| "num_input_tokens_seen": 113709312, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.322456813819578, | |
| "grad_norm": 0.5583354234695435, | |
| "learning_rate": 4.219668727452397e-05, | |
| "loss": 0.0754, | |
| "num_input_tokens_seen": 114654080, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.341650671785029, | |
| "grad_norm": 0.47424212098121643, | |
| "learning_rate": 4.207449080142104e-05, | |
| "loss": 0.0692, | |
| "num_input_tokens_seen": 115618176, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.36084452975048, | |
| "grad_norm": 0.4865480959415436, | |
| "learning_rate": 4.195152492479727e-05, | |
| "loss": 0.0734, | |
| "num_input_tokens_seen": 116583168, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.380038387715931, | |
| "grad_norm": 0.5127935409545898, | |
| "learning_rate": 4.182779518568926e-05, | |
| "loss": 0.0756, | |
| "num_input_tokens_seen": 117525632, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.399232245681382, | |
| "grad_norm": 0.48488515615463257, | |
| "learning_rate": 4.170330715955444e-05, | |
| "loss": 0.0721, | |
| "num_input_tokens_seen": 118487040, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.418426103646833, | |
| "grad_norm": 0.5220322012901306, | |
| "learning_rate": 4.157806645601988e-05, | |
| "loss": 0.0708, | |
| "num_input_tokens_seen": 119424384, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.4376199616122842, | |
| "grad_norm": 0.4799134433269501, | |
| "learning_rate": 4.145207871862947e-05, | |
| "loss": 0.071, | |
| "num_input_tokens_seen": 120394496, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.456813819577735, | |
| "grad_norm": 0.5201560258865356, | |
| "learning_rate": 4.132534962458962e-05, | |
| "loss": 0.0733, | |
| "num_input_tokens_seen": 121340160, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.476007677543186, | |
| "grad_norm": 0.46850621700286865, | |
| "learning_rate": 4.1197884884513474e-05, | |
| "loss": 0.0725, | |
| "num_input_tokens_seen": 122270720, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.495201535508637, | |
| "grad_norm": 0.5046434998512268, | |
| "learning_rate": 4.1069690242163484e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 123226880, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.5143953934740884, | |
| "grad_norm": 0.4388461410999298, | |
| "learning_rate": 4.094077147419271e-05, | |
| "loss": 0.0702, | |
| "num_input_tokens_seen": 124177664, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.5335892514395395, | |
| "grad_norm": 0.5357294678688049, | |
| "learning_rate": 4.0811134389884433e-05, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 125131520, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.5527831094049906, | |
| "grad_norm": 0.56220543384552, | |
| "learning_rate": 4.0680784830890405e-05, | |
| "loss": 0.0719, | |
| "num_input_tokens_seen": 126085248, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.5719769673704413, | |
| "grad_norm": 0.4952705204486847, | |
| "learning_rate": 4.05497286709676e-05, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 127024128, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.5911708253358925, | |
| "grad_norm": 0.4931686520576477, | |
| "learning_rate": 4.0417971815713584e-05, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 127971072, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.6103646833013436, | |
| "grad_norm": 0.5248854756355286, | |
| "learning_rate": 4.028552020230031e-05, | |
| "loss": 0.0741, | |
| "num_input_tokens_seen": 128914176, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.629558541266795, | |
| "grad_norm": 0.48179954290390015, | |
| "learning_rate": 4.015237979920666e-05, | |
| "loss": 0.0722, | |
| "num_input_tokens_seen": 129855360, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.6487523992322455, | |
| "grad_norm": 0.5252842903137207, | |
| "learning_rate": 4.001855660594948e-05, | |
| "loss": 0.0746, | |
| "num_input_tokens_seen": 130808960, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.6679462571976966, | |
| "grad_norm": 0.48107513785362244, | |
| "learning_rate": 3.9884056652813184e-05, | |
| "loss": 0.0692, | |
| "num_input_tokens_seen": 131750144, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.6871401151631478, | |
| "grad_norm": 0.6338366866111755, | |
| "learning_rate": 3.974888600057808e-05, | |
| "loss": 0.0713, | |
| "num_input_tokens_seen": 132695936, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.706333973128599, | |
| "grad_norm": 0.4953431785106659, | |
| "learning_rate": 3.9613050740247224e-05, | |
| "loss": 0.0711, | |
| "num_input_tokens_seen": 133666944, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.72552783109405, | |
| "grad_norm": 0.4697812497615814, | |
| "learning_rate": 3.947655699277197e-05, | |
| "loss": 0.0677, | |
| "num_input_tokens_seen": 134611456, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.744721689059501, | |
| "grad_norm": 0.5141253471374512, | |
| "learning_rate": 3.933941090877615e-05, | |
| "loss": 0.0742, | |
| "num_input_tokens_seen": 135558400, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.763915547024952, | |
| "grad_norm": 0.4803192913532257, | |
| "learning_rate": 3.920161866827889e-05, | |
| "loss": 0.07, | |
| "num_input_tokens_seen": 136509696, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.783109404990403, | |
| "grad_norm": 0.47573211789131165, | |
| "learning_rate": 3.906318648041617e-05, | |
| "loss": 0.0725, | |
| "num_input_tokens_seen": 137442816, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.802303262955854, | |
| "grad_norm": 0.5205782055854797, | |
| "learning_rate": 3.8924120583160985e-05, | |
| "loss": 0.0704, | |
| "num_input_tokens_seen": 138383232, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.8214971209213053, | |
| "grad_norm": 0.5315016508102417, | |
| "learning_rate": 3.8784427243042296e-05, | |
| "loss": 0.0721, | |
| "num_input_tokens_seen": 139326464, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.840690978886756, | |
| "grad_norm": 0.4982779324054718, | |
| "learning_rate": 3.8644112754862614e-05, | |
| "loss": 0.0702, | |
| "num_input_tokens_seen": 140295296, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.859884836852207, | |
| "grad_norm": 0.550413191318512, | |
| "learning_rate": 3.850318344141439e-05, | |
| "loss": 0.0719, | |
| "num_input_tokens_seen": 141247488, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 2.8790786948176583, | |
| "grad_norm": 0.5022615194320679, | |
| "learning_rate": 3.8361645653195026e-05, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 142186240, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.8982725527831095, | |
| "grad_norm": 0.5040895342826843, | |
| "learning_rate": 3.821950576812081e-05, | |
| "loss": 0.0717, | |
| "num_input_tokens_seen": 143130624, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 2.9174664107485606, | |
| "grad_norm": 0.5027932524681091, | |
| "learning_rate": 3.807677019123944e-05, | |
| "loss": 0.0717, | |
| "num_input_tokens_seen": 144061568, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.9366602687140118, | |
| "grad_norm": 0.5451627969741821, | |
| "learning_rate": 3.793344535444142e-05, | |
| "loss": 0.0694, | |
| "num_input_tokens_seen": 145013632, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 2.9558541266794625, | |
| "grad_norm": 0.5232383012771606, | |
| "learning_rate": 3.7789537716170256e-05, | |
| "loss": 0.07, | |
| "num_input_tokens_seen": 145962880, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.9750479846449136, | |
| "grad_norm": 0.46057307720184326, | |
| "learning_rate": 3.764505376113138e-05, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 146916480, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 2.9942418426103647, | |
| "grad_norm": 0.510372519493103, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 147865344, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.013435700575816, | |
| "grad_norm": 0.3846762180328369, | |
| "learning_rate": 3.735438296912768e-05, | |
| "loss": 0.0515, | |
| "num_input_tokens_seen": 148817664, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 3.0326295585412666, | |
| "grad_norm": 0.5160824656486511, | |
| "learning_rate": 3.720820923024778e-05, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 149770880, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.0518234165067177, | |
| "grad_norm": 0.5621275305747986, | |
| "learning_rate": 3.7061485370179835e-05, | |
| "loss": 0.0388, | |
| "num_input_tokens_seen": 150717312, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 3.071017274472169, | |
| "grad_norm": 0.5459131002426147, | |
| "learning_rate": 3.69142180005327e-05, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 151666432, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.09021113243762, | |
| "grad_norm": 0.5296151638031006, | |
| "learning_rate": 3.676641375740662e-05, | |
| "loss": 0.0373, | |
| "num_input_tokens_seen": 152584064, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 3.109404990403071, | |
| "grad_norm": 0.4850353002548218, | |
| "learning_rate": 3.6618079301094216e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 153525248, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.128598848368522, | |
| "grad_norm": 0.5585963726043701, | |
| "learning_rate": 3.646922131578036e-05, | |
| "loss": 0.038, | |
| "num_input_tokens_seen": 154456704, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 3.147792706333973, | |
| "grad_norm": 0.5561864972114563, | |
| "learning_rate": 3.631984650924094e-05, | |
| "loss": 0.0387, | |
| "num_input_tokens_seen": 155405312, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.166986564299424, | |
| "grad_norm": 0.5100105404853821, | |
| "learning_rate": 3.6169961612540645e-05, | |
| "loss": 0.0387, | |
| "num_input_tokens_seen": 156361984, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 3.1861804222648753, | |
| "grad_norm": 0.5485111474990845, | |
| "learning_rate": 3.6019573379729643e-05, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 157296768, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.2053742802303264, | |
| "grad_norm": 0.5596518516540527, | |
| "learning_rate": 3.586868858753921e-05, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 158262144, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 3.224568138195777, | |
| "grad_norm": 0.5062248110771179, | |
| "learning_rate": 3.5717314035076355e-05, | |
| "loss": 0.0394, | |
| "num_input_tokens_seen": 159213952, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.2437619961612283, | |
| "grad_norm": 0.560895562171936, | |
| "learning_rate": 3.556545654351749e-05, | |
| "loss": 0.0412, | |
| "num_input_tokens_seen": 160142208, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 3.2629558541266794, | |
| "grad_norm": 0.5781249403953552, | |
| "learning_rate": 3.5413122955801005e-05, | |
| "loss": 0.0379, | |
| "num_input_tokens_seen": 161101824, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.2821497120921306, | |
| "grad_norm": 0.5329350233078003, | |
| "learning_rate": 3.526032013631893e-05, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 162062592, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 3.3013435700575817, | |
| "grad_norm": 0.5311657190322876, | |
| "learning_rate": 3.510705497060762e-05, | |
| "loss": 0.0407, | |
| "num_input_tokens_seen": 163008896, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.320537428023033, | |
| "grad_norm": 0.5238428115844727, | |
| "learning_rate": 3.4953334365037525e-05, | |
| "loss": 0.0396, | |
| "num_input_tokens_seen": 163957632, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 3.3397312859884836, | |
| "grad_norm": 0.5340429544448853, | |
| "learning_rate": 3.479916524650188e-05, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 164903680, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.3589251439539347, | |
| "grad_norm": 0.5380253195762634, | |
| "learning_rate": 3.4644554562104634e-05, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 165832960, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 3.378119001919386, | |
| "grad_norm": 0.5895189046859741, | |
| "learning_rate": 3.4489509278847414e-05, | |
| "loss": 0.0386, | |
| "num_input_tokens_seen": 166790016, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.397312859884837, | |
| "grad_norm": 0.5654773712158203, | |
| "learning_rate": 3.433403638331553e-05, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 167751808, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 3.4165067178502877, | |
| "grad_norm": 0.5874072909355164, | |
| "learning_rate": 3.417814288136319e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 168697984, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.435700575815739, | |
| "grad_norm": 0.5615460276603699, | |
| "learning_rate": 3.4021835797797804e-05, | |
| "loss": 0.0397, | |
| "num_input_tokens_seen": 169634432, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 3.45489443378119, | |
| "grad_norm": 0.5442382097244263, | |
| "learning_rate": 3.386512217606339e-05, | |
| "loss": 0.0408, | |
| "num_input_tokens_seen": 170583808, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.474088291746641, | |
| "grad_norm": 0.5456710457801819, | |
| "learning_rate": 3.370800907792325e-05, | |
| "loss": 0.0393, | |
| "num_input_tokens_seen": 171575808, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 3.4932821497120923, | |
| "grad_norm": 0.6690710186958313, | |
| "learning_rate": 3.355050358314172e-05, | |
| "loss": 0.0399, | |
| "num_input_tokens_seen": 172516224, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.5124760076775434, | |
| "grad_norm": 0.5574905276298523, | |
| "learning_rate": 3.339261278916512e-05, | |
| "loss": 0.0382, | |
| "num_input_tokens_seen": 173462784, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 3.531669865642994, | |
| "grad_norm": 0.6012794971466064, | |
| "learning_rate": 3.323434381080199e-05, | |
| "loss": 0.0393, | |
| "num_input_tokens_seen": 174417408, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.5508637236084453, | |
| "grad_norm": 0.5913425087928772, | |
| "learning_rate": 3.307570377990245e-05, | |
| "loss": 0.0401, | |
| "num_input_tokens_seen": 175372160, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 3.5700575815738964, | |
| "grad_norm": 0.5533618927001953, | |
| "learning_rate": 3.2916699845036816e-05, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 176325376, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.5892514395393476, | |
| "grad_norm": 0.5684598684310913, | |
| "learning_rate": 3.2757339171173506e-05, | |
| "loss": 0.0405, | |
| "num_input_tokens_seen": 177279232, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 3.6084452975047983, | |
| "grad_norm": 0.5171712040901184, | |
| "learning_rate": 3.2597628939356175e-05, | |
| "loss": 0.0385, | |
| "num_input_tokens_seen": 178230656, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.6276391554702494, | |
| "grad_norm": 0.5861181616783142, | |
| "learning_rate": 3.243757634638008e-05, | |
| "loss": 0.0406, | |
| "num_input_tokens_seen": 179165056, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 3.6468330134357005, | |
| "grad_norm": 0.5875094532966614, | |
| "learning_rate": 3.227718860446782e-05, | |
| "loss": 0.0397, | |
| "num_input_tokens_seen": 180110464, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.6660268714011517, | |
| "grad_norm": 0.518147885799408, | |
| "learning_rate": 3.211647294094437e-05, | |
| "loss": 0.0406, | |
| "num_input_tokens_seen": 181050368, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 3.685220729366603, | |
| "grad_norm": 0.585909903049469, | |
| "learning_rate": 3.195543659791132e-05, | |
| "loss": 0.0399, | |
| "num_input_tokens_seen": 181985408, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.704414587332054, | |
| "grad_norm": 0.5211689472198486, | |
| "learning_rate": 3.179408683192061e-05, | |
| "loss": 0.0403, | |
| "num_input_tokens_seen": 182932096, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 3.7236084452975047, | |
| "grad_norm": 0.5516882538795471, | |
| "learning_rate": 3.163243091364752e-05, | |
| "loss": 0.0414, | |
| "num_input_tokens_seen": 183894528, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.742802303262956, | |
| "grad_norm": 0.5610490441322327, | |
| "learning_rate": 3.147047612756302e-05, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 184841472, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 3.761996161228407, | |
| "grad_norm": 0.5976274013519287, | |
| "learning_rate": 3.130822977160554e-05, | |
| "loss": 0.0411, | |
| "num_input_tokens_seen": 185803520, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.781190019193858, | |
| "grad_norm": 0.5341821312904358, | |
| "learning_rate": 3.114569915685213e-05, | |
| "loss": 0.0416, | |
| "num_input_tokens_seen": 186746880, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 3.800383877159309, | |
| "grad_norm": 0.5178216695785522, | |
| "learning_rate": 3.098289160718895e-05, | |
| "loss": 0.0418, | |
| "num_input_tokens_seen": 187703040, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.81957773512476, | |
| "grad_norm": 0.6667547225952148, | |
| "learning_rate": 3.081981445898131e-05, | |
| "loss": 0.0399, | |
| "num_input_tokens_seen": 188650112, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 3.838771593090211, | |
| "grad_norm": 0.6052145957946777, | |
| "learning_rate": 3.065647506074306e-05, | |
| "loss": 0.0369, | |
| "num_input_tokens_seen": 189577088, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.8579654510556622, | |
| "grad_norm": 0.5983301401138306, | |
| "learning_rate": 3.0492880772805433e-05, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 190533376, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 3.8771593090211134, | |
| "grad_norm": 0.5493575930595398, | |
| "learning_rate": 3.03290389669854e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 191484672, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.8963531669865645, | |
| "grad_norm": 0.5318569540977478, | |
| "learning_rate": 3.016495702625351e-05, | |
| "loss": 0.0401, | |
| "num_input_tokens_seen": 192425728, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 3.9155470249520152, | |
| "grad_norm": 0.5920220613479614, | |
| "learning_rate": 3.0000642344401113e-05, | |
| "loss": 0.0408, | |
| "num_input_tokens_seen": 193386368, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.9347408829174664, | |
| "grad_norm": 0.48210155963897705, | |
| "learning_rate": 2.983610232570728e-05, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 194338560, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 3.9539347408829175, | |
| "grad_norm": 0.551249623298645, | |
| "learning_rate": 2.9671344384605127e-05, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 195283968, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.9731285988483687, | |
| "grad_norm": 0.5769145488739014, | |
| "learning_rate": 2.950637594534765e-05, | |
| "loss": 0.0397, | |
| "num_input_tokens_seen": 196222080, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 3.9923224568138194, | |
| "grad_norm": 0.49872222542762756, | |
| "learning_rate": 2.9341204441673266e-05, | |
| "loss": 0.0396, | |
| "num_input_tokens_seen": 197174016, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 4.0115163147792705, | |
| "grad_norm": 0.32796505093574524, | |
| "learning_rate": 2.917583731647077e-05, | |
| "loss": 0.0271, | |
| "num_input_tokens_seen": 198129792, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 4.030710172744722, | |
| "grad_norm": 0.4801371395587921, | |
| "learning_rate": 2.9010282021444008e-05, | |
| "loss": 0.0188, | |
| "num_input_tokens_seen": 199088384, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 4.049904030710173, | |
| "grad_norm": 0.5048861503601074, | |
| "learning_rate": 2.8844546016776013e-05, | |
| "loss": 0.0195, | |
| "num_input_tokens_seen": 200040832, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 4.069097888675624, | |
| "grad_norm": 0.44816502928733826, | |
| "learning_rate": 2.8678636770792906e-05, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 200993024, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 4.088291746641075, | |
| "grad_norm": 0.5177501440048218, | |
| "learning_rate": 2.851256175962732e-05, | |
| "loss": 0.0175, | |
| "num_input_tokens_seen": 201925376, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 4.107485604606526, | |
| "grad_norm": 0.44937270879745483, | |
| "learning_rate": 2.8346328466881545e-05, | |
| "loss": 0.0175, | |
| "num_input_tokens_seen": 202875520, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 4.126679462571977, | |
| "grad_norm": 0.46722468733787537, | |
| "learning_rate": 2.8179944383290274e-05, | |
| "loss": 0.0177, | |
| "num_input_tokens_seen": 203793536, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 4.145873320537428, | |
| "grad_norm": 0.47768163681030273, | |
| "learning_rate": 2.8013417006383076e-05, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 204730496, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 4.165067178502879, | |
| "grad_norm": 0.5000248551368713, | |
| "learning_rate": 2.784675384014656e-05, | |
| "loss": 0.0183, | |
| "num_input_tokens_seen": 205692416, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 4.18426103646833, | |
| "grad_norm": 0.43874186277389526, | |
| "learning_rate": 2.7679962394686198e-05, | |
| "loss": 0.0163, | |
| "num_input_tokens_seen": 206628608, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 4.203454894433781, | |
| "grad_norm": 0.6000416874885559, | |
| "learning_rate": 2.751305018588793e-05, | |
| "loss": 0.0173, | |
| "num_input_tokens_seen": 207572096, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 4.222648752399232, | |
| "grad_norm": 0.5658661127090454, | |
| "learning_rate": 2.7346024735079486e-05, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 208517376, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.241842610364683, | |
| "grad_norm": 0.47470784187316895, | |
| "learning_rate": 2.717889356869146e-05, | |
| "loss": 0.017, | |
| "num_input_tokens_seen": 209485056, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 4.2610364683301345, | |
| "grad_norm": 0.5162433385848999, | |
| "learning_rate": 2.7011664217918154e-05, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 210415232, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 4.280230326295586, | |
| "grad_norm": 0.5180822014808655, | |
| "learning_rate": 2.684434421837821e-05, | |
| "loss": 0.018, | |
| "num_input_tokens_seen": 211362560, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 4.299424184261037, | |
| "grad_norm": 0.5464392304420471, | |
| "learning_rate": 2.667694110977506e-05, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 212339456, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 4.318618042226488, | |
| "grad_norm": 0.5567265748977661, | |
| "learning_rate": 2.6509462435557152e-05, | |
| "loss": 0.0187, | |
| "num_input_tokens_seen": 213288320, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 4.337811900191938, | |
| "grad_norm": 0.5592502951622009, | |
| "learning_rate": 2.6341915742578037e-05, | |
| "loss": 0.0189, | |
| "num_input_tokens_seen": 214232576, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 4.357005758157389, | |
| "grad_norm": 0.53173828125, | |
| "learning_rate": 2.617430858075632e-05, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 215180800, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 4.3761996161228405, | |
| "grad_norm": 0.4227728843688965, | |
| "learning_rate": 2.600664850273538e-05, | |
| "loss": 0.0175, | |
| "num_input_tokens_seen": 216116480, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 4.395393474088292, | |
| "grad_norm": 0.6441555023193359, | |
| "learning_rate": 2.5838943063543136e-05, | |
| "loss": 0.0174, | |
| "num_input_tokens_seen": 217079552, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 4.414587332053743, | |
| "grad_norm": 0.4902968108654022, | |
| "learning_rate": 2.5671199820251534e-05, | |
| "loss": 0.016, | |
| "num_input_tokens_seen": 218026496, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 4.433781190019194, | |
| "grad_norm": 0.46307000517845154, | |
| "learning_rate": 2.550342633163601e-05, | |
| "loss": 0.0171, | |
| "num_input_tokens_seen": 218977152, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 4.452975047984645, | |
| "grad_norm": 0.4390980899333954, | |
| "learning_rate": 2.5335630157834937e-05, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 219918080, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 4.472168905950096, | |
| "grad_norm": 0.5380067825317383, | |
| "learning_rate": 2.5167818860008908e-05, | |
| "loss": 0.0182, | |
| "num_input_tokens_seen": 220859776, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 4.491362763915547, | |
| "grad_norm": 0.6842356324195862, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0184, | |
| "num_input_tokens_seen": 221800192, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.510556621880998, | |
| "grad_norm": 0.5243207812309265, | |
| "learning_rate": 2.48321811399911e-05, | |
| "loss": 0.0182, | |
| "num_input_tokens_seen": 222755840, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 4.529750479846449, | |
| "grad_norm": 0.5043371915817261, | |
| "learning_rate": 2.4664369842165068e-05, | |
| "loss": 0.0182, | |
| "num_input_tokens_seen": 223683328, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.5489443378119, | |
| "grad_norm": 0.5096651315689087, | |
| "learning_rate": 2.4496573668363996e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 224617088, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 4.568138195777351, | |
| "grad_norm": 0.4918656349182129, | |
| "learning_rate": 2.4328800179748475e-05, | |
| "loss": 0.0189, | |
| "num_input_tokens_seen": 225549824, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.587332053742802, | |
| "grad_norm": 0.50703364610672, | |
| "learning_rate": 2.4161056936456873e-05, | |
| "loss": 0.0179, | |
| "num_input_tokens_seen": 226499968, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 4.606525911708253, | |
| "grad_norm": 0.535247266292572, | |
| "learning_rate": 2.399335149726463e-05, | |
| "loss": 0.0184, | |
| "num_input_tokens_seen": 227438592, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.6257197696737045, | |
| "grad_norm": 0.5735894441604614, | |
| "learning_rate": 2.3825691419243694e-05, | |
| "loss": 0.0173, | |
| "num_input_tokens_seen": 228392064, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 4.644913627639156, | |
| "grad_norm": 0.525362491607666, | |
| "learning_rate": 2.365808425742196e-05, | |
| "loss": 0.0172, | |
| "num_input_tokens_seen": 229337856, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.664107485604607, | |
| "grad_norm": 0.548953115940094, | |
| "learning_rate": 2.3490537564442847e-05, | |
| "loss": 0.0172, | |
| "num_input_tokens_seen": 230301184, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 4.683301343570058, | |
| "grad_norm": 0.543995201587677, | |
| "learning_rate": 2.3323058890224938e-05, | |
| "loss": 0.0183, | |
| "num_input_tokens_seen": 231239808, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 4.702495201535509, | |
| "grad_norm": 0.4721289873123169, | |
| "learning_rate": 2.3155655781621793e-05, | |
| "loss": 0.017, | |
| "num_input_tokens_seen": 232200960, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 4.72168905950096, | |
| "grad_norm": 0.5055978298187256, | |
| "learning_rate": 2.2988335782081855e-05, | |
| "loss": 0.018, | |
| "num_input_tokens_seen": 233170176, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 4.74088291746641, | |
| "grad_norm": 0.5819576382637024, | |
| "learning_rate": 2.2821106431308544e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 234121984, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 4.760076775431862, | |
| "grad_norm": 0.5343950986862183, | |
| "learning_rate": 2.265397526492052e-05, | |
| "loss": 0.019, | |
| "num_input_tokens_seen": 235080448, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 4.779270633397313, | |
| "grad_norm": 0.5696731805801392, | |
| "learning_rate": 2.2486949814112077e-05, | |
| "loss": 0.0167, | |
| "num_input_tokens_seen": 236029440, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 4.798464491362764, | |
| "grad_norm": 0.513455331325531, | |
| "learning_rate": 2.2320037605313808e-05, | |
| "loss": 0.0172, | |
| "num_input_tokens_seen": 236989952, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.817658349328215, | |
| "grad_norm": 0.5600264072418213, | |
| "learning_rate": 2.2153246159853446e-05, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 237952640, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 4.836852207293666, | |
| "grad_norm": 0.5633450746536255, | |
| "learning_rate": 2.1986582993616926e-05, | |
| "loss": 0.0179, | |
| "num_input_tokens_seen": 238887552, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 4.856046065259117, | |
| "grad_norm": 0.4715237021446228, | |
| "learning_rate": 2.1820055616709735e-05, | |
| "loss": 0.0163, | |
| "num_input_tokens_seen": 239837440, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 4.8752399232245685, | |
| "grad_norm": 0.6615576148033142, | |
| "learning_rate": 2.1653671533118468e-05, | |
| "loss": 0.019, | |
| "num_input_tokens_seen": 240771712, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 4.894433781190019, | |
| "grad_norm": 0.4694409668445587, | |
| "learning_rate": 2.148743824037269e-05, | |
| "loss": 0.0169, | |
| "num_input_tokens_seen": 241706496, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 4.91362763915547, | |
| "grad_norm": 0.5807188153266907, | |
| "learning_rate": 2.1321363229207096e-05, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 242651520, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 4.932821497120921, | |
| "grad_norm": 0.5913830399513245, | |
| "learning_rate": 2.115545398322399e-05, | |
| "loss": 0.018, | |
| "num_input_tokens_seen": 243610880, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 4.952015355086372, | |
| "grad_norm": 0.47264307737350464, | |
| "learning_rate": 2.098971797855599e-05, | |
| "loss": 0.0171, | |
| "num_input_tokens_seen": 244553344, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 4.971209213051823, | |
| "grad_norm": 0.5455983281135559, | |
| "learning_rate": 2.0824162683529224e-05, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 245489280, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 4.990403071017274, | |
| "grad_norm": 0.5236849784851074, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 0.0177, | |
| "num_input_tokens_seen": 246430848, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 5.009596928982726, | |
| "grad_norm": 0.3665928840637207, | |
| "learning_rate": 2.0493624054652357e-05, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 247392000, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 5.028790786948177, | |
| "grad_norm": 0.2873888313770294, | |
| "learning_rate": 2.0328655615394882e-05, | |
| "loss": 0.0076, | |
| "num_input_tokens_seen": 248329472, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 5.047984644913628, | |
| "grad_norm": 0.3191753029823303, | |
| "learning_rate": 2.016389767429272e-05, | |
| "loss": 0.0069, | |
| "num_input_tokens_seen": 249250688, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 5.067178502879079, | |
| "grad_norm": 0.3973754048347473, | |
| "learning_rate": 1.9999357655598893e-05, | |
| "loss": 0.0068, | |
| "num_input_tokens_seen": 250189312, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 5.08637236084453, | |
| "grad_norm": 0.27491700649261475, | |
| "learning_rate": 1.98350429737465e-05, | |
| "loss": 0.0063, | |
| "num_input_tokens_seen": 251125632, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 5.10556621880998, | |
| "grad_norm": 0.448505163192749, | |
| "learning_rate": 1.9670961033014605e-05, | |
| "loss": 0.0063, | |
| "num_input_tokens_seen": 252084224, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 5.1247600767754315, | |
| "grad_norm": 0.39973142743110657, | |
| "learning_rate": 1.950711922719458e-05, | |
| "loss": 0.0067, | |
| "num_input_tokens_seen": 253038080, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 5.143953934740883, | |
| "grad_norm": 0.3733747899532318, | |
| "learning_rate": 1.934352493925695e-05, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 253959936, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 5.163147792706334, | |
| "grad_norm": 0.33044806122779846, | |
| "learning_rate": 1.9180185541018695e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 254898176, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 5.182341650671785, | |
| "grad_norm": 0.39706701040267944, | |
| "learning_rate": 1.9017108392811065e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 255850368, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 5.201535508637236, | |
| "grad_norm": 0.391462117433548, | |
| "learning_rate": 1.8854300843147875e-05, | |
| "loss": 0.0062, | |
| "num_input_tokens_seen": 256820096, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 5.220729366602687, | |
| "grad_norm": 0.41619014739990234, | |
| "learning_rate": 1.8691770228394456e-05, | |
| "loss": 0.0063, | |
| "num_input_tokens_seen": 257742080, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 5.239923224568138, | |
| "grad_norm": 0.3508637547492981, | |
| "learning_rate": 1.852952387243698e-05, | |
| "loss": 0.0054, | |
| "num_input_tokens_seen": 258705664, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 5.25911708253359, | |
| "grad_norm": 0.3165414333343506, | |
| "learning_rate": 1.8367569086352483e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 259630464, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 5.278310940499041, | |
| "grad_norm": 0.31605008244514465, | |
| "learning_rate": 1.820591316807939e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 260580352, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 5.297504798464491, | |
| "grad_norm": 0.235918790102005, | |
| "learning_rate": 1.8044563402088684e-05, | |
| "loss": 0.005, | |
| "num_input_tokens_seen": 261539072, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 5.316698656429942, | |
| "grad_norm": 0.37443870306015015, | |
| "learning_rate": 1.788352705905563e-05, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 262481152, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 5.335892514395393, | |
| "grad_norm": 0.42907220125198364, | |
| "learning_rate": 1.7722811395532178e-05, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 263416064, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 5.355086372360844, | |
| "grad_norm": 0.3673328161239624, | |
| "learning_rate": 1.756242365361993e-05, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 264374016, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 5.3742802303262955, | |
| "grad_norm": 0.40815985202789307, | |
| "learning_rate": 1.740237106064383e-05, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 265326976, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.393474088291747, | |
| "grad_norm": 0.3974185287952423, | |
| "learning_rate": 1.72426608288265e-05, | |
| "loss": 0.0053, | |
| "num_input_tokens_seen": 266300672, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 5.412667946257198, | |
| "grad_norm": 0.415446013212204, | |
| "learning_rate": 1.7083300154963193e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 267247488, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 5.431861804222649, | |
| "grad_norm": 0.24956613779067993, | |
| "learning_rate": 1.6924296220097556e-05, | |
| "loss": 0.0054, | |
| "num_input_tokens_seen": 268195072, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 5.4510556621881, | |
| "grad_norm": 0.3614320456981659, | |
| "learning_rate": 1.6765656189198013e-05, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 269156736, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 5.470249520153551, | |
| "grad_norm": 0.3081935942173004, | |
| "learning_rate": 1.6607387210834887e-05, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 270111232, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 5.4894433781190015, | |
| "grad_norm": 0.616385281085968, | |
| "learning_rate": 1.6449496416858284e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 271075328, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 5.508637236084453, | |
| "grad_norm": 0.2966477572917938, | |
| "learning_rate": 1.6291990922076745e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 272035200, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 5.527831094049904, | |
| "grad_norm": 0.3426896035671234, | |
| "learning_rate": 1.613487782393661e-05, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 273006464, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 5.547024952015355, | |
| "grad_norm": 0.333395779132843, | |
| "learning_rate": 1.59781642022022e-05, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 273947520, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 5.566218809980806, | |
| "grad_norm": 0.37326982617378235, | |
| "learning_rate": 1.582185711863681e-05, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 274880640, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 5.585412667946257, | |
| "grad_norm": 0.47231632471084595, | |
| "learning_rate": 1.5665963616684476e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 275832576, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 5.604606525911708, | |
| "grad_norm": 0.4276968240737915, | |
| "learning_rate": 1.5510490721152592e-05, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 276766720, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 5.6238003838771595, | |
| "grad_norm": 0.3536054491996765, | |
| "learning_rate": 1.535544543789537e-05, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 277722752, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 5.642994241842611, | |
| "grad_norm": 0.3875754475593567, | |
| "learning_rate": 1.5200834753498128e-05, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 278668544, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 5.662188099808061, | |
| "grad_norm": 0.39483705163002014, | |
| "learning_rate": 1.5046665634962476e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 279602944, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 5.681381957773512, | |
| "grad_norm": 0.3274906873703003, | |
| "learning_rate": 1.489294502939238e-05, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 280545408, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 5.700575815738963, | |
| "grad_norm": 0.41820597648620605, | |
| "learning_rate": 1.4739679863681086e-05, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 281486208, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 5.719769673704414, | |
| "grad_norm": 0.3023267984390259, | |
| "learning_rate": 1.4586877044199016e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 282428032, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 5.7389635316698655, | |
| "grad_norm": 0.40845391154289246, | |
| "learning_rate": 1.443454345648252e-05, | |
| "loss": 0.0061, | |
| "num_input_tokens_seen": 283387264, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 5.758157389635317, | |
| "grad_norm": 0.2751927971839905, | |
| "learning_rate": 1.4282685964923642e-05, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 284347008, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.777351247600768, | |
| "grad_norm": 0.39462777972221375, | |
| "learning_rate": 1.4131311412460796e-05, | |
| "loss": 0.0061, | |
| "num_input_tokens_seen": 285271424, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 5.796545105566219, | |
| "grad_norm": 0.3681143522262573, | |
| "learning_rate": 1.398042662027035e-05, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 286222208, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 5.81573896353167, | |
| "grad_norm": 0.3678882122039795, | |
| "learning_rate": 1.3830038387459354e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 287186304, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 5.834932821497121, | |
| "grad_norm": 0.3934548795223236, | |
| "learning_rate": 1.3680153490759073e-05, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 288142848, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 5.854126679462572, | |
| "grad_norm": 0.3608354926109314, | |
| "learning_rate": 1.3530778684219648e-05, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 289076608, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 5.8733205374280235, | |
| "grad_norm": 0.3579324781894684, | |
| "learning_rate": 1.3381920698905787e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 290014848, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 5.892514395393474, | |
| "grad_norm": 0.45630770921707153, | |
| "learning_rate": 1.3233586242593387e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 290956928, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 5.911708253358925, | |
| "grad_norm": 0.48819243907928467, | |
| "learning_rate": 1.3085781999467303e-05, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 291889408, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 5.930902111324376, | |
| "grad_norm": 0.39040514826774597, | |
| "learning_rate": 1.293851462982017e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 292832768, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 5.950095969289827, | |
| "grad_norm": 0.33169373869895935, | |
| "learning_rate": 1.2791790769752232e-05, | |
| "loss": 0.0054, | |
| "num_input_tokens_seen": 293767040, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.969289827255278, | |
| "grad_norm": 0.3252679705619812, | |
| "learning_rate": 1.2645617030872328e-05, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 294750208, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 5.9884836852207295, | |
| "grad_norm": 0.35827863216400146, | |
| "learning_rate": 1.2500000000000006e-05, | |
| "loss": 0.0051, | |
| "num_input_tokens_seen": 295721088, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 6.007677543186181, | |
| "grad_norm": 0.2643219530582428, | |
| "learning_rate": 1.2354946238868631e-05, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 296669184, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 6.026871401151632, | |
| "grad_norm": 0.16359519958496094, | |
| "learning_rate": 1.2210462283829755e-05, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 297624448, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 6.046065259117083, | |
| "grad_norm": 0.17375914752483368, | |
| "learning_rate": 1.2066554645558578e-05, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 298558080, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 6.065259117082533, | |
| "grad_norm": 0.16882829368114471, | |
| "learning_rate": 1.1923229808760564e-05, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 299492352, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 6.084452975047984, | |
| "grad_norm": 0.15032212436199188, | |
| "learning_rate": 1.1780494231879183e-05, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 300446976, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 6.1036468330134355, | |
| "grad_norm": 0.1795198768377304, | |
| "learning_rate": 1.1638354346804971e-05, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 301379328, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 6.122840690978887, | |
| "grad_norm": 0.19939038157463074, | |
| "learning_rate": 1.1496816558585622e-05, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 302306944, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 6.142034548944338, | |
| "grad_norm": 0.13161912560462952, | |
| "learning_rate": 1.1355887245137383e-05, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 303246848, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.161228406909789, | |
| "grad_norm": 0.12050630152225494, | |
| "learning_rate": 1.121557275695771e-05, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 304181248, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 6.18042226487524, | |
| "grad_norm": 0.1479523628950119, | |
| "learning_rate": 1.1075879416839023e-05, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 305139200, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 6.199616122840691, | |
| "grad_norm": 0.2239997535943985, | |
| "learning_rate": 1.093681351958383e-05, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 306099328, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 6.218809980806142, | |
| "grad_norm": 0.1326994150876999, | |
| "learning_rate": 1.0798381331721109e-05, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 307053568, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 6.2380038387715935, | |
| "grad_norm": 0.16336026787757874, | |
| "learning_rate": 1.0660589091223855e-05, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 308003200, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 6.257197696737044, | |
| "grad_norm": 0.19519683718681335, | |
| "learning_rate": 1.052344300722803e-05, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 308958720, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 6.276391554702495, | |
| "grad_norm": 0.19589100778102875, | |
| "learning_rate": 1.0386949259752785e-05, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 309904384, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 6.295585412667946, | |
| "grad_norm": 0.1591753512620926, | |
| "learning_rate": 1.0251113999421935e-05, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 310861568, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 6.314779270633397, | |
| "grad_norm": 0.1489296555519104, | |
| "learning_rate": 1.0115943347186826e-05, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 311800064, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 6.333973128598848, | |
| "grad_norm": 0.1340964287519455, | |
| "learning_rate": 9.981443394050525e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 312742656, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 6.3531669865642995, | |
| "grad_norm": 0.4749620258808136, | |
| "learning_rate": 9.847620200793343e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 313683840, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 6.372360844529751, | |
| "grad_norm": 0.27900460362434387, | |
| "learning_rate": 9.714479797699694e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 314630400, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 6.391554702495202, | |
| "grad_norm": 0.15642359852790833, | |
| "learning_rate": 9.582028184286423e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 315612544, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 6.410748560460653, | |
| "grad_norm": 0.2352278232574463, | |
| "learning_rate": 9.450271329032404e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 316564224, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 6.429942418426104, | |
| "grad_norm": 0.2565127909183502, | |
| "learning_rate": 9.3192151691096e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 317537024, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 6.449136276391554, | |
| "grad_norm": 0.23012320697307587, | |
| "learning_rate": 9.18886561011557e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 318482944, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 6.468330134357005, | |
| "grad_norm": 0.15872737765312195, | |
| "learning_rate": 9.059228525807296e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 319438848, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 6.487523992322457, | |
| "grad_norm": 0.1375139206647873, | |
| "learning_rate": 8.930309757836517e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 320388736, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 6.506717850287908, | |
| "grad_norm": 0.12399590760469437, | |
| "learning_rate": 8.802115115486535e-06, | |
| "loss": 0.0013, | |
| "num_input_tokens_seen": 321354880, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 6.525911708253359, | |
| "grad_norm": 0.23495157063007355, | |
| "learning_rate": 8.67465037541038e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 322301952, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.54510556621881, | |
| "grad_norm": 0.15574663877487183, | |
| "learning_rate": 8.54792128137053e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 323246208, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 6.564299424184261, | |
| "grad_norm": 0.1617659628391266, | |
| "learning_rate": 8.421933543980126e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 324191616, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 6.583493282149712, | |
| "grad_norm": 0.14752747118473053, | |
| "learning_rate": 8.29669284044557e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 325125888, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 6.6026871401151634, | |
| "grad_norm": 0.2986501455307007, | |
| "learning_rate": 8.172204814310742e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 326070784, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 6.621880998080615, | |
| "grad_norm": 0.23812003433704376, | |
| "learning_rate": 8.048475075202727e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 326995712, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 6.641074856046066, | |
| "grad_norm": 0.13329505920410156, | |
| "learning_rate": 7.92550919857896e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 327954816, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 6.660268714011516, | |
| "grad_norm": 0.10942558199167252, | |
| "learning_rate": 7.803312725476031e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 328909184, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 6.679462571976967, | |
| "grad_norm": 0.10470914840698242, | |
| "learning_rate": 7.681891162260015e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 329862144, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 6.698656429942418, | |
| "grad_norm": 0.2065214365720749, | |
| "learning_rate": 7.561249980378301e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 330812544, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 6.717850287907869, | |
| "grad_norm": 0.2046762853860855, | |
| "learning_rate": 7.441394616113062e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 331769216, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 6.737044145873321, | |
| "grad_norm": 0.17153848707675934, | |
| "learning_rate": 7.3223304703363135e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 332703744, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 6.756238003838772, | |
| "grad_norm": 0.11358804255723953, | |
| "learning_rate": 7.20406290826649e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 333635712, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 6.775431861804223, | |
| "grad_norm": 0.21972091495990753, | |
| "learning_rate": 7.086597259226707e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 334579968, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 6.794625719769674, | |
| "grad_norm": 0.16843383014202118, | |
| "learning_rate": 6.969938816404639e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 335506304, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 6.813819577735125, | |
| "grad_norm": 0.155124694108963, | |
| "learning_rate": 6.854092836613948e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 336457856, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 6.833013435700575, | |
| "grad_norm": 0.11480195820331573, | |
| "learning_rate": 6.739064540057424e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 337391616, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 6.8522072936660265, | |
| "grad_norm": 0.13995982706546783, | |
| "learning_rate": 6.624859110091791e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 338349568, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 6.871401151631478, | |
| "grad_norm": 0.2626807689666748, | |
| "learning_rate": 6.511481692994076e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 339293440, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 6.890595009596929, | |
| "grad_norm": 0.21995946764945984, | |
| "learning_rate": 6.3989373977297315e-06, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 340271360, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 6.90978886756238, | |
| "grad_norm": 0.11761938780546188, | |
| "learning_rate": 6.28723129572247e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 341216384, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 6.928982725527831, | |
| "grad_norm": 0.14595161378383636, | |
| "learning_rate": 6.1763684206256525e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 342183808, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 6.948176583493282, | |
| "grad_norm": 0.1292518526315689, | |
| "learning_rate": 6.066353768095504e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 343123712, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 6.967370441458733, | |
| "grad_norm": 0.19953125715255737, | |
| "learning_rate": 5.957192295566022e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 344062976, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 6.9865642994241846, | |
| "grad_norm": 0.17268332839012146, | |
| "learning_rate": 5.848888922025553e-06, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 345019904, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 7.005758157389636, | |
| "grad_norm": 0.0566362664103508, | |
| "learning_rate": 5.741448527795137e-06, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 345976320, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 7.024952015355086, | |
| "grad_norm": 0.07774636894464493, | |
| "learning_rate": 5.634875954308638e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 346933120, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 7.044145873320537, | |
| "grad_norm": 0.09574442356824875, | |
| "learning_rate": 5.52917600389451e-06, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 347883392, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 7.063339731285988, | |
| "grad_norm": 0.04345453530550003, | |
| "learning_rate": 5.424353439559446e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 348815616, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 7.082533589251439, | |
| "grad_norm": 0.08021701127290726, | |
| "learning_rate": 5.320412984773748e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 349761152, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 7.1017274472168905, | |
| "grad_norm": 0.04693225026130676, | |
| "learning_rate": 5.217359323258459e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 350714880, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 7.120921305182342, | |
| "grad_norm": 0.041134320199489594, | |
| "learning_rate": 5.115197098774302e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 351676544, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 7.140115163147793, | |
| "grad_norm": 0.06960324198007584, | |
| "learning_rate": 5.013930914912476e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 352604672, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 7.159309021113244, | |
| "grad_norm": 0.057376183569431305, | |
| "learning_rate": 4.913565334887135e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 353552640, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 7.178502879078695, | |
| "grad_norm": 0.04041101410984993, | |
| "learning_rate": 4.814104881329828e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 354511360, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 7.197696737044146, | |
| "grad_norm": 0.052469249814748764, | |
| "learning_rate": 4.715554036085673e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 355478144, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 7.2168905950095965, | |
| "grad_norm": 0.11581304669380188, | |
| "learning_rate": 4.617917240011394e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 356424448, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 7.236084452975048, | |
| "grad_norm": 0.04358832538127899, | |
| "learning_rate": 4.521198892775203e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 357374208, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 7.255278310940499, | |
| "grad_norm": 0.04860702529549599, | |
| "learning_rate": 4.425403352658591e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 358315392, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 7.27447216890595, | |
| "grad_norm": 0.05813557654619217, | |
| "learning_rate": 4.330534936359873e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 359280384, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 7.293666026871401, | |
| "grad_norm": 0.08444052934646606, | |
| "learning_rate": 4.236597918799709e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 360221440, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 7.312859884836852, | |
| "grad_norm": 0.09866166114807129, | |
| "learning_rate": 4.143596532928468e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 361175936, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 7.332053742802303, | |
| "grad_norm": 0.039118677377700806, | |
| "learning_rate": 4.051534969535472e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 362113280, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 7.3512476007677545, | |
| "grad_norm": 0.08323455601930618, | |
| "learning_rate": 3.960417377060152e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 363056512, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 7.370441458733206, | |
| "grad_norm": 0.03620649501681328, | |
| "learning_rate": 3.8702478614051355e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 364026752, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 7.389635316698657, | |
| "grad_norm": 0.06456654518842697, | |
| "learning_rate": 3.7810304857511914e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 364979456, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 7.408829174664108, | |
| "grad_norm": 0.057450417429208755, | |
| "learning_rate": 3.6927692703741634e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 365919488, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 7.428023032629558, | |
| "grad_norm": 0.10976872593164444, | |
| "learning_rate": 3.605468192463815e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 366871552, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 7.447216890595009, | |
| "grad_norm": 0.04592859372496605, | |
| "learning_rate": 3.5191311859445796e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 367824768, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 7.4664107485604605, | |
| "grad_norm": 0.08413061499595642, | |
| "learning_rate": 3.4337621412983274e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 368776704, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 7.485604606525912, | |
| "grad_norm": 0.06421375274658203, | |
| "learning_rate": 3.3493649053890326e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 369726848, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 7.504798464491363, | |
| "grad_norm": 0.04606764018535614, | |
| "learning_rate": 3.2659432812894296e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 370669184, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 7.523992322456814, | |
| "grad_norm": 0.14935247600078583, | |
| "learning_rate": 3.183501028109642e-06, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 371619072, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 7.543186180422265, | |
| "grad_norm": 0.03758896514773369, | |
| "learning_rate": 3.1020418608278035e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 372591104, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 7.562380038387716, | |
| "grad_norm": 0.08971104770898819, | |
| "learning_rate": 3.0215694501226384e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 373540352, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 7.581573896353167, | |
| "grad_norm": 0.049786727875471115, | |
| "learning_rate": 2.942087422208051e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 374494336, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 7.600767754318618, | |
| "grad_norm": 0.14331580698490143, | |
| "learning_rate": 2.8635993586697553e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 375443968, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 7.619961612284069, | |
| "grad_norm": 0.1280643343925476, | |
| "learning_rate": 2.7861087963038435e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 376398848, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 7.63915547024952, | |
| "grad_norm": 0.07448034733533859, | |
| "learning_rate": 2.70961922695743e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 377363072, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 7.658349328214971, | |
| "grad_norm": 0.047854091972112656, | |
| "learning_rate": 2.6341340973713187e-06, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 378285440, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 7.677543186180422, | |
| "grad_norm": 0.05751164257526398, | |
| "learning_rate": 2.5596568090246548e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 379227776, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 7.696737044145873, | |
| "grad_norm": 0.09036415070295334, | |
| "learning_rate": 2.486190717981665e-06, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 380168064, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 7.7159309021113245, | |
| "grad_norm": 0.0800870880484581, | |
| "learning_rate": 2.4137391347404476e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 381124736, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 7.735124760076776, | |
| "grad_norm": 0.055397044867277145, | |
| "learning_rate": 2.3423053240837515e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 382078592, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 7.754318618042227, | |
| "grad_norm": 0.06471221148967743, | |
| "learning_rate": 2.271892504931905e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 383012224, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 7.773512476007678, | |
| "grad_norm": 0.05552973225712776, | |
| "learning_rate": 2.2025038501977486e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 383955328, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 7.792706333973129, | |
| "grad_norm": 0.05302416905760765, | |
| "learning_rate": 2.1341424866436364e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 384918528, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 7.811900191938579, | |
| "grad_norm": 0.04085630923509598, | |
| "learning_rate": 2.0668114947405726e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 385865472, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 7.8310940499040305, | |
| "grad_norm": 0.11027319729328156, | |
| "learning_rate": 2.0005139085293945e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 386815488, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 7.850287907869482, | |
| "grad_norm": 0.10259139537811279, | |
| "learning_rate": 1.9352527154840345e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 387746176, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 7.869481765834933, | |
| "grad_norm": 0.08809423446655273, | |
| "learning_rate": 1.8710308563769124e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 388682752, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 7.888675623800384, | |
| "grad_norm": 0.04966364800930023, | |
| "learning_rate": 1.8078512251464286e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 389629056, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 7.907869481765835, | |
| "grad_norm": 0.17464539408683777, | |
| "learning_rate": 1.7457166687665449e-06, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 390563584, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 7.927063339731286, | |
| "grad_norm": 0.04215671867132187, | |
| "learning_rate": 1.684629987118494e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 391511808, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 7.946257197696737, | |
| "grad_norm": 0.07717634737491608, | |
| "learning_rate": 1.624593932864632e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 392460032, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 7.9654510556621885, | |
| "grad_norm": 0.04429319128394127, | |
| "learning_rate": 1.5656112113243721e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 393403264, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 7.984644913627639, | |
| "grad_norm": 0.06878451257944107, | |
| "learning_rate": 1.5076844803522922e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 394339584, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 8.00383877159309, | |
| "grad_norm": 0.05545974150300026, | |
| "learning_rate": 1.4508163502183786e-06, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 395291264, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 8.023032629558541, | |
| "grad_norm": 0.07342655211687088, | |
| "learning_rate": 1.3950093834903866e-06, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 396246272, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 8.042226487523992, | |
| "grad_norm": 0.030074596405029297, | |
| "learning_rate": 1.340266094918366e-06, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 397189376, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 8.061420345489443, | |
| "grad_norm": 0.0349307544529438, | |
| "learning_rate": 1.286588951321363e-06, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 398128000, | |
| "step": 2100 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2340, | |
| "num_input_tokens_seen": 398128000, | |
| "num_train_epochs": 9, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.201849892590846e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |