{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014295925661186561, "grad_norm": 16.642337799072266, "learning_rate": 1.9742857142857144e-05, "loss": 4.2448, "mean_token_accuracy": 0.44755197104532274, "num_tokens": 63714.0, "step": 10 }, { "epoch": 0.028591851322373123, "grad_norm": 12.869135856628418, "learning_rate": 1.945714285714286e-05, "loss": 1.9287, "mean_token_accuracy": 0.5766903940588236, "num_tokens": 128528.0, "step": 20 }, { "epoch": 0.04288777698355969, "grad_norm": 18.376684188842773, "learning_rate": 1.9171428571428573e-05, "loss": 1.6956, "mean_token_accuracy": 0.597195016592741, "num_tokens": 191200.0, "step": 30 }, { "epoch": 0.057183702644746245, "grad_norm": 17.71656036376953, "learning_rate": 1.888571428571429e-05, "loss": 1.6076, "mean_token_accuracy": 0.6067132025957107, "num_tokens": 255728.0, "step": 40 }, { "epoch": 0.07147962830593281, "grad_norm": 21.026283264160156, "learning_rate": 1.86e-05, "loss": 1.5728, "mean_token_accuracy": 0.612850959226489, "num_tokens": 319058.0, "step": 50 }, { "epoch": 0.08577555396711938, "grad_norm": 46.10198974609375, "learning_rate": 1.8314285714285714e-05, "loss": 1.5977, "mean_token_accuracy": 0.6111391615122557, "num_tokens": 384900.0, "step": 60 }, { "epoch": 0.10007147962830593, "grad_norm": 14.742942810058594, "learning_rate": 1.802857142857143e-05, "loss": 1.5649, "mean_token_accuracy": 0.6097237385809422, "num_tokens": 450346.0, "step": 70 }, { "epoch": 0.11436740528949249, "grad_norm": 43.62748718261719, "learning_rate": 1.7742857142857143e-05, "loss": 1.5184, "mean_token_accuracy": 0.6210372049361468, "num_tokens": 515018.0, "step": 80 }, { "epoch": 0.12866333095067906, "grad_norm": 15.469511032104492, "learning_rate": 1.745714285714286e-05, "loss": 1.4736, "mean_token_accuracy": 0.6270900748670101, "num_tokens": 576955.0, "step": 90 }, { "epoch": 0.14295925661186562, "grad_norm": 19.448793411254883, "learning_rate": 1.717142857142857e-05, "loss": 1.4637, "mean_token_accuracy": 0.6368957210332156, "num_tokens": 641295.0, "step": 100 }, { "epoch": 0.15725518227305219, "grad_norm": 37.31778335571289, "learning_rate": 1.6885714285714288e-05, "loss": 1.5303, "mean_token_accuracy": 0.6210926879197359, "num_tokens": 706683.0, "step": 110 }, { "epoch": 0.17155110793423875, "grad_norm": 9.722342491149902, "learning_rate": 1.66e-05, "loss": 1.4596, "mean_token_accuracy": 0.6298462159931659, "num_tokens": 771285.0, "step": 120 }, { "epoch": 0.18584703359542531, "grad_norm": 9.656769752502441, "learning_rate": 1.6314285714285716e-05, "loss": 1.5281, "mean_token_accuracy": 0.6251190695911646, "num_tokens": 840678.0, "step": 130 }, { "epoch": 0.20014295925661185, "grad_norm": 9.608354568481445, "learning_rate": 1.602857142857143e-05, "loss": 1.4438, "mean_token_accuracy": 0.6370445918291807, "num_tokens": 905832.0, "step": 140 }, { "epoch": 0.21443888491779842, "grad_norm": 9.842904090881348, "learning_rate": 1.5742857142857145e-05, "loss": 1.5379, "mean_token_accuracy": 0.6172758720815181, "num_tokens": 972946.0, "step": 150 }, { "epoch": 0.22873481057898498, "grad_norm": 18.17994499206543, "learning_rate": 1.545714285714286e-05, "loss": 1.4322, "mean_token_accuracy": 0.6351331725716591, "num_tokens": 1034427.0, "step": 160 }, { "epoch": 0.24303073624017155, "grad_norm": 8.876994132995605, "learning_rate": 1.5171428571428572e-05, "loss": 1.4343, "mean_token_accuracy": 0.6313620086759328, "num_tokens": 1101359.0, "step": 170 }, { "epoch": 0.2573266619013581, "grad_norm": 10.895979881286621, "learning_rate": 1.4885714285714288e-05, "loss": 1.4537, "mean_token_accuracy": 0.633945481479168, "num_tokens": 1166538.0, "step": 180 }, { "epoch": 0.27162258756254465, "grad_norm": 12.30453872680664, "learning_rate": 1.46e-05, "loss": 1.5363, "mean_token_accuracy": 0.6255367647856473, "num_tokens": 1235118.0, "step": 190 }, { "epoch": 0.28591851322373124, "grad_norm": 10.28065299987793, "learning_rate": 1.4314285714285717e-05, "loss": 1.4199, "mean_token_accuracy": 0.635765865072608, "num_tokens": 1300601.0, "step": 200 }, { "epoch": 0.3002144388849178, "grad_norm": 11.893675804138184, "learning_rate": 1.402857142857143e-05, "loss": 1.3881, "mean_token_accuracy": 0.6449477795511485, "num_tokens": 1362732.0, "step": 210 }, { "epoch": 0.31451036454610437, "grad_norm": 11.485602378845215, "learning_rate": 1.3742857142857144e-05, "loss": 1.3647, "mean_token_accuracy": 0.6486451178789139, "num_tokens": 1424780.0, "step": 220 }, { "epoch": 0.3288062902072909, "grad_norm": 8.882689476013184, "learning_rate": 1.3457142857142858e-05, "loss": 1.3915, "mean_token_accuracy": 0.6436454936861992, "num_tokens": 1490808.0, "step": 230 }, { "epoch": 0.3431022158684775, "grad_norm": 18.272981643676758, "learning_rate": 1.3171428571428573e-05, "loss": 1.4796, "mean_token_accuracy": 0.6283955980092287, "num_tokens": 1556933.0, "step": 240 }, { "epoch": 0.35739814152966404, "grad_norm": 11.947668075561523, "learning_rate": 1.2885714285714285e-05, "loss": 1.4398, "mean_token_accuracy": 0.638329004868865, "num_tokens": 1621053.0, "step": 250 }, { "epoch": 0.37169406719085063, "grad_norm": 23.547773361206055, "learning_rate": 1.2600000000000001e-05, "loss": 1.3801, "mean_token_accuracy": 0.6448216594755649, "num_tokens": 1686747.0, "step": 260 }, { "epoch": 0.38598999285203717, "grad_norm": 16.82425880432129, "learning_rate": 1.2314285714285716e-05, "loss": 1.3891, "mean_token_accuracy": 0.6475608512759209, "num_tokens": 1751946.0, "step": 270 }, { "epoch": 0.4002859185132237, "grad_norm": 11.931357383728027, "learning_rate": 1.202857142857143e-05, "loss": 1.3768, "mean_token_accuracy": 0.6439446356147528, "num_tokens": 1816123.0, "step": 280 }, { "epoch": 0.4145818441744103, "grad_norm": 14.375319480895996, "learning_rate": 1.1742857142857144e-05, "loss": 1.315, "mean_token_accuracy": 0.6517576463520527, "num_tokens": 1879227.0, "step": 290 }, { "epoch": 0.42887776983559683, "grad_norm": 10.699817657470703, "learning_rate": 1.1457142857142857e-05, "loss": 1.3519, "mean_token_accuracy": 0.6487406313419342, "num_tokens": 1944238.0, "step": 300 }, { "epoch": 0.4431736954967834, "grad_norm": 12.067941665649414, "learning_rate": 1.1171428571428573e-05, "loss": 1.2784, "mean_token_accuracy": 0.6627866499125957, "num_tokens": 2007629.0, "step": 310 }, { "epoch": 0.45746962115796996, "grad_norm": 15.550559997558594, "learning_rate": 1.0885714285714286e-05, "loss": 1.3495, "mean_token_accuracy": 0.6514371998608113, "num_tokens": 2076666.0, "step": 320 }, { "epoch": 0.47176554681915656, "grad_norm": 30.000173568725586, "learning_rate": 1.0600000000000002e-05, "loss": 1.3358, "mean_token_accuracy": 0.6507035464048385, "num_tokens": 2140860.0, "step": 330 }, { "epoch": 0.4860614724803431, "grad_norm": 7.962319850921631, "learning_rate": 1.0314285714285715e-05, "loss": 1.3231, "mean_token_accuracy": 0.6570919144898653, "num_tokens": 2204773.0, "step": 340 }, { "epoch": 0.5003573981415297, "grad_norm": 24.023008346557617, "learning_rate": 1.002857142857143e-05, "loss": 1.3936, "mean_token_accuracy": 0.6455658808350563, "num_tokens": 2270923.0, "step": 350 }, { "epoch": 0.5146533238027162, "grad_norm": 8.74783706665039, "learning_rate": 9.742857142857143e-06, "loss": 1.3383, "mean_token_accuracy": 0.6552599217742682, "num_tokens": 2337009.0, "step": 360 }, { "epoch": 0.5289492494639028, "grad_norm": 17.01344108581543, "learning_rate": 9.457142857142858e-06, "loss": 1.3524, "mean_token_accuracy": 0.6488417606800795, "num_tokens": 2405973.0, "step": 370 }, { "epoch": 0.5432451751250893, "grad_norm": 9.353411674499512, "learning_rate": 9.171428571428572e-06, "loss": 1.2638, "mean_token_accuracy": 0.6636200629174709, "num_tokens": 2469824.0, "step": 380 }, { "epoch": 0.557541100786276, "grad_norm": 13.265799522399902, "learning_rate": 8.885714285714286e-06, "loss": 1.2254, "mean_token_accuracy": 0.6689145911484957, "num_tokens": 2535167.0, "step": 390 }, { "epoch": 0.5718370264474625, "grad_norm": 19.46824836730957, "learning_rate": 8.6e-06, "loss": 1.3844, "mean_token_accuracy": 0.6483918268233537, "num_tokens": 2607759.0, "step": 400 }, { "epoch": 0.586132952108649, "grad_norm": 15.773782730102539, "learning_rate": 8.314285714285715e-06, "loss": 1.2708, "mean_token_accuracy": 0.6655503377318382, "num_tokens": 2670580.0, "step": 410 }, { "epoch": 0.6004288777698356, "grad_norm": 8.917901039123535, "learning_rate": 8.02857142857143e-06, "loss": 1.2726, "mean_token_accuracy": 0.6589578501880169, "num_tokens": 2737272.0, "step": 420 }, { "epoch": 0.6147248034310222, "grad_norm": 8.988587379455566, "learning_rate": 7.742857142857144e-06, "loss": 1.221, "mean_token_accuracy": 0.664219357818365, "num_tokens": 2803875.0, "step": 430 }, { "epoch": 0.6290207290922087, "grad_norm": 12.661059379577637, "learning_rate": 7.457142857142857e-06, "loss": 1.2658, "mean_token_accuracy": 0.662236025184393, "num_tokens": 2869457.0, "step": 440 }, { "epoch": 0.6433166547533953, "grad_norm": 8.545147895812988, "learning_rate": 7.1714285714285725e-06, "loss": 1.2778, "mean_token_accuracy": 0.6622273363173008, "num_tokens": 2931790.0, "step": 450 }, { "epoch": 0.6576125804145818, "grad_norm": 20.769514083862305, "learning_rate": 6.885714285714287e-06, "loss": 1.2951, "mean_token_accuracy": 0.6606701787561178, "num_tokens": 2997229.0, "step": 460 }, { "epoch": 0.6719085060757684, "grad_norm": 12.466110229492188, "learning_rate": 6.600000000000001e-06, "loss": 1.1754, "mean_token_accuracy": 0.6822692640125751, "num_tokens": 3063485.0, "step": 470 }, { "epoch": 0.686204431736955, "grad_norm": 8.45051383972168, "learning_rate": 6.314285714285715e-06, "loss": 1.2102, "mean_token_accuracy": 0.6759132348001003, "num_tokens": 3127984.0, "step": 480 }, { "epoch": 0.7005003573981415, "grad_norm": 12.029594421386719, "learning_rate": 6.028571428571429e-06, "loss": 1.3355, "mean_token_accuracy": 0.6649406619369984, "num_tokens": 3194219.0, "step": 490 }, { "epoch": 0.7147962830593281, "grad_norm": 8.824553489685059, "learning_rate": 5.742857142857143e-06, "loss": 1.2317, "mean_token_accuracy": 0.6705160938203335, "num_tokens": 3259068.0, "step": 500 }, { "epoch": 0.7290922087205146, "grad_norm": 16.150766372680664, "learning_rate": 5.457142857142858e-06, "loss": 1.1558, "mean_token_accuracy": 0.6850677601993084, "num_tokens": 3324070.0, "step": 510 }, { "epoch": 0.7433881343817013, "grad_norm": 7.721499919891357, "learning_rate": 5.171428571428571e-06, "loss": 1.168, "mean_token_accuracy": 0.6747931383550168, "num_tokens": 3386885.0, "step": 520 }, { "epoch": 0.7576840600428878, "grad_norm": 9.311972618103027, "learning_rate": 4.885714285714286e-06, "loss": 1.1645, "mean_token_accuracy": 0.6775478422641754, "num_tokens": 3448602.0, "step": 530 }, { "epoch": 0.7719799857040743, "grad_norm": 9.636552810668945, "learning_rate": 4.600000000000001e-06, "loss": 1.2542, "mean_token_accuracy": 0.6680241461843253, "num_tokens": 3516481.0, "step": 540 }, { "epoch": 0.7862759113652609, "grad_norm": 36.31599044799805, "learning_rate": 4.314285714285714e-06, "loss": 1.1866, "mean_token_accuracy": 0.6768352195620537, "num_tokens": 3580217.0, "step": 550 }, { "epoch": 0.8005718370264474, "grad_norm": 7.471230506896973, "learning_rate": 4.028571428571429e-06, "loss": 1.1705, "mean_token_accuracy": 0.6818295098841191, "num_tokens": 3643021.0, "step": 560 }, { "epoch": 0.8148677626876341, "grad_norm": 48.099830627441406, "learning_rate": 3.742857142857143e-06, "loss": 1.1602, "mean_token_accuracy": 0.6852999441325665, "num_tokens": 3710116.0, "step": 570 }, { "epoch": 0.8291636883488206, "grad_norm": 13.096914291381836, "learning_rate": 3.4571428571428574e-06, "loss": 1.1942, "mean_token_accuracy": 0.6752621583640576, "num_tokens": 3775926.0, "step": 580 }, { "epoch": 0.8434596140100071, "grad_norm": 11.580378532409668, "learning_rate": 3.1714285714285714e-06, "loss": 1.1277, "mean_token_accuracy": 0.6849311918020249, "num_tokens": 3840218.0, "step": 590 }, { "epoch": 0.8577555396711937, "grad_norm": 9.58252239227295, "learning_rate": 2.885714285714286e-06, "loss": 1.187, "mean_token_accuracy": 0.6740429483354091, "num_tokens": 3904300.0, "step": 600 }, { "epoch": 0.8720514653323803, "grad_norm": 9.778560638427734, "learning_rate": 2.6e-06, "loss": 1.2088, "mean_token_accuracy": 0.6759266927838326, "num_tokens": 3970409.0, "step": 610 }, { "epoch": 0.8863473909935669, "grad_norm": 9.931038856506348, "learning_rate": 2.3142857142857145e-06, "loss": 1.1766, "mean_token_accuracy": 0.6766778022050858, "num_tokens": 4038742.0, "step": 620 }, { "epoch": 0.9006433166547534, "grad_norm": 7.126023769378662, "learning_rate": 2.028571428571429e-06, "loss": 1.0968, "mean_token_accuracy": 0.6913008309900761, "num_tokens": 4103374.0, "step": 630 }, { "epoch": 0.9149392423159399, "grad_norm": 7.73612642288208, "learning_rate": 1.7428571428571432e-06, "loss": 1.1254, "mean_token_accuracy": 0.6863209947943687, "num_tokens": 4170239.0, "step": 640 }, { "epoch": 0.9292351679771265, "grad_norm": 6.532904148101807, "learning_rate": 1.4571428571428573e-06, "loss": 1.1586, "mean_token_accuracy": 0.6804635964334012, "num_tokens": 4237810.0, "step": 650 }, { "epoch": 0.9435310936383131, "grad_norm": 7.370081901550293, "learning_rate": 1.1714285714285715e-06, "loss": 1.174, "mean_token_accuracy": 0.6809860028326511, "num_tokens": 4302937.0, "step": 660 }, { "epoch": 0.9578270192994996, "grad_norm": 7.471885681152344, "learning_rate": 8.857142857142857e-07, "loss": 1.1755, "mean_token_accuracy": 0.6858656518161297, "num_tokens": 4368704.0, "step": 670 }, { "epoch": 0.9721229449606862, "grad_norm": 9.739863395690918, "learning_rate": 6.000000000000001e-07, "loss": 1.1052, "mean_token_accuracy": 0.6904201626777648, "num_tokens": 4431384.0, "step": 680 }, { "epoch": 0.9864188706218727, "grad_norm": 11.182050704956055, "learning_rate": 3.1428571428571433e-07, "loss": 1.1422, "mean_token_accuracy": 0.688240597397089, "num_tokens": 4500182.0, "step": 690 }, { "epoch": 1.0, "grad_norm": 11.066879272460938, "learning_rate": 2.8571428571428575e-08, "loss": 1.131, "mean_token_accuracy": 0.6868848518321389, "num_tokens": 4559091.0, "step": 700 } ], "logging_steps": 10, "max_steps": 700, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4791381278720.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }