| { | |
| "best_metric": 2.4261486530303955, | |
| "best_model_checkpoint": "./output/training_results/C016_Meta-Llama-3-8B_pretrain_20240721_092214/checkpoint-11088", | |
| "epoch": 4.0, | |
| "eval_steps": 1232, | |
| "global_step": 12316, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0003247807729782397, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 2.6721, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.20006495615459566, | |
| "grad_norm": 1.9328745806142353, | |
| "learning_rate": 1.9805194805194805e-06, | |
| "loss": 2.592, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.4001299123091913, | |
| "grad_norm": 2.0226924668887496, | |
| "learning_rate": 2.245175689219919e-06, | |
| "loss": 2.5057, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 0.4001299123091913, | |
| "eval_loss": 2.4827427864074707, | |
| "eval_runtime": 252.9711, | |
| "eval_samples_per_second": 86.543, | |
| "eval_steps_per_second": 0.68, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 0.6001948684637869, | |
| "grad_norm": 1.8512197610266496, | |
| "learning_rate": 1.2232016471327423e-06, | |
| "loss": 2.4683, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 0.8002598246183826, | |
| "grad_norm": 1.989781044650789, | |
| "learning_rate": 6.53049308175953e-07, | |
| "loss": 2.444, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 0.8002598246183826, | |
| "eval_loss": 2.440027952194214, | |
| "eval_runtime": 251.3004, | |
| "eval_samples_per_second": 87.119, | |
| "eval_steps_per_second": 0.684, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 1.0003247807729783, | |
| "grad_norm": 1.8956736763179278, | |
| "learning_rate": 3.466185770829244e-07, | |
| "loss": 2.4358, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.200389736927574, | |
| "grad_norm": 1.9111043304334017, | |
| "learning_rate": 1.8910445197889315e-07, | |
| "loss": 2.3648, | |
| "step": 3696 | |
| }, | |
| { | |
| "epoch": 1.200389736927574, | |
| "eval_loss": 2.431915760040283, | |
| "eval_runtime": 251.1577, | |
| "eval_samples_per_second": 87.168, | |
| "eval_steps_per_second": 0.685, | |
| "step": 3696 | |
| }, | |
| { | |
| "epoch": 1.4004546930821695, | |
| "grad_norm": 1.884967762986231, | |
| "learning_rate": 1.1168237259086467e-07, | |
| "loss": 2.3663, | |
| "step": 4312 | |
| }, | |
| { | |
| "epoch": 1.600519649236765, | |
| "grad_norm": 1.9873340014256546, | |
| "learning_rate": 7.563133304849047e-08, | |
| "loss": 2.372, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 1.600519649236765, | |
| "eval_loss": 2.4293837547302246, | |
| "eval_runtime": 251.386, | |
| "eval_samples_per_second": 87.089, | |
| "eval_steps_per_second": 0.684, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 1.8005846053913608, | |
| "grad_norm": 1.9124289441200373, | |
| "learning_rate": 5.98689431836726e-08, | |
| "loss": 2.3684, | |
| "step": 5544 | |
| }, | |
| { | |
| "epoch": 2.0006495615459565, | |
| "grad_norm": 1.9479679838446426, | |
| "learning_rate": 5.346405476547749e-08, | |
| "loss": 2.3667, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.0006495615459565, | |
| "eval_loss": 2.4281327724456787, | |
| "eval_runtime": 251.4248, | |
| "eval_samples_per_second": 87.076, | |
| "eval_steps_per_second": 0.684, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.2007145177005523, | |
| "grad_norm": 1.9022764488275758, | |
| "learning_rate": 5.109115615383696e-08, | |
| "loss": 2.3568, | |
| "step": 6776 | |
| }, | |
| { | |
| "epoch": 2.400779473855148, | |
| "grad_norm": 1.885664698080457, | |
| "learning_rate": 5.0300090028337e-08, | |
| "loss": 2.3573, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 2.400779473855148, | |
| "eval_loss": 2.4281272888183594, | |
| "eval_runtime": 250.732, | |
| "eval_samples_per_second": 87.316, | |
| "eval_steps_per_second": 0.686, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 2.6008444300097433, | |
| "grad_norm": 1.9790266878041507, | |
| "learning_rate": 5.006932020966859e-08, | |
| "loss": 2.3533, | |
| "step": 8008 | |
| }, | |
| { | |
| "epoch": 2.800909386164339, | |
| "grad_norm": 1.9987792342132904, | |
| "learning_rate": 5.0012816199435985e-08, | |
| "loss": 2.3603, | |
| "step": 8624 | |
| }, | |
| { | |
| "epoch": 2.800909386164339, | |
| "eval_loss": 2.4273290634155273, | |
| "eval_runtime": 251.237, | |
| "eval_samples_per_second": 87.141, | |
| "eval_steps_per_second": 0.685, | |
| "step": 8624 | |
| }, | |
| { | |
| "epoch": 3.000974342318935, | |
| "grad_norm": 1.892342522891005, | |
| "learning_rate": 5.0001737227175665e-08, | |
| "loss": 2.3575, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 3.2010392984735305, | |
| "grad_norm": 1.945849916471603, | |
| "learning_rate": 5.0000151498505686e-08, | |
| "loss": 2.3522, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 3.2010392984735305, | |
| "eval_loss": 2.4268627166748047, | |
| "eval_runtime": 251.3708, | |
| "eval_samples_per_second": 87.094, | |
| "eval_steps_per_second": 0.684, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 3.401104254628126, | |
| "grad_norm": 1.9950133710544578, | |
| "learning_rate": 5.0000006522774745e-08, | |
| "loss": 2.3532, | |
| "step": 10472 | |
| }, | |
| { | |
| "epoch": 3.6011692107827216, | |
| "grad_norm": 1.9873010505050475, | |
| "learning_rate": 5.000000007975414e-08, | |
| "loss": 2.353, | |
| "step": 11088 | |
| }, | |
| { | |
| "epoch": 3.6011692107827216, | |
| "eval_loss": 2.4261486530303955, | |
| "eval_runtime": 252.6429, | |
| "eval_samples_per_second": 86.656, | |
| "eval_steps_per_second": 0.681, | |
| "step": 11088 | |
| }, | |
| { | |
| "epoch": 3.8012341669373173, | |
| "grad_norm": 1.998167327050962, | |
| "learning_rate": 5.0000000000044414e-08, | |
| "loss": 2.3517, | |
| "step": 11704 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 12316, | |
| "total_flos": 1287266123120640.0, | |
| "train_loss": 2.391189053524001, | |
| "train_runtime": 40307.7885, | |
| "train_samples_per_second": 19.553, | |
| "train_steps_per_second": 0.306 | |
| } | |
| ], | |
| "logging_steps": 616, | |
| "max_steps": 12316, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 1232, | |
| "total_flos": 1287266123120640.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |