| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 10665, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015002344116268168, | |
| "grad_norm": 21.375, | |
| "learning_rate": 2.905342080599813e-07, | |
| "loss": 0.9809, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.030004688232536336, | |
| "grad_norm": 22.75, | |
| "learning_rate": 5.904404873477039e-07, | |
| "loss": 0.9194, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0450070323488045, | |
| "grad_norm": 21.875, | |
| "learning_rate": 8.903467666354265e-07, | |
| "loss": 0.8532, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.06000937646507267, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.1902530459231491e-06, | |
| "loss": 0.875, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.07501172058134084, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 1.4901593252108717e-06, | |
| "loss": 0.7929, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.090014064697609, | |
| "grad_norm": 28.125, | |
| "learning_rate": 1.7900656044985943e-06, | |
| "loss": 0.7529, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.10501640881387717, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 2.089971883786317e-06, | |
| "loss": 0.7193, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.12001875293014534, | |
| "grad_norm": 22.875, | |
| "learning_rate": 2.3898781630740394e-06, | |
| "loss": 0.6668, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.1350210970464135, | |
| "grad_norm": 20.25, | |
| "learning_rate": 2.689784442361762e-06, | |
| "loss": 0.6509, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.15002344116268168, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.9896907216494846e-06, | |
| "loss": 0.6039, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.16502578527894984, | |
| "grad_norm": 14.875, | |
| "learning_rate": 3.2895970009372076e-06, | |
| "loss": 0.5509, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.180028129395218, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 3.58950328022493e-06, | |
| "loss": 0.5364, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.19503047351148617, | |
| "grad_norm": 24.5, | |
| "learning_rate": 3.889409559512652e-06, | |
| "loss": 0.5291, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.21003281762775433, | |
| "grad_norm": 20.125, | |
| "learning_rate": 4.189315838800375e-06, | |
| "loss": 0.5017, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.2250351617440225, | |
| "grad_norm": 20.75, | |
| "learning_rate": 4.489222118088098e-06, | |
| "loss": 0.5199, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.24003750586029068, | |
| "grad_norm": 10.5, | |
| "learning_rate": 4.789128397375821e-06, | |
| "loss": 0.5377, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.2550398499765588, | |
| "grad_norm": 20.0, | |
| "learning_rate": 5.0890346766635435e-06, | |
| "loss": 0.4862, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.270042194092827, | |
| "grad_norm": 13.625, | |
| "learning_rate": 5.388940955951266e-06, | |
| "loss": 0.5037, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.28504453820909514, | |
| "grad_norm": 13.5, | |
| "learning_rate": 5.688847235238988e-06, | |
| "loss": 0.4857, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.30004688232536336, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 5.98875351452671e-06, | |
| "loss": 0.4565, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3150492264416315, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 6.288659793814433e-06, | |
| "loss": 0.4413, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.3300515705578997, | |
| "grad_norm": 10.25, | |
| "learning_rate": 6.588566073102156e-06, | |
| "loss": 0.4185, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.34505391467416785, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 6.888472352389879e-06, | |
| "loss": 0.4353, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.360056258790436, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 7.1883786316776015e-06, | |
| "loss": 0.3606, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.3750586029067042, | |
| "grad_norm": 16.75, | |
| "learning_rate": 7.488284910965324e-06, | |
| "loss": 0.4126, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.39006094702297234, | |
| "grad_norm": 13.5, | |
| "learning_rate": 7.788191190253046e-06, | |
| "loss": 0.4052, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.4050632911392405, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8.08809746954077e-06, | |
| "loss": 0.383, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.42006563525550866, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 8.388003748828491e-06, | |
| "loss": 0.3557, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.4350679793717768, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8.687910028116214e-06, | |
| "loss": 0.3837, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.450070323488045, | |
| "grad_norm": 11.625, | |
| "learning_rate": 8.987816307403938e-06, | |
| "loss": 0.3317, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.46507266760431315, | |
| "grad_norm": 12.25, | |
| "learning_rate": 9.28772258669166e-06, | |
| "loss": 0.3331, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.48007501172058137, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 9.587628865979383e-06, | |
| "loss": 0.3751, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.49507735583684953, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 9.887535145267105e-06, | |
| "loss": 0.3289, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.5100796999531176, | |
| "grad_norm": 10.875, | |
| "learning_rate": 9.999892863685326e-06, | |
| "loss": 0.3429, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.5250820440693859, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 9.999275773410506e-06, | |
| "loss": 0.3186, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.540084388185654, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.998110227713216e-06, | |
| "loss": 0.3941, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.5550867323019222, | |
| "grad_norm": 25.375, | |
| "learning_rate": 9.996396354461945e-06, | |
| "loss": 0.3848, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.5700890764181903, | |
| "grad_norm": 13.125, | |
| "learning_rate": 9.994134341680546e-06, | |
| "loss": 0.348, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.5850914205344585, | |
| "grad_norm": 16.5, | |
| "learning_rate": 9.991324437527599e-06, | |
| "loss": 0.3511, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.6000937646507267, | |
| "grad_norm": 12.0, | |
| "learning_rate": 9.987966950269184e-06, | |
| "loss": 0.3655, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6150961087669948, | |
| "grad_norm": 15.0, | |
| "learning_rate": 9.984062248245078e-06, | |
| "loss": 0.3657, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.630098452883263, | |
| "grad_norm": 14.375, | |
| "learning_rate": 9.979610759828324e-06, | |
| "loss": 0.298, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.6451007969995312, | |
| "grad_norm": 16.125, | |
| "learning_rate": 9.974612973378252e-06, | |
| "loss": 0.3793, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.6601031411157994, | |
| "grad_norm": 13.0, | |
| "learning_rate": 9.969069437186899e-06, | |
| "loss": 0.3425, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.6751054852320675, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 9.962980759418844e-06, | |
| "loss": 0.3424, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6901078293483357, | |
| "grad_norm": 15.25, | |
| "learning_rate": 9.956347608044512e-06, | |
| "loss": 0.357, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.7051101734646038, | |
| "grad_norm": 23.75, | |
| "learning_rate": 9.949170710766875e-06, | |
| "loss": 0.3335, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.720112517580872, | |
| "grad_norm": 10.75, | |
| "learning_rate": 9.94145085494162e-06, | |
| "loss": 0.3344, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.7351148616971401, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 9.933188887490784e-06, | |
| "loss": 0.3206, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.7501172058134083, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 9.924385714809818e-06, | |
| "loss": 0.3673, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7651195499296765, | |
| "grad_norm": 8.75, | |
| "learning_rate": 9.91504230266817e-06, | |
| "loss": 0.3392, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.7801218940459447, | |
| "grad_norm": 18.625, | |
| "learning_rate": 9.905159676103322e-06, | |
| "loss": 0.3022, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.7951242381622129, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.89473891930834e-06, | |
| "loss": 0.3361, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.810126582278481, | |
| "grad_norm": 12.5, | |
| "learning_rate": 9.88378117551293e-06, | |
| "loss": 0.3299, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.8251289263947492, | |
| "grad_norm": 14.0, | |
| "learning_rate": 9.872287646858015e-06, | |
| "loss": 0.3304, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.8401312705110173, | |
| "grad_norm": 14.25, | |
| "learning_rate": 9.860259594263858e-06, | |
| "loss": 0.3219, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.8551336146272855, | |
| "grad_norm": 13.375, | |
| "learning_rate": 9.847698337291725e-06, | |
| "loss": 0.2956, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.8701359587435537, | |
| "grad_norm": 10.625, | |
| "learning_rate": 9.834605253999119e-06, | |
| "loss": 0.3375, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.8851383028598219, | |
| "grad_norm": 12.0, | |
| "learning_rate": 9.820981780788604e-06, | |
| "loss": 0.3464, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.90014064697609, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 9.806829412250215e-06, | |
| "loss": 0.3522, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.9151429910923582, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.792149700997492e-06, | |
| "loss": 0.328, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.9301453352086263, | |
| "grad_norm": 13.625, | |
| "learning_rate": 9.776944257497157e-06, | |
| "loss": 0.3549, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.9451476793248945, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 9.761214749892411e-06, | |
| "loss": 0.3402, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.9601500234411627, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 9.74496290381996e-06, | |
| "loss": 0.3362, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.9751523675574308, | |
| "grad_norm": 9.875, | |
| "learning_rate": 9.728190502220673e-06, | |
| "loss": 0.3825, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.9901547116736991, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 9.710899385143993e-06, | |
| "loss": 0.3536, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 1.0051570557899672, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 9.693091449546068e-06, | |
| "loss": 0.2984, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 1.0201593999062353, | |
| "grad_norm": 14.375, | |
| "learning_rate": 9.674768649081647e-06, | |
| "loss": 0.2614, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 1.0351617440225036, | |
| "grad_norm": 16.5, | |
| "learning_rate": 9.655932993889742e-06, | |
| "loss": 0.282, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 1.0501640881387717, | |
| "grad_norm": 12.25, | |
| "learning_rate": 9.636586550373105e-06, | |
| "loss": 0.2256, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.0651664322550398, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 9.616731440971536e-06, | |
| "loss": 0.2698, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 1.080168776371308, | |
| "grad_norm": 18.625, | |
| "learning_rate": 9.596369843929022e-06, | |
| "loss": 0.2477, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 1.0951711204875763, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 9.575503993054787e-06, | |
| "loss": 0.2369, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 1.1101734646038444, | |
| "grad_norm": 9.625, | |
| "learning_rate": 9.554136177478206e-06, | |
| "loss": 0.2443, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 1.1251758087201125, | |
| "grad_norm": 13.625, | |
| "learning_rate": 9.532268741397692e-06, | |
| "loss": 0.2789, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.1401781528363806, | |
| "grad_norm": 10.5, | |
| "learning_rate": 9.50990408382351e-06, | |
| "loss": 0.2721, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 1.155180496952649, | |
| "grad_norm": 8.75, | |
| "learning_rate": 9.487044658314585e-06, | |
| "loss": 0.2372, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 1.170182841068917, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 9.463692972709349e-06, | |
| "loss": 0.259, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 1.1851851851851851, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 9.439851588850586e-06, | |
| "loss": 0.2918, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 1.2001875293014534, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 9.4155231223044e-06, | |
| "loss": 0.2392, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.2151898734177216, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 9.390710242073265e-06, | |
| "loss": 0.2973, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 1.2301922175339897, | |
| "grad_norm": 6.5, | |
| "learning_rate": 9.365415670303214e-06, | |
| "loss": 0.2763, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 1.2451945616502578, | |
| "grad_norm": 11.375, | |
| "learning_rate": 9.339642181985196e-06, | |
| "loss": 0.2845, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 1.260196905766526, | |
| "grad_norm": 11.75, | |
| "learning_rate": 9.313392604650655e-06, | |
| "loss": 0.2532, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 1.2751992498827942, | |
| "grad_norm": 9.375, | |
| "learning_rate": 9.286669818061316e-06, | |
| "loss": 0.2647, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.2902015939990623, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 9.259476753893258e-06, | |
| "loss": 0.256, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 1.3052039381153304, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.231816395415294e-06, | |
| "loss": 0.2596, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 1.3202062822315987, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 9.20369177716168e-06, | |
| "loss": 0.2589, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 1.3352086263478669, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 9.17510598459921e-06, | |
| "loss": 0.2753, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 1.350210970464135, | |
| "grad_norm": 6.5, | |
| "learning_rate": 9.146062153788716e-06, | |
| "loss": 0.2512, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.3652133145804033, | |
| "grad_norm": 9.75, | |
| "learning_rate": 9.116563471041018e-06, | |
| "loss": 0.252, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 1.3802156586966714, | |
| "grad_norm": 18.75, | |
| "learning_rate": 9.086613172567368e-06, | |
| "loss": 0.2238, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 1.3952180028129395, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 9.056214544124414e-06, | |
| "loss": 0.2635, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 1.4102203469292076, | |
| "grad_norm": 10.125, | |
| "learning_rate": 9.025370920653723e-06, | |
| "loss": 0.2525, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 1.4252226910454757, | |
| "grad_norm": 11.25, | |
| "learning_rate": 8.994085685915934e-06, | |
| "loss": 0.2493, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.440225035161744, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8.962362272119504e-06, | |
| "loss": 0.2551, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 1.4552273792780122, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 8.930204159544208e-06, | |
| "loss": 0.2573, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 1.4702297233942803, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 8.89761487615929e-06, | |
| "loss": 0.2544, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 1.4852320675105486, | |
| "grad_norm": 15.125, | |
| "learning_rate": 8.864597997236454e-06, | |
| "loss": 0.2696, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 1.5002344116268167, | |
| "grad_norm": 18.875, | |
| "learning_rate": 8.831157144957612e-06, | |
| "loss": 0.243, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.5152367557430848, | |
| "grad_norm": 7.625, | |
| "learning_rate": 8.797295988017506e-06, | |
| "loss": 0.2689, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 1.5302390998593531, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.763018241221241e-06, | |
| "loss": 0.2405, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 1.5452414439756212, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 8.728327665076726e-06, | |
| "loss": 0.291, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 1.5602437880918893, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 8.693228065382131e-06, | |
| "loss": 0.2358, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 1.5752461322081577, | |
| "grad_norm": 15.375, | |
| "learning_rate": 8.657723292808365e-06, | |
| "loss": 0.2577, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.5902484763244256, | |
| "grad_norm": 14.25, | |
| "learning_rate": 8.621817242476626e-06, | |
| "loss": 0.2358, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 1.605250820440694, | |
| "grad_norm": 20.75, | |
| "learning_rate": 8.58551385353108e-06, | |
| "loss": 0.2879, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 1.620253164556962, | |
| "grad_norm": 12.125, | |
| "learning_rate": 8.548817108706714e-06, | |
| "loss": 0.2549, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 1.63525550867323, | |
| "grad_norm": 19.75, | |
| "learning_rate": 8.511731033892397e-06, | |
| "loss": 0.2679, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 1.6502578527894984, | |
| "grad_norm": 7.875, | |
| "learning_rate": 8.474259697689211e-06, | |
| "loss": 0.3122, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.6652601969057665, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 8.436407210964101e-06, | |
| "loss": 0.284, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 1.6802625410220347, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.398177726398887e-06, | |
| "loss": 0.2833, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 1.695264885138303, | |
| "grad_norm": 14.875, | |
| "learning_rate": 8.359575438034671e-06, | |
| "loss": 0.2927, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 1.7102672292545709, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 8.320604580811744e-06, | |
| "loss": 0.2367, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 1.7252695733708392, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8.281269430104965e-06, | |
| "loss": 0.2772, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.7402719174871075, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8.241574301254733e-06, | |
| "loss": 0.2424, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 1.7552742616033754, | |
| "grad_norm": 15.25, | |
| "learning_rate": 8.201523549093552e-06, | |
| "loss": 0.2595, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 1.7702766057196437, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 8.161121567468298e-06, | |
| "loss": 0.2717, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 1.7852789498359118, | |
| "grad_norm": 10.0, | |
| "learning_rate": 8.120372788758152e-06, | |
| "loss": 0.2582, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 1.80028129395218, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 8.079281683388368e-06, | |
| "loss": 0.2521, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.8152836380684483, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8.037852759339814e-06, | |
| "loss": 0.252, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 1.8302859821847164, | |
| "grad_norm": 11.875, | |
| "learning_rate": 7.99609056165443e-06, | |
| "loss": 0.2604, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 1.8452883263009845, | |
| "grad_norm": 15.625, | |
| "learning_rate": 7.953999671936591e-06, | |
| "loss": 0.2656, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 1.8602906704172528, | |
| "grad_norm": 11.875, | |
| "learning_rate": 7.911584707850487e-06, | |
| "loss": 0.2208, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 1.8752930145335207, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.868850322613525e-06, | |
| "loss": 0.2632, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.890295358649789, | |
| "grad_norm": 12.5, | |
| "learning_rate": 7.825801204485837e-06, | |
| "loss": 0.2528, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 1.9052977027660571, | |
| "grad_norm": 9.75, | |
| "learning_rate": 7.782442076255952e-06, | |
| "loss": 0.2539, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 1.9203000468823253, | |
| "grad_norm": 13.625, | |
| "learning_rate": 7.738777694722666e-06, | |
| "loss": 0.2846, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 1.9353023909985936, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 7.694812850173197e-06, | |
| "loss": 0.2536, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 1.9503047351148617, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 7.650552365857648e-06, | |
| "loss": 0.2522, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.9653070792311298, | |
| "grad_norm": 17.625, | |
| "learning_rate": 7.606001097459865e-06, | |
| "loss": 0.2744, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 1.9803094233473981, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.561163932564739e-06, | |
| "loss": 0.2725, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 1.9953117674636662, | |
| "grad_norm": 6.75, | |
| "learning_rate": 7.516045790122e-06, | |
| "loss": 0.2576, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 2.0103141115799343, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 7.470651619906574e-06, | |
| "loss": 0.1846, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 2.0253164556962027, | |
| "grad_norm": 13.5, | |
| "learning_rate": 7.424986401975561e-06, | |
| "loss": 0.2504, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.0403187998124706, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 7.379055146121884e-06, | |
| "loss": 0.1835, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 2.055321143928739, | |
| "grad_norm": 13.0, | |
| "learning_rate": 7.332862891324681e-06, | |
| "loss": 0.2298, | |
| "step": 4384 | |
| }, | |
| { | |
| "epoch": 2.070323488045007, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 7.286414705196499e-06, | |
| "loss": 0.2158, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 2.085325832161275, | |
| "grad_norm": 13.625, | |
| "learning_rate": 7.2397156834273295e-06, | |
| "loss": 0.2305, | |
| "step": 4448 | |
| }, | |
| { | |
| "epoch": 2.1003281762775434, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 7.192770949225591e-06, | |
| "loss": 0.1828, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.1153305203938118, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 7.1455856527560666e-06, | |
| "loss": 0.172, | |
| "step": 4512 | |
| }, | |
| { | |
| "epoch": 2.1303328645100796, | |
| "grad_norm": 14.25, | |
| "learning_rate": 7.0981649705748955e-06, | |
| "loss": 0.164, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 2.145335208626348, | |
| "grad_norm": 13.5, | |
| "learning_rate": 7.050514105061679e-06, | |
| "loss": 0.1857, | |
| "step": 4576 | |
| }, | |
| { | |
| "epoch": 2.160337552742616, | |
| "grad_norm": 7.375, | |
| "learning_rate": 7.002638283848726e-06, | |
| "loss": 0.1909, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 2.175339896858884, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 6.95454275924756e-06, | |
| "loss": 0.1902, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.1903422409751525, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 6.906232807672699e-06, | |
| "loss": 0.1778, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 2.2053445850914204, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 6.857713729062794e-06, | |
| "loss": 0.1802, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 2.2203469292076887, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 6.80899084629919e-06, | |
| "loss": 0.2209, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 2.235349273323957, | |
| "grad_norm": 15.625, | |
| "learning_rate": 6.760069504621971e-06, | |
| "loss": 0.2697, | |
| "step": 4768 | |
| }, | |
| { | |
| "epoch": 2.250351617440225, | |
| "grad_norm": 9.125, | |
| "learning_rate": 6.710955071043547e-06, | |
| "loss": 0.1916, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.2653539615564933, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 6.661652933759856e-06, | |
| "loss": 0.1851, | |
| "step": 4832 | |
| }, | |
| { | |
| "epoch": 2.280356305672761, | |
| "grad_norm": 18.125, | |
| "learning_rate": 6.612168501559242e-06, | |
| "loss": 0.2051, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 2.2953586497890295, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 6.5625072032290735e-06, | |
| "loss": 0.176, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 2.310360993905298, | |
| "grad_norm": 16.25, | |
| "learning_rate": 6.512674486960166e-06, | |
| "loss": 0.1753, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 2.3253633380215657, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 6.462675819749082e-06, | |
| "loss": 0.1666, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.340365682137834, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 6.412516686798354e-06, | |
| "loss": 0.1841, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 2.3553680262541024, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 6.362202590914728e-06, | |
| "loss": 0.2007, | |
| "step": 5024 | |
| }, | |
| { | |
| "epoch": 2.3703703703703702, | |
| "grad_norm": 19.5, | |
| "learning_rate": 6.311739051905468e-06, | |
| "loss": 0.1642, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 2.3853727144866386, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 6.261131605972785e-06, | |
| "loss": 0.1976, | |
| "step": 5088 | |
| }, | |
| { | |
| "epoch": 2.400375058602907, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 6.2103858051064915e-06, | |
| "loss": 0.2102, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.415377402719175, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 6.159507216474891e-06, | |
| "loss": 0.2282, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 2.430379746835443, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 6.108501421814039e-06, | |
| "loss": 0.2119, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 2.4453820909517114, | |
| "grad_norm": 15.125, | |
| "learning_rate": 6.057374016815376e-06, | |
| "loss": 0.2176, | |
| "step": 5216 | |
| }, | |
| { | |
| "epoch": 2.4603844350679793, | |
| "grad_norm": 13.125, | |
| "learning_rate": 6.0061306105118474e-06, | |
| "loss": 0.1872, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 2.4753867791842477, | |
| "grad_norm": 11.25, | |
| "learning_rate": 5.954776824662547e-06, | |
| "loss": 0.1978, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 2.4903891233005155, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 5.90331829313598e-06, | |
| "loss": 0.1864, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 2.505391467416784, | |
| "grad_norm": 6.625, | |
| "learning_rate": 5.851760661291977e-06, | |
| "loss": 0.2036, | |
| "step": 5344 | |
| }, | |
| { | |
| "epoch": 2.520393811533052, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 5.80010958536237e-06, | |
| "loss": 0.202, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 2.53539615564932, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 5.748370731830456e-06, | |
| "loss": 0.2186, | |
| "step": 5408 | |
| }, | |
| { | |
| "epoch": 2.5503984997655884, | |
| "grad_norm": 17.75, | |
| "learning_rate": 5.696549776809346e-06, | |
| "loss": 0.1919, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 2.5654008438818563, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 5.6446524054192605e-06, | |
| "loss": 0.2007, | |
| "step": 5472 | |
| }, | |
| { | |
| "epoch": 2.5804031879981246, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 5.592684311163827e-06, | |
| "loss": 0.2096, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 2.595405532114393, | |
| "grad_norm": 11.25, | |
| "learning_rate": 5.540651195305464e-06, | |
| "loss": 0.2196, | |
| "step": 5536 | |
| }, | |
| { | |
| "epoch": 2.610407876230661, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 5.488558766239916e-06, | |
| "loss": 0.2207, | |
| "step": 5568 | |
| }, | |
| { | |
| "epoch": 2.625410220346929, | |
| "grad_norm": 19.75, | |
| "learning_rate": 5.436412738869995e-06, | |
| "loss": 0.1945, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.6404125644631975, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 5.384218833978626e-06, | |
| "loss": 0.1896, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 2.6554149085794654, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 5.331982777601228e-06, | |
| "loss": 0.2217, | |
| "step": 5664 | |
| }, | |
| { | |
| "epoch": 2.6704172526957337, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 5.279710300397537e-06, | |
| "loss": 0.1987, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 2.685419596812002, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 5.227407137022902e-06, | |
| "loss": 0.232, | |
| "step": 5728 | |
| }, | |
| { | |
| "epoch": 2.70042194092827, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 5.175079025499163e-06, | |
| "loss": 0.1845, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 2.7154242850445383, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 5.1227317065851445e-06, | |
| "loss": 0.1973, | |
| "step": 5792 | |
| }, | |
| { | |
| "epoch": 2.7304266291608066, | |
| "grad_norm": 10.875, | |
| "learning_rate": 5.070370923146855e-06, | |
| "loss": 0.1819, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 2.7454289732770745, | |
| "grad_norm": 16.875, | |
| "learning_rate": 5.0180024195274555e-06, | |
| "loss": 0.1741, | |
| "step": 5856 | |
| }, | |
| { | |
| "epoch": 2.760431317393343, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 4.965631940917068e-06, | |
| "loss": 0.179, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 2.775433661509611, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 4.91326523272248e-06, | |
| "loss": 0.1901, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 2.790436005625879, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 4.860908039936839e-06, | |
| "loss": 0.2238, | |
| "step": 5952 | |
| }, | |
| { | |
| "epoch": 2.8054383497421473, | |
| "grad_norm": 8.375, | |
| "learning_rate": 4.80856610650939e-06, | |
| "loss": 0.1826, | |
| "step": 5984 | |
| }, | |
| { | |
| "epoch": 2.8204406938584152, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 4.756245174715315e-06, | |
| "loss": 0.2012, | |
| "step": 6016 | |
| }, | |
| { | |
| "epoch": 2.8354430379746836, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 4.703950984525774e-06, | |
| "loss": 0.2342, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 2.8504453820909514, | |
| "grad_norm": 10.0, | |
| "learning_rate": 4.6516892729781815e-06, | |
| "loss": 0.1805, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.8654477262072198, | |
| "grad_norm": 17.5, | |
| "learning_rate": 4.599465773546822e-06, | |
| "loss": 0.1987, | |
| "step": 6112 | |
| }, | |
| { | |
| "epoch": 2.880450070323488, | |
| "grad_norm": 16.625, | |
| "learning_rate": 4.547286215513846e-06, | |
| "loss": 0.1936, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 2.895452414439756, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.495156323340724e-06, | |
| "loss": 0.2244, | |
| "step": 6176 | |
| }, | |
| { | |
| "epoch": 2.9104547585560243, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 4.443081816040233e-06, | |
| "loss": 0.2456, | |
| "step": 6208 | |
| }, | |
| { | |
| "epoch": 2.9254571026722926, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 4.391068406549049e-06, | |
| "loss": 0.2125, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.9404594467885605, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 4.339121801100982e-06, | |
| "loss": 0.2383, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 2.955461790904829, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 4.287247698600987e-06, | |
| "loss": 0.1784, | |
| "step": 6304 | |
| }, | |
| { | |
| "epoch": 2.970464135021097, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 4.235451789999928e-06, | |
| "loss": 0.183, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 2.985466479137365, | |
| "grad_norm": 13.0, | |
| "learning_rate": 4.1837397576702576e-06, | |
| "loss": 0.2395, | |
| "step": 6368 | |
| }, | |
| { | |
| "epoch": 3.0004688232536334, | |
| "grad_norm": 14.375, | |
| "learning_rate": 4.132117274782616e-06, | |
| "loss": 0.2184, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.0154711673699017, | |
| "grad_norm": 13.0, | |
| "learning_rate": 4.0805900046834405e-06, | |
| "loss": 0.1968, | |
| "step": 6432 | |
| }, | |
| { | |
| "epoch": 3.0304735114861696, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 4.0291636002736725e-06, | |
| "loss": 0.1868, | |
| "step": 6464 | |
| }, | |
| { | |
| "epoch": 3.045475855602438, | |
| "grad_norm": 14.875, | |
| "learning_rate": 3.977843703388572e-06, | |
| "loss": 0.1928, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 3.0604781997187063, | |
| "grad_norm": 18.375, | |
| "learning_rate": 3.926635944178788e-06, | |
| "loss": 0.1874, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 3.075480543834974, | |
| "grad_norm": 12.25, | |
| "learning_rate": 3.875545940492681e-06, | |
| "loss": 0.1743, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 3.0904828879512425, | |
| "grad_norm": 17.0, | |
| "learning_rate": 3.824579297260006e-06, | |
| "loss": 0.183, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 3.1054852320675104, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 3.773741605877026e-06, | |
| "loss": 0.2052, | |
| "step": 6624 | |
| }, | |
| { | |
| "epoch": 3.1204875761837787, | |
| "grad_norm": 11.375, | |
| "learning_rate": 3.7230384435930785e-06, | |
| "loss": 0.1794, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 3.135489920300047, | |
| "grad_norm": 12.625, | |
| "learning_rate": 3.6724753728987206e-06, | |
| "loss": 0.1562, | |
| "step": 6688 | |
| }, | |
| { | |
| "epoch": 3.150492264416315, | |
| "grad_norm": 12.25, | |
| "learning_rate": 3.6220579409154888e-06, | |
| "loss": 0.1605, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 3.1654946085325832, | |
| "grad_norm": 10.125, | |
| "learning_rate": 3.571791678787332e-06, | |
| "loss": 0.2082, | |
| "step": 6752 | |
| }, | |
| { | |
| "epoch": 3.1804969526488516, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 3.521682101073818e-06, | |
| "loss": 0.1501, | |
| "step": 6784 | |
| }, | |
| { | |
| "epoch": 3.1954992967651195, | |
| "grad_norm": 18.0, | |
| "learning_rate": 3.471734705145138e-06, | |
| "loss": 0.1697, | |
| "step": 6816 | |
| }, | |
| { | |
| "epoch": 3.210501640881388, | |
| "grad_norm": 15.5, | |
| "learning_rate": 3.421954970579008e-06, | |
| "loss": 0.2038, | |
| "step": 6848 | |
| }, | |
| { | |
| "epoch": 3.2255039849976557, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.3723483585595256e-06, | |
| "loss": 0.1683, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 3.240506329113924, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 3.3229203112780382e-06, | |
| "loss": 0.2224, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 3.2555086732301923, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 3.2736762513360963e-06, | |
| "loss": 0.1734, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 3.27051101734646, | |
| "grad_norm": 17.625, | |
| "learning_rate": 3.224621581150553e-06, | |
| "loss": 0.1558, | |
| "step": 6976 | |
| }, | |
| { | |
| "epoch": 3.2855133614627285, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 3.175761682360885e-06, | |
| "loss": 0.1752, | |
| "step": 7008 | |
| }, | |
| { | |
| "epoch": 3.300515705578997, | |
| "grad_norm": 18.375, | |
| "learning_rate": 3.1271019152387917e-06, | |
| "loss": 0.1734, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 3.3155180496952648, | |
| "grad_norm": 16.875, | |
| "learning_rate": 3.0786476181001263e-06, | |
| "loss": 0.1868, | |
| "step": 7072 | |
| }, | |
| { | |
| "epoch": 3.330520393811533, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 3.030404106719259e-06, | |
| "loss": 0.1976, | |
| "step": 7104 | |
| }, | |
| { | |
| "epoch": 3.3455227379278014, | |
| "grad_norm": 16.25, | |
| "learning_rate": 2.982376673745887e-06, | |
| "loss": 0.2065, | |
| "step": 7136 | |
| }, | |
| { | |
| "epoch": 3.3605250820440693, | |
| "grad_norm": 6.25, | |
| "learning_rate": 2.934570588124399e-06, | |
| "loss": 0.1526, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 3.3755274261603376, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 2.8869910945158407e-06, | |
| "loss": 0.1765, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.390529770276606, | |
| "grad_norm": 7.375, | |
| "learning_rate": 2.839643412722525e-06, | |
| "loss": 0.1942, | |
| "step": 7232 | |
| }, | |
| { | |
| "epoch": 3.405532114392874, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 2.7925327371153998e-06, | |
| "loss": 0.1577, | |
| "step": 7264 | |
| }, | |
| { | |
| "epoch": 3.420534458509142, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 2.7456642360641772e-06, | |
| "loss": 0.2023, | |
| "step": 7296 | |
| }, | |
| { | |
| "epoch": 3.43553680262541, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 2.6990430513703316e-06, | |
| "loss": 0.2057, | |
| "step": 7328 | |
| }, | |
| { | |
| "epoch": 3.4505391467416784, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 2.6526742977030084e-06, | |
| "loss": 0.1727, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 3.4655414908579467, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 2.6065630620379062e-06, | |
| "loss": 0.1804, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 3.4805438349742146, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 2.5607144030992093e-06, | |
| "loss": 0.1589, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 3.495546179090483, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 2.515133350804598e-06, | |
| "loss": 0.173, | |
| "step": 7456 | |
| }, | |
| { | |
| "epoch": 3.510548523206751, | |
| "grad_norm": 9.375, | |
| "learning_rate": 2.4698249057134377e-06, | |
| "loss": 0.1657, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 3.525550867323019, | |
| "grad_norm": 19.125, | |
| "learning_rate": 2.4247940384781834e-06, | |
| "loss": 0.1833, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 3.5405532114392875, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 2.38004568929906e-06, | |
| "loss": 0.1743, | |
| "step": 7552 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 12.25, | |
| "learning_rate": 2.335584767382098e-06, | |
| "loss": 0.1886, | |
| "step": 7584 | |
| }, | |
| { | |
| "epoch": 3.5705578996718237, | |
| "grad_norm": 8.625, | |
| "learning_rate": 2.291416150400547e-06, | |
| "loss": 0.1891, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 3.585560243788092, | |
| "grad_norm": 11.375, | |
| "learning_rate": 2.247544683959767e-06, | |
| "loss": 0.1598, | |
| "step": 7648 | |
| }, | |
| { | |
| "epoch": 3.60056258790436, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 2.203975181065632e-06, | |
| "loss": 0.2002, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 3.6155649320206282, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 2.160712421596506e-06, | |
| "loss": 0.1845, | |
| "step": 7712 | |
| }, | |
| { | |
| "epoch": 3.6305672761368966, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 2.1177611517788655e-06, | |
| "loss": 0.1477, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 3.6455696202531644, | |
| "grad_norm": 12.0, | |
| "learning_rate": 2.0751260836665947e-06, | |
| "loss": 0.1685, | |
| "step": 7776 | |
| }, | |
| { | |
| "epoch": 3.6605719643694328, | |
| "grad_norm": 7.75, | |
| "learning_rate": 2.0328118946240473e-06, | |
| "loss": 0.1954, | |
| "step": 7808 | |
| }, | |
| { | |
| "epoch": 3.675574308485701, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 1.9908232268129037e-06, | |
| "loss": 0.2078, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 3.690576652601969, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 1.9491646866828927e-06, | |
| "loss": 0.1719, | |
| "step": 7872 | |
| }, | |
| { | |
| "epoch": 3.7055789967182373, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.9078408444664417e-06, | |
| "loss": 0.1844, | |
| "step": 7904 | |
| }, | |
| { | |
| "epoch": 3.7205813408345056, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 1.8668562336772734e-06, | |
| "loss": 0.2377, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 3.7355836849507735, | |
| "grad_norm": 15.25, | |
| "learning_rate": 1.826215350613062e-06, | |
| "loss": 0.1571, | |
| "step": 7968 | |
| }, | |
| { | |
| "epoch": 3.750586029067042, | |
| "grad_norm": 9.625, | |
| "learning_rate": 1.7859226538621487e-06, | |
| "loss": 0.1891, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.7655883731833097, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 1.745982563814414e-06, | |
| "loss": 0.1671, | |
| "step": 8032 | |
| }, | |
| { | |
| "epoch": 3.780590717299578, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 1.7063994621763176e-06, | |
| "loss": 0.1584, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 3.795593061415846, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.6671776914902027e-06, | |
| "loss": 0.1865, | |
| "step": 8096 | |
| }, | |
| { | |
| "epoch": 3.8105954055321143, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.6283215546578862e-06, | |
| "loss": 0.2019, | |
| "step": 8128 | |
| }, | |
| { | |
| "epoch": 3.8255977496483826, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.5898353144686036e-06, | |
| "loss": 0.1802, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 3.8406000937646505, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.5517231931313454e-06, | |
| "loss": 0.1647, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 3.855602437880919, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.513989371811656e-06, | |
| "loss": 0.1205, | |
| "step": 8224 | |
| }, | |
| { | |
| "epoch": 3.870604781997187, | |
| "grad_norm": 9.5, | |
| "learning_rate": 1.4766379901729272e-06, | |
| "loss": 0.1919, | |
| "step": 8256 | |
| }, | |
| { | |
| "epoch": 3.885607126113455, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.4396731459222546e-06, | |
| "loss": 0.1581, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 3.9006094702297234, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.4030988943608826e-06, | |
| "loss": 0.2072, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 3.9156118143459917, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.3669192479393145e-06, | |
| "loss": 0.1677, | |
| "step": 8352 | |
| }, | |
| { | |
| "epoch": 3.9306141584622596, | |
| "grad_norm": 11.625, | |
| "learning_rate": 1.3311381758171165e-06, | |
| "loss": 0.1702, | |
| "step": 8384 | |
| }, | |
| { | |
| "epoch": 3.945616502578528, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 1.2957596034274732e-06, | |
| "loss": 0.1901, | |
| "step": 8416 | |
| }, | |
| { | |
| "epoch": 3.9606188466947962, | |
| "grad_norm": 11.25, | |
| "learning_rate": 1.2607874120465457e-06, | |
| "loss": 0.1804, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 3.975621190811064, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.2262254383676597e-06, | |
| "loss": 0.1606, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 3.9906235349273325, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 1.192077474080398e-06, | |
| "loss": 0.1913, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 4.005625879043601, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.1583472654546257e-06, | |
| "loss": 0.1642, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 4.020628223159869, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.1250385129295005e-06, | |
| "loss": 0.1555, | |
| "step": 8576 | |
| }, | |
| { | |
| "epoch": 4.035630567276137, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.0921548707075026e-06, | |
| "loss": 0.2104, | |
| "step": 8608 | |
| }, | |
| { | |
| "epoch": 4.050632911392405, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.059699946353549e-06, | |
| "loss": 0.1725, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 4.065635255508673, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.0276773003992157e-06, | |
| "loss": 0.1715, | |
| "step": 8672 | |
| }, | |
| { | |
| "epoch": 4.080637599624941, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 9.96090445952121e-07, | |
| "loss": 0.1721, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 4.09563994374121, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 9.649428483105204e-07, | |
| "loss": 0.1912, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 4.110642287857478, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 9.34237924583129e-07, | |
| "loss": 0.1479, | |
| "step": 8768 | |
| }, | |
| { | |
| "epoch": 4.125644631973746, | |
| "grad_norm": 14.25, | |
| "learning_rate": 9.039790433142481e-07, | |
| "loss": 0.2151, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.140646976090014, | |
| "grad_norm": 13.875, | |
| "learning_rate": 8.741695241142095e-07, | |
| "loss": 0.1768, | |
| "step": 8832 | |
| }, | |
| { | |
| "epoch": 4.155649320206282, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8.448126372951904e-07, | |
| "loss": 0.1616, | |
| "step": 8864 | |
| }, | |
| { | |
| "epoch": 4.17065166432255, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 8.159116035124431e-07, | |
| "loss": 0.1733, | |
| "step": 8896 | |
| }, | |
| { | |
| "epoch": 4.185654008438819, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 7.874695934109583e-07, | |
| "loss": 0.1801, | |
| "step": 8928 | |
| }, | |
| { | |
| "epoch": 4.200656352555087, | |
| "grad_norm": 12.375, | |
| "learning_rate": 7.594897272776275e-07, | |
| "loss": 0.1977, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 4.215658696671355, | |
| "grad_norm": 16.75, | |
| "learning_rate": 7.319750746989262e-07, | |
| "loss": 0.1982, | |
| "step": 8992 | |
| }, | |
| { | |
| "epoch": 4.2306610407876235, | |
| "grad_norm": 16.125, | |
| "learning_rate": 7.049286542241573e-07, | |
| "loss": 0.1527, | |
| "step": 9024 | |
| }, | |
| { | |
| "epoch": 4.245663384903891, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 6.783534330342984e-07, | |
| "loss": 0.1716, | |
| "step": 9056 | |
| }, | |
| { | |
| "epoch": 4.260665729020159, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 6.522523266164759e-07, | |
| "loss": 0.184, | |
| "step": 9088 | |
| }, | |
| { | |
| "epoch": 4.275668073136427, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 6.266281984441214e-07, | |
| "loss": 0.1396, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 4.290670417252696, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 6.014838596628225e-07, | |
| "loss": 0.1483, | |
| "step": 9152 | |
| }, | |
| { | |
| "epoch": 4.305672761368964, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 5.768220687819271e-07, | |
| "loss": 0.1754, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 4.320675105485232, | |
| "grad_norm": 11.125, | |
| "learning_rate": 5.526455313719126e-07, | |
| "loss": 0.1736, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 4.3356774496015005, | |
| "grad_norm": 18.125, | |
| "learning_rate": 5.289568997675643e-07, | |
| "loss": 0.1973, | |
| "step": 9248 | |
| }, | |
| { | |
| "epoch": 4.350679793717768, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 5.057587727769981e-07, | |
| "loss": 0.1786, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 4.365682137834036, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 4.830536953965531e-07, | |
| "loss": 0.1549, | |
| "step": 9312 | |
| }, | |
| { | |
| "epoch": 4.380684481950305, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 4.6084415853158537e-07, | |
| "loss": 0.1411, | |
| "step": 9344 | |
| }, | |
| { | |
| "epoch": 4.395686826066573, | |
| "grad_norm": 26.125, | |
| "learning_rate": 4.391325987232037e-07, | |
| "loss": 0.2195, | |
| "step": 9376 | |
| }, | |
| { | |
| "epoch": 4.410689170182841, | |
| "grad_norm": 21.75, | |
| "learning_rate": 4.17921397880956e-07, | |
| "loss": 0.1898, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 4.42569151429911, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 3.9721288302152493e-07, | |
| "loss": 0.1947, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 4.4406938584153774, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 3.770093260134322e-07, | |
| "loss": 0.1821, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 4.455696202531645, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 3.573129433278011e-07, | |
| "loss": 0.2005, | |
| "step": 9504 | |
| }, | |
| { | |
| "epoch": 4.470698546647914, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 3.381258957951983e-07, | |
| "loss": 0.1658, | |
| "step": 9536 | |
| }, | |
| { | |
| "epoch": 4.485700890764182, | |
| "grad_norm": 20.625, | |
| "learning_rate": 3.194502883685663e-07, | |
| "loss": 0.2, | |
| "step": 9568 | |
| }, | |
| { | |
| "epoch": 4.50070323488045, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.0128816989230315e-07, | |
| "loss": 0.1681, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.515705578996718, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 2.836415328774872e-07, | |
| "loss": 0.1757, | |
| "step": 9632 | |
| }, | |
| { | |
| "epoch": 4.5307079231129865, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 2.665123132832842e-07, | |
| "loss": 0.1635, | |
| "step": 9664 | |
| }, | |
| { | |
| "epoch": 4.545710267229254, | |
| "grad_norm": 13.125, | |
| "learning_rate": 2.499023903045622e-07, | |
| "loss": 0.2215, | |
| "step": 9696 | |
| }, | |
| { | |
| "epoch": 4.560712611345522, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 2.3381358616572593e-07, | |
| "loss": 0.1952, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 4.575714955461791, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 2.1824766592080937e-07, | |
| "loss": 0.1618, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 4.590717299578059, | |
| "grad_norm": 20.625, | |
| "learning_rate": 2.0320633725983641e-07, | |
| "loss": 0.1983, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 4.605719643694327, | |
| "grad_norm": 14.25, | |
| "learning_rate": 1.8869125032147384e-07, | |
| "loss": 0.1992, | |
| "step": 9824 | |
| }, | |
| { | |
| "epoch": 4.620721987810596, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 1.747039975120035e-07, | |
| "loss": 0.1966, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 4.6357243319268635, | |
| "grad_norm": 15.125, | |
| "learning_rate": 1.6124611333062036e-07, | |
| "loss": 0.1652, | |
| "step": 9888 | |
| }, | |
| { | |
| "epoch": 4.650726676043131, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 1.4831907420108705e-07, | |
| "loss": 0.1459, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 4.6657290201594, | |
| "grad_norm": 11.125, | |
| "learning_rate": 1.3592429830976362e-07, | |
| "loss": 0.1759, | |
| "step": 9952 | |
| }, | |
| { | |
| "epoch": 4.680731364275668, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 1.2406314545001795e-07, | |
| "loss": 0.152, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 4.695733708391936, | |
| "grad_norm": 12.0, | |
| "learning_rate": 1.1273691687305299e-07, | |
| "loss": 0.1946, | |
| "step": 10016 | |
| }, | |
| { | |
| "epoch": 4.710736052508205, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.0194685514514302e-07, | |
| "loss": 0.1786, | |
| "step": 10048 | |
| }, | |
| { | |
| "epoch": 4.725738396624473, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 9.16941440113206e-08, | |
| "loss": 0.1709, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 4.7407407407407405, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 8.197990826551094e-08, | |
| "loss": 0.1554, | |
| "step": 10112 | |
| }, | |
| { | |
| "epoch": 4.755743084857009, | |
| "grad_norm": 14.125, | |
| "learning_rate": 7.280521362713122e-08, | |
| "loss": 0.1948, | |
| "step": 10144 | |
| }, | |
| { | |
| "epoch": 4.770745428973277, | |
| "grad_norm": 14.375, | |
| "learning_rate": 6.417106662417849e-08, | |
| "loss": 0.1493, | |
| "step": 10176 | |
| }, | |
| { | |
| "epoch": 4.785747773089545, | |
| "grad_norm": 13.0, | |
| "learning_rate": 5.607841448280194e-08, | |
| "loss": 0.1867, | |
| "step": 10208 | |
| }, | |
| { | |
| "epoch": 4.800750117205814, | |
| "grad_norm": 7.875, | |
| "learning_rate": 4.852814502338765e-08, | |
| "loss": 0.1832, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 4.815752461322082, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 4.1521086563159344e-08, | |
| "loss": 0.1793, | |
| "step": 10272 | |
| }, | |
| { | |
| "epoch": 4.83075480543835, | |
| "grad_norm": 8.125, | |
| "learning_rate": 3.5058007825303774e-08, | |
| "loss": 0.1727, | |
| "step": 10304 | |
| }, | |
| { | |
| "epoch": 4.845757149554618, | |
| "grad_norm": 9.75, | |
| "learning_rate": 2.9139617854639368e-08, | |
| "loss": 0.1745, | |
| "step": 10336 | |
| }, | |
| { | |
| "epoch": 4.860759493670886, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 2.3766565939826734e-08, | |
| "loss": 0.1869, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 4.875761837787154, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 1.8939441542138448e-08, | |
| "loss": 0.1436, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 4.890764181903423, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.4658774230789653e-08, | |
| "loss": 0.1762, | |
| "step": 10432 | |
| }, | |
| { | |
| "epoch": 4.905766526019691, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.0925033624842874e-08, | |
| "loss": 0.1635, | |
| "step": 10464 | |
| }, | |
| { | |
| "epoch": 4.920768870135959, | |
| "grad_norm": 17.75, | |
| "learning_rate": 7.73862934168479e-09, | |
| "loss": 0.171, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 4.9357712142522265, | |
| "grad_norm": 18.875, | |
| "learning_rate": 5.099910952091059e-09, | |
| "loss": 0.1919, | |
| "step": 10528 | |
| }, | |
| { | |
| "epoch": 4.950773558368495, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 3.0091679418742248e-09, | |
| "loss": 0.171, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 4.965775902484763, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.4666296801252312e-09, | |
| "loss": 0.171, | |
| "step": 10592 | |
| }, | |
| { | |
| "epoch": 4.980778246601031, | |
| "grad_norm": 16.625, | |
| "learning_rate": 4.724653940513246e-10, | |
| "loss": 0.1232, | |
| "step": 10624 | |
| }, | |
| { | |
| "epoch": 4.9957805907173, | |
| "grad_norm": 8.75, | |
| "learning_rate": 2.6784150408132315e-11, | |
| "loss": 0.2079, | |
| "step": 10656 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 10665, | |
| "total_flos": 5.821442357117952e+16, | |
| "train_loss": 0.25197240614410416, | |
| "train_runtime": 2509.3883, | |
| "train_samples_per_second": 4.25, | |
| "train_steps_per_second": 4.25 | |
| } | |
| ], | |
| "logging_steps": 32, | |
| "max_steps": 10665, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.821442357117952e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |