| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.626957170077044, | |
| "eval_steps": 5000, | |
| "global_step": 20000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0003313727114572115, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 0.0, | |
| "loss": 4.1855, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016568635572860577, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 9.017298490982701e-07, | |
| "loss": 4.4012, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03313727114572115, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.8218623481781377e-06, | |
| "loss": 4.3839, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04970590671858172, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.741994847258005e-06, | |
| "loss": 4.345, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0662745422914423, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.662127346337873e-06, | |
| "loss": 4.2973, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08284317786430287, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 4.582259845417741e-06, | |
| "loss": 4.224, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09941181343716345, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 5.502392344497608e-06, | |
| "loss": 4.144, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11598044901002402, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 6.422524843577475e-06, | |
| "loss": 4.0929, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1325490845828846, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 7.342657342657343e-06, | |
| "loss": 4.0516, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14911772015574518, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8.26278984173721e-06, | |
| "loss": 4.025, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.16568635572860574, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 9.182922340817078e-06, | |
| "loss": 4.0113, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18225499130146633, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.0103054839896946e-05, | |
| "loss": 4.0015, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1988236268743269, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.1023187338976813e-05, | |
| "loss": 3.9936, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.21539226244718748, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 1.1943319838056682e-05, | |
| "loss": 3.9778, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.23196089802004805, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.2863452337136547e-05, | |
| "loss": 3.971, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24852953359290864, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.3783584836216415e-05, | |
| "loss": 3.9664, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2650981691657692, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 1.4703717335296282e-05, | |
| "loss": 3.9561, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.28166680473862976, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.562384983437615e-05, | |
| "loss": 3.9459, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.29823544031149035, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.6543982333456018e-05, | |
| "loss": 3.9438, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.31480407588435094, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.7464114832535886e-05, | |
| "loss": 3.9386, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3313727114572115, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.8384247331615755e-05, | |
| "loss": 3.9285, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.34794134703007207, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.930437983069562e-05, | |
| "loss": 3.9211, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.36450998260293266, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 2.022451232977549e-05, | |
| "loss": 3.9216, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.38107861817579325, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 2.1144644828855357e-05, | |
| "loss": 3.9118, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3976472537486538, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 2.2064777327935222e-05, | |
| "loss": 3.9059, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4142158893215144, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 2.298490982701509e-05, | |
| "loss": 3.9019, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.43078452489437496, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.390504232609496e-05, | |
| "loss": 3.9035, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4473531604672355, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 2.4825174825174828e-05, | |
| "loss": 3.8937, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4639217960400961, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 2.5745307324254693e-05, | |
| "loss": 3.881, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4804904316129567, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.666543982333456e-05, | |
| "loss": 3.8808, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.49705906718581727, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 2.7585572322414427e-05, | |
| "loss": 3.8782, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5136277027586779, | |
| "grad_norm": 1.0, | |
| "learning_rate": 2.8505704821494296e-05, | |
| "loss": 3.8817, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5301963383315385, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 2.942583732057416e-05, | |
| "loss": 3.8718, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5467649739043989, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.034596981965403e-05, | |
| "loss": 3.8675, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5633336094772595, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 3.12661023187339e-05, | |
| "loss": 3.8699, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5799022450501201, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.2186234817813766e-05, | |
| "loss": 3.8645, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5964708806229807, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 3.3106367316893635e-05, | |
| "loss": 3.8601, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6130395161958413, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 3.4026499815973504e-05, | |
| "loss": 3.8513, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6296081517687019, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 3.4946632315053365e-05, | |
| "loss": 3.8583, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6461767873415625, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 3.5866764814133234e-05, | |
| "loss": 3.857, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.662745422914423, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 3.67868973132131e-05, | |
| "loss": 3.8488, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6793140584872835, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 3.770702981229297e-05, | |
| "loss": 3.843, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6958826940601441, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 3.862716231137284e-05, | |
| "loss": 3.8413, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7124513296330047, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 3.954729481045271e-05, | |
| "loss": 3.836, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.7290199652058653, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 4.046742730953258e-05, | |
| "loss": 3.8366, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7455886007787259, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 4.138755980861244e-05, | |
| "loss": 3.8347, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7621572363515865, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 4.230769230769231e-05, | |
| "loss": 3.8299, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.778725871924447, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 4.3227824806772176e-05, | |
| "loss": 3.8187, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7952945074973076, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 4.4147957305852044e-05, | |
| "loss": 3.8225, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8118631430701682, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 4.506808980493191e-05, | |
| "loss": 3.8229, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8284317786430287, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 4.598822230401178e-05, | |
| "loss": 3.8245, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8450004142158893, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 4.690835480309165e-05, | |
| "loss": 3.8109, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8615690497887499, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 4.782848730217152e-05, | |
| "loss": 3.8195, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8781376853616105, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 4.874861980125138e-05, | |
| "loss": 3.8125, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.894706320934471, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 4.966875230033125e-05, | |
| "loss": 3.8131, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9112749565073316, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 4.999998362078322e-05, | |
| "loss": 3.819, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9278435920801922, | |
| "grad_norm": 0.75, | |
| "learning_rate": 4.999989244747393e-05, | |
| "loss": 3.8082, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9444122276530528, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 4.9999721297876855e-05, | |
| "loss": 3.8146, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9609808632259134, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 4.999947017253951e-05, | |
| "loss": 3.8042, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.977549498798774, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 4.9999139072265274e-05, | |
| "loss": 3.8072, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9941181343716345, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 4.9998727998113335e-05, | |
| "loss": 3.8008, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0106039267666307, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 4.999823695139877e-05, | |
| "loss": 3.7924, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.0271725623394914, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.999766593369246e-05, | |
| "loss": 3.7963, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0437411979123519, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.999701494682112e-05, | |
| "loss": 3.7837, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.0603098334852126, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.999628399286731e-05, | |
| "loss": 3.7942, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.076878469058073, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.99954730741694e-05, | |
| "loss": 3.7819, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.0934471046309335, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.999458219332157e-05, | |
| "loss": 3.7868, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1100157402037942, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 4.9993611353173794e-05, | |
| "loss": 3.7924, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.1265843757766547, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 4.999256055683187e-05, | |
| "loss": 3.7884, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1431530113495154, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 4.999142980765736e-05, | |
| "loss": 3.7875, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.159721646922376, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.9990219109267596e-05, | |
| "loss": 3.7827, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1762902824952366, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 4.9988928465535686e-05, | |
| "loss": 3.7832, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.192858918068097, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.9987557880590486e-05, | |
| "loss": 3.7854, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2094275536409578, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.998610735881659e-05, | |
| "loss": 3.7765, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.2259961892138183, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 4.99845769048543e-05, | |
| "loss": 3.7835, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2425648247866787, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.998296652359965e-05, | |
| "loss": 3.7809, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.2591334603595394, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 4.9981276220204344e-05, | |
| "loss": 3.7849, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.2757020959324, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.997950600007578e-05, | |
| "loss": 3.7815, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.2922707315052606, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 4.997765586887702e-05, | |
| "loss": 3.7793, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.308839367078121, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.997572583252672e-05, | |
| "loss": 3.7729, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.3254080026509816, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 4.9973715897199226e-05, | |
| "loss": 3.7745, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3419766382238423, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.9971626069324435e-05, | |
| "loss": 3.7688, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.358545273796703, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 4.996945635558785e-05, | |
| "loss": 3.7748, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3751139093695635, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 4.996720676293052e-05, | |
| "loss": 3.7686, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.391682544942424, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 4.9964877298549045e-05, | |
| "loss": 3.7736, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4082511805152846, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.9962467969895535e-05, | |
| "loss": 3.7751, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.4248198160881451, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 4.995997878467758e-05, | |
| "loss": 3.7673, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4413884516610058, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 4.995740975085825e-05, | |
| "loss": 3.7742, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.4579570872338663, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 4.9954760876656056e-05, | |
| "loss": 3.7737, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4745257228067268, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 4.995203217054493e-05, | |
| "loss": 3.7704, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.4910943583795875, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 4.9949223641254156e-05, | |
| "loss": 3.7693, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.507662993952448, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.994633529776842e-05, | |
| "loss": 3.76, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.5242316295253087, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 4.994336714932771e-05, | |
| "loss": 3.7617, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.5408002650981691, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 4.9940319205427335e-05, | |
| "loss": 3.7737, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.5573689006710296, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 4.993719147581787e-05, | |
| "loss": 3.7699, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.5739375362438903, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.9933983970505116e-05, | |
| "loss": 3.7665, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.590506171816751, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.9930696699750095e-05, | |
| "loss": 3.7622, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6070748073896115, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 4.992732967406901e-05, | |
| "loss": 3.7572, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.623643442962472, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.992388290423318e-05, | |
| "loss": 3.7626, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.6402120785353325, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.9920356401269055e-05, | |
| "loss": 3.7626, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.6567807141081932, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 4.991675017645815e-05, | |
| "loss": 3.7626, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.6567807141081932, | |
| "eval_loss": 3.72454833984375, | |
| "eval_runtime": 7.9243, | |
| "eval_samples_per_second": 122.282, | |
| "eval_steps_per_second": 2.019, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.6733493496810539, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 4.991306424133701e-05, | |
| "loss": 3.762, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.6899179852539143, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.990929860769719e-05, | |
| "loss": 3.7576, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7064866208267748, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 4.990545328758518e-05, | |
| "loss": 3.7624, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.7230552563996355, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 4.990152829330243e-05, | |
| "loss": 3.757, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.7396238919724962, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 4.989752363740524e-05, | |
| "loss": 3.7655, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.7561925275453567, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.989343933270477e-05, | |
| "loss": 3.7575, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.7727611631182172, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.9889275392266984e-05, | |
| "loss": 3.7618, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.7893297986910777, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.988503182941259e-05, | |
| "loss": 3.7561, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.8058984342639384, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.988070865771702e-05, | |
| "loss": 3.7645, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.822467069836799, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 4.9876305891010385e-05, | |
| "loss": 3.7571, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.8390357054096595, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.987182354337744e-05, | |
| "loss": 3.7565, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.85560434098252, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 4.986726162915748e-05, | |
| "loss": 3.7604, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.8721729765553805, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 4.9862620162944386e-05, | |
| "loss": 3.7497, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.8887416121282412, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 4.9857899159586496e-05, | |
| "loss": 3.7498, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.905310247701102, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 4.9853098634186625e-05, | |
| "loss": 3.7488, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.9218788832739624, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 4.984821860210196e-05, | |
| "loss": 3.7517, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.9384475188468229, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.984325907894404e-05, | |
| "loss": 3.7454, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.9550161544196836, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 4.98382200805787e-05, | |
| "loss": 3.7446, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.9715847899925443, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 4.9833101623126034e-05, | |
| "loss": 3.751, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.9881534255654048, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.982790372296031e-05, | |
| "loss": 3.7485, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.004639217960401, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 4.9822626396709965e-05, | |
| "loss": 3.7467, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.0212078535332614, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 4.98172696612575e-05, | |
| "loss": 3.7434, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.0377764891061223, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.981183353373946e-05, | |
| "loss": 3.7451, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.054345124678983, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.980631803154638e-05, | |
| "loss": 3.7338, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.0709137602518433, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 4.98007231723227e-05, | |
| "loss": 3.7348, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.0874823958247037, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 4.979504897396675e-05, | |
| "loss": 3.7295, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.104051031397564, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.978929545463066e-05, | |
| "loss": 3.7357, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.120619666970425, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.97834626327203e-05, | |
| "loss": 3.7387, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.1371883025432856, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 4.9777550526895265e-05, | |
| "loss": 3.7359, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.153756938116146, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.977155915606877e-05, | |
| "loss": 3.7359, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.1703255736890066, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.9765488539407586e-05, | |
| "loss": 3.7333, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.186894209261867, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.975933869633202e-05, | |
| "loss": 3.7296, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.203462844834728, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.9753109646515814e-05, | |
| "loss": 3.7297, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.2200314804075885, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.974680140988612e-05, | |
| "loss": 3.7334, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.236600115980449, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 4.974041400662338e-05, | |
| "loss": 3.7342, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.2531687515533094, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.973394745716133e-05, | |
| "loss": 3.726, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.2697373871261703, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.972740178218688e-05, | |
| "loss": 3.7359, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.286306022699031, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 4.972077700264007e-05, | |
| "loss": 3.7298, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.3028746582718913, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 4.9714073139714004e-05, | |
| "loss": 3.731, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.319443293844752, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 4.970729021485476e-05, | |
| "loss": 3.7268, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.3360119294176123, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.9700428249761386e-05, | |
| "loss": 3.7249, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.352580564990473, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.969348726638574e-05, | |
| "loss": 3.7297, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.3691492005633337, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 4.968646728693248e-05, | |
| "loss": 3.728, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.385717836136194, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.967936833385898e-05, | |
| "loss": 3.7298, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.4022864717090546, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.9672190429875266e-05, | |
| "loss": 3.7228, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.4188551072819156, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.96649335979439e-05, | |
| "loss": 3.7254, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.435423742854776, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.9657597861279976e-05, | |
| "loss": 3.7284, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.4519923784276365, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 4.965018324335099e-05, | |
| "loss": 3.7213, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.468561014000497, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 4.964268976787679e-05, | |
| "loss": 3.7224, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.4851296495733575, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.9635117458829496e-05, | |
| "loss": 3.7222, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.501698285146218, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 4.962746634043341e-05, | |
| "loss": 3.7286, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.518266920719079, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 4.961973643716497e-05, | |
| "loss": 3.7287, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.5348355562919394, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 4.961192777375263e-05, | |
| "loss": 3.7208, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.5514041918648, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.9604040375176816e-05, | |
| "loss": 3.7305, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.5679728274376608, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.9596074266669844e-05, | |
| "loss": 3.7171, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.5845414630105212, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.95880294737158e-05, | |
| "loss": 3.721, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.6011100985833817, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 4.9579906022050517e-05, | |
| "loss": 3.7225, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.617678734156242, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 4.957170393766143e-05, | |
| "loss": 3.7215, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.6342473697291027, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 4.956342324678755e-05, | |
| "loss": 3.7237, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.650816005301963, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.9555063975919345e-05, | |
| "loss": 3.7189, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.667384640874824, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 4.954662615179868e-05, | |
| "loss": 3.7209, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.6839532764476846, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.953810980141869e-05, | |
| "loss": 3.7267, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.700521912020545, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.952951495202374e-05, | |
| "loss": 3.72, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.717090547593406, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.9520841631109315e-05, | |
| "loss": 3.7168, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.7336591831662664, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.951208986642194e-05, | |
| "loss": 3.7167, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.750227818739127, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.9503259685959074e-05, | |
| "loss": 3.7192, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.7667964543119874, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.949435111796905e-05, | |
| "loss": 3.7162, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.783365089884848, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.948536419095095e-05, | |
| "loss": 3.7161, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.7999337254577084, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.947629893365453e-05, | |
| "loss": 3.718, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.8165023610305693, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 4.9467155375080165e-05, | |
| "loss": 3.7152, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.8330709966034298, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 4.9457933544478684e-05, | |
| "loss": 3.7237, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.8496396321762902, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.944863347135132e-05, | |
| "loss": 3.7092, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.8662082677491507, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.943925518544962e-05, | |
| "loss": 3.7174, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.8827769033220116, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 4.942979871677532e-05, | |
| "loss": 3.7162, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.899345538894872, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.94202640955803e-05, | |
| "loss": 3.7152, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.9159141744677326, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.9410651352366435e-05, | |
| "loss": 3.7181, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.932482810040593, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 4.94009605178855e-05, | |
| "loss": 3.7177, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.9490514456134536, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 4.939119162313912e-05, | |
| "loss": 3.7216, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.965620081186314, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.9381344699378626e-05, | |
| "loss": 3.712, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.982188716759175, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 4.937141977810497e-05, | |
| "loss": 3.7199, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.9987573523320354, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 4.936141689106861e-05, | |
| "loss": 3.7058, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 3.0152431447270316, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.935133607026945e-05, | |
| "loss": 3.6969, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.0318117802998925, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.934117734795669e-05, | |
| "loss": 3.706, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 3.048380415872753, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 4.933094075662874e-05, | |
| "loss": 3.7006, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.0649490514456135, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.9320626329033134e-05, | |
| "loss": 3.703, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 3.081517687018474, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.9310234098166396e-05, | |
| "loss": 3.7084, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.0980863225913344, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 4.929976409727395e-05, | |
| "loss": 3.6991, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 3.1146549581641954, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 4.928921635985001e-05, | |
| "loss": 3.7057, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.131223593737056, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.9278590919637466e-05, | |
| "loss": 3.703, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 3.1477922293099163, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.9267887810627824e-05, | |
| "loss": 3.7103, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.164360864882777, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.9257107067061e-05, | |
| "loss": 3.7017, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 3.1809295004556373, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.924624872342531e-05, | |
| "loss": 3.6937, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.197498136028498, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 4.92353128144573e-05, | |
| "loss": 3.7063, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 3.2140667716013587, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.9224299375141656e-05, | |
| "loss": 3.6978, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.230635407174219, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.921320844071109e-05, | |
| "loss": 3.705, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 3.2472040427470796, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.920204004664624e-05, | |
| "loss": 3.704, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.26377267831994, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 4.91907942286755e-05, | |
| "loss": 3.703, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 3.280341313892801, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 4.917947102277499e-05, | |
| "loss": 3.7027, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.2969099494656615, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.916807046516838e-05, | |
| "loss": 3.6977, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 3.313478585038522, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.9156592592326814e-05, | |
| "loss": 3.6968, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.313478585038522, | |
| "eval_loss": 3.6868932247161865, | |
| "eval_runtime": 8.062, | |
| "eval_samples_per_second": 120.194, | |
| "eval_steps_per_second": 1.985, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.3300472206113825, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 4.9145037440968746e-05, | |
| "loss": 3.7025, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 3.3466158561842434, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 4.913340504805984e-05, | |
| "loss": 3.7034, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.363184491757104, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.912169545081292e-05, | |
| "loss": 3.7025, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 3.3797531273299644, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.910990868668772e-05, | |
| "loss": 3.6956, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.396321762902825, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.9098044793390876e-05, | |
| "loss": 3.7025, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.4128903984756853, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.908610380887576e-05, | |
| "loss": 3.6915, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.4294590340485462, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 4.9074085771342365e-05, | |
| "loss": 3.6987, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.4460276696214067, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 4.9061990719237175e-05, | |
| "loss": 3.6943, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.462596305194267, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 4.904981869125303e-05, | |
| "loss": 3.6961, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.4791649407671277, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.903756972632908e-05, | |
| "loss": 3.6985, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.4957335763399886, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.9025243863650545e-05, | |
| "loss": 3.7024, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.512302211912849, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.901284114264866e-05, | |
| "loss": 3.6989, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.5288708474857096, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.900036160300053e-05, | |
| "loss": 3.7015, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.54543948305857, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.898780528462904e-05, | |
| "loss": 3.6958, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.5620081186314305, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.8975172227702665e-05, | |
| "loss": 3.6988, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.578576754204291, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.896246247263537e-05, | |
| "loss": 3.6926, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.595145389777152, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.89496760600865e-05, | |
| "loss": 3.6903, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.6117140253500124, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 4.893681303096062e-05, | |
| "loss": 3.6945, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.628282660922873, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 4.892387342640739e-05, | |
| "loss": 3.6977, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.644851296495734, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.8910857287821465e-05, | |
| "loss": 3.6882, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.6614199320685943, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 4.889776465684232e-05, | |
| "loss": 3.6968, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.6779885676414548, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.888459557535413e-05, | |
| "loss": 3.6937, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.6945572032143152, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.8871350085485646e-05, | |
| "loss": 3.7016, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.7111258387871757, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 4.8858028229610055e-05, | |
| "loss": 3.6983, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.727694474360036, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.884463005034485e-05, | |
| "loss": 3.6995, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.744263109932897, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.8831155590551674e-05, | |
| "loss": 3.6978, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.7608317455057576, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.881760489333621e-05, | |
| "loss": 3.6981, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.777400381078618, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.880397800204801e-05, | |
| "loss": 3.6884, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.793969016651479, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.879027496028041e-05, | |
| "loss": 3.6957, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.8105376522243395, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 4.8776495811870315e-05, | |
| "loss": 3.6933, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.8271062877972, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.876264060089813e-05, | |
| "loss": 3.6857, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.8436749233700604, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.874870937168757e-05, | |
| "loss": 3.6998, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.860243558942921, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.873470216880557e-05, | |
| "loss": 3.6876, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.8768121945157814, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.8720619037062056e-05, | |
| "loss": 3.6933, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.8933808300886423, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 4.87064600215099e-05, | |
| "loss": 3.6993, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.909949465661503, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.869222516744473e-05, | |
| "loss": 3.6934, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.9265181012343633, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.867791452040475e-05, | |
| "loss": 3.6912, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.9430867368072238, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.866352812617066e-05, | |
| "loss": 3.6898, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.9596553723800847, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 4.864906603076549e-05, | |
| "loss": 3.6901, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.976224007952945, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.8634528280454404e-05, | |
| "loss": 3.6925, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.9927926435258057, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.861991492174464e-05, | |
| "loss": 3.6884, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 4.009278435920802, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.8605226001385256e-05, | |
| "loss": 3.685, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 4.025847071493662, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.85904615663671e-05, | |
| "loss": 3.6815, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 4.042415707066523, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 4.857562166392253e-05, | |
| "loss": 3.6793, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 4.058984342639383, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 4.8560706341525386e-05, | |
| "loss": 3.6799, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 4.075552978212245, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.854571564689075e-05, | |
| "loss": 3.6809, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 4.092121613785105, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.853064962797483e-05, | |
| "loss": 3.6788, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 4.108690249357966, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.851550833297481e-05, | |
| "loss": 3.6869, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 4.125258884930826, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 4.850029181032869e-05, | |
| "loss": 3.6864, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 4.1418275205036865, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 4.8485000108715094e-05, | |
| "loss": 3.6845, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.158396156076547, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 4.846963327705319e-05, | |
| "loss": 3.6772, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 4.1749647916494075, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 4.845419136450248e-05, | |
| "loss": 3.691, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 4.191533427222268, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.843867442046264e-05, | |
| "loss": 3.6889, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 4.208102062795128, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.842308249457339e-05, | |
| "loss": 3.6835, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 4.22467069836799, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.8407415636714315e-05, | |
| "loss": 3.6833, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 4.24123933394085, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 4.839167389700472e-05, | |
| "loss": 3.6782, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 4.257807969513711, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.837585732580345e-05, | |
| "loss": 3.6887, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 4.274376605086571, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 4.835996597370877e-05, | |
| "loss": 3.6855, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 4.290945240659432, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.8343999891558136e-05, | |
| "loss": 3.6828, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 4.307513876232292, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 4.832795913042809e-05, | |
| "loss": 3.6815, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.324082511805153, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.831184374163407e-05, | |
| "loss": 3.6836, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 4.340651147378013, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.829565377673026e-05, | |
| "loss": 3.6919, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 4.357219782950874, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 4.827938928750941e-05, | |
| "loss": 3.6815, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 4.373788418523734, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.82630503260027e-05, | |
| "loss": 3.6834, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 4.3903570540965955, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 4.824663694447952e-05, | |
| "loss": 3.6834, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 4.406925689669456, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.823014919544734e-05, | |
| "loss": 3.6814, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 4.4234943252423164, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.8213587131651566e-05, | |
| "loss": 3.6849, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 4.440062960815177, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 4.819695080607531e-05, | |
| "loss": 3.6795, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 4.456631596388037, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 4.8180240271939256e-05, | |
| "loss": 3.6823, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 4.473200231960898, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 4.8163455582701514e-05, | |
| "loss": 3.6798, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.489768867533758, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 4.8146596792057385e-05, | |
| "loss": 3.6811, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 4.506337503106619, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.812966395393924e-05, | |
| "loss": 3.6858, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 4.522906138679479, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 4.811265712251635e-05, | |
| "loss": 3.6776, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 4.539474774252341, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 4.809557635219465e-05, | |
| "loss": 3.6812, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.556043409825201, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 4.807842169761667e-05, | |
| "loss": 3.6772, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.572612045398062, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 4.806119321366124e-05, | |
| "loss": 3.6846, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.589180680970922, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 4.8043890955443436e-05, | |
| "loss": 3.6826, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.605749316543783, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 4.802651497831428e-05, | |
| "loss": 3.6789, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.622317952116643, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 4.8009065337860685e-05, | |
| "loss": 3.6776, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.638886587689504, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.799154208990518e-05, | |
| "loss": 3.6736, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.655455223262364, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.7973945290505766e-05, | |
| "loss": 3.6817, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.6720238588352245, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 4.7956274995955775e-05, | |
| "loss": 3.6843, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.688592494408086, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 4.793853126278361e-05, | |
| "loss": 3.6777, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.705161129980946, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.792071414775265e-05, | |
| "loss": 3.6788, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.721729765553807, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.7902823707861e-05, | |
| "loss": 3.6818, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.738298401126667, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 4.7884860000341344e-05, | |
| "loss": 3.6844, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.754867036699528, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 4.786682308266076e-05, | |
| "loss": 3.68, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.771435672272388, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.784871301252052e-05, | |
| "loss": 3.6782, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.788004307845249, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.783052984785593e-05, | |
| "loss": 3.6743, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.804572943418109, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 4.781227364683611e-05, | |
| "loss": 3.6864, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.82114157899097, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 4.7793944467863836e-05, | |
| "loss": 3.6766, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.837710214563831, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.777554236957537e-05, | |
| "loss": 3.676, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.854278850136692, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.775706741084019e-05, | |
| "loss": 3.6834, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.870847485709552, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.7738519650760935e-05, | |
| "loss": 3.679, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.8874161212824125, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.7719899148673074e-05, | |
| "loss": 3.6792, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.903984756855273, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.770120596414481e-05, | |
| "loss": 3.6783, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.9205533924281335, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 4.768244015697687e-05, | |
| "loss": 3.6849, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.937122028000994, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 4.7663601787202285e-05, | |
| "loss": 3.6749, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.9536906635738545, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 4.7644690915086245e-05, | |
| "loss": 3.6825, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.970259299146715, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 4.7625707601125845e-05, | |
| "loss": 3.6747, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.970259299146715, | |
| "eval_loss": 3.661933422088623, | |
| "eval_runtime": 7.9724, | |
| "eval_samples_per_second": 121.545, | |
| "eval_steps_per_second": 2.007, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.986827934719576, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.760665190604996e-05, | |
| "loss": 3.6823, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 5.0033137271145725, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 4.758752389081901e-05, | |
| "loss": 3.6759, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 5.019882362687433, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 4.756832361662473e-05, | |
| "loss": 3.6767, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 5.036450998260293, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 4.754905114489009e-05, | |
| "loss": 3.6778, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 5.053019633833154, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 4.752970653726896e-05, | |
| "loss": 3.6676, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 5.069588269406014, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.751028985564602e-05, | |
| "loss": 3.6659, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 5.086156904978875, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 4.7490801162136484e-05, | |
| "loss": 3.6648, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 5.102725540551735, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.7471240519085964e-05, | |
| "loss": 3.6797, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 5.119294176124596, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 4.745160798907023e-05, | |
| "loss": 3.6685, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 5.135862811697456, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 4.7431903634895034e-05, | |
| "loss": 3.6657, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.152431447270318, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 4.741212751959589e-05, | |
| "loss": 3.6699, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 5.169000082843178, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 4.739227970643787e-05, | |
| "loss": 3.6705, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 5.185568718416039, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 4.737236025891544e-05, | |
| "loss": 3.6692, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 5.202137353988899, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 4.73523692407522e-05, | |
| "loss": 3.6632, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 5.21870598956176, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 4.733230671590074e-05, | |
| "loss": 3.6764, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 5.23527462513462, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 4.731217274854236e-05, | |
| "loss": 3.6778, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 5.2518432607074805, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 4.729196740308696e-05, | |
| "loss": 3.6771, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 5.268411896280341, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 4.7271690744172745e-05, | |
| "loss": 3.6752, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 5.2849805318532015, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 4.725134283666607e-05, | |
| "loss": 3.6766, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 5.301549167426063, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 4.723092374566123e-05, | |
| "loss": 3.6753, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 5.318117802998923, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.721043353648021e-05, | |
| "loss": 3.6728, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 5.334686438571784, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 4.718987227467256e-05, | |
| "loss": 3.6722, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 5.351255074144644, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 4.716924002601507e-05, | |
| "loss": 3.6674, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 5.367823709717505, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.7148536856511655e-05, | |
| "loss": 3.6766, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 5.384392345290365, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 4.712776283239311e-05, | |
| "loss": 3.6643, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 5.400960980863226, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 4.71069180201169e-05, | |
| "loss": 3.6655, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 5.417529616436086, | |
| "grad_norm": 0.5, | |
| "learning_rate": 4.708600248636693e-05, | |
| "loss": 3.6734, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 5.434098252008947, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 4.7065016298053365e-05, | |
| "loss": 3.6684, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 5.450666887581807, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 4.7043959522312384e-05, | |
| "loss": 3.6661, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 5.4672355231546685, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 4.7022832226505996e-05, | |
| "loss": 3.6706, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 5.483804158727529, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.700163447822179e-05, | |
| "loss": 3.6701, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 5.5003727943003895, | |
| "grad_norm": 0.5, | |
| "learning_rate": 4.6980366345272756e-05, | |
| "loss": 3.6724, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 5.51694142987325, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 4.6959027895697026e-05, | |
| "loss": 3.6691, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 5.5335100654461105, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 4.693761919775771e-05, | |
| "loss": 3.6729, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 5.550078701018971, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 4.691614031994261e-05, | |
| "loss": 3.6766, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 5.566647336591831, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 4.6894591330964076e-05, | |
| "loss": 3.6744, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 5.583215972164692, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 4.6872972299758726e-05, | |
| "loss": 3.6675, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 5.599784607737552, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 4.685128329548725e-05, | |
| "loss": 3.6669, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 5.616353243310414, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 4.6829524387534184e-05, | |
| "loss": 3.6688, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 5.632921878883274, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 4.6807695645507704e-05, | |
| "loss": 3.6708, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 5.649490514456135, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 4.6785797139239376e-05, | |
| "loss": 3.669, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 5.666059150028995, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 4.676382893878395e-05, | |
| "loss": 3.6652, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 5.682627785601856, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 4.674179111441913e-05, | |
| "loss": 3.6658, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.699196421174716, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 4.671968373664536e-05, | |
| "loss": 3.6668, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.715765056747577, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 4.6697506876185586e-05, | |
| "loss": 3.6721, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.732333692320437, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 4.667526060398503e-05, | |
| "loss": 3.6696, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.748902327893298, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 4.665294499121097e-05, | |
| "loss": 3.6714, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.765470963466159, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 4.6630560109252496e-05, | |
| "loss": 3.665, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.782039599039019, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.660810602972032e-05, | |
| "loss": 3.6705, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.79860823461188, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 4.658558282444651e-05, | |
| "loss": 3.6658, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.81517687018474, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 4.6562990565484265e-05, | |
| "loss": 3.6624, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.831745505757601, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 4.654032932510769e-05, | |
| "loss": 3.6718, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.848314141330461, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 4.651759917581159e-05, | |
| "loss": 3.6682, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.864882776903322, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 4.649480019031117e-05, | |
| "loss": 3.6631, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.881451412476182, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 4.647193244154189e-05, | |
| "loss": 3.6697, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.898020048049043, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 4.644899600265917e-05, | |
| "loss": 3.6689, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.914588683621904, | |
| "grad_norm": 0.5, | |
| "learning_rate": 4.6425990947038175e-05, | |
| "loss": 3.6726, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.931157319194765, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 4.640291734827357e-05, | |
| "loss": 3.6715, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.947725954767625, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 4.637977528017931e-05, | |
| "loss": 3.67, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.964294590340486, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 4.63565648167884e-05, | |
| "loss": 3.6715, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.980863225913346, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 4.6333286032352594e-05, | |
| "loss": 3.6649, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.9974318614862066, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 4.630993900134227e-05, | |
| "loss": 3.6725, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 6.013917653881203, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.628652379844609e-05, | |
| "loss": 3.6574, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 6.030486289454063, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.626304049857081e-05, | |
| "loss": 3.6601, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 6.047054925026924, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 4.623948917684106e-05, | |
| "loss": 3.67, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 6.063623560599785, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.621586990859904e-05, | |
| "loss": 3.6611, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 6.0801921961726455, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.6192182769404335e-05, | |
| "loss": 3.6608, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 6.096760831745506, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.616842783503365e-05, | |
| "loss": 3.6631, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 6.1133294673183665, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.614460518148057e-05, | |
| "loss": 3.657, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 6.129898102891227, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 4.6120714884955316e-05, | |
| "loss": 3.6635, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 6.146466738464087, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.609675702188452e-05, | |
| "loss": 3.6595, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 6.163035374036948, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.6072731668910926e-05, | |
| "loss": 3.6676, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 6.179604009609808, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.604863890289322e-05, | |
| "loss": 3.6611, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 6.196172645182669, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.6024478800905735e-05, | |
| "loss": 3.6679, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 6.21274128075553, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.600025144023822e-05, | |
| "loss": 3.6612, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 6.229309916328391, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.597595689839556e-05, | |
| "loss": 3.6664, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 6.245878551901251, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.59515952530976e-05, | |
| "loss": 3.6644, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 6.262447187474112, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.592716658227884e-05, | |
| "loss": 3.6656, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 6.279015823046972, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.5902670964088165e-05, | |
| "loss": 3.6643, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 6.295584458619833, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.587810847688868e-05, | |
| "loss": 3.6589, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 6.312153094192693, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.585347919925737e-05, | |
| "loss": 3.6601, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 6.328721729765554, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.582878320998491e-05, | |
| "loss": 3.662, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 6.345290365338414, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.580402058807539e-05, | |
| "loss": 3.6678, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 6.3618590009112745, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 4.577919141274604e-05, | |
| "loss": 3.6652, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 6.378427636484136, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.575429576342701e-05, | |
| "loss": 3.6591, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 6.394996272056996, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.5729333719761124e-05, | |
| "loss": 3.6592, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 6.411564907629857, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.5704305361603585e-05, | |
| "loss": 3.66, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 6.428133543202717, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.567921076902173e-05, | |
| "loss": 3.6641, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 6.444702178775578, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.5654050022294805e-05, | |
| "loss": 3.6676, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 6.461270814348438, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 4.562882320191368e-05, | |
| "loss": 3.6594, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 6.477839449921299, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.560353038858062e-05, | |
| "loss": 3.6554, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 6.494408085494159, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.557817166320896e-05, | |
| "loss": 3.668, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 6.51097672106702, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.5552747106922924e-05, | |
| "loss": 3.6629, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 6.52754535663988, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 4.552725680105733e-05, | |
| "loss": 3.666, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 6.544113992212742, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 4.550170082715733e-05, | |
| "loss": 3.6651, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 6.560682627785602, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.5476079266978164e-05, | |
| "loss": 3.6653, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 6.5772512633584626, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.545039220248486e-05, | |
| "loss": 3.6644, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 6.593819898931323, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 4.542463971585201e-05, | |
| "loss": 3.6624, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 6.6103885345041835, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.539882188946352e-05, | |
| "loss": 3.6577, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 6.626957170077044, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.537293880591229e-05, | |
| "loss": 3.6674, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 6.626957170077044, | |
| "eval_loss": 3.6582796573638916, | |
| "eval_runtime": 7.9393, | |
| "eval_samples_per_second": 122.05, | |
| "eval_steps_per_second": 2.015, | |
| "step": 20000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 90540, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 30, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.903523641762932e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |