{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.626957170077044, "eval_steps": 5000, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003313727114572115, "grad_norm": 2.828125, "learning_rate": 0.0, "loss": 4.1855, "step": 1 }, { "epoch": 0.016568635572860577, "grad_norm": 3.234375, "learning_rate": 9.017298490982701e-07, "loss": 4.4012, "step": 50 }, { "epoch": 0.03313727114572115, "grad_norm": 2.984375, "learning_rate": 1.8218623481781377e-06, "loss": 4.3839, "step": 100 }, { "epoch": 0.04970590671858172, "grad_norm": 2.265625, "learning_rate": 2.741994847258005e-06, "loss": 4.345, "step": 150 }, { "epoch": 0.0662745422914423, "grad_norm": 2.171875, "learning_rate": 3.662127346337873e-06, "loss": 4.2973, "step": 200 }, { "epoch": 0.08284317786430287, "grad_norm": 1.5234375, "learning_rate": 4.582259845417741e-06, "loss": 4.224, "step": 250 }, { "epoch": 0.09941181343716345, "grad_norm": 1.0234375, "learning_rate": 5.502392344497608e-06, "loss": 4.144, "step": 300 }, { "epoch": 0.11598044901002402, "grad_norm": 1.0390625, "learning_rate": 6.422524843577475e-06, "loss": 4.0929, "step": 350 }, { "epoch": 0.1325490845828846, "grad_norm": 1.1796875, "learning_rate": 7.342657342657343e-06, "loss": 4.0516, "step": 400 }, { "epoch": 0.14911772015574518, "grad_norm": 1.390625, "learning_rate": 8.26278984173721e-06, "loss": 4.025, "step": 450 }, { "epoch": 0.16568635572860574, "grad_norm": 1.515625, "learning_rate": 9.182922340817078e-06, "loss": 4.0113, "step": 500 }, { "epoch": 0.18225499130146633, "grad_norm": 1.4765625, "learning_rate": 1.0103054839896946e-05, "loss": 4.0015, "step": 550 }, { "epoch": 0.1988236268743269, "grad_norm": 1.40625, "learning_rate": 1.1023187338976813e-05, "loss": 3.9936, "step": 600 }, { "epoch": 0.21539226244718748, "grad_norm": 1.234375, "learning_rate": 1.1943319838056682e-05, "loss": 3.9778, "step": 650 }, { "epoch": 0.23196089802004805, "grad_norm": 1.3671875, "learning_rate": 1.2863452337136547e-05, "loss": 3.971, "step": 700 }, { "epoch": 0.24852953359290864, "grad_norm": 1.2265625, "learning_rate": 1.3783584836216415e-05, "loss": 3.9664, "step": 750 }, { "epoch": 0.2650981691657692, "grad_norm": 1.2109375, "learning_rate": 1.4703717335296282e-05, "loss": 3.9561, "step": 800 }, { "epoch": 0.28166680473862976, "grad_norm": 1.0859375, "learning_rate": 1.562384983437615e-05, "loss": 3.9459, "step": 850 }, { "epoch": 0.29823544031149035, "grad_norm": 1.0, "learning_rate": 1.6543982333456018e-05, "loss": 3.9438, "step": 900 }, { "epoch": 0.31480407588435094, "grad_norm": 1.0390625, "learning_rate": 1.7464114832535886e-05, "loss": 3.9386, "step": 950 }, { "epoch": 0.3313727114572115, "grad_norm": 1.0625, "learning_rate": 1.8384247331615755e-05, "loss": 3.9285, "step": 1000 }, { "epoch": 0.34794134703007207, "grad_norm": 1.1328125, "learning_rate": 1.930437983069562e-05, "loss": 3.9211, "step": 1050 }, { "epoch": 0.36450998260293266, "grad_norm": 1.0859375, "learning_rate": 2.022451232977549e-05, "loss": 3.9216, "step": 1100 }, { "epoch": 0.38107861817579325, "grad_norm": 0.9921875, "learning_rate": 2.1144644828855357e-05, "loss": 3.9118, "step": 1150 }, { "epoch": 0.3976472537486538, "grad_norm": 1.0078125, "learning_rate": 2.2064777327935222e-05, "loss": 3.9059, "step": 1200 }, { "epoch": 0.4142158893215144, "grad_norm": 1.03125, "learning_rate": 2.298490982701509e-05, "loss": 3.9019, "step": 1250 }, { "epoch": 0.43078452489437496, "grad_norm": 1.0234375, "learning_rate": 2.390504232609496e-05, "loss": 3.9035, "step": 1300 }, { "epoch": 0.4473531604672355, "grad_norm": 0.8671875, "learning_rate": 2.4825174825174828e-05, "loss": 3.8937, "step": 1350 }, { "epoch": 0.4639217960400961, "grad_norm": 0.89453125, "learning_rate": 2.5745307324254693e-05, "loss": 3.881, "step": 1400 }, { "epoch": 0.4804904316129567, "grad_norm": 0.90234375, "learning_rate": 2.666543982333456e-05, "loss": 3.8808, "step": 1450 }, { "epoch": 0.49705906718581727, "grad_norm": 0.84375, "learning_rate": 2.7585572322414427e-05, "loss": 3.8782, "step": 1500 }, { "epoch": 0.5136277027586779, "grad_norm": 1.0, "learning_rate": 2.8505704821494296e-05, "loss": 3.8817, "step": 1550 }, { "epoch": 0.5301963383315385, "grad_norm": 0.94140625, "learning_rate": 2.942583732057416e-05, "loss": 3.8718, "step": 1600 }, { "epoch": 0.5467649739043989, "grad_norm": 0.80859375, "learning_rate": 3.034596981965403e-05, "loss": 3.8675, "step": 1650 }, { "epoch": 0.5633336094772595, "grad_norm": 0.75390625, "learning_rate": 3.12661023187339e-05, "loss": 3.8699, "step": 1700 }, { "epoch": 0.5799022450501201, "grad_norm": 0.77734375, "learning_rate": 3.2186234817813766e-05, "loss": 3.8645, "step": 1750 }, { "epoch": 0.5964708806229807, "grad_norm": 0.86328125, "learning_rate": 3.3106367316893635e-05, "loss": 3.8601, "step": 1800 }, { "epoch": 0.6130395161958413, "grad_norm": 0.85546875, "learning_rate": 3.4026499815973504e-05, "loss": 3.8513, "step": 1850 }, { "epoch": 0.6296081517687019, "grad_norm": 0.828125, "learning_rate": 3.4946632315053365e-05, "loss": 3.8583, "step": 1900 }, { "epoch": 0.6461767873415625, "grad_norm": 0.8359375, "learning_rate": 3.5866764814133234e-05, "loss": 3.857, "step": 1950 }, { "epoch": 0.662745422914423, "grad_norm": 0.796875, "learning_rate": 3.67868973132131e-05, "loss": 3.8488, "step": 2000 }, { "epoch": 0.6793140584872835, "grad_norm": 0.7734375, "learning_rate": 3.770702981229297e-05, "loss": 3.843, "step": 2050 }, { "epoch": 0.6958826940601441, "grad_norm": 0.79296875, "learning_rate": 3.862716231137284e-05, "loss": 3.8413, "step": 2100 }, { "epoch": 0.7124513296330047, "grad_norm": 0.8046875, "learning_rate": 3.954729481045271e-05, "loss": 3.836, "step": 2150 }, { "epoch": 0.7290199652058653, "grad_norm": 0.7578125, "learning_rate": 4.046742730953258e-05, "loss": 3.8366, "step": 2200 }, { "epoch": 0.7455886007787259, "grad_norm": 0.79296875, "learning_rate": 4.138755980861244e-05, "loss": 3.8347, "step": 2250 }, { "epoch": 0.7621572363515865, "grad_norm": 0.80078125, "learning_rate": 4.230769230769231e-05, "loss": 3.8299, "step": 2300 }, { "epoch": 0.778725871924447, "grad_norm": 0.75390625, "learning_rate": 4.3227824806772176e-05, "loss": 3.8187, "step": 2350 }, { "epoch": 0.7952945074973076, "grad_norm": 0.76953125, "learning_rate": 4.4147957305852044e-05, "loss": 3.8225, "step": 2400 }, { "epoch": 0.8118631430701682, "grad_norm": 0.81640625, "learning_rate": 4.506808980493191e-05, "loss": 3.8229, "step": 2450 }, { "epoch": 0.8284317786430287, "grad_norm": 0.77734375, "learning_rate": 4.598822230401178e-05, "loss": 3.8245, "step": 2500 }, { "epoch": 0.8450004142158893, "grad_norm": 0.78515625, "learning_rate": 4.690835480309165e-05, "loss": 3.8109, "step": 2550 }, { "epoch": 0.8615690497887499, "grad_norm": 0.78125, "learning_rate": 4.782848730217152e-05, "loss": 3.8195, "step": 2600 }, { "epoch": 0.8781376853616105, "grad_norm": 0.7890625, "learning_rate": 4.874861980125138e-05, "loss": 3.8125, "step": 2650 }, { "epoch": 0.894706320934471, "grad_norm": 0.7578125, "learning_rate": 4.966875230033125e-05, "loss": 3.8131, "step": 2700 }, { "epoch": 0.9112749565073316, "grad_norm": 0.73046875, "learning_rate": 4.999998362078322e-05, "loss": 3.819, "step": 2750 }, { "epoch": 0.9278435920801922, "grad_norm": 0.75, "learning_rate": 4.999989244747393e-05, "loss": 3.8082, "step": 2800 }, { "epoch": 0.9444122276530528, "grad_norm": 0.79296875, "learning_rate": 4.9999721297876855e-05, "loss": 3.8146, "step": 2850 }, { "epoch": 0.9609808632259134, "grad_norm": 0.765625, "learning_rate": 4.999947017253951e-05, "loss": 3.8042, "step": 2900 }, { "epoch": 0.977549498798774, "grad_norm": 0.734375, "learning_rate": 4.9999139072265274e-05, "loss": 3.8072, "step": 2950 }, { "epoch": 0.9941181343716345, "grad_norm": 0.7578125, "learning_rate": 4.9998727998113335e-05, "loss": 3.8008, "step": 3000 }, { "epoch": 1.0106039267666307, "grad_norm": 0.31640625, "learning_rate": 4.999823695139877e-05, "loss": 3.7924, "step": 3050 }, { "epoch": 1.0271725623394914, "grad_norm": 0.306640625, "learning_rate": 4.999766593369246e-05, "loss": 3.7963, "step": 3100 }, { "epoch": 1.0437411979123519, "grad_norm": 0.33203125, "learning_rate": 4.999701494682112e-05, "loss": 3.7837, "step": 3150 }, { "epoch": 1.0603098334852126, "grad_norm": 0.298828125, "learning_rate": 4.999628399286731e-05, "loss": 3.7942, "step": 3200 }, { "epoch": 1.076878469058073, "grad_norm": 0.298828125, "learning_rate": 4.99954730741694e-05, "loss": 3.7819, "step": 3250 }, { "epoch": 1.0934471046309335, "grad_norm": 0.296875, "learning_rate": 4.999458219332157e-05, "loss": 3.7868, "step": 3300 }, { "epoch": 1.1100157402037942, "grad_norm": 0.294921875, "learning_rate": 4.9993611353173794e-05, "loss": 3.7924, "step": 3350 }, { "epoch": 1.1265843757766547, "grad_norm": 0.29296875, "learning_rate": 4.999256055683187e-05, "loss": 3.7884, "step": 3400 }, { "epoch": 1.1431530113495154, "grad_norm": 0.287109375, "learning_rate": 4.999142980765736e-05, "loss": 3.7875, "step": 3450 }, { "epoch": 1.159721646922376, "grad_norm": 0.3125, "learning_rate": 4.9990219109267596e-05, "loss": 3.7827, "step": 3500 }, { "epoch": 1.1762902824952366, "grad_norm": 0.314453125, "learning_rate": 4.9988928465535686e-05, "loss": 3.7832, "step": 3550 }, { "epoch": 1.192858918068097, "grad_norm": 0.28515625, "learning_rate": 4.9987557880590486e-05, "loss": 3.7854, "step": 3600 }, { "epoch": 1.2094275536409578, "grad_norm": 0.306640625, "learning_rate": 4.998610735881659e-05, "loss": 3.7765, "step": 3650 }, { "epoch": 1.2259961892138183, "grad_norm": 0.27734375, "learning_rate": 4.99845769048543e-05, "loss": 3.7835, "step": 3700 }, { "epoch": 1.2425648247866787, "grad_norm": 0.298828125, "learning_rate": 4.998296652359965e-05, "loss": 3.7809, "step": 3750 }, { "epoch": 1.2591334603595394, "grad_norm": 0.310546875, "learning_rate": 4.9981276220204344e-05, "loss": 3.7849, "step": 3800 }, { "epoch": 1.2757020959324, "grad_norm": 0.3125, "learning_rate": 4.997950600007578e-05, "loss": 3.7815, "step": 3850 }, { "epoch": 1.2922707315052606, "grad_norm": 0.302734375, "learning_rate": 4.997765586887702e-05, "loss": 3.7793, "step": 3900 }, { "epoch": 1.308839367078121, "grad_norm": 0.30859375, "learning_rate": 4.997572583252672e-05, "loss": 3.7729, "step": 3950 }, { "epoch": 1.3254080026509816, "grad_norm": 0.29296875, "learning_rate": 4.9973715897199226e-05, "loss": 3.7745, "step": 4000 }, { "epoch": 1.3419766382238423, "grad_norm": 0.32421875, "learning_rate": 4.9971626069324435e-05, "loss": 3.7688, "step": 4050 }, { "epoch": 1.358545273796703, "grad_norm": 0.275390625, "learning_rate": 4.996945635558785e-05, "loss": 3.7748, "step": 4100 }, { "epoch": 1.3751139093695635, "grad_norm": 0.291015625, "learning_rate": 4.996720676293052e-05, "loss": 3.7686, "step": 4150 }, { "epoch": 1.391682544942424, "grad_norm": 0.287109375, "learning_rate": 4.9964877298549045e-05, "loss": 3.7736, "step": 4200 }, { "epoch": 1.4082511805152846, "grad_norm": 0.306640625, "learning_rate": 4.9962467969895535e-05, "loss": 3.7751, "step": 4250 }, { "epoch": 1.4248198160881451, "grad_norm": 0.2890625, "learning_rate": 4.995997878467758e-05, "loss": 3.7673, "step": 4300 }, { "epoch": 1.4413884516610058, "grad_norm": 0.34375, "learning_rate": 4.995740975085825e-05, "loss": 3.7742, "step": 4350 }, { "epoch": 1.4579570872338663, "grad_norm": 0.294921875, "learning_rate": 4.9954760876656056e-05, "loss": 3.7737, "step": 4400 }, { "epoch": 1.4745257228067268, "grad_norm": 0.283203125, "learning_rate": 4.995203217054493e-05, "loss": 3.7704, "step": 4450 }, { "epoch": 1.4910943583795875, "grad_norm": 0.294921875, "learning_rate": 4.9949223641254156e-05, "loss": 3.7693, "step": 4500 }, { "epoch": 1.507662993952448, "grad_norm": 0.306640625, "learning_rate": 4.994633529776842e-05, "loss": 3.76, "step": 4550 }, { "epoch": 1.5242316295253087, "grad_norm": 0.27734375, "learning_rate": 4.994336714932771e-05, "loss": 3.7617, "step": 4600 }, { "epoch": 1.5408002650981691, "grad_norm": 0.287109375, "learning_rate": 4.9940319205427335e-05, "loss": 3.7737, "step": 4650 }, { "epoch": 1.5573689006710296, "grad_norm": 0.310546875, "learning_rate": 4.993719147581787e-05, "loss": 3.7699, "step": 4700 }, { "epoch": 1.5739375362438903, "grad_norm": 0.306640625, "learning_rate": 4.9933983970505116e-05, "loss": 3.7665, "step": 4750 }, { "epoch": 1.590506171816751, "grad_norm": 0.30859375, "learning_rate": 4.9930696699750095e-05, "loss": 3.7622, "step": 4800 }, { "epoch": 1.6070748073896115, "grad_norm": 0.287109375, "learning_rate": 4.992732967406901e-05, "loss": 3.7572, "step": 4850 }, { "epoch": 1.623643442962472, "grad_norm": 0.3125, "learning_rate": 4.992388290423318e-05, "loss": 3.7626, "step": 4900 }, { "epoch": 1.6402120785353325, "grad_norm": 0.328125, "learning_rate": 4.9920356401269055e-05, "loss": 3.7626, "step": 4950 }, { "epoch": 1.6567807141081932, "grad_norm": 0.31640625, "learning_rate": 4.991675017645815e-05, "loss": 3.7626, "step": 5000 }, { "epoch": 1.6567807141081932, "eval_loss": 3.72454833984375, "eval_runtime": 7.9243, "eval_samples_per_second": 122.282, "eval_steps_per_second": 2.019, "step": 5000 }, { "epoch": 1.6733493496810539, "grad_norm": 0.3046875, "learning_rate": 4.991306424133701e-05, "loss": 3.762, "step": 5050 }, { "epoch": 1.6899179852539143, "grad_norm": 0.296875, "learning_rate": 4.990929860769719e-05, "loss": 3.7576, "step": 5100 }, { "epoch": 1.7064866208267748, "grad_norm": 0.3203125, "learning_rate": 4.990545328758518e-05, "loss": 3.7624, "step": 5150 }, { "epoch": 1.7230552563996355, "grad_norm": 0.302734375, "learning_rate": 4.990152829330243e-05, "loss": 3.757, "step": 5200 }, { "epoch": 1.7396238919724962, "grad_norm": 0.283203125, "learning_rate": 4.989752363740524e-05, "loss": 3.7655, "step": 5250 }, { "epoch": 1.7561925275453567, "grad_norm": 0.298828125, "learning_rate": 4.989343933270477e-05, "loss": 3.7575, "step": 5300 }, { "epoch": 1.7727611631182172, "grad_norm": 0.30859375, "learning_rate": 4.9889275392266984e-05, "loss": 3.7618, "step": 5350 }, { "epoch": 1.7893297986910777, "grad_norm": 0.30859375, "learning_rate": 4.988503182941259e-05, "loss": 3.7561, "step": 5400 }, { "epoch": 1.8058984342639384, "grad_norm": 0.298828125, "learning_rate": 4.988070865771702e-05, "loss": 3.7645, "step": 5450 }, { "epoch": 1.822467069836799, "grad_norm": 0.294921875, "learning_rate": 4.9876305891010385e-05, "loss": 3.7571, "step": 5500 }, { "epoch": 1.8390357054096595, "grad_norm": 0.30859375, "learning_rate": 4.987182354337744e-05, "loss": 3.7565, "step": 5550 }, { "epoch": 1.85560434098252, "grad_norm": 0.310546875, "learning_rate": 4.986726162915748e-05, "loss": 3.7604, "step": 5600 }, { "epoch": 1.8721729765553805, "grad_norm": 0.28125, "learning_rate": 4.9862620162944386e-05, "loss": 3.7497, "step": 5650 }, { "epoch": 1.8887416121282412, "grad_norm": 0.291015625, "learning_rate": 4.9857899159586496e-05, "loss": 3.7498, "step": 5700 }, { "epoch": 1.905310247701102, "grad_norm": 0.302734375, "learning_rate": 4.9853098634186625e-05, "loss": 3.7488, "step": 5750 }, { "epoch": 1.9218788832739624, "grad_norm": 0.29296875, "learning_rate": 4.984821860210196e-05, "loss": 3.7517, "step": 5800 }, { "epoch": 1.9384475188468229, "grad_norm": 0.3125, "learning_rate": 4.984325907894404e-05, "loss": 3.7454, "step": 5850 }, { "epoch": 1.9550161544196836, "grad_norm": 0.291015625, "learning_rate": 4.98382200805787e-05, "loss": 3.7446, "step": 5900 }, { "epoch": 1.9715847899925443, "grad_norm": 0.30078125, "learning_rate": 4.9833101623126034e-05, "loss": 3.751, "step": 5950 }, { "epoch": 1.9881534255654048, "grad_norm": 0.296875, "learning_rate": 4.982790372296031e-05, "loss": 3.7485, "step": 6000 }, { "epoch": 2.004639217960401, "grad_norm": 0.341796875, "learning_rate": 4.9822626396709965e-05, "loss": 3.7467, "step": 6050 }, { "epoch": 2.0212078535332614, "grad_norm": 0.341796875, "learning_rate": 4.98172696612575e-05, "loss": 3.7434, "step": 6100 }, { "epoch": 2.0377764891061223, "grad_norm": 0.330078125, "learning_rate": 4.981183353373946e-05, "loss": 3.7451, "step": 6150 }, { "epoch": 2.054345124678983, "grad_norm": 0.328125, "learning_rate": 4.980631803154638e-05, "loss": 3.7338, "step": 6200 }, { "epoch": 2.0709137602518433, "grad_norm": 0.333984375, "learning_rate": 4.98007231723227e-05, "loss": 3.7348, "step": 6250 }, { "epoch": 2.0874823958247037, "grad_norm": 0.3359375, "learning_rate": 4.979504897396675e-05, "loss": 3.7295, "step": 6300 }, { "epoch": 2.104051031397564, "grad_norm": 0.365234375, "learning_rate": 4.978929545463066e-05, "loss": 3.7357, "step": 6350 }, { "epoch": 2.120619666970425, "grad_norm": 0.365234375, "learning_rate": 4.97834626327203e-05, "loss": 3.7387, "step": 6400 }, { "epoch": 2.1371883025432856, "grad_norm": 0.345703125, "learning_rate": 4.9777550526895265e-05, "loss": 3.7359, "step": 6450 }, { "epoch": 2.153756938116146, "grad_norm": 0.357421875, "learning_rate": 4.977155915606877e-05, "loss": 3.7359, "step": 6500 }, { "epoch": 2.1703255736890066, "grad_norm": 0.330078125, "learning_rate": 4.9765488539407586e-05, "loss": 3.7333, "step": 6550 }, { "epoch": 2.186894209261867, "grad_norm": 0.33203125, "learning_rate": 4.975933869633202e-05, "loss": 3.7296, "step": 6600 }, { "epoch": 2.203462844834728, "grad_norm": 0.3125, "learning_rate": 4.9753109646515814e-05, "loss": 3.7297, "step": 6650 }, { "epoch": 2.2200314804075885, "grad_norm": 0.361328125, "learning_rate": 4.974680140988612e-05, "loss": 3.7334, "step": 6700 }, { "epoch": 2.236600115980449, "grad_norm": 0.33984375, "learning_rate": 4.974041400662338e-05, "loss": 3.7342, "step": 6750 }, { "epoch": 2.2531687515533094, "grad_norm": 0.376953125, "learning_rate": 4.973394745716133e-05, "loss": 3.726, "step": 6800 }, { "epoch": 2.2697373871261703, "grad_norm": 0.35546875, "learning_rate": 4.972740178218688e-05, "loss": 3.7359, "step": 6850 }, { "epoch": 2.286306022699031, "grad_norm": 0.322265625, "learning_rate": 4.972077700264007e-05, "loss": 3.7298, "step": 6900 }, { "epoch": 2.3028746582718913, "grad_norm": 0.326171875, "learning_rate": 4.9714073139714004e-05, "loss": 3.731, "step": 6950 }, { "epoch": 2.319443293844752, "grad_norm": 0.337890625, "learning_rate": 4.970729021485476e-05, "loss": 3.7268, "step": 7000 }, { "epoch": 2.3360119294176123, "grad_norm": 0.33203125, "learning_rate": 4.9700428249761386e-05, "loss": 3.7249, "step": 7050 }, { "epoch": 2.352580564990473, "grad_norm": 0.33203125, "learning_rate": 4.969348726638574e-05, "loss": 3.7297, "step": 7100 }, { "epoch": 2.3691492005633337, "grad_norm": 0.34765625, "learning_rate": 4.968646728693248e-05, "loss": 3.728, "step": 7150 }, { "epoch": 2.385717836136194, "grad_norm": 0.328125, "learning_rate": 4.967936833385898e-05, "loss": 3.7298, "step": 7200 }, { "epoch": 2.4022864717090546, "grad_norm": 0.357421875, "learning_rate": 4.9672190429875266e-05, "loss": 3.7228, "step": 7250 }, { "epoch": 2.4188551072819156, "grad_norm": 0.3671875, "learning_rate": 4.96649335979439e-05, "loss": 3.7254, "step": 7300 }, { "epoch": 2.435423742854776, "grad_norm": 0.36328125, "learning_rate": 4.9657597861279976e-05, "loss": 3.7284, "step": 7350 }, { "epoch": 2.4519923784276365, "grad_norm": 0.31640625, "learning_rate": 4.965018324335099e-05, "loss": 3.7213, "step": 7400 }, { "epoch": 2.468561014000497, "grad_norm": 0.310546875, "learning_rate": 4.964268976787679e-05, "loss": 3.7224, "step": 7450 }, { "epoch": 2.4851296495733575, "grad_norm": 0.330078125, "learning_rate": 4.9635117458829496e-05, "loss": 3.7222, "step": 7500 }, { "epoch": 2.501698285146218, "grad_norm": 0.322265625, "learning_rate": 4.962746634043341e-05, "loss": 3.7286, "step": 7550 }, { "epoch": 2.518266920719079, "grad_norm": 0.37890625, "learning_rate": 4.961973643716497e-05, "loss": 3.7287, "step": 7600 }, { "epoch": 2.5348355562919394, "grad_norm": 0.326171875, "learning_rate": 4.961192777375263e-05, "loss": 3.7208, "step": 7650 }, { "epoch": 2.5514041918648, "grad_norm": 0.3515625, "learning_rate": 4.9604040375176816e-05, "loss": 3.7305, "step": 7700 }, { "epoch": 2.5679728274376608, "grad_norm": 0.330078125, "learning_rate": 4.9596074266669844e-05, "loss": 3.7171, "step": 7750 }, { "epoch": 2.5845414630105212, "grad_norm": 0.32421875, "learning_rate": 4.95880294737158e-05, "loss": 3.721, "step": 7800 }, { "epoch": 2.6011100985833817, "grad_norm": 0.310546875, "learning_rate": 4.9579906022050517e-05, "loss": 3.7225, "step": 7850 }, { "epoch": 2.617678734156242, "grad_norm": 0.322265625, "learning_rate": 4.957170393766143e-05, "loss": 3.7215, "step": 7900 }, { "epoch": 2.6342473697291027, "grad_norm": 0.345703125, "learning_rate": 4.956342324678755e-05, "loss": 3.7237, "step": 7950 }, { "epoch": 2.650816005301963, "grad_norm": 0.3125, "learning_rate": 4.9555063975919345e-05, "loss": 3.7189, "step": 8000 }, { "epoch": 2.667384640874824, "grad_norm": 0.314453125, "learning_rate": 4.954662615179868e-05, "loss": 3.7209, "step": 8050 }, { "epoch": 2.6839532764476846, "grad_norm": 0.328125, "learning_rate": 4.953810980141869e-05, "loss": 3.7267, "step": 8100 }, { "epoch": 2.700521912020545, "grad_norm": 0.32421875, "learning_rate": 4.952951495202374e-05, "loss": 3.72, "step": 8150 }, { "epoch": 2.717090547593406, "grad_norm": 0.33203125, "learning_rate": 4.9520841631109315e-05, "loss": 3.7168, "step": 8200 }, { "epoch": 2.7336591831662664, "grad_norm": 0.33203125, "learning_rate": 4.951208986642194e-05, "loss": 3.7167, "step": 8250 }, { "epoch": 2.750227818739127, "grad_norm": 0.32421875, "learning_rate": 4.9503259685959074e-05, "loss": 3.7192, "step": 8300 }, { "epoch": 2.7667964543119874, "grad_norm": 0.3515625, "learning_rate": 4.949435111796905e-05, "loss": 3.7162, "step": 8350 }, { "epoch": 2.783365089884848, "grad_norm": 0.328125, "learning_rate": 4.948536419095095e-05, "loss": 3.7161, "step": 8400 }, { "epoch": 2.7999337254577084, "grad_norm": 0.32421875, "learning_rate": 4.947629893365453e-05, "loss": 3.718, "step": 8450 }, { "epoch": 2.8165023610305693, "grad_norm": 0.322265625, "learning_rate": 4.9467155375080165e-05, "loss": 3.7152, "step": 8500 }, { "epoch": 2.8330709966034298, "grad_norm": 0.34375, "learning_rate": 4.9457933544478684e-05, "loss": 3.7237, "step": 8550 }, { "epoch": 2.8496396321762902, "grad_norm": 0.330078125, "learning_rate": 4.944863347135132e-05, "loss": 3.7092, "step": 8600 }, { "epoch": 2.8662082677491507, "grad_norm": 0.328125, "learning_rate": 4.943925518544962e-05, "loss": 3.7174, "step": 8650 }, { "epoch": 2.8827769033220116, "grad_norm": 0.3359375, "learning_rate": 4.942979871677532e-05, "loss": 3.7162, "step": 8700 }, { "epoch": 2.899345538894872, "grad_norm": 0.359375, "learning_rate": 4.94202640955803e-05, "loss": 3.7152, "step": 8750 }, { "epoch": 2.9159141744677326, "grad_norm": 0.357421875, "learning_rate": 4.9410651352366435e-05, "loss": 3.7181, "step": 8800 }, { "epoch": 2.932482810040593, "grad_norm": 0.333984375, "learning_rate": 4.94009605178855e-05, "loss": 3.7177, "step": 8850 }, { "epoch": 2.9490514456134536, "grad_norm": 0.341796875, "learning_rate": 4.939119162313912e-05, "loss": 3.7216, "step": 8900 }, { "epoch": 2.965620081186314, "grad_norm": 0.3671875, "learning_rate": 4.9381344699378626e-05, "loss": 3.712, "step": 8950 }, { "epoch": 2.982188716759175, "grad_norm": 0.322265625, "learning_rate": 4.937141977810497e-05, "loss": 3.7199, "step": 9000 }, { "epoch": 2.9987573523320354, "grad_norm": 0.326171875, "learning_rate": 4.936141689106861e-05, "loss": 3.7058, "step": 9050 }, { "epoch": 3.0152431447270316, "grad_norm": 0.408203125, "learning_rate": 4.935133607026945e-05, "loss": 3.6969, "step": 9100 }, { "epoch": 3.0318117802998925, "grad_norm": 0.41796875, "learning_rate": 4.934117734795669e-05, "loss": 3.706, "step": 9150 }, { "epoch": 3.048380415872753, "grad_norm": 0.369140625, "learning_rate": 4.933094075662874e-05, "loss": 3.7006, "step": 9200 }, { "epoch": 3.0649490514456135, "grad_norm": 0.361328125, "learning_rate": 4.9320626329033134e-05, "loss": 3.703, "step": 9250 }, { "epoch": 3.081517687018474, "grad_norm": 0.376953125, "learning_rate": 4.9310234098166396e-05, "loss": 3.7084, "step": 9300 }, { "epoch": 3.0980863225913344, "grad_norm": 0.388671875, "learning_rate": 4.929976409727395e-05, "loss": 3.6991, "step": 9350 }, { "epoch": 3.1146549581641954, "grad_norm": 0.40234375, "learning_rate": 4.928921635985001e-05, "loss": 3.7057, "step": 9400 }, { "epoch": 3.131223593737056, "grad_norm": 0.392578125, "learning_rate": 4.9278590919637466e-05, "loss": 3.703, "step": 9450 }, { "epoch": 3.1477922293099163, "grad_norm": 0.36328125, "learning_rate": 4.9267887810627824e-05, "loss": 3.7103, "step": 9500 }, { "epoch": 3.164360864882777, "grad_norm": 0.359375, "learning_rate": 4.9257107067061e-05, "loss": 3.7017, "step": 9550 }, { "epoch": 3.1809295004556373, "grad_norm": 0.3828125, "learning_rate": 4.924624872342531e-05, "loss": 3.6937, "step": 9600 }, { "epoch": 3.197498136028498, "grad_norm": 0.404296875, "learning_rate": 4.92353128144573e-05, "loss": 3.7063, "step": 9650 }, { "epoch": 3.2140667716013587, "grad_norm": 0.373046875, "learning_rate": 4.9224299375141656e-05, "loss": 3.6978, "step": 9700 }, { "epoch": 3.230635407174219, "grad_norm": 0.35546875, "learning_rate": 4.921320844071109e-05, "loss": 3.705, "step": 9750 }, { "epoch": 3.2472040427470796, "grad_norm": 0.36328125, "learning_rate": 4.920204004664624e-05, "loss": 3.704, "step": 9800 }, { "epoch": 3.26377267831994, "grad_norm": 0.38671875, "learning_rate": 4.91907942286755e-05, "loss": 3.703, "step": 9850 }, { "epoch": 3.280341313892801, "grad_norm": 0.37109375, "learning_rate": 4.917947102277499e-05, "loss": 3.7027, "step": 9900 }, { "epoch": 3.2969099494656615, "grad_norm": 0.376953125, "learning_rate": 4.916807046516838e-05, "loss": 3.6977, "step": 9950 }, { "epoch": 3.313478585038522, "grad_norm": 0.359375, "learning_rate": 4.9156592592326814e-05, "loss": 3.6968, "step": 10000 }, { "epoch": 3.313478585038522, "eval_loss": 3.6868932247161865, "eval_runtime": 8.062, "eval_samples_per_second": 120.194, "eval_steps_per_second": 1.985, "step": 10000 }, { "epoch": 3.3300472206113825, "grad_norm": 0.37890625, "learning_rate": 4.9145037440968746e-05, "loss": 3.7025, "step": 10050 }, { "epoch": 3.3466158561842434, "grad_norm": 0.388671875, "learning_rate": 4.913340504805984e-05, "loss": 3.7034, "step": 10100 }, { "epoch": 3.363184491757104, "grad_norm": 0.357421875, "learning_rate": 4.912169545081292e-05, "loss": 3.7025, "step": 10150 }, { "epoch": 3.3797531273299644, "grad_norm": 0.373046875, "learning_rate": 4.910990868668772e-05, "loss": 3.6956, "step": 10200 }, { "epoch": 3.396321762902825, "grad_norm": 0.3828125, "learning_rate": 4.9098044793390876e-05, "loss": 3.7025, "step": 10250 }, { "epoch": 3.4128903984756853, "grad_norm": 0.36328125, "learning_rate": 4.908610380887576e-05, "loss": 3.6915, "step": 10300 }, { "epoch": 3.4294590340485462, "grad_norm": 0.384765625, "learning_rate": 4.9074085771342365e-05, "loss": 3.6987, "step": 10350 }, { "epoch": 3.4460276696214067, "grad_norm": 0.38671875, "learning_rate": 4.9061990719237175e-05, "loss": 3.6943, "step": 10400 }, { "epoch": 3.462596305194267, "grad_norm": 0.380859375, "learning_rate": 4.904981869125303e-05, "loss": 3.6961, "step": 10450 }, { "epoch": 3.4791649407671277, "grad_norm": 0.408203125, "learning_rate": 4.903756972632908e-05, "loss": 3.6985, "step": 10500 }, { "epoch": 3.4957335763399886, "grad_norm": 0.3828125, "learning_rate": 4.9025243863650545e-05, "loss": 3.7024, "step": 10550 }, { "epoch": 3.512302211912849, "grad_norm": 0.365234375, "learning_rate": 4.901284114264866e-05, "loss": 3.6989, "step": 10600 }, { "epoch": 3.5288708474857096, "grad_norm": 0.376953125, "learning_rate": 4.900036160300053e-05, "loss": 3.7015, "step": 10650 }, { "epoch": 3.54543948305857, "grad_norm": 0.365234375, "learning_rate": 4.898780528462904e-05, "loss": 3.6958, "step": 10700 }, { "epoch": 3.5620081186314305, "grad_norm": 0.392578125, "learning_rate": 4.8975172227702665e-05, "loss": 3.6988, "step": 10750 }, { "epoch": 3.578576754204291, "grad_norm": 0.365234375, "learning_rate": 4.896246247263537e-05, "loss": 3.6926, "step": 10800 }, { "epoch": 3.595145389777152, "grad_norm": 0.373046875, "learning_rate": 4.89496760600865e-05, "loss": 3.6903, "step": 10850 }, { "epoch": 3.6117140253500124, "grad_norm": 0.384765625, "learning_rate": 4.893681303096062e-05, "loss": 3.6945, "step": 10900 }, { "epoch": 3.628282660922873, "grad_norm": 0.369140625, "learning_rate": 4.892387342640739e-05, "loss": 3.6977, "step": 10950 }, { "epoch": 3.644851296495734, "grad_norm": 0.365234375, "learning_rate": 4.8910857287821465e-05, "loss": 3.6882, "step": 11000 }, { "epoch": 3.6614199320685943, "grad_norm": 0.37109375, "learning_rate": 4.889776465684232e-05, "loss": 3.6968, "step": 11050 }, { "epoch": 3.6779885676414548, "grad_norm": 0.359375, "learning_rate": 4.888459557535413e-05, "loss": 3.6937, "step": 11100 }, { "epoch": 3.6945572032143152, "grad_norm": 0.375, "learning_rate": 4.8871350085485646e-05, "loss": 3.7016, "step": 11150 }, { "epoch": 3.7111258387871757, "grad_norm": 0.384765625, "learning_rate": 4.8858028229610055e-05, "loss": 3.6983, "step": 11200 }, { "epoch": 3.727694474360036, "grad_norm": 0.375, "learning_rate": 4.884463005034485e-05, "loss": 3.6995, "step": 11250 }, { "epoch": 3.744263109932897, "grad_norm": 0.373046875, "learning_rate": 4.8831155590551674e-05, "loss": 3.6978, "step": 11300 }, { "epoch": 3.7608317455057576, "grad_norm": 0.376953125, "learning_rate": 4.881760489333621e-05, "loss": 3.6981, "step": 11350 }, { "epoch": 3.777400381078618, "grad_norm": 0.375, "learning_rate": 4.880397800204801e-05, "loss": 3.6884, "step": 11400 }, { "epoch": 3.793969016651479, "grad_norm": 0.375, "learning_rate": 4.879027496028041e-05, "loss": 3.6957, "step": 11450 }, { "epoch": 3.8105376522243395, "grad_norm": 0.384765625, "learning_rate": 4.8776495811870315e-05, "loss": 3.6933, "step": 11500 }, { "epoch": 3.8271062877972, "grad_norm": 0.373046875, "learning_rate": 4.876264060089813e-05, "loss": 3.6857, "step": 11550 }, { "epoch": 3.8436749233700604, "grad_norm": 0.357421875, "learning_rate": 4.874870937168757e-05, "loss": 3.6998, "step": 11600 }, { "epoch": 3.860243558942921, "grad_norm": 0.373046875, "learning_rate": 4.873470216880557e-05, "loss": 3.6876, "step": 11650 }, { "epoch": 3.8768121945157814, "grad_norm": 0.375, "learning_rate": 4.8720619037062056e-05, "loss": 3.6933, "step": 11700 }, { "epoch": 3.8933808300886423, "grad_norm": 0.380859375, "learning_rate": 4.87064600215099e-05, "loss": 3.6993, "step": 11750 }, { "epoch": 3.909949465661503, "grad_norm": 0.376953125, "learning_rate": 4.869222516744473e-05, "loss": 3.6934, "step": 11800 }, { "epoch": 3.9265181012343633, "grad_norm": 0.392578125, "learning_rate": 4.867791452040475e-05, "loss": 3.6912, "step": 11850 }, { "epoch": 3.9430867368072238, "grad_norm": 0.359375, "learning_rate": 4.866352812617066e-05, "loss": 3.6898, "step": 11900 }, { "epoch": 3.9596553723800847, "grad_norm": 0.388671875, "learning_rate": 4.864906603076549e-05, "loss": 3.6901, "step": 11950 }, { "epoch": 3.976224007952945, "grad_norm": 0.3671875, "learning_rate": 4.8634528280454404e-05, "loss": 3.6925, "step": 12000 }, { "epoch": 3.9927926435258057, "grad_norm": 0.365234375, "learning_rate": 4.861991492174464e-05, "loss": 3.6884, "step": 12050 }, { "epoch": 4.009278435920802, "grad_norm": 0.421875, "learning_rate": 4.8605226001385256e-05, "loss": 3.685, "step": 12100 }, { "epoch": 4.025847071493662, "grad_norm": 0.421875, "learning_rate": 4.85904615663671e-05, "loss": 3.6815, "step": 12150 }, { "epoch": 4.042415707066523, "grad_norm": 0.43359375, "learning_rate": 4.857562166392253e-05, "loss": 3.6793, "step": 12200 }, { "epoch": 4.058984342639383, "grad_norm": 0.443359375, "learning_rate": 4.8560706341525386e-05, "loss": 3.6799, "step": 12250 }, { "epoch": 4.075552978212245, "grad_norm": 0.435546875, "learning_rate": 4.854571564689075e-05, "loss": 3.6809, "step": 12300 }, { "epoch": 4.092121613785105, "grad_norm": 0.431640625, "learning_rate": 4.853064962797483e-05, "loss": 3.6788, "step": 12350 }, { "epoch": 4.108690249357966, "grad_norm": 0.419921875, "learning_rate": 4.851550833297481e-05, "loss": 3.6869, "step": 12400 }, { "epoch": 4.125258884930826, "grad_norm": 0.455078125, "learning_rate": 4.850029181032869e-05, "loss": 3.6864, "step": 12450 }, { "epoch": 4.1418275205036865, "grad_norm": 0.4453125, "learning_rate": 4.8485000108715094e-05, "loss": 3.6845, "step": 12500 }, { "epoch": 4.158396156076547, "grad_norm": 0.4375, "learning_rate": 4.846963327705319e-05, "loss": 3.6772, "step": 12550 }, { "epoch": 4.1749647916494075, "grad_norm": 0.462890625, "learning_rate": 4.845419136450248e-05, "loss": 3.691, "step": 12600 }, { "epoch": 4.191533427222268, "grad_norm": 0.421875, "learning_rate": 4.843867442046264e-05, "loss": 3.6889, "step": 12650 }, { "epoch": 4.208102062795128, "grad_norm": 0.435546875, "learning_rate": 4.842308249457339e-05, "loss": 3.6835, "step": 12700 }, { "epoch": 4.22467069836799, "grad_norm": 0.4296875, "learning_rate": 4.8407415636714315e-05, "loss": 3.6833, "step": 12750 }, { "epoch": 4.24123933394085, "grad_norm": 0.404296875, "learning_rate": 4.839167389700472e-05, "loss": 3.6782, "step": 12800 }, { "epoch": 4.257807969513711, "grad_norm": 0.41796875, "learning_rate": 4.837585732580345e-05, "loss": 3.6887, "step": 12850 }, { "epoch": 4.274376605086571, "grad_norm": 0.416015625, "learning_rate": 4.835996597370877e-05, "loss": 3.6855, "step": 12900 }, { "epoch": 4.290945240659432, "grad_norm": 0.421875, "learning_rate": 4.8343999891558136e-05, "loss": 3.6828, "step": 12950 }, { "epoch": 4.307513876232292, "grad_norm": 0.44140625, "learning_rate": 4.832795913042809e-05, "loss": 3.6815, "step": 13000 }, { "epoch": 4.324082511805153, "grad_norm": 0.4296875, "learning_rate": 4.831184374163407e-05, "loss": 3.6836, "step": 13050 }, { "epoch": 4.340651147378013, "grad_norm": 0.42578125, "learning_rate": 4.829565377673026e-05, "loss": 3.6919, "step": 13100 }, { "epoch": 4.357219782950874, "grad_norm": 0.455078125, "learning_rate": 4.827938928750941e-05, "loss": 3.6815, "step": 13150 }, { "epoch": 4.373788418523734, "grad_norm": 0.431640625, "learning_rate": 4.82630503260027e-05, "loss": 3.6834, "step": 13200 }, { "epoch": 4.3903570540965955, "grad_norm": 0.439453125, "learning_rate": 4.824663694447952e-05, "loss": 3.6834, "step": 13250 }, { "epoch": 4.406925689669456, "grad_norm": 0.4296875, "learning_rate": 4.823014919544734e-05, "loss": 3.6814, "step": 13300 }, { "epoch": 4.4234943252423164, "grad_norm": 0.431640625, "learning_rate": 4.8213587131651566e-05, "loss": 3.6849, "step": 13350 }, { "epoch": 4.440062960815177, "grad_norm": 0.4453125, "learning_rate": 4.819695080607531e-05, "loss": 3.6795, "step": 13400 }, { "epoch": 4.456631596388037, "grad_norm": 0.4375, "learning_rate": 4.8180240271939256e-05, "loss": 3.6823, "step": 13450 }, { "epoch": 4.473200231960898, "grad_norm": 0.439453125, "learning_rate": 4.8163455582701514e-05, "loss": 3.6798, "step": 13500 }, { "epoch": 4.489768867533758, "grad_norm": 0.416015625, "learning_rate": 4.8146596792057385e-05, "loss": 3.6811, "step": 13550 }, { "epoch": 4.506337503106619, "grad_norm": 0.42578125, "learning_rate": 4.812966395393924e-05, "loss": 3.6858, "step": 13600 }, { "epoch": 4.522906138679479, "grad_norm": 0.44140625, "learning_rate": 4.811265712251635e-05, "loss": 3.6776, "step": 13650 }, { "epoch": 4.539474774252341, "grad_norm": 0.40625, "learning_rate": 4.809557635219465e-05, "loss": 3.6812, "step": 13700 }, { "epoch": 4.556043409825201, "grad_norm": 0.416015625, "learning_rate": 4.807842169761667e-05, "loss": 3.6772, "step": 13750 }, { "epoch": 4.572612045398062, "grad_norm": 0.43359375, "learning_rate": 4.806119321366124e-05, "loss": 3.6846, "step": 13800 }, { "epoch": 4.589180680970922, "grad_norm": 0.439453125, "learning_rate": 4.8043890955443436e-05, "loss": 3.6826, "step": 13850 }, { "epoch": 4.605749316543783, "grad_norm": 0.447265625, "learning_rate": 4.802651497831428e-05, "loss": 3.6789, "step": 13900 }, { "epoch": 4.622317952116643, "grad_norm": 0.41015625, "learning_rate": 4.8009065337860685e-05, "loss": 3.6776, "step": 13950 }, { "epoch": 4.638886587689504, "grad_norm": 0.435546875, "learning_rate": 4.799154208990518e-05, "loss": 3.6736, "step": 14000 }, { "epoch": 4.655455223262364, "grad_norm": 0.421875, "learning_rate": 4.7973945290505766e-05, "loss": 3.6817, "step": 14050 }, { "epoch": 4.6720238588352245, "grad_norm": 0.46484375, "learning_rate": 4.7956274995955775e-05, "loss": 3.6843, "step": 14100 }, { "epoch": 4.688592494408086, "grad_norm": 0.443359375, "learning_rate": 4.793853126278361e-05, "loss": 3.6777, "step": 14150 }, { "epoch": 4.705161129980946, "grad_norm": 0.421875, "learning_rate": 4.792071414775265e-05, "loss": 3.6788, "step": 14200 }, { "epoch": 4.721729765553807, "grad_norm": 0.42578125, "learning_rate": 4.7902823707861e-05, "loss": 3.6818, "step": 14250 }, { "epoch": 4.738298401126667, "grad_norm": 0.451171875, "learning_rate": 4.7884860000341344e-05, "loss": 3.6844, "step": 14300 }, { "epoch": 4.754867036699528, "grad_norm": 0.4375, "learning_rate": 4.786682308266076e-05, "loss": 3.68, "step": 14350 }, { "epoch": 4.771435672272388, "grad_norm": 0.4296875, "learning_rate": 4.784871301252052e-05, "loss": 3.6782, "step": 14400 }, { "epoch": 4.788004307845249, "grad_norm": 0.435546875, "learning_rate": 4.783052984785593e-05, "loss": 3.6743, "step": 14450 }, { "epoch": 4.804572943418109, "grad_norm": 0.435546875, "learning_rate": 4.781227364683611e-05, "loss": 3.6864, "step": 14500 }, { "epoch": 4.82114157899097, "grad_norm": 0.423828125, "learning_rate": 4.7793944467863836e-05, "loss": 3.6766, "step": 14550 }, { "epoch": 4.837710214563831, "grad_norm": 0.41796875, "learning_rate": 4.777554236957537e-05, "loss": 3.676, "step": 14600 }, { "epoch": 4.854278850136692, "grad_norm": 0.408203125, "learning_rate": 4.775706741084019e-05, "loss": 3.6834, "step": 14650 }, { "epoch": 4.870847485709552, "grad_norm": 0.42578125, "learning_rate": 4.7738519650760935e-05, "loss": 3.679, "step": 14700 }, { "epoch": 4.8874161212824125, "grad_norm": 0.41796875, "learning_rate": 4.7719899148673074e-05, "loss": 3.6792, "step": 14750 }, { "epoch": 4.903984756855273, "grad_norm": 0.431640625, "learning_rate": 4.770120596414481e-05, "loss": 3.6783, "step": 14800 }, { "epoch": 4.9205533924281335, "grad_norm": 0.44921875, "learning_rate": 4.768244015697687e-05, "loss": 3.6849, "step": 14850 }, { "epoch": 4.937122028000994, "grad_norm": 0.462890625, "learning_rate": 4.7663601787202285e-05, "loss": 3.6749, "step": 14900 }, { "epoch": 4.9536906635738545, "grad_norm": 0.42578125, "learning_rate": 4.7644690915086245e-05, "loss": 3.6825, "step": 14950 }, { "epoch": 4.970259299146715, "grad_norm": 0.404296875, "learning_rate": 4.7625707601125845e-05, "loss": 3.6747, "step": 15000 }, { "epoch": 4.970259299146715, "eval_loss": 3.661933422088623, "eval_runtime": 7.9724, "eval_samples_per_second": 121.545, "eval_steps_per_second": 2.007, "step": 15000 }, { "epoch": 4.986827934719576, "grad_norm": 0.408203125, "learning_rate": 4.760665190604996e-05, "loss": 3.6823, "step": 15050 }, { "epoch": 5.0033137271145725, "grad_norm": 0.47265625, "learning_rate": 4.758752389081901e-05, "loss": 3.6759, "step": 15100 }, { "epoch": 5.019882362687433, "grad_norm": 0.4609375, "learning_rate": 4.756832361662473e-05, "loss": 3.6767, "step": 15150 }, { "epoch": 5.036450998260293, "grad_norm": 0.486328125, "learning_rate": 4.754905114489009e-05, "loss": 3.6778, "step": 15200 }, { "epoch": 5.053019633833154, "grad_norm": 0.470703125, "learning_rate": 4.752970653726896e-05, "loss": 3.6676, "step": 15250 }, { "epoch": 5.069588269406014, "grad_norm": 0.478515625, "learning_rate": 4.751028985564602e-05, "loss": 3.6659, "step": 15300 }, { "epoch": 5.086156904978875, "grad_norm": 0.455078125, "learning_rate": 4.7490801162136484e-05, "loss": 3.6648, "step": 15350 }, { "epoch": 5.102725540551735, "grad_norm": 0.478515625, "learning_rate": 4.7471240519085964e-05, "loss": 3.6797, "step": 15400 }, { "epoch": 5.119294176124596, "grad_norm": 0.4765625, "learning_rate": 4.745160798907023e-05, "loss": 3.6685, "step": 15450 }, { "epoch": 5.135862811697456, "grad_norm": 0.48046875, "learning_rate": 4.7431903634895034e-05, "loss": 3.6657, "step": 15500 }, { "epoch": 5.152431447270318, "grad_norm": 0.484375, "learning_rate": 4.741212751959589e-05, "loss": 3.6699, "step": 15550 }, { "epoch": 5.169000082843178, "grad_norm": 0.466796875, "learning_rate": 4.739227970643787e-05, "loss": 3.6705, "step": 15600 }, { "epoch": 5.185568718416039, "grad_norm": 0.4921875, "learning_rate": 4.737236025891544e-05, "loss": 3.6692, "step": 15650 }, { "epoch": 5.202137353988899, "grad_norm": 0.4921875, "learning_rate": 4.73523692407522e-05, "loss": 3.6632, "step": 15700 }, { "epoch": 5.21870598956176, "grad_norm": 0.470703125, "learning_rate": 4.733230671590074e-05, "loss": 3.6764, "step": 15750 }, { "epoch": 5.23527462513462, "grad_norm": 0.4921875, "learning_rate": 4.731217274854236e-05, "loss": 3.6778, "step": 15800 }, { "epoch": 5.2518432607074805, "grad_norm": 0.482421875, "learning_rate": 4.729196740308696e-05, "loss": 3.6771, "step": 15850 }, { "epoch": 5.268411896280341, "grad_norm": 0.47265625, "learning_rate": 4.7271690744172745e-05, "loss": 3.6752, "step": 15900 }, { "epoch": 5.2849805318532015, "grad_norm": 0.4609375, "learning_rate": 4.725134283666607e-05, "loss": 3.6766, "step": 15950 }, { "epoch": 5.301549167426063, "grad_norm": 0.486328125, "learning_rate": 4.723092374566123e-05, "loss": 3.6753, "step": 16000 }, { "epoch": 5.318117802998923, "grad_norm": 0.478515625, "learning_rate": 4.721043353648021e-05, "loss": 3.6728, "step": 16050 }, { "epoch": 5.334686438571784, "grad_norm": 0.4921875, "learning_rate": 4.718987227467256e-05, "loss": 3.6722, "step": 16100 }, { "epoch": 5.351255074144644, "grad_norm": 0.486328125, "learning_rate": 4.716924002601507e-05, "loss": 3.6674, "step": 16150 }, { "epoch": 5.367823709717505, "grad_norm": 0.478515625, "learning_rate": 4.7148536856511655e-05, "loss": 3.6766, "step": 16200 }, { "epoch": 5.384392345290365, "grad_norm": 0.4609375, "learning_rate": 4.712776283239311e-05, "loss": 3.6643, "step": 16250 }, { "epoch": 5.400960980863226, "grad_norm": 0.49609375, "learning_rate": 4.71069180201169e-05, "loss": 3.6655, "step": 16300 }, { "epoch": 5.417529616436086, "grad_norm": 0.5, "learning_rate": 4.708600248636693e-05, "loss": 3.6734, "step": 16350 }, { "epoch": 5.434098252008947, "grad_norm": 0.498046875, "learning_rate": 4.7065016298053365e-05, "loss": 3.6684, "step": 16400 }, { "epoch": 5.450666887581807, "grad_norm": 0.47265625, "learning_rate": 4.7043959522312384e-05, "loss": 3.6661, "step": 16450 }, { "epoch": 5.4672355231546685, "grad_norm": 0.462890625, "learning_rate": 4.7022832226505996e-05, "loss": 3.6706, "step": 16500 }, { "epoch": 5.483804158727529, "grad_norm": 0.478515625, "learning_rate": 4.700163447822179e-05, "loss": 3.6701, "step": 16550 }, { "epoch": 5.5003727943003895, "grad_norm": 0.5, "learning_rate": 4.6980366345272756e-05, "loss": 3.6724, "step": 16600 }, { "epoch": 5.51694142987325, "grad_norm": 0.46875, "learning_rate": 4.6959027895697026e-05, "loss": 3.6691, "step": 16650 }, { "epoch": 5.5335100654461105, "grad_norm": 0.486328125, "learning_rate": 4.693761919775771e-05, "loss": 3.6729, "step": 16700 }, { "epoch": 5.550078701018971, "grad_norm": 0.484375, "learning_rate": 4.691614031994261e-05, "loss": 3.6766, "step": 16750 }, { "epoch": 5.566647336591831, "grad_norm": 0.458984375, "learning_rate": 4.6894591330964076e-05, "loss": 3.6744, "step": 16800 }, { "epoch": 5.583215972164692, "grad_norm": 0.46875, "learning_rate": 4.6872972299758726e-05, "loss": 3.6675, "step": 16850 }, { "epoch": 5.599784607737552, "grad_norm": 0.498046875, "learning_rate": 4.685128329548725e-05, "loss": 3.6669, "step": 16900 }, { "epoch": 5.616353243310414, "grad_norm": 0.46875, "learning_rate": 4.6829524387534184e-05, "loss": 3.6688, "step": 16950 }, { "epoch": 5.632921878883274, "grad_norm": 0.474609375, "learning_rate": 4.6807695645507704e-05, "loss": 3.6708, "step": 17000 }, { "epoch": 5.649490514456135, "grad_norm": 0.4765625, "learning_rate": 4.6785797139239376e-05, "loss": 3.669, "step": 17050 }, { "epoch": 5.666059150028995, "grad_norm": 0.494140625, "learning_rate": 4.676382893878395e-05, "loss": 3.6652, "step": 17100 }, { "epoch": 5.682627785601856, "grad_norm": 0.482421875, "learning_rate": 4.674179111441913e-05, "loss": 3.6658, "step": 17150 }, { "epoch": 5.699196421174716, "grad_norm": 0.48828125, "learning_rate": 4.671968373664536e-05, "loss": 3.6668, "step": 17200 }, { "epoch": 5.715765056747577, "grad_norm": 0.4609375, "learning_rate": 4.6697506876185586e-05, "loss": 3.6721, "step": 17250 }, { "epoch": 5.732333692320437, "grad_norm": 0.48828125, "learning_rate": 4.667526060398503e-05, "loss": 3.6696, "step": 17300 }, { "epoch": 5.748902327893298, "grad_norm": 0.474609375, "learning_rate": 4.665294499121097e-05, "loss": 3.6714, "step": 17350 }, { "epoch": 5.765470963466159, "grad_norm": 0.494140625, "learning_rate": 4.6630560109252496e-05, "loss": 3.665, "step": 17400 }, { "epoch": 5.782039599039019, "grad_norm": 0.478515625, "learning_rate": 4.660810602972032e-05, "loss": 3.6705, "step": 17450 }, { "epoch": 5.79860823461188, "grad_norm": 0.466796875, "learning_rate": 4.658558282444651e-05, "loss": 3.6658, "step": 17500 }, { "epoch": 5.81517687018474, "grad_norm": 0.474609375, "learning_rate": 4.6562990565484265e-05, "loss": 3.6624, "step": 17550 }, { "epoch": 5.831745505757601, "grad_norm": 0.46484375, "learning_rate": 4.654032932510769e-05, "loss": 3.6718, "step": 17600 }, { "epoch": 5.848314141330461, "grad_norm": 0.47265625, "learning_rate": 4.651759917581159e-05, "loss": 3.6682, "step": 17650 }, { "epoch": 5.864882776903322, "grad_norm": 0.4609375, "learning_rate": 4.649480019031117e-05, "loss": 3.6631, "step": 17700 }, { "epoch": 5.881451412476182, "grad_norm": 0.48828125, "learning_rate": 4.647193244154189e-05, "loss": 3.6697, "step": 17750 }, { "epoch": 5.898020048049043, "grad_norm": 0.45703125, "learning_rate": 4.644899600265917e-05, "loss": 3.6689, "step": 17800 }, { "epoch": 5.914588683621904, "grad_norm": 0.5, "learning_rate": 4.6425990947038175e-05, "loss": 3.6726, "step": 17850 }, { "epoch": 5.931157319194765, "grad_norm": 0.46875, "learning_rate": 4.640291734827357e-05, "loss": 3.6715, "step": 17900 }, { "epoch": 5.947725954767625, "grad_norm": 0.478515625, "learning_rate": 4.637977528017931e-05, "loss": 3.67, "step": 17950 }, { "epoch": 5.964294590340486, "grad_norm": 0.46484375, "learning_rate": 4.63565648167884e-05, "loss": 3.6715, "step": 18000 }, { "epoch": 5.980863225913346, "grad_norm": 0.4765625, "learning_rate": 4.6333286032352594e-05, "loss": 3.6649, "step": 18050 }, { "epoch": 5.9974318614862066, "grad_norm": 0.482421875, "learning_rate": 4.630993900134227e-05, "loss": 3.6725, "step": 18100 }, { "epoch": 6.013917653881203, "grad_norm": 0.53515625, "learning_rate": 4.628652379844609e-05, "loss": 3.6574, "step": 18150 }, { "epoch": 6.030486289454063, "grad_norm": 0.53125, "learning_rate": 4.626304049857081e-05, "loss": 3.6601, "step": 18200 }, { "epoch": 6.047054925026924, "grad_norm": 0.5703125, "learning_rate": 4.623948917684106e-05, "loss": 3.67, "step": 18250 }, { "epoch": 6.063623560599785, "grad_norm": 0.5625, "learning_rate": 4.621586990859904e-05, "loss": 3.6611, "step": 18300 }, { "epoch": 6.0801921961726455, "grad_norm": 0.53125, "learning_rate": 4.6192182769404335e-05, "loss": 3.6608, "step": 18350 }, { "epoch": 6.096760831745506, "grad_norm": 0.53125, "learning_rate": 4.616842783503365e-05, "loss": 3.6631, "step": 18400 }, { "epoch": 6.1133294673183665, "grad_norm": 0.53125, "learning_rate": 4.614460518148057e-05, "loss": 3.657, "step": 18450 }, { "epoch": 6.129898102891227, "grad_norm": 0.5234375, "learning_rate": 4.6120714884955316e-05, "loss": 3.6635, "step": 18500 }, { "epoch": 6.146466738464087, "grad_norm": 0.546875, "learning_rate": 4.609675702188452e-05, "loss": 3.6595, "step": 18550 }, { "epoch": 6.163035374036948, "grad_norm": 0.55078125, "learning_rate": 4.6072731668910926e-05, "loss": 3.6676, "step": 18600 }, { "epoch": 6.179604009609808, "grad_norm": 0.5625, "learning_rate": 4.604863890289322e-05, "loss": 3.6611, "step": 18650 }, { "epoch": 6.196172645182669, "grad_norm": 0.546875, "learning_rate": 4.6024478800905735e-05, "loss": 3.6679, "step": 18700 }, { "epoch": 6.21274128075553, "grad_norm": 0.54296875, "learning_rate": 4.600025144023822e-05, "loss": 3.6612, "step": 18750 }, { "epoch": 6.229309916328391, "grad_norm": 0.56640625, "learning_rate": 4.597595689839556e-05, "loss": 3.6664, "step": 18800 }, { "epoch": 6.245878551901251, "grad_norm": 0.546875, "learning_rate": 4.59515952530976e-05, "loss": 3.6644, "step": 18850 }, { "epoch": 6.262447187474112, "grad_norm": 0.52734375, "learning_rate": 4.592716658227884e-05, "loss": 3.6656, "step": 18900 }, { "epoch": 6.279015823046972, "grad_norm": 0.53515625, "learning_rate": 4.5902670964088165e-05, "loss": 3.6643, "step": 18950 }, { "epoch": 6.295584458619833, "grad_norm": 0.55859375, "learning_rate": 4.587810847688868e-05, "loss": 3.6589, "step": 19000 }, { "epoch": 6.312153094192693, "grad_norm": 0.55078125, "learning_rate": 4.585347919925737e-05, "loss": 3.6601, "step": 19050 }, { "epoch": 6.328721729765554, "grad_norm": 0.54296875, "learning_rate": 4.582878320998491e-05, "loss": 3.662, "step": 19100 }, { "epoch": 6.345290365338414, "grad_norm": 0.52734375, "learning_rate": 4.580402058807539e-05, "loss": 3.6678, "step": 19150 }, { "epoch": 6.3618590009112745, "grad_norm": 0.51953125, "learning_rate": 4.577919141274604e-05, "loss": 3.6652, "step": 19200 }, { "epoch": 6.378427636484136, "grad_norm": 0.53515625, "learning_rate": 4.575429576342701e-05, "loss": 3.6591, "step": 19250 }, { "epoch": 6.394996272056996, "grad_norm": 0.55078125, "learning_rate": 4.5729333719761124e-05, "loss": 3.6592, "step": 19300 }, { "epoch": 6.411564907629857, "grad_norm": 0.546875, "learning_rate": 4.5704305361603585e-05, "loss": 3.66, "step": 19350 }, { "epoch": 6.428133543202717, "grad_norm": 0.52734375, "learning_rate": 4.567921076902173e-05, "loss": 3.6641, "step": 19400 }, { "epoch": 6.444702178775578, "grad_norm": 0.54296875, "learning_rate": 4.5654050022294805e-05, "loss": 3.6676, "step": 19450 }, { "epoch": 6.461270814348438, "grad_norm": 0.55859375, "learning_rate": 4.562882320191368e-05, "loss": 3.6594, "step": 19500 }, { "epoch": 6.477839449921299, "grad_norm": 0.52734375, "learning_rate": 4.560353038858062e-05, "loss": 3.6554, "step": 19550 }, { "epoch": 6.494408085494159, "grad_norm": 0.52734375, "learning_rate": 4.557817166320896e-05, "loss": 3.668, "step": 19600 }, { "epoch": 6.51097672106702, "grad_norm": 0.546875, "learning_rate": 4.5552747106922924e-05, "loss": 3.6629, "step": 19650 }, { "epoch": 6.52754535663988, "grad_norm": 0.515625, "learning_rate": 4.552725680105733e-05, "loss": 3.666, "step": 19700 }, { "epoch": 6.544113992212742, "grad_norm": 0.5390625, "learning_rate": 4.550170082715733e-05, "loss": 3.6651, "step": 19750 }, { "epoch": 6.560682627785602, "grad_norm": 0.5546875, "learning_rate": 4.5476079266978164e-05, "loss": 3.6653, "step": 19800 }, { "epoch": 6.5772512633584626, "grad_norm": 0.5546875, "learning_rate": 4.545039220248486e-05, "loss": 3.6644, "step": 19850 }, { "epoch": 6.593819898931323, "grad_norm": 0.546875, "learning_rate": 4.542463971585201e-05, "loss": 3.6624, "step": 19900 }, { "epoch": 6.6103885345041835, "grad_norm": 0.53125, "learning_rate": 4.539882188946352e-05, "loss": 3.6577, "step": 19950 }, { "epoch": 6.626957170077044, "grad_norm": 0.53515625, "learning_rate": 4.537293880591229e-05, "loss": 3.6674, "step": 20000 }, { "epoch": 6.626957170077044, "eval_loss": 3.6582796573638916, "eval_runtime": 7.9393, "eval_samples_per_second": 122.05, "eval_steps_per_second": 2.015, "step": 20000 } ], "logging_steps": 50, "max_steps": 90540, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.903523641762932e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }