{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.011105003359263517, "eval_steps": 1000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 240.56250762939453, "epoch": 2.776250839815879e-06, "grad_norm": 0.06479538977146149, "kl": 0.0, "learning_rate": 8.310249307479225e-09, "loss": -0.0006, "reward": 0.16875001788139343, "reward_std": 0.19138706848025322, "rewards/countdown_reward_func": 0.16875001788139343, "step": 1, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 5.552501679631758e-06, "grad_norm": 0.055109903216362, "kl": 0.0, "learning_rate": 1.662049861495845e-08, "loss": -0.0006, "step": 2 }, { "clip_ratio": 0.0, "epoch": 8.328752519447637e-06, "grad_norm": 0.0616462379693985, "kl": 0.0006676262128166854, "learning_rate": 2.4930747922437675e-08, "loss": -0.0006, "step": 3 }, { "clip_ratio": 0.0, "epoch": 1.1105003359263517e-05, "grad_norm": 0.061320219188928604, "kl": 0.0007026523817330599, "learning_rate": 3.32409972299169e-08, "loss": -0.0005, "step": 4 }, { "clip_ratio": 0.0002470490289852023, "epoch": 1.3881254199079395e-05, "grad_norm": 0.059856366366147995, "kl": 0.0007550643058493733, "learning_rate": 4.155124653739612e-08, "loss": -0.0006, "step": 5 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 1.6657505038895273e-05, "grad_norm": 0.09312308579683304, "kl": 0.0007920752104837447, "learning_rate": 4.986149584487535e-08, "loss": -0.0009, "step": 6 }, { "clip_ratio": 0.0, "epoch": 1.9433755878711153e-05, "grad_norm": 0.0637163445353508, "kl": 0.0006995745643507689, "learning_rate": 5.8171745152354567e-08, "loss": -0.0007, "step": 7 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 2.2210006718527033e-05, "grad_norm": 0.055853065103292465, "kl": 0.0008612782985437661, "learning_rate": 6.64819944598338e-08, "loss": -0.0002, "step": 8 }, { "clip_ratio": 0.0, "epoch": 2.498625755834291e-05, "grad_norm": 0.060049429535865784, "kl": 0.00074524967931211, "learning_rate": 7.479224376731302e-08, "loss": -0.0007, "step": 9 }, { "clip_ratio": 0.0, "epoch": 2.776250839815879e-05, "grad_norm": 0.059901271015405655, "kl": 0.0008222198521252722, "learning_rate": 8.310249307479224e-08, "loss": -0.0004, "step": 10 }, { "clip_ratio": 0.00016566881095059216, "epoch": 3.0538759237974666e-05, "grad_norm": 0.05985812842845917, "kl": 0.0007750319491606206, "learning_rate": 9.141274238227148e-08, "loss": -0.0006, "step": 11 }, { "clip_ratio": 0.0, "epoch": 3.3315010077790546e-05, "grad_norm": 0.09739804267883301, "kl": 0.0008287512173410505, "learning_rate": 9.97229916897507e-08, "loss": -0.0006, "step": 12 }, { "clip_ratio": 8.928571332944557e-05, "completion_length": 224.6666717529297, "epoch": 3.6091260917606426e-05, "grad_norm": 0.19317923486232758, "kl": 0.0007415811996906996, "learning_rate": 1.0803324099722992e-07, "loss": 0.0222, "reward": 0.2645833343267441, "reward_std": 0.2765769511461258, "rewards/countdown_reward_func": 0.2645833268761635, "step": 13, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.827683632262051e-05, "epoch": 3.8867511757422306e-05, "grad_norm": 0.07646733522415161, "kl": 0.0008568160701543093, "learning_rate": 1.1634349030470913e-07, "loss": 0.0217, "step": 14 }, { "clip_ratio": 0.00019336564582772553, "epoch": 4.1643762597238186e-05, "grad_norm": 0.10118044912815094, "kl": 0.0007392842962872237, "learning_rate": 1.2465373961218836e-07, "loss": 0.0214, "step": 15 }, { "clip_ratio": 0.0004612546181306243, "epoch": 4.4420013437054066e-05, "grad_norm": 0.09648283571004868, "kl": 0.0007288659107871354, "learning_rate": 1.329639889196676e-07, "loss": 0.0215, "step": 16 }, { "clip_ratio": 0.0, "epoch": 4.7196264276869946e-05, "grad_norm": 0.10432377457618713, "kl": 0.0007946545956656337, "learning_rate": 1.4127423822714683e-07, "loss": 0.021, "step": 17 }, { "clip_ratio": 0.0, "epoch": 4.997251511668582e-05, "grad_norm": 0.0799316018819809, "kl": 0.0007485100359190255, "learning_rate": 1.4958448753462604e-07, "loss": 0.0217, "step": 18 }, { "clip_ratio": 8.480325777782127e-05, "epoch": 5.27487659565017e-05, "grad_norm": 0.20232859253883362, "kl": 0.0009814091317821294, "learning_rate": 1.5789473684210525e-07, "loss": 0.022, "step": 19 }, { "clip_ratio": 0.00027768261497840285, "epoch": 5.552501679631758e-05, "grad_norm": 0.07837554812431335, "kl": 0.000779987225541845, "learning_rate": 1.6620498614958448e-07, "loss": 0.0217, "step": 20 }, { "clip_ratio": 0.00010407993249827996, "epoch": 5.830126763613346e-05, "grad_norm": 0.09943106770515442, "kl": 0.0007467044633813202, "learning_rate": 1.7451523545706372e-07, "loss": 0.0214, "step": 21 }, { "clip_ratio": 0.0005535055533982813, "epoch": 6.107751847594933e-05, "grad_norm": 0.09467501193284988, "kl": 0.0007356623827945441, "learning_rate": 1.8282548476454296e-07, "loss": 0.0217, "step": 22 }, { "clip_ratio": 8.979885024018586e-05, "epoch": 6.385376931576521e-05, "grad_norm": 0.1044834554195404, "kl": 0.0007783641340211034, "learning_rate": 1.9113573407202217e-07, "loss": 0.0216, "step": 23 }, { "clip_ratio": 0.0, "epoch": 6.663002015558109e-05, "grad_norm": 0.08091197907924652, "kl": 0.0007477364561054856, "learning_rate": 1.994459833795014e-07, "loss": 0.0214, "step": 24 }, { "clip_ratio": 0.000327225134242326, "completion_length": 223.89583587646484, "epoch": 6.940627099539697e-05, "grad_norm": 0.07894178479909897, "kl": 0.0009490847878623754, "learning_rate": 2.0775623268698064e-07, "loss": 0.0032, "reward": 0.24583334475755692, "reward_std": 0.2639523148536682, "rewards/countdown_reward_func": 0.24583334475755692, "step": 25, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00018698579515330493, "epoch": 7.218252183521285e-05, "grad_norm": 0.07660482078790665, "kl": 0.0009208305855281651, "learning_rate": 2.1606648199445985e-07, "loss": 0.0032, "step": 26 }, { "clip_ratio": 0.0, "epoch": 7.495877267502873e-05, "grad_norm": 0.10856402665376663, "kl": 0.0008324629161506891, "learning_rate": 2.2437673130193906e-07, "loss": 0.0029, "step": 27 }, { "clip_ratio": 0.0, "epoch": 7.773502351484461e-05, "grad_norm": 0.10259876400232315, "kl": 0.0007138713845051825, "learning_rate": 2.3268698060941827e-07, "loss": 0.0028, "step": 28 }, { "clip_ratio": 0.0005700875190086663, "epoch": 8.051127435466049e-05, "grad_norm": 0.07817202061414719, "kl": 0.0009188149124383926, "learning_rate": 2.409972299168975e-07, "loss": 0.003, "step": 29 }, { "clip_ratio": 8.18062835605815e-05, "epoch": 8.328752519447637e-05, "grad_norm": 0.09154564142227173, "kl": 0.000773191568441689, "learning_rate": 2.493074792243767e-07, "loss": 0.0026, "step": 30 }, { "clip_ratio": 0.00010195758659392595, "epoch": 8.606377603429225e-05, "grad_norm": 0.09764699637889862, "kl": 0.0008510929765179753, "learning_rate": 2.57617728531856e-07, "loss": 0.0032, "step": 31 }, { "clip_ratio": 0.0, "epoch": 8.884002687410813e-05, "grad_norm": 0.07986056804656982, "kl": 0.0009318325319327414, "learning_rate": 2.659279778393352e-07, "loss": 0.0025, "step": 32 }, { "clip_ratio": 0.00019616441568359733, "epoch": 9.161627771392401e-05, "grad_norm": 0.10944823920726776, "kl": 0.0008964207954704762, "learning_rate": 2.742382271468144e-07, "loss": 0.0024, "step": 33 }, { "clip_ratio": 0.0, "epoch": 9.439252855373989e-05, "grad_norm": 0.08562328666448593, "kl": 0.000787771336035803, "learning_rate": 2.8254847645429366e-07, "loss": 0.0027, "step": 34 }, { "clip_ratio": 0.00016318648704327643, "epoch": 9.716877939355577e-05, "grad_norm": 0.06373903900384903, "kl": 0.0009429152996744961, "learning_rate": 2.9085872576177287e-07, "loss": 0.0028, "step": 35 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 9.994503023337164e-05, "grad_norm": 0.0951249971985817, "kl": 0.0008729175024200231, "learning_rate": 2.991689750692521e-07, "loss": 0.0029, "step": 36 }, { "clip_ratio": 0.00010088780982187018, "completion_length": 234.20833587646484, "epoch": 0.00010272128107318752, "grad_norm": 0.0882072001695633, "kl": 0.001087184005882591, "learning_rate": 3.0747922437673134e-07, "loss": -0.0108, "reward": 0.24791668355464935, "reward_std": 0.2163795679807663, "rewards/countdown_reward_func": 0.24791668355464935, "step": 37, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0001054975319130034, "grad_norm": 0.08608131855726242, "kl": 0.0007613546913489699, "learning_rate": 3.157894736842105e-07, "loss": -0.0112, "step": 38 }, { "clip_ratio": 0.0, "epoch": 0.00010827378275281928, "grad_norm": 0.081035315990448, "kl": 0.0008077043457888067, "learning_rate": 3.2409972299168976e-07, "loss": -0.0113, "step": 39 }, { "clip_ratio": 0.0, "epoch": 0.00011105003359263516, "grad_norm": 0.07537035644054413, "kl": 0.0007854337454773486, "learning_rate": 3.3240997229916897e-07, "loss": -0.0118, "step": 40 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00011382628443245104, "grad_norm": 0.08525540679693222, "kl": 0.0007534388569183648, "learning_rate": 3.407202216066482e-07, "loss": -0.0112, "step": 41 }, { "clip_ratio": 0.0003339054746902548, "epoch": 0.00011660253527226692, "grad_norm": 0.1856469362974167, "kl": 0.000957256241235882, "learning_rate": 3.4903047091412744e-07, "loss": -0.0105, "step": 42 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0001193787861120828, "grad_norm": 0.06687930226325989, "kl": 0.0008277066808659583, "learning_rate": 3.5734072022160665e-07, "loss": -0.011, "step": 43 }, { "clip_ratio": 0.0, "epoch": 0.00012215503695189867, "grad_norm": 0.07720962911844254, "kl": 0.0007938558992464095, "learning_rate": 3.656509695290859e-07, "loss": -0.0115, "step": 44 }, { "clip_ratio": 0.0, "epoch": 0.00012493128779171456, "grad_norm": 0.08875006437301636, "kl": 0.0007698170084040612, "learning_rate": 3.739612188365651e-07, "loss": -0.011, "step": 45 }, { "clip_ratio": 0.0, "epoch": 0.00012770753863153043, "grad_norm": 0.07545354217290878, "kl": 0.0008280337788164616, "learning_rate": 3.8227146814404433e-07, "loss": -0.0114, "step": 46 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00013048378947134632, "grad_norm": 0.07932056486606598, "kl": 0.000836798019008711, "learning_rate": 3.905817174515236e-07, "loss": -0.0109, "step": 47 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00013326004031116219, "grad_norm": 0.11963900178670883, "kl": 0.0009241281659342349, "learning_rate": 3.988919667590028e-07, "loss": -0.011, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 218.25000762939453, "epoch": 0.00013603629115097808, "grad_norm": 0.06625409424304962, "kl": 0.0008119716076180339, "learning_rate": 4.07202216066482e-07, "loss": 0.0113, "reward": 0.24166668951511383, "reward_std": 0.18230264633893967, "rewards/countdown_reward_func": 0.24166668206453323, "step": 49, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.00013881254199079395, "grad_norm": 0.07639951258897781, "kl": 0.0009117976296693087, "learning_rate": 4.155124653739613e-07, "loss": 0.0117, "step": 50 }, { "clip_ratio": 0.00010530749568715692, "epoch": 0.00014158879283060984, "grad_norm": 0.0806988924741745, "kl": 0.0008828463032841682, "learning_rate": 4.238227146814405e-07, "loss": 0.0111, "step": 51 }, { "clip_ratio": 0.0, "epoch": 0.0001443650436704257, "grad_norm": 0.07537931948900223, "kl": 0.0007655246299691498, "learning_rate": 4.321329639889197e-07, "loss": 0.0111, "step": 52 }, { "clip_ratio": 0.0, "epoch": 0.0001471412945102416, "grad_norm": 0.06568142026662827, "kl": 0.0007329773507080972, "learning_rate": 4.4044321329639896e-07, "loss": 0.0112, "step": 53 }, { "clip_ratio": 0.0, "epoch": 0.00014991754535005747, "grad_norm": 0.08050818741321564, "kl": 0.0007917624025139958, "learning_rate": 4.487534626038781e-07, "loss": 0.0115, "step": 54 }, { "clip_ratio": 0.0, "epoch": 0.00015269379618987336, "grad_norm": 0.07148700952529907, "kl": 0.0008034448546823114, "learning_rate": 4.570637119113573e-07, "loss": 0.0113, "step": 55 }, { "clip_ratio": 0.00021934154210612178, "epoch": 0.00015547004702968923, "grad_norm": 0.07921700179576874, "kl": 0.0008946515154093504, "learning_rate": 4.6537396121883653e-07, "loss": 0.0113, "step": 56 }, { "clip_ratio": 8.692628762219101e-05, "epoch": 0.0001582462978695051, "grad_norm": 0.07876580208539963, "kl": 0.0009213399316649884, "learning_rate": 4.736842105263158e-07, "loss": 0.011, "step": 57 }, { "clip_ratio": 0.0002706017985474318, "epoch": 0.00016102254870932099, "grad_norm": 0.06707654148340225, "kl": 0.0008947286114562303, "learning_rate": 4.81994459833795e-07, "loss": 0.0109, "step": 58 }, { "clip_ratio": 0.0, "epoch": 0.00016379879954913685, "grad_norm": 0.08683553338050842, "kl": 0.0007533867028541863, "learning_rate": 4.903047091412742e-07, "loss": 0.0113, "step": 59 }, { "clip_ratio": 0.0, "epoch": 0.00016657505038895275, "grad_norm": 0.07259709388017654, "kl": 0.0008512145723216236, "learning_rate": 4.986149584487534e-07, "loss": 0.0115, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 242.2291717529297, "epoch": 0.0001693513012287686, "grad_norm": 0.10226056724786758, "kl": 0.0007961629016790539, "learning_rate": 5.069252077562327e-07, "loss": 0.0107, "reward": 0.3229166716337204, "reward_std": 0.3417354077100754, "rewards/countdown_reward_func": 0.3229166716337204, "step": 61, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0001721275520685845, "grad_norm": 0.1283038705587387, "kl": 0.0008402638195548207, "learning_rate": 5.15235457063712e-07, "loss": 0.0097, "step": 62 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00017490380290840037, "grad_norm": 0.10426519066095352, "kl": 0.0008127306937240064, "learning_rate": 5.235457063711912e-07, "loss": 0.0104, "step": 63 }, { "clip_ratio": 0.0, "epoch": 0.00017768005374821627, "grad_norm": 0.09159515053033829, "kl": 0.0008558765111956745, "learning_rate": 5.318559556786704e-07, "loss": 0.0104, "step": 64 }, { "clip_ratio": 0.0, "epoch": 0.00018045630458803213, "grad_norm": 0.09254452586174011, "kl": 0.00104894393007271, "learning_rate": 5.401662049861496e-07, "loss": 0.0107, "step": 65 }, { "clip_ratio": 0.00045153952669352293, "epoch": 0.00018323255542784803, "grad_norm": 0.10837826132774353, "kl": 0.0008054885256569833, "learning_rate": 5.484764542936288e-07, "loss": 0.0104, "step": 66 }, { "clip_ratio": 0.0, "epoch": 0.0001860088062676639, "grad_norm": 0.09977608174085617, "kl": 0.0007916014001239091, "learning_rate": 5.567867036011081e-07, "loss": 0.011, "step": 67 }, { "clip_ratio": 0.0, "epoch": 0.00018878505710747979, "grad_norm": 0.1362018585205078, "kl": 0.0008146155159920454, "learning_rate": 5.650969529085873e-07, "loss": 0.0102, "step": 68 }, { "clip_ratio": 0.0, "epoch": 0.00019156130794729565, "grad_norm": 0.09612994641065598, "kl": 0.0008106507884804159, "learning_rate": 5.734072022160665e-07, "loss": 0.0099, "step": 69 }, { "clip_ratio": 0.00017850531003205106, "epoch": 0.00019433755878711155, "grad_norm": 0.10862355679273605, "kl": 0.0008707395172677934, "learning_rate": 5.817174515235457e-07, "loss": 0.0095, "step": 70 }, { "clip_ratio": 0.0, "epoch": 0.0001971138096269274, "grad_norm": 0.09866558760404587, "kl": 0.0008735440496820956, "learning_rate": 5.900277008310249e-07, "loss": 0.0106, "step": 71 }, { "clip_ratio": 9.110787505051121e-05, "epoch": 0.00019989006046674328, "grad_norm": 0.09790490567684174, "kl": 0.0008786998223513365, "learning_rate": 5.983379501385042e-07, "loss": 0.0105, "step": 72 }, { "clip_ratio": 0.0001945525291375816, "completion_length": 232.2291717529297, "epoch": 0.00020266631130655917, "grad_norm": 0.09406734257936478, "kl": 0.0010395684512332082, "learning_rate": 6.066481994459835e-07, "loss": 0.0024, "reward": 0.23750000447034836, "reward_std": 0.2697988599538803, "rewards/countdown_reward_func": 0.23750000447034836, "step": 73, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00020544256214637504, "grad_norm": 0.11244494467973709, "kl": 0.0007681146962568164, "learning_rate": 6.149584487534627e-07, "loss": 0.0034, "step": 74 }, { "clip_ratio": 0.0, "epoch": 0.00020821881298619093, "grad_norm": 0.15458862483501434, "kl": 0.0007610543980263174, "learning_rate": 6.232686980609418e-07, "loss": 0.0025, "step": 75 }, { "clip_ratio": 0.0, "epoch": 0.0002109950638260068, "grad_norm": 0.07957284897565842, "kl": 0.0009243077947758138, "learning_rate": 6.31578947368421e-07, "loss": 0.0031, "step": 76 }, { "clip_ratio": 0.0, "epoch": 0.0002137713146658227, "grad_norm": 0.1434016078710556, "kl": 0.0009022459562402219, "learning_rate": 6.398891966759003e-07, "loss": 0.0033, "step": 77 }, { "clip_ratio": 0.0002997264382429421, "epoch": 0.00021654756550563856, "grad_norm": 0.07511159032583237, "kl": 0.0009360458934679627, "learning_rate": 6.481994459833795e-07, "loss": 0.0025, "step": 78 }, { "clip_ratio": 0.0002821488888002932, "epoch": 0.00021932381634545445, "grad_norm": 0.10179822146892548, "kl": 0.0009871571965049952, "learning_rate": 6.565096952908587e-07, "loss": 0.0025, "step": 79 }, { "clip_ratio": 9.72762645687908e-05, "epoch": 0.00022210006718527032, "grad_norm": 0.10691147297620773, "kl": 0.0007517710037063807, "learning_rate": 6.648199445983379e-07, "loss": 0.0032, "step": 80 }, { "clip_ratio": 0.0, "epoch": 0.0002248763180250862, "grad_norm": 0.1533641219139099, "kl": 0.0008048239687923342, "learning_rate": 6.731301939058171e-07, "loss": 0.0022, "step": 81 }, { "clip_ratio": 0.0, "epoch": 0.00022765256886490208, "grad_norm": 0.08312182873487473, "kl": 0.0009179475018754601, "learning_rate": 6.814404432132964e-07, "loss": 0.0032, "step": 82 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00023042881970471797, "grad_norm": 0.11414949595928192, "kl": 0.0008502569980919361, "learning_rate": 6.897506925207757e-07, "loss": 0.0024, "step": 83 }, { "clip_ratio": 0.0001669251942075789, "epoch": 0.00023320507054453384, "grad_norm": 0.07714688032865524, "kl": 0.0008958573453128338, "learning_rate": 6.980609418282549e-07, "loss": 0.0027, "step": 84 }, { "clip_ratio": 0.0002005309797823429, "completion_length": 219.5416717529297, "epoch": 0.00023598132138434973, "grad_norm": 0.08387522399425507, "kl": 0.0009274522599298507, "learning_rate": 7.063711911357341e-07, "loss": 0.0284, "reward": 0.1666666716337204, "reward_std": 0.16661179438233376, "rewards/countdown_reward_func": 0.1666666641831398, "step": 85, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.077705180970952e-05, "epoch": 0.0002387575722241656, "grad_norm": 0.07613258063793182, "kl": 0.0010448671237099916, "learning_rate": 7.146814404432133e-07, "loss": 0.028, "step": 86 }, { "clip_ratio": 0.0, "epoch": 0.00024153382306398146, "grad_norm": 0.11073552072048187, "kl": 0.000871124560944736, "learning_rate": 7.229916897506925e-07, "loss": 0.0284, "step": 87 }, { "clip_ratio": 0.0, "epoch": 0.00024431007390379733, "grad_norm": 0.08452224731445312, "kl": 0.0010483066726010293, "learning_rate": 7.313019390581718e-07, "loss": 0.0288, "step": 88 }, { "clip_ratio": 0.00011130899656563997, "epoch": 0.0002470863247436132, "grad_norm": 0.0861300677061081, "kl": 0.0009267764107789844, "learning_rate": 7.39612188365651e-07, "loss": 0.028, "step": 89 }, { "clip_ratio": 0.0, "epoch": 0.0002498625755834291, "grad_norm": 0.07036253809928894, "kl": 0.0008911852783057839, "learning_rate": 7.479224376731302e-07, "loss": 0.028, "step": 90 }, { "clip_ratio": 0.0003118399763479829, "epoch": 0.000252638826423245, "grad_norm": 0.09484658390283585, "kl": 0.0009728774311952293, "learning_rate": 7.562326869806093e-07, "loss": 0.0283, "step": 91 }, { "clip_ratio": 0.00011130899656563997, "epoch": 0.00025541507726306085, "grad_norm": 0.07554644346237183, "kl": 0.0008746770035941154, "learning_rate": 7.645429362880887e-07, "loss": 0.0278, "step": 92 }, { "clip_ratio": 0.0, "epoch": 0.00025819132810287674, "grad_norm": 0.1146901547908783, "kl": 0.0008961272542364895, "learning_rate": 7.728531855955679e-07, "loss": 0.0286, "step": 93 }, { "clip_ratio": 8.191350207198411e-05, "epoch": 0.00026096757894269264, "grad_norm": 0.08794571459293365, "kl": 0.0010003510979004204, "learning_rate": 7.811634349030472e-07, "loss": 0.0281, "step": 94 }, { "clip_ratio": 0.00023231577506521717, "epoch": 0.00026374382978250853, "grad_norm": 0.08675868809223175, "kl": 0.0009201938519254327, "learning_rate": 7.894736842105263e-07, "loss": 0.028, "step": 95 }, { "clip_ratio": 0.0, "epoch": 0.00026652008062232437, "grad_norm": 0.07415983825922012, "kl": 0.0008623613393865526, "learning_rate": 7.977839335180056e-07, "loss": 0.028, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 240.6041717529297, "epoch": 0.00026929633146214026, "grad_norm": 0.05498122051358223, "kl": 0.000935161137022078, "learning_rate": 8.060941828254847e-07, "loss": 0.0019, "reward": 0.18958335369825363, "reward_std": 0.20226776599884033, "rewards/countdown_reward_func": 0.18958335369825363, "step": 97, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.741259080125019e-05, "epoch": 0.00027207258230195616, "grad_norm": 0.0805509015917778, "kl": 0.0009494524856563658, "learning_rate": 8.14404432132964e-07, "loss": 0.0017, "step": 98 }, { "clip_ratio": 0.0, "epoch": 0.00027484883314177205, "grad_norm": 0.06182171031832695, "kl": 0.0010764948674477637, "learning_rate": 8.227146814404432e-07, "loss": 0.0018, "step": 99 }, { "clip_ratio": 0.0, "epoch": 0.0002776250839815879, "grad_norm": 0.05677403509616852, "kl": 0.000932041322812438, "learning_rate": 8.310249307479226e-07, "loss": 0.0022, "step": 100 }, { "clip_ratio": 0.0, "epoch": 0.0002804013348214038, "grad_norm": 0.06866513192653656, "kl": 0.0009856745309662074, "learning_rate": 8.393351800554017e-07, "loss": 0.0018, "step": 101 }, { "clip_ratio": 0.00016465802764287218, "epoch": 0.0002831775856612197, "grad_norm": 0.0790778174996376, "kl": 0.0009655553149059415, "learning_rate": 8.47645429362881e-07, "loss": 0.0018, "step": 102 }, { "clip_ratio": 0.0, "epoch": 0.0002859538365010355, "grad_norm": 0.06253324449062347, "kl": 0.0010523943637963384, "learning_rate": 8.559556786703601e-07, "loss": 0.0017, "step": 103 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0002887300873408514, "grad_norm": 0.08336114138364792, "kl": 0.0009701263043098152, "learning_rate": 8.642659279778394e-07, "loss": 0.002, "step": 104 }, { "clip_ratio": 0.0, "epoch": 0.0002915063381806673, "grad_norm": 0.05647846683859825, "kl": 0.0010485479142516851, "learning_rate": 8.725761772853186e-07, "loss": 0.0017, "step": 105 }, { "clip_ratio": 0.0, "epoch": 0.0002942825890204832, "grad_norm": 0.07278945297002792, "kl": 0.0010502847435418516, "learning_rate": 8.808864265927979e-07, "loss": 0.0019, "step": 106 }, { "clip_ratio": 0.0, "epoch": 0.00029705883986029904, "grad_norm": 0.06493644416332245, "kl": 0.0009983716008719057, "learning_rate": 8.89196675900277e-07, "loss": 0.0018, "step": 107 }, { "clip_ratio": 0.00026072279433719814, "epoch": 0.00029983509070011493, "grad_norm": 0.07456071674823761, "kl": 0.0010671942727640271, "learning_rate": 8.975069252077562e-07, "loss": 0.0021, "step": 108 }, { "clip_ratio": 0.00034843204775825143, "completion_length": 220.12500762939453, "epoch": 0.0003026113415399308, "grad_norm": 0.09195102006196976, "kl": 0.001304664183408022, "learning_rate": 9.058171745152355e-07, "loss": -0.0043, "reward": 0.260416679084301, "reward_std": 0.29160892963409424, "rewards/countdown_reward_func": 0.2604166641831398, "step": 109, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00044802867341786623, "epoch": 0.0003053875923797467, "grad_norm": 0.11659281700849533, "kl": 0.001220056030433625, "learning_rate": 9.141274238227146e-07, "loss": -0.0037, "step": 110 }, { "clip_ratio": 0.0, "epoch": 0.00030816384321956256, "grad_norm": 0.11090529710054398, "kl": 0.0011154624517075717, "learning_rate": 9.22437673130194e-07, "loss": -0.0036, "step": 111 }, { "clip_ratio": 0.0, "epoch": 0.00031094009405937845, "grad_norm": 0.11202968657016754, "kl": 0.0010931476717814803, "learning_rate": 9.307479224376731e-07, "loss": -0.0036, "step": 112 }, { "clip_ratio": 0.0, "epoch": 0.00031371634489919434, "grad_norm": 0.11670778691768646, "kl": 0.001031600550049916, "learning_rate": 9.390581717451524e-07, "loss": -0.0045, "step": 113 }, { "clip_ratio": 8.710801193956286e-05, "epoch": 0.0003164925957390102, "grad_norm": 0.10784825682640076, "kl": 0.0010588545119389892, "learning_rate": 9.473684210526316e-07, "loss": -0.0044, "step": 114 }, { "clip_ratio": 0.00035092979669570923, "epoch": 0.0003192688465788261, "grad_norm": 0.12178794294595718, "kl": 0.0014504955615848303, "learning_rate": 9.55678670360111e-07, "loss": -0.0033, "step": 115 }, { "clip_ratio": 0.00010288065823260695, "epoch": 0.00032204509741864197, "grad_norm": 0.092310331761837, "kl": 0.0012508891522884369, "learning_rate": 9.6398891966759e-07, "loss": -0.0034, "step": 116 }, { "clip_ratio": 0.0, "epoch": 0.00032482134825845786, "grad_norm": 0.10470046103000641, "kl": 0.0011784560047090054, "learning_rate": 9.722991689750693e-07, "loss": -0.0037, "step": 117 }, { "clip_ratio": 8.710801193956286e-05, "epoch": 0.0003275975990982737, "grad_norm": 0.10416633635759354, "kl": 0.0012174599687568843, "learning_rate": 9.806094182825484e-07, "loss": -0.0036, "step": 118 }, { "clip_ratio": 0.0, "epoch": 0.0003303738499380896, "grad_norm": 0.12231718003749847, "kl": 0.0010898615000769496, "learning_rate": 9.889196675900277e-07, "loss": -0.0035, "step": 119 }, { "clip_ratio": 0.00019340052676852793, "epoch": 0.0003331501007779055, "grad_norm": 0.11489807814359665, "kl": 0.001152284734416753, "learning_rate": 9.972299168975068e-07, "loss": -0.0045, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 221.6041717529297, "epoch": 0.0003359263516177214, "grad_norm": 0.05373651906847954, "kl": 0.0010495753376744688, "learning_rate": 1.0055401662049862e-06, "loss": -0.014, "reward": 0.19166668504476547, "reward_std": 0.15476077795028687, "rewards/countdown_reward_func": 0.19166666641831398, "step": 121, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0, "epoch": 0.0003387026024575372, "grad_norm": 0.07888471335172653, "kl": 0.0014498817035928369, "learning_rate": 1.0138504155124655e-06, "loss": -0.0136, "step": 122 }, { "clip_ratio": 9.491267701378092e-05, "epoch": 0.0003414788532973531, "grad_norm": 0.11604306101799011, "kl": 0.0012312422040849924, "learning_rate": 1.0221606648199446e-06, "loss": -0.0136, "step": 123 }, { "clip_ratio": 0.0, "epoch": 0.000344255104137169, "grad_norm": 0.059598926454782486, "kl": 0.001186943962238729, "learning_rate": 1.030470914127424e-06, "loss": -0.0142, "step": 124 }, { "clip_ratio": 9.307520667789504e-05, "epoch": 0.0003470313549769849, "grad_norm": 0.08821563422679901, "kl": 0.0014322291244752705, "learning_rate": 1.0387811634349032e-06, "loss": -0.0145, "step": 125 }, { "clip_ratio": 9.491267701378092e-05, "epoch": 0.00034980760581680074, "grad_norm": 0.08867970108985901, "kl": 0.0012800320982933044, "learning_rate": 1.0470914127423823e-06, "loss": -0.014, "step": 126 }, { "clip_ratio": 0.0, "epoch": 0.00035258385665661664, "grad_norm": 0.05335162952542305, "kl": 0.0010596674110274762, "learning_rate": 1.0554016620498616e-06, "loss": -0.0142, "step": 127 }, { "clip_ratio": 0.00040634788456372917, "epoch": 0.00035536010749643253, "grad_norm": 0.08012160658836365, "kl": 0.0014414938050322235, "learning_rate": 1.0637119113573407e-06, "loss": -0.0138, "step": 128 }, { "clip_ratio": 0.0, "epoch": 0.00035813635833624837, "grad_norm": 0.13474638760089874, "kl": 0.0013173840707167983, "learning_rate": 1.0720221606648198e-06, "loss": -0.0142, "step": 129 }, { "clip_ratio": 0.00018023690790869296, "epoch": 0.00036091260917606426, "grad_norm": 0.05427733436226845, "kl": 0.0011062846169807017, "learning_rate": 1.0803324099722992e-06, "loss": -0.0142, "step": 130 }, { "clip_ratio": 9.307520667789504e-05, "epoch": 0.00036368886001588016, "grad_norm": 0.0898558497428894, "kl": 0.0013623088016174734, "learning_rate": 1.0886426592797783e-06, "loss": -0.0141, "step": 131 }, { "clip_ratio": 0.0, "epoch": 0.00036646511085569605, "grad_norm": 0.07763069868087769, "kl": 0.0013295715907588601, "learning_rate": 1.0969529085872576e-06, "loss": -0.0142, "step": 132 }, { "clip_ratio": 0.00016427145601483062, "completion_length": 248.33333587646484, "epoch": 0.0003692413616955119, "grad_norm": 0.07501526921987534, "kl": 0.0011433768086135387, "learning_rate": 1.1052631578947369e-06, "loss": 0.0024, "reward": 0.2458333671092987, "reward_std": 0.23012542724609375, "rewards/countdown_reward_func": 0.2458333522081375, "step": 133, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0001751334930304438, "epoch": 0.0003720176125353278, "grad_norm": 0.12185314297676086, "kl": 0.0012197635951451957, "learning_rate": 1.1135734072022162e-06, "loss": 0.0026, "step": 134 }, { "clip_ratio": 0.0, "epoch": 0.0003747938633751437, "grad_norm": 0.08777681738138199, "kl": 0.0011197520070709288, "learning_rate": 1.1218836565096953e-06, "loss": 0.0021, "step": 135 }, { "clip_ratio": 0.0, "epoch": 0.00037757011421495957, "grad_norm": 0.08989999443292618, "kl": 0.0010376300197094679, "learning_rate": 1.1301939058171746e-06, "loss": 0.0023, "step": 136 }, { "clip_ratio": 8.223684562835842e-05, "epoch": 0.0003803463650547754, "grad_norm": 0.08074529469013214, "kl": 0.0010572479804977775, "learning_rate": 1.1385041551246537e-06, "loss": 0.0019, "step": 137 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0003831226158945913, "grad_norm": 0.08292272686958313, "kl": 0.0011392622254788876, "learning_rate": 1.146814404432133e-06, "loss": 0.0022, "step": 138 }, { "clip_ratio": 8.289124525617808e-05, "epoch": 0.0003858988667344072, "grad_norm": 0.0767727866768837, "kl": 0.001106983982026577, "learning_rate": 1.1551246537396122e-06, "loss": 0.0019, "step": 139 }, { "clip_ratio": 0.0, "epoch": 0.0003886751175742231, "grad_norm": 0.0795488953590393, "kl": 0.0012870717328041792, "learning_rate": 1.1634349030470915e-06, "loss": 0.003, "step": 140 }, { "clip_ratio": 0.0, "epoch": 0.00039145136841403893, "grad_norm": 0.0856185331940651, "kl": 0.001159544801339507, "learning_rate": 1.1717451523545706e-06, "loss": 0.0018, "step": 141 }, { "clip_ratio": 0.0, "epoch": 0.0003942276192538548, "grad_norm": 0.08445297926664352, "kl": 0.0011639554286375642, "learning_rate": 1.1800554016620499e-06, "loss": 0.0025, "step": 142 }, { "clip_ratio": 8.223684562835842e-05, "epoch": 0.0003970038700936707, "grad_norm": 0.07916339486837387, "kl": 0.0011218629661016166, "learning_rate": 1.188365650969529e-06, "loss": 0.0025, "step": 143 }, { "clip_ratio": 0.00024620501790195704, "epoch": 0.00039978012093348656, "grad_norm": 0.0729660838842392, "kl": 0.0012778243399225175, "learning_rate": 1.1966759002770083e-06, "loss": 0.0024, "step": 144 }, { "clip_ratio": 8.967001485871151e-05, "completion_length": 232.02083587646484, "epoch": 0.00040255637177330245, "grad_norm": 0.16677485406398773, "kl": 0.0013882183120585978, "learning_rate": 1.2049861495844876e-06, "loss": 0.014, "reward": 0.30416667461395264, "reward_std": 0.36280614137649536, "rewards/countdown_reward_func": 0.30416667461395264, "step": 145, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00010926573304459453, "epoch": 0.00040533262261311834, "grad_norm": 0.17154261469841003, "kl": 0.0015535103739239275, "learning_rate": 1.213296398891967e-06, "loss": 0.0145, "step": 146 }, { "clip_ratio": 8.526603050995618e-05, "epoch": 0.00040810887345293424, "grad_norm": 0.11442252993583679, "kl": 0.001147409901022911, "learning_rate": 1.221606648199446e-06, "loss": 0.015, "step": 147 }, { "clip_ratio": 8.967001485871151e-05, "epoch": 0.0004108851242927501, "grad_norm": 0.11827956140041351, "kl": 0.0013873514835722744, "learning_rate": 1.2299168975069254e-06, "loss": 0.0147, "step": 148 }, { "clip_ratio": 0.0008980906713986769, "epoch": 0.00041366137513256597, "grad_norm": 0.10520009696483612, "kl": 0.001313833869062364, "learning_rate": 1.2382271468144045e-06, "loss": 0.0145, "step": 149 }, { "clip_ratio": 0.0, "epoch": 0.00041643762597238186, "grad_norm": 0.11322156339883804, "kl": 0.0012296418426558375, "learning_rate": 1.2465373961218836e-06, "loss": 0.0146, "step": 150 }, { "clip_ratio": 0.00025922466011252254, "epoch": 0.00041921387681219776, "grad_norm": 0.16956517100334167, "kl": 0.0012730106245726347, "learning_rate": 1.2548476454293629e-06, "loss": 0.0134, "step": 151 }, { "clip_ratio": 0.00010926573304459453, "epoch": 0.0004219901276520136, "grad_norm": 0.1768382489681244, "kl": 0.0016011170810088515, "learning_rate": 1.263157894736842e-06, "loss": 0.0138, "step": 152 }, { "clip_ratio": 8.428860746789724e-05, "epoch": 0.0004247663784918295, "grad_norm": 0.11108796298503876, "kl": 0.0012242573429830372, "learning_rate": 1.2714681440443213e-06, "loss": 0.0142, "step": 153 }, { "clip_ratio": 0.00016696537932148203, "epoch": 0.0004275426293316454, "grad_norm": 0.13032646477222443, "kl": 0.0015075987321324646, "learning_rate": 1.2797783933518006e-06, "loss": 0.014, "step": 154 }, { "clip_ratio": 0.0, "epoch": 0.0004303188801714613, "grad_norm": 0.09640121459960938, "kl": 0.0013893723953515291, "learning_rate": 1.28808864265928e-06, "loss": 0.0138, "step": 155 }, { "clip_ratio": 0.0, "epoch": 0.0004330951310112771, "grad_norm": 0.10653083771467209, "kl": 0.0013818400911986828, "learning_rate": 1.296398891966759e-06, "loss": 0.014, "step": 156 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 235.75, "epoch": 0.000435871381851093, "grad_norm": 0.05519864708185196, "kl": 0.001432242221198976, "learning_rate": 1.3047091412742383e-06, "loss": 0.0092, "reward": 0.24583333730697632, "reward_std": 0.12264448031783104, "rewards/countdown_reward_func": 0.24583333730697632, "step": 157, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0, "epoch": 0.0004386476326909089, "grad_norm": 0.09035968035459518, "kl": 0.0013828090741299093, "learning_rate": 1.3130193905817175e-06, "loss": 0.0092, "step": 158 }, { "clip_ratio": 0.0005962533032288775, "epoch": 0.00044142388353072474, "grad_norm": 0.1271812468767166, "kl": 0.0021160971373319626, "learning_rate": 1.3213296398891968e-06, "loss": 0.0093, "step": 159 }, { "clip_ratio": 0.0, "epoch": 0.00044420013437054064, "grad_norm": 0.0863696038722992, "kl": 0.001588326005730778, "learning_rate": 1.3296398891966759e-06, "loss": 0.0096, "step": 160 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.00044697638521035653, "grad_norm": 0.061348333954811096, "kl": 0.0016502806101925671, "learning_rate": 1.3379501385041552e-06, "loss": 0.0092, "step": 161 }, { "clip_ratio": 0.0, "epoch": 0.0004497526360501724, "grad_norm": 0.04854971170425415, "kl": 0.0016185293206945062, "learning_rate": 1.3462603878116343e-06, "loss": 0.0094, "step": 162 }, { "clip_ratio": 0.0, "epoch": 0.00045252888688998826, "grad_norm": 0.060902826488018036, "kl": 0.0025005778297781944, "learning_rate": 1.3545706371191136e-06, "loss": 0.009, "step": 163 }, { "clip_ratio": 0.00011261261533945799, "epoch": 0.00045530513772980416, "grad_norm": 0.08311949670314789, "kl": 0.0018348235171288252, "learning_rate": 1.3628808864265927e-06, "loss": 0.0094, "step": 164 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00045808138856962005, "grad_norm": 0.1301003396511078, "kl": 0.0029168009059503675, "learning_rate": 1.371191135734072e-06, "loss": 0.01, "step": 165 }, { "clip_ratio": 0.0, "epoch": 0.00046085763940943594, "grad_norm": 0.07615955919027328, "kl": 0.002113637514412403, "learning_rate": 1.3795013850415513e-06, "loss": 0.009, "step": 166 }, { "clip_ratio": 9.67492233030498e-05, "epoch": 0.0004636338902492518, "grad_norm": 0.061112839728593826, "kl": 0.001958071778062731, "learning_rate": 1.3878116343490307e-06, "loss": 0.0092, "step": 167 }, { "clip_ratio": 0.00017946927982848138, "epoch": 0.0004664101410890677, "grad_norm": 0.04938596114516258, "kl": 0.002263071248307824, "learning_rate": 1.3961218836565098e-06, "loss": 0.0092, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 212.87500762939453, "epoch": 0.00046918639192888357, "grad_norm": 0.08904451131820679, "kl": 0.0021521300077438354, "learning_rate": 1.404432132963989e-06, "loss": -0.0054, "reward": 0.3541666716337204, "reward_std": 0.3927241712808609, "rewards/countdown_reward_func": 0.3541666716337204, "step": 169, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.000244140625, "epoch": 0.00047196264276869946, "grad_norm": 0.11747531592845917, "kl": 0.0023746880469843745, "learning_rate": 1.4127423822714682e-06, "loss": -0.0057, "step": 170 }, { "clip_ratio": 0.0, "epoch": 0.0004747388936085153, "grad_norm": 0.10242423415184021, "kl": 0.0022907587699592113, "learning_rate": 1.4210526315789473e-06, "loss": -0.0055, "step": 171 }, { "clip_ratio": 0.0, "epoch": 0.0004775151444483312, "grad_norm": 0.09900445491075516, "kl": 0.0021405539009720087, "learning_rate": 1.4293628808864266e-06, "loss": -0.0054, "step": 172 }, { "clip_ratio": 0.0, "epoch": 0.0004802913952881471, "grad_norm": 0.13611117005348206, "kl": 0.0026756724109873176, "learning_rate": 1.4376731301939057e-06, "loss": -0.0062, "step": 173 }, { "clip_ratio": 0.0, "epoch": 0.00048306764612796293, "grad_norm": 0.11884411424398422, "kl": 0.0026626097969710827, "learning_rate": 1.445983379501385e-06, "loss": -0.0055, "step": 174 }, { "clip_ratio": 0.0, "epoch": 0.0004858438969677788, "grad_norm": 0.09348145127296448, "kl": 0.002659423043951392, "learning_rate": 1.4542936288088643e-06, "loss": -0.0048, "step": 175 }, { "clip_ratio": 0.000244140625, "epoch": 0.0004886201478075947, "grad_norm": 0.11439940333366394, "kl": 0.0028921624179929495, "learning_rate": 1.4626038781163436e-06, "loss": -0.0065, "step": 176 }, { "clip_ratio": 9.476876584812999e-05, "epoch": 0.0004913963986474106, "grad_norm": 0.21524833142757416, "kl": 0.002880470361560583, "learning_rate": 1.4709141274238228e-06, "loss": -0.0057, "step": 177 }, { "clip_ratio": 0.0, "epoch": 0.0004941726494872264, "grad_norm": 0.10695669800043106, "kl": 0.0028178965440019965, "learning_rate": 1.479224376731302e-06, "loss": -0.006, "step": 178 }, { "clip_ratio": 0.0, "epoch": 0.0004969489003270424, "grad_norm": 0.12360948324203491, "kl": 0.0034589068964123726, "learning_rate": 1.4875346260387812e-06, "loss": -0.006, "step": 179 }, { "clip_ratio": 0.0, "epoch": 0.0004997251511668582, "grad_norm": 0.1427268236875534, "kl": 0.0033028185134753585, "learning_rate": 1.4958448753462605e-06, "loss": -0.006, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 237.1041717529297, "epoch": 0.0005025014020066741, "grad_norm": 0.06570961326360703, "kl": 0.0035237747943028808, "learning_rate": 1.5041551246537396e-06, "loss": 0.0151, "reward": 0.166666679084301, "reward_std": 0.19649019464850426, "rewards/countdown_reward_func": 0.1666666716337204, "step": 181, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.00050527765284649, "grad_norm": 0.07061938941478729, "kl": 0.003632499254308641, "learning_rate": 1.5124653739612187e-06, "loss": 0.0151, "step": 182 }, { "clip_ratio": 0.00017770155682228506, "epoch": 0.0005080539036863059, "grad_norm": 0.09591417759656906, "kl": 0.004101799800992012, "learning_rate": 1.5207756232686982e-06, "loss": 0.0149, "step": 183 }, { "clip_ratio": 8.169934881152585e-05, "epoch": 0.0005108301545261217, "grad_norm": 0.0730750560760498, "kl": 0.003928871126845479, "learning_rate": 1.5290858725761773e-06, "loss": 0.015, "step": 184 }, { "clip_ratio": 0.0, "epoch": 0.0005136064053659377, "grad_norm": 0.08014266192913055, "kl": 0.003984199371188879, "learning_rate": 1.5373961218836564e-06, "loss": 0.0144, "step": 185 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0005163826562057535, "grad_norm": 0.08353135734796524, "kl": 0.004139116033911705, "learning_rate": 1.5457063711911357e-06, "loss": 0.0146, "step": 186 }, { "clip_ratio": 0.00018080235895467922, "epoch": 0.0005191589070455693, "grad_norm": 0.07052478194236755, "kl": 0.004334121011197567, "learning_rate": 1.554016620498615e-06, "loss": 0.0151, "step": 187 }, { "clip_ratio": 0.0, "epoch": 0.0005219351578853853, "grad_norm": 0.12264339625835419, "kl": 0.004643098916858435, "learning_rate": 1.5623268698060944e-06, "loss": 0.0148, "step": 188 }, { "clip_ratio": 8.7596352386754e-05, "epoch": 0.0005247114087252011, "grad_norm": 0.10199768096208572, "kl": 0.00538562866859138, "learning_rate": 1.5706371191135735e-06, "loss": 0.014, "step": 189 }, { "clip_ratio": 8.169934881152585e-05, "epoch": 0.0005274876595650171, "grad_norm": 0.07175012677907944, "kl": 0.005290654953569174, "learning_rate": 1.5789473684210526e-06, "loss": 0.0147, "step": 190 }, { "clip_ratio": 0.00017524592112749815, "epoch": 0.0005302639104048329, "grad_norm": 0.08007644861936569, "kl": 0.0052713199984282255, "learning_rate": 1.5872576177285321e-06, "loss": 0.0146, "step": 191 }, { "clip_ratio": 8.538251131540164e-05, "epoch": 0.0005330401612446487, "grad_norm": 0.08058490604162216, "kl": 0.005752292228862643, "learning_rate": 1.5955678670360112e-06, "loss": 0.0142, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 224.25000762939453, "epoch": 0.0005358164120844647, "grad_norm": 0.07866574823856354, "kl": 0.006615105550736189, "learning_rate": 1.6038781163434903e-06, "loss": -0.005, "reward": 0.1458333507180214, "reward_std": 0.1595480851829052, "rewards/countdown_reward_func": 0.1458333432674408, "step": 193, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0005385926629242805, "grad_norm": 0.10931258648633957, "kl": 0.006460321601480246, "learning_rate": 1.6121883656509694e-06, "loss": -0.0049, "step": 194 }, { "clip_ratio": 8.37240440887399e-05, "epoch": 0.0005413689137640964, "grad_norm": 0.08174652606248856, "kl": 0.0075355947483330965, "learning_rate": 1.6204986149584487e-06, "loss": -0.0045, "step": 195 }, { "clip_ratio": 8.37240440887399e-05, "epoch": 0.0005441451646039123, "grad_norm": 0.08151189982891083, "kl": 0.006791774649173021, "learning_rate": 1.628808864265928e-06, "loss": -0.0046, "step": 196 }, { "clip_ratio": 9.077705180970952e-05, "epoch": 0.0005469214154437282, "grad_norm": 0.06075465306639671, "kl": 0.007927711587399244, "learning_rate": 1.6371191135734074e-06, "loss": -0.005, "step": 197 }, { "clip_ratio": 0.0, "epoch": 0.0005496976662835441, "grad_norm": 0.06800781190395355, "kl": 0.007068223087117076, "learning_rate": 1.6454293628808865e-06, "loss": -0.005, "step": 198 }, { "clip_ratio": 0.0, "epoch": 0.0005524739171233599, "grad_norm": 0.08043520897626877, "kl": 0.008283360861241817, "learning_rate": 1.6537396121883656e-06, "loss": -0.0051, "step": 199 }, { "clip_ratio": 0.0, "epoch": 0.0005552501679631758, "grad_norm": 0.08258956670761108, "kl": 0.007602859288454056, "learning_rate": 1.662049861495845e-06, "loss": -0.0056, "step": 200 }, { "clip_ratio": 8.37240440887399e-05, "epoch": 0.0005580264188029917, "grad_norm": 0.08197806775569916, "kl": 0.009075871668756008, "learning_rate": 1.6703601108033242e-06, "loss": -0.0042, "step": 201 }, { "clip_ratio": 0.0, "epoch": 0.0005608026696428076, "grad_norm": 0.08293389528989792, "kl": 0.0072558128740638494, "learning_rate": 1.6786703601108033e-06, "loss": -0.0044, "step": 202 }, { "clip_ratio": 0.00018364480638410896, "epoch": 0.0005635789204826234, "grad_norm": 0.0675339549779892, "kl": 0.008414975367486477, "learning_rate": 1.6869806094182824e-06, "loss": -0.0051, "step": 203 }, { "clip_ratio": 0.0, "epoch": 0.0005663551713224394, "grad_norm": 0.0665825828909874, "kl": 0.007153096608817577, "learning_rate": 1.695290858725762e-06, "loss": -0.0048, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 216.5416717529297, "epoch": 0.0005691314221622552, "grad_norm": 0.0898197814822197, "kl": 0.0062689268961548805, "learning_rate": 1.703601108033241e-06, "loss": -0.0212, "reward": 0.24375002086162567, "reward_std": 0.23638741672039032, "rewards/countdown_reward_func": 0.24375001341104507, "step": 205, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0007762892200844362, "epoch": 0.000571907673002071, "grad_norm": 0.11292041838169098, "kl": 0.0070912938099354506, "learning_rate": 1.7119113573407201e-06, "loss": -0.0204, "step": 206 }, { "clip_ratio": 0.0, "epoch": 0.000574683923841887, "grad_norm": 0.1421259045600891, "kl": 0.006881088018417358, "learning_rate": 1.7202216066481995e-06, "loss": -0.0206, "step": 207 }, { "clip_ratio": 0.00012413108197506517, "epoch": 0.0005774601746817028, "grad_norm": 0.12615558505058289, "kl": 0.006374599179252982, "learning_rate": 1.7285318559556788e-06, "loss": -0.022, "step": 208 }, { "clip_ratio": 0.0001857691750046797, "epoch": 0.0005802364255215188, "grad_norm": 0.11529010534286499, "kl": 0.007537527941167355, "learning_rate": 1.736842105263158e-06, "loss": -0.0217, "step": 209 }, { "clip_ratio": 0.0, "epoch": 0.0005830126763613346, "grad_norm": 0.12297216802835464, "kl": 0.006081829313188791, "learning_rate": 1.7451523545706372e-06, "loss": -0.0219, "step": 210 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0005857889272011504, "grad_norm": 0.09268545359373093, "kl": 0.005436737323179841, "learning_rate": 1.7534626038781163e-06, "loss": -0.0217, "step": 211 }, { "clip_ratio": 0.00048481223348062485, "epoch": 0.0005885651780409664, "grad_norm": 0.09842047840356827, "kl": 0.005601341603323817, "learning_rate": 1.7617728531855958e-06, "loss": -0.021, "step": 212 }, { "clip_ratio": 0.00019936203898396343, "epoch": 0.0005913414288807822, "grad_norm": 0.13735024631023407, "kl": 0.005223254906013608, "learning_rate": 1.770083102493075e-06, "loss": -0.022, "step": 213 }, { "clip_ratio": 0.00022600556985707954, "epoch": 0.0005941176797205981, "grad_norm": 0.11505624651908875, "kl": 0.004508420126512647, "learning_rate": 1.778393351800554e-06, "loss": -0.0228, "step": 214 }, { "clip_ratio": 0.0, "epoch": 0.000596893930560414, "grad_norm": 0.10706788301467896, "kl": 0.005065717967227101, "learning_rate": 1.7867036011080331e-06, "loss": -0.022, "step": 215 }, { "clip_ratio": 8.6088155512698e-05, "epoch": 0.0005996701814002299, "grad_norm": 0.11780065298080444, "kl": 0.004196330206468701, "learning_rate": 1.7950138504155125e-06, "loss": -0.0226, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 229.7291717529297, "epoch": 0.0006024464322400457, "grad_norm": 0.07949130237102509, "kl": 0.004571998957544565, "learning_rate": 1.8033240997229918e-06, "loss": 0.0072, "reward": 0.24583334475755692, "reward_std": 0.22264151275157928, "rewards/countdown_reward_func": 0.24583334475755692, "step": 217, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0006052226830798616, "grad_norm": 0.10650928318500519, "kl": 0.004555474733933806, "learning_rate": 1.811634349030471e-06, "loss": 0.0074, "step": 218 }, { "clip_ratio": 9.95222944766283e-05, "epoch": 0.0006079989339196775, "grad_norm": 0.10155003517866135, "kl": 0.0035600599367171526, "learning_rate": 1.8199445983379502e-06, "loss": 0.0071, "step": 219 }, { "clip_ratio": 0.00018531362002249807, "epoch": 0.0006107751847594934, "grad_norm": 0.09107557684183121, "kl": 0.0034167662961408496, "learning_rate": 1.8282548476454293e-06, "loss": 0.0078, "step": 220 }, { "clip_ratio": 9.95222944766283e-05, "epoch": 0.0006135514355993093, "grad_norm": 0.13129286468029022, "kl": 0.003628110629506409, "learning_rate": 1.8365650969529088e-06, "loss": 0.008, "step": 221 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0006163276864391251, "grad_norm": 0.07515281438827515, "kl": 0.0033600571332499385, "learning_rate": 1.844875346260388e-06, "loss": 0.0079, "step": 222 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0006191039372789411, "grad_norm": 0.07991602271795273, "kl": 0.0036217205924913287, "learning_rate": 1.853185595567867e-06, "loss": 0.0073, "step": 223 }, { "clip_ratio": 0.0, "epoch": 0.0006218801881187569, "grad_norm": 0.1082308441400528, "kl": 0.0038258974673226476, "learning_rate": 1.8614958448753461e-06, "loss": 0.0074, "step": 224 }, { "clip_ratio": 0.00037994710146449506, "epoch": 0.0006246564389585727, "grad_norm": 0.09276928007602692, "kl": 0.0031863467302173376, "learning_rate": 1.8698060941828257e-06, "loss": 0.0073, "step": 225 }, { "clip_ratio": 8.25082533992827e-05, "epoch": 0.0006274326897983887, "grad_norm": 0.09415683150291443, "kl": 0.002969763823784888, "learning_rate": 1.8781163434903048e-06, "loss": 0.0079, "step": 226 }, { "clip_ratio": 0.00026924171834252775, "epoch": 0.0006302089406382045, "grad_norm": 0.12183138728141785, "kl": 0.003270600689575076, "learning_rate": 1.8864265927977839e-06, "loss": 0.0078, "step": 227 }, { "clip_ratio": 0.0, "epoch": 0.0006329851914780204, "grad_norm": 0.07732009142637253, "kl": 0.002818679786287248, "learning_rate": 1.8947368421052632e-06, "loss": 0.0079, "step": 228 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 227.75000762939453, "epoch": 0.0006357614423178363, "grad_norm": 0.10973645746707916, "kl": 0.003377788234502077, "learning_rate": 1.9030470914127425e-06, "loss": 0.0024, "reward": 0.2458333522081375, "reward_std": 0.3024734854698181, "rewards/countdown_reward_func": 0.24583334475755692, "step": 229, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0001726210830383934, "epoch": 0.0006385376931576522, "grad_norm": 0.0772981122136116, "kl": 0.003124785842373967, "learning_rate": 1.911357340720222e-06, "loss": 0.0019, "step": 230 }, { "clip_ratio": 9.286775457439944e-05, "epoch": 0.0006413139439974681, "grad_norm": 0.2130415439605713, "kl": 0.003285923390649259, "learning_rate": 1.919667590027701e-06, "loss": 0.003, "step": 231 }, { "clip_ratio": 0.0, "epoch": 0.0006440901948372839, "grad_norm": 0.09580881893634796, "kl": 0.002900973428040743, "learning_rate": 1.92797783933518e-06, "loss": 0.0023, "step": 232 }, { "clip_ratio": 0.0, "epoch": 0.0006468664456770998, "grad_norm": 0.09705420583486557, "kl": 0.003031375934369862, "learning_rate": 1.9362880886426595e-06, "loss": 0.002, "step": 233 }, { "clip_ratio": 9.177679748972878e-05, "epoch": 0.0006496426965169157, "grad_norm": 0.08564502000808716, "kl": 0.0033782614627853036, "learning_rate": 1.9445983379501387e-06, "loss": 0.0024, "step": 234 }, { "clip_ratio": 0.0, "epoch": 0.0006524189473567316, "grad_norm": 0.11757489293813705, "kl": 0.003013497218489647, "learning_rate": 1.9529085872576178e-06, "loss": 0.0018, "step": 235 }, { "clip_ratio": 0.0, "epoch": 0.0006551951981965474, "grad_norm": 0.08156201988458633, "kl": 0.0028692481573671103, "learning_rate": 1.961218836565097e-06, "loss": 0.0022, "step": 236 }, { "clip_ratio": 9.124087227974087e-05, "epoch": 0.0006579714490363634, "grad_norm": 0.20828205347061157, "kl": 0.00310861156322062, "learning_rate": 1.969529085872576e-06, "loss": 0.0024, "step": 237 }, { "clip_ratio": 0.0011520737316459417, "epoch": 0.0006607476998761792, "grad_norm": 0.09965456277132034, "kl": 0.0033284203382208943, "learning_rate": 1.9778393351800555e-06, "loss": 0.0024, "step": 238 }, { "clip_ratio": 0.0, "epoch": 0.0006635239507159951, "grad_norm": 0.08843494206666946, "kl": 0.0030406563309952617, "learning_rate": 1.9861495844875346e-06, "loss": 0.0018, "step": 239 }, { "clip_ratio": 0.00018355359497945756, "epoch": 0.000666300201555811, "grad_norm": 0.08795179426670074, "kl": 0.0030062462901696563, "learning_rate": 1.9944598337950137e-06, "loss": 0.0022, "step": 240 }, { "clip_ratio": 8.698677993379533e-05, "completion_length": 239.81250762939453, "epoch": 0.0006690764523956268, "grad_norm": 0.06673049926757812, "kl": 0.002854794613085687, "learning_rate": 2.002770083102493e-06, "loss": -0.0041, "reward": 0.1875000149011612, "reward_std": 0.16496698185801506, "rewards/countdown_reward_func": 0.1875000149011612, "step": 241, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0006718527032354428, "grad_norm": 0.05837683007121086, "kl": 0.0029842500807717443, "learning_rate": 2.0110803324099723e-06, "loss": -0.0044, "step": 242 }, { "clip_ratio": 8.698677993379533e-05, "epoch": 0.0006746289540752586, "grad_norm": 0.05153853818774223, "kl": 0.0027709035202860832, "learning_rate": 2.0193905817174514e-06, "loss": -0.0041, "step": 243 }, { "clip_ratio": 0.00016836699796840549, "epoch": 0.0006774052049150744, "grad_norm": 0.09485623240470886, "kl": 0.003231698414310813, "learning_rate": 2.027700831024931e-06, "loss": -0.0041, "step": 244 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.0006801814557548904, "grad_norm": 0.06381552666425705, "kl": 0.003178777056746185, "learning_rate": 2.03601108033241e-06, "loss": -0.0044, "step": 245 }, { "clip_ratio": 0.0, "epoch": 0.0006829577065947062, "grad_norm": 0.06970212608575821, "kl": 0.0039439547108486295, "learning_rate": 2.044321329639889e-06, "loss": -0.0044, "step": 246 }, { "clip_ratio": 0.0007053310982882977, "epoch": 0.0006857339574345221, "grad_norm": 0.05486287549138069, "kl": 0.0030296844197437167, "learning_rate": 2.0526315789473687e-06, "loss": -0.0041, "step": 247 }, { "clip_ratio": 0.0, "epoch": 0.000688510208274338, "grad_norm": 0.05788696929812431, "kl": 0.0034874266711995006, "learning_rate": 2.060941828254848e-06, "loss": -0.0043, "step": 248 }, { "clip_ratio": 0.0, "epoch": 0.0006912864591141539, "grad_norm": 0.05053585022687912, "kl": 0.003134583937935531, "learning_rate": 2.069252077562327e-06, "loss": -0.0045, "step": 249 }, { "clip_ratio": 0.0, "epoch": 0.0006940627099539698, "grad_norm": 0.08815469592809677, "kl": 0.0033259709598496556, "learning_rate": 2.0775623268698064e-06, "loss": -0.0048, "step": 250 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.0006968389607937856, "grad_norm": 0.06786980479955673, "kl": 0.00334134662989527, "learning_rate": 2.0858725761772855e-06, "loss": -0.0048, "step": 251 }, { "clip_ratio": 0.0, "epoch": 0.0006996152116336015, "grad_norm": 0.06675417721271515, "kl": 0.0038671550573781133, "learning_rate": 2.0941828254847646e-06, "loss": -0.0048, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 234.33334350585938, "epoch": 0.0007023914624734174, "grad_norm": 0.10852495580911636, "kl": 0.003565722843632102, "learning_rate": 2.1024930747922437e-06, "loss": 0.0025, "reward": 0.24791669100522995, "reward_std": 0.22883932292461395, "rewards/countdown_reward_func": 0.24791669100522995, "step": 253, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00017266585200559348, "epoch": 0.0007051677133132333, "grad_norm": 0.10487343370914459, "kl": 0.0033710359130054712, "learning_rate": 2.1108033240997233e-06, "loss": 0.0034, "step": 254 }, { "clip_ratio": 0.00030323654209496453, "epoch": 0.0007079439641530491, "grad_norm": 0.10246977210044861, "kl": 0.003350336686708033, "learning_rate": 2.1191135734072024e-06, "loss": 0.0024, "step": 255 }, { "clip_ratio": 0.0002672308764886111, "epoch": 0.0007107202149928651, "grad_norm": 0.11946409195661545, "kl": 0.0036674703005701303, "learning_rate": 2.1274238227146815e-06, "loss": 0.0023, "step": 256 }, { "clip_ratio": 0.0001684882226982154, "epoch": 0.0007134964658326809, "grad_norm": 0.0756080374121666, "kl": 0.0034829488722607493, "learning_rate": 2.1357340720221606e-06, "loss": 0.0024, "step": 257 }, { "clip_ratio": 0.00016693805810064077, "epoch": 0.0007162727166724967, "grad_norm": 0.11272068321704865, "kl": 0.0037394753890112042, "learning_rate": 2.1440443213296397e-06, "loss": 0.0026, "step": 258 }, { "clip_ratio": 0.0, "epoch": 0.0007190489675123127, "grad_norm": 0.10757721215486526, "kl": 0.003679752117022872, "learning_rate": 2.152354570637119e-06, "loss": 0.0022, "step": 259 }, { "clip_ratio": 0.00017266585200559348, "epoch": 0.0007218252183521285, "grad_norm": 0.07714775204658508, "kl": 0.003283996251411736, "learning_rate": 2.1606648199445983e-06, "loss": 0.0029, "step": 260 }, { "clip_ratio": 0.00020028267317684367, "epoch": 0.0007246014691919445, "grad_norm": 0.08071676641702652, "kl": 0.0033569439547136426, "learning_rate": 2.1689750692520774e-06, "loss": 0.0022, "step": 261 }, { "clip_ratio": 0.0005443122354336083, "epoch": 0.0007273777200317603, "grad_norm": 0.11737988144159317, "kl": 0.004172776127234101, "learning_rate": 2.1772853185595565e-06, "loss": 0.0027, "step": 262 }, { "clip_ratio": 0.0, "epoch": 0.0007301539708715762, "grad_norm": 0.0743941143155098, "kl": 0.003631085390225053, "learning_rate": 2.185595567867036e-06, "loss": 0.0024, "step": 263 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0007329302217113921, "grad_norm": 0.10840493440628052, "kl": 0.003976256353780627, "learning_rate": 2.193905817174515e-06, "loss": 0.0021, "step": 264 }, { "clip_ratio": 0.00016276042151730508, "completion_length": 217.4791717529297, "epoch": 0.0007357064725512079, "grad_norm": 0.08730115741491318, "kl": 0.0036497570108622313, "learning_rate": 2.2022160664819947e-06, "loss": -0.0134, "reward": 0.24375002086162567, "reward_std": 0.19520405679941177, "rewards/countdown_reward_func": 0.24375002086162567, "step": 265, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0007384827233910238, "grad_norm": 0.07678765058517456, "kl": 0.003502070438116789, "learning_rate": 2.2105263157894738e-06, "loss": -0.0136, "step": 266 }, { "clip_ratio": 0.0, "epoch": 0.0007412589742308397, "grad_norm": 0.10239151120185852, "kl": 0.004613874014467001, "learning_rate": 2.218836565096953e-06, "loss": -0.0133, "step": 267 }, { "clip_ratio": 0.0, "epoch": 0.0007440352250706556, "grad_norm": 0.09318691492080688, "kl": 0.003522416460327804, "learning_rate": 2.2271468144044324e-06, "loss": -0.0137, "step": 268 }, { "clip_ratio": 0.00012007684563286602, "epoch": 0.0007468114759104715, "grad_norm": 0.10666222870349884, "kl": 0.0034163352102041245, "learning_rate": 2.2354570637119115e-06, "loss": -0.0136, "step": 269 }, { "clip_ratio": 0.00018601190822664648, "epoch": 0.0007495877267502874, "grad_norm": 0.07045263051986694, "kl": 0.003822776139713824, "learning_rate": 2.2437673130193906e-06, "loss": -0.0135, "step": 270 }, { "clip_ratio": 0.00016324006719514728, "epoch": 0.0007523639775901032, "grad_norm": 0.08979963511228561, "kl": 0.003612265922129154, "learning_rate": 2.25207756232687e-06, "loss": -0.0136, "step": 271 }, { "clip_ratio": 9.300595411332324e-05, "epoch": 0.0007551402284299191, "grad_norm": 0.07281750440597534, "kl": 0.0034800268476828933, "learning_rate": 2.2603878116343493e-06, "loss": -0.014, "step": 272 }, { "clip_ratio": 0.0, "epoch": 0.000757916479269735, "grad_norm": 0.1016596257686615, "kl": 0.004669018788263202, "learning_rate": 2.2686980609418284e-06, "loss": -0.0135, "step": 273 }, { "clip_ratio": 0.00021114865376148373, "epoch": 0.0007606927301095508, "grad_norm": 0.09079304337501526, "kl": 0.0033616855507716537, "learning_rate": 2.2770083102493075e-06, "loss": -0.0134, "step": 274 }, { "clip_ratio": 0.00019769607024500147, "epoch": 0.0007634689809493668, "grad_norm": 0.11171045154333115, "kl": 0.0033946483163163066, "learning_rate": 2.285318559556787e-06, "loss": -0.0136, "step": 275 }, { "clip_ratio": 0.00032530390308238566, "epoch": 0.0007662452317891826, "grad_norm": 0.06686889380216599, "kl": 0.004112225957214832, "learning_rate": 2.293628808864266e-06, "loss": -0.0133, "step": 276 }, { "clip_ratio": 0.000663017388433218, "completion_length": 232.50000762939453, "epoch": 0.0007690214826289984, "grad_norm": 0.09006670117378235, "kl": 0.003324171178974211, "learning_rate": 2.301939058171745e-06, "loss": 0.0007, "reward": 0.3541666865348816, "reward_std": 0.34993430972099304, "rewards/countdown_reward_func": 0.3541666865348816, "step": 277, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0007717977334688144, "grad_norm": 0.10477007925510406, "kl": 0.003485492314212024, "learning_rate": 2.3102493074792243e-06, "loss": -0.0001, "step": 278 }, { "clip_ratio": 0.0001145737842307426, "epoch": 0.0007745739843086302, "grad_norm": 0.11382175236940384, "kl": 0.003511702874675393, "learning_rate": 2.3185595567867034e-06, "loss": 0.0, "step": 279 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0007773502351484462, "grad_norm": 0.09704895317554474, "kl": 0.0034232381731271744, "learning_rate": 2.326869806094183e-06, "loss": 0.0006, "step": 280 }, { "clip_ratio": 0.0, "epoch": 0.000780126485988262, "grad_norm": 0.11391977220773697, "kl": 0.0033790983725339174, "learning_rate": 2.335180055401662e-06, "loss": 0.0002, "step": 281 }, { "clip_ratio": 0.0, "epoch": 0.0007829027368280779, "grad_norm": 0.3949395418167114, "kl": 0.0036244012881070375, "learning_rate": 2.343490304709141e-06, "loss": 0.0005, "step": 282 }, { "clip_ratio": 0.0006535947904922068, "epoch": 0.0007856789876678938, "grad_norm": 0.10197897255420685, "kl": 0.00319200346712023, "learning_rate": 2.3518005540166202e-06, "loss": 0.0001, "step": 283 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0007884552385077096, "grad_norm": 0.10884040594100952, "kl": 0.0036248579854145646, "learning_rate": 2.3601108033240998e-06, "loss": -0.0003, "step": 284 }, { "clip_ratio": 0.0, "epoch": 0.0007912314893475255, "grad_norm": 0.1121264100074768, "kl": 0.003715887665748596, "learning_rate": 2.368421052631579e-06, "loss": -0.0012, "step": 285 }, { "clip_ratio": 0.0, "epoch": 0.0007940077401873414, "grad_norm": 0.10213881731033325, "kl": 0.003711581346578896, "learning_rate": 2.376731301939058e-06, "loss": -0.0, "step": 286 }, { "clip_ratio": 0.0, "epoch": 0.0007967839910271573, "grad_norm": 0.15096428990364075, "kl": 0.004820869769901037, "learning_rate": 2.3850415512465375e-06, "loss": -0.0011, "step": 287 }, { "clip_ratio": 0.0001633986976230517, "epoch": 0.0007995602418669731, "grad_norm": 0.12257353216409683, "kl": 0.004069758579134941, "learning_rate": 2.3933518005540166e-06, "loss": -0.0001, "step": 288 }, { "clip_ratio": 0.00025150904548354447, "completion_length": 229.6041717529297, "epoch": 0.0008023364927067891, "grad_norm": 0.1052643209695816, "kl": 0.004225482931360602, "learning_rate": 2.401662049861496e-06, "loss": 0.0125, "reward": 0.22500000149011612, "reward_std": 0.2582135424017906, "rewards/countdown_reward_func": 0.22500000149011612, "step": 289, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0008051127435466049, "grad_norm": 0.08905075490474701, "kl": 0.004490195075049996, "learning_rate": 2.4099722991689752e-06, "loss": 0.0132, "step": 290 }, { "clip_ratio": 0.0005068937316536903, "epoch": 0.0008078889943864208, "grad_norm": 0.07550480216741562, "kl": 0.0038030524738132954, "learning_rate": 2.4182825484764543e-06, "loss": 0.012, "step": 291 }, { "clip_ratio": 0.00012575452274177223, "epoch": 0.0008106652452262367, "grad_norm": 0.08875437080860138, "kl": 0.00425003154668957, "learning_rate": 2.426592797783934e-06, "loss": 0.0121, "step": 292 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0008134414960660525, "grad_norm": 0.08870893716812134, "kl": 0.004312893608585, "learning_rate": 2.434903047091413e-06, "loss": 0.0117, "step": 293 }, { "clip_ratio": 0.0008138020639307797, "epoch": 0.0008162177469058685, "grad_norm": 0.08667095005512238, "kl": 0.004525796044617891, "learning_rate": 2.443213296398892e-06, "loss": 0.0117, "step": 294 }, { "clip_ratio": 0.0, "epoch": 0.0008189939977456843, "grad_norm": 0.10581125319004059, "kl": 0.004761378280818462, "learning_rate": 2.451523545706371e-06, "loss": 0.0118, "step": 295 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0008217702485855002, "grad_norm": 0.08822281658649445, "kl": 0.005203166743740439, "learning_rate": 2.4598337950138507e-06, "loss": 0.0123, "step": 296 }, { "clip_ratio": 0.0010381632746430114, "epoch": 0.0008245464994253161, "grad_norm": 0.07447604089975357, "kl": 0.00508764130063355, "learning_rate": 2.46814404432133e-06, "loss": 0.012, "step": 297 }, { "clip_ratio": 0.00037726358277723193, "epoch": 0.0008273227502651319, "grad_norm": 0.087750643491745, "kl": 0.005412213504314423, "learning_rate": 2.476454293628809e-06, "loss": 0.0111, "step": 298 }, { "clip_ratio": 0.0, "epoch": 0.0008300990011049479, "grad_norm": 0.08693079650402069, "kl": 0.005746564595028758, "learning_rate": 2.484764542936288e-06, "loss": 0.0114, "step": 299 }, { "clip_ratio": 0.0007324218604480848, "epoch": 0.0008328752519447637, "grad_norm": 0.09118447452783585, "kl": 0.0058568131644278765, "learning_rate": 2.493074792243767e-06, "loss": 0.0116, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 223.0, "epoch": 0.0008356515027845796, "grad_norm": 0.09235308319330215, "kl": 0.006070051807910204, "learning_rate": 2.5013850415512467e-06, "loss": -0.0069, "reward": 0.20000001043081284, "reward_std": 0.2610399127006531, "rewards/countdown_reward_func": 0.20000001043081284, "step": 301, "zero_std_ratio": 0.0 }, { "clip_ratio": 8.361203799722716e-05, "epoch": 0.0008384277536243955, "grad_norm": 0.08406054228544235, "kl": 0.0060172725934535265, "learning_rate": 2.5096952908587258e-06, "loss": -0.0069, "step": 302 }, { "clip_ratio": 0.0, "epoch": 0.0008412040044642114, "grad_norm": 0.0954718291759491, "kl": 0.006230462109670043, "learning_rate": 2.518005540166205e-06, "loss": -0.0066, "step": 303 }, { "clip_ratio": 0.00010575295891612768, "epoch": 0.0008439802553040272, "grad_norm": 0.09710827469825745, "kl": 0.00680284365080297, "learning_rate": 2.526315789473684e-06, "loss": -0.007, "step": 304 }, { "clip_ratio": 0.0002602904787636362, "epoch": 0.0008467565061438431, "grad_norm": 0.09757263213396072, "kl": 0.007144181756302714, "learning_rate": 2.5346260387811635e-06, "loss": -0.0062, "step": 305 }, { "clip_ratio": 0.0, "epoch": 0.000849532756983659, "grad_norm": 0.07679014652967453, "kl": 0.008339704247191548, "learning_rate": 2.5429362880886426e-06, "loss": -0.0076, "step": 306 }, { "clip_ratio": 8.361203799722716e-05, "epoch": 0.0008523090078234748, "grad_norm": 0.09126242995262146, "kl": 0.0085701416246593, "learning_rate": 2.5512465373961217e-06, "loss": -0.0071, "step": 307 }, { "clip_ratio": 8.361203799722716e-05, "epoch": 0.0008550852586632908, "grad_norm": 0.08423109352588654, "kl": 0.007637211121618748, "learning_rate": 2.5595567867036012e-06, "loss": -0.0073, "step": 308 }, { "clip_ratio": 8.833922038320452e-05, "epoch": 0.0008578615095031066, "grad_norm": 0.09705601632595062, "kl": 0.007593115558847785, "learning_rate": 2.5678670360110803e-06, "loss": -0.0067, "step": 309 }, { "clip_ratio": 0.0, "epoch": 0.0008606377603429226, "grad_norm": 0.09621502459049225, "kl": 0.008113941876217723, "learning_rate": 2.57617728531856e-06, "loss": -0.0075, "step": 310 }, { "clip_ratio": 0.00020451023010537028, "epoch": 0.0008634140111827384, "grad_norm": 0.10358273983001709, "kl": 0.008504221215844154, "learning_rate": 2.584487534626039e-06, "loss": -0.0073, "step": 311 }, { "clip_ratio": 0.00016722407599445432, "epoch": 0.0008661902620225542, "grad_norm": 0.08098059147596359, "kl": 0.009506989270448685, "learning_rate": 2.592797783933518e-06, "loss": -0.0075, "step": 312 }, { "clip_ratio": 0.00016914748994167894, "completion_length": 226.37500762939453, "epoch": 0.0008689665128623702, "grad_norm": 0.0626518577337265, "kl": 0.008252686355262995, "learning_rate": 2.6011080332409976e-06, "loss": 0.0071, "reward": 0.2291666716337204, "reward_std": 0.21667250245809555, "rewards/countdown_reward_func": 0.2291666641831398, "step": 313, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.000871742763702186, "grad_norm": 0.0984596535563469, "kl": 0.010352411307394505, "learning_rate": 2.6094182825484767e-06, "loss": 0.0072, "step": 314 }, { "clip_ratio": 0.0, "epoch": 0.0008745190145420019, "grad_norm": 0.09516298025846481, "kl": 0.011239033192396164, "learning_rate": 2.617728531855956e-06, "loss": 0.0077, "step": 315 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.0008772952653818178, "grad_norm": 0.10682272166013718, "kl": 0.010467468295246363, "learning_rate": 2.626038781163435e-06, "loss": 0.0069, "step": 316 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0008800715162216336, "grad_norm": 0.08754424750804901, "kl": 0.010327389929443598, "learning_rate": 2.6343490304709144e-06, "loss": 0.0072, "step": 317 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.0008828477670614495, "grad_norm": 0.12904123961925507, "kl": 0.010680126026272774, "learning_rate": 2.6426592797783935e-06, "loss": 0.0072, "step": 318 }, { "clip_ratio": 0.00025372125674039125, "epoch": 0.0008856240179012654, "grad_norm": 0.06477981805801392, "kl": 0.009791890624910593, "learning_rate": 2.6509695290858726e-06, "loss": 0.0066, "step": 319 }, { "clip_ratio": 0.00040258895023725927, "epoch": 0.0008884002687410813, "grad_norm": 0.185244619846344, "kl": 0.010858574416488409, "learning_rate": 2.6592797783933517e-06, "loss": 0.0068, "step": 320 }, { "clip_ratio": 9.968101949198171e-05, "epoch": 0.0008911765195808972, "grad_norm": 0.08614979684352875, "kl": 0.014307011850178242, "learning_rate": 2.667590027700831e-06, "loss": 0.0067, "step": 321 }, { "clip_ratio": 0.00027272728038951755, "epoch": 0.0008939527704207131, "grad_norm": 0.10753199458122253, "kl": 0.013140483759343624, "learning_rate": 2.6759002770083104e-06, "loss": 0.0054, "step": 322 }, { "clip_ratio": 0.00029475959308911115, "epoch": 0.0008967290212605289, "grad_norm": 0.10408802330493927, "kl": 0.01228410704061389, "learning_rate": 2.6842105263157895e-06, "loss": 0.0061, "step": 323 }, { "clip_ratio": 0.00019936203898396343, "epoch": 0.0008995052721003448, "grad_norm": 0.12455835938453674, "kl": 0.014058600179851055, "learning_rate": 2.6925207756232686e-06, "loss": 0.0061, "step": 324 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 237.64583587646484, "epoch": 0.0009022815229401607, "grad_norm": 0.08178848773241043, "kl": 0.011353591922670603, "learning_rate": 2.7008310249307477e-06, "loss": 0.0089, "reward": 0.1937500163912773, "reward_std": 0.1958785615861416, "rewards/countdown_reward_func": 0.1937500163912773, "step": 325, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0006667007837677374, "epoch": 0.0009050577737799765, "grad_norm": 0.08745010197162628, "kl": 0.015723693184554577, "learning_rate": 2.709141274238227e-06, "loss": 0.0086, "step": 326 }, { "clip_ratio": 0.00018234216258861125, "epoch": 0.0009078340246197925, "grad_norm": 0.08209048956632614, "kl": 0.013638208620250225, "learning_rate": 2.7174515235457063e-06, "loss": 0.0081, "step": 327 }, { "clip_ratio": 8.741259080125019e-05, "epoch": 0.0009106102754596083, "grad_norm": 0.07701068371534348, "kl": 0.014465087559074163, "learning_rate": 2.7257617728531854e-06, "loss": 0.0088, "step": 328 }, { "clip_ratio": 0.00025351856311317533, "epoch": 0.0009133865262994242, "grad_norm": 0.08792005479335785, "kl": 0.014732198789715767, "learning_rate": 2.734072022160665e-06, "loss": 0.0089, "step": 329 }, { "clip_ratio": 0.0, "epoch": 0.0009161627771392401, "grad_norm": 0.07523926347494125, "kl": 0.014703777618706226, "learning_rate": 2.742382271468144e-06, "loss": 0.0082, "step": 330 }, { "clip_ratio": 0.0, "epoch": 0.0009189390279790559, "grad_norm": 0.08062107861042023, "kl": 0.013506517745554447, "learning_rate": 2.7506925207756236e-06, "loss": 0.0086, "step": 331 }, { "clip_ratio": 0.0006983240018598735, "epoch": 0.0009217152788188719, "grad_norm": 0.07194234430789948, "kl": 0.019093542359769344, "learning_rate": 2.7590027700831027e-06, "loss": 0.0082, "step": 332 }, { "clip_ratio": 0.00025065265799639747, "epoch": 0.0009244915296586877, "grad_norm": 0.08615561574697495, "kl": 0.015138544142246246, "learning_rate": 2.7673130193905818e-06, "loss": 0.0077, "step": 333 }, { "clip_ratio": 8.741259080125019e-05, "epoch": 0.0009272677804985036, "grad_norm": 0.09492068737745285, "kl": 0.01689472934231162, "learning_rate": 2.7756232686980613e-06, "loss": 0.0082, "step": 334 }, { "clip_ratio": 0.0004922889638692141, "epoch": 0.0009300440313383195, "grad_norm": 0.10922621935606003, "kl": 0.017271476797759533, "learning_rate": 2.7839335180055404e-06, "loss": 0.007, "step": 335 }, { "clip_ratio": 0.00010048231342807412, "epoch": 0.0009328202821781354, "grad_norm": 0.0761309564113617, "kl": 0.01699853641912341, "learning_rate": 2.7922437673130195e-06, "loss": 0.0081, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 227.08334350585938, "epoch": 0.0009355965330179512, "grad_norm": 0.05785919353365898, "kl": 0.01666484959423542, "learning_rate": 2.8005540166204986e-06, "loss": 0.01, "reward": 0.15416667610406876, "reward_std": 0.1091257855296135, "rewards/countdown_reward_func": 0.15416667610406876, "step": 337, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00018317173817194998, "epoch": 0.0009383727838577671, "grad_norm": 0.05232284963130951, "kl": 0.01712951622903347, "learning_rate": 2.808864265927978e-06, "loss": 0.0099, "step": 338 }, { "clip_ratio": 0.0, "epoch": 0.000941149034697583, "grad_norm": 0.09636948257684708, "kl": 0.019584951922297478, "learning_rate": 2.8171745152354573e-06, "loss": 0.0101, "step": 339 }, { "clip_ratio": 0.0, "epoch": 0.0009439252855373989, "grad_norm": 0.06418822705745697, "kl": 0.0171346515417099, "learning_rate": 2.8254847645429364e-06, "loss": 0.0094, "step": 340 }, { "clip_ratio": 0.000244140625, "epoch": 0.0009467015363772148, "grad_norm": 0.05186130478978157, "kl": 0.018314712680876255, "learning_rate": 2.8337950138504155e-06, "loss": 0.0096, "step": 341 }, { "clip_ratio": 0.0, "epoch": 0.0009494777872170306, "grad_norm": 0.05359478294849396, "kl": 0.022239549085497856, "learning_rate": 2.8421052631578946e-06, "loss": 0.0094, "step": 342 }, { "clip_ratio": 8.60289073898457e-05, "epoch": 0.0009522540380568466, "grad_norm": 0.05925218388438225, "kl": 0.020320788025856018, "learning_rate": 2.850415512465374e-06, "loss": 0.0096, "step": 343 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0009550302888966624, "grad_norm": 0.05631886422634125, "kl": 0.02036405447870493, "learning_rate": 2.858725761772853e-06, "loss": 0.0094, "step": 344 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0009578065397364782, "grad_norm": 0.1005195826292038, "kl": 0.023968802765011787, "learning_rate": 2.8670360110803323e-06, "loss": 0.0093, "step": 345 }, { "clip_ratio": 0.0, "epoch": 0.0009605827905762942, "grad_norm": 0.06793226301670074, "kl": 0.02041178196668625, "learning_rate": 2.8753462603878114e-06, "loss": 0.0087, "step": 346 }, { "clip_ratio": 0.0, "epoch": 0.00096335904141611, "grad_norm": 0.048542365431785583, "kl": 0.021388364024460316, "learning_rate": 2.883656509695291e-06, "loss": 0.0093, "step": 347 }, { "clip_ratio": 0.0002645519489306025, "epoch": 0.0009661352922559259, "grad_norm": 0.08362864702939987, "kl": 0.027280261740088463, "learning_rate": 2.89196675900277e-06, "loss": 0.0093, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 228.02084350585938, "epoch": 0.0009689115430957418, "grad_norm": 0.0892510712146759, "kl": 0.02360483817756176, "learning_rate": 2.900277008310249e-06, "loss": -0.0109, "reward": 0.16875001043081284, "reward_std": 0.1314988099038601, "rewards/countdown_reward_func": 0.16875001043081284, "step": 349, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0009716877939355576, "grad_norm": 0.09001202881336212, "kl": 0.023638173937797546, "learning_rate": 2.9085872576177287e-06, "loss": -0.0107, "step": 350 }, { "clip_ratio": 0.0001959434594027698, "epoch": 0.0009744640447753736, "grad_norm": 0.113184355199337, "kl": 0.02315935306251049, "learning_rate": 2.9168975069252078e-06, "loss": -0.0112, "step": 351 }, { "clip_ratio": 0.0, "epoch": 0.0009772402956151893, "grad_norm": 0.07550473511219025, "kl": 0.02309281285852194, "learning_rate": 2.9252077562326873e-06, "loss": -0.0108, "step": 352 }, { "clip_ratio": 0.0, "epoch": 0.0009800165464550054, "grad_norm": 0.11282236129045486, "kl": 0.024476034566760063, "learning_rate": 2.9335180055401664e-06, "loss": -0.0112, "step": 353 }, { "clip_ratio": 0.00018355359497945756, "epoch": 0.0009827927972948212, "grad_norm": 0.11352071166038513, "kl": 0.02151069976389408, "learning_rate": 2.9418282548476455e-06, "loss": -0.0114, "step": 354 }, { "clip_ratio": 0.0, "epoch": 0.000985569048134637, "grad_norm": 0.08133033663034439, "kl": 0.021808774210512638, "learning_rate": 2.950138504155125e-06, "loss": -0.0114, "step": 355 }, { "clip_ratio": 0.00010416666918899864, "epoch": 0.000988345298974453, "grad_norm": 0.08895107358694077, "kl": 0.02102847583591938, "learning_rate": 2.958448753462604e-06, "loss": -0.0113, "step": 356 }, { "clip_ratio": 9.177679748972878e-05, "epoch": 0.0009911215498142687, "grad_norm": 0.11091752350330353, "kl": 0.01951050851494074, "learning_rate": 2.9667590027700832e-06, "loss": -0.0118, "step": 357 }, { "clip_ratio": 0.00028880476020276546, "epoch": 0.0009938978006540848, "grad_norm": 0.08083935081958771, "kl": 0.019494274631142616, "learning_rate": 2.9750692520775623e-06, "loss": -0.0118, "step": 358 }, { "clip_ratio": 0.00018355359497945756, "epoch": 0.0009966740514939006, "grad_norm": 0.09999022632837296, "kl": 0.019018066115677357, "learning_rate": 2.983379501385042e-06, "loss": -0.012, "step": 359 }, { "clip_ratio": 0.0, "epoch": 0.0009994503023337165, "grad_norm": 0.09490374475717545, "kl": 0.01723548863083124, "learning_rate": 2.991689750692521e-06, "loss": -0.013, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 226.5625, "epoch": 0.0010022265531735323, "grad_norm": 0.09549269825220108, "kl": 0.019508808851242065, "learning_rate": 3e-06, "loss": 0.0015, "reward": 0.2250000238418579, "reward_std": 0.23222807794809341, "rewards/countdown_reward_func": 0.22500000894069672, "step": 361, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00017572575598023832, "epoch": 0.0010050028040133482, "grad_norm": 0.10312195122241974, "kl": 0.01787219289690256, "learning_rate": 3e-06, "loss": 0.0021, "step": 362 }, { "clip_ratio": 0.0007824726053513587, "epoch": 0.001007779054853164, "grad_norm": 0.12467264384031296, "kl": 0.017863546032458544, "learning_rate": 3e-06, "loss": 0.0019, "step": 363 }, { "clip_ratio": 8.37240440887399e-05, "epoch": 0.00101055530569298, "grad_norm": 0.2627834379673004, "kl": 0.01767959538847208, "learning_rate": 3e-06, "loss": 0.0021, "step": 364 }, { "clip_ratio": 9.038322605192661e-05, "epoch": 0.0010133315565327959, "grad_norm": 0.0790000930428505, "kl": 0.01718911249190569, "learning_rate": 3e-06, "loss": 0.0016, "step": 365 }, { "clip_ratio": 0.0001808318484108895, "epoch": 0.0010161078073726117, "grad_norm": 0.07705968618392944, "kl": 0.01755279116332531, "learning_rate": 3e-06, "loss": 0.0022, "step": 366 }, { "clip_ratio": 0.0001741072628647089, "epoch": 0.0010188840582124276, "grad_norm": 0.10953141748905182, "kl": 0.01750816684216261, "learning_rate": 3e-06, "loss": 0.0021, "step": 367 }, { "clip_ratio": 0.0004197447851765901, "epoch": 0.0010216603090522434, "grad_norm": 0.2015552967786789, "kl": 0.017061928287148476, "learning_rate": 3e-06, "loss": 0.0016, "step": 368 }, { "clip_ratio": 0.000489045400172472, "epoch": 0.0010244365598920595, "grad_norm": 0.1368420273065567, "kl": 0.017408848274499178, "learning_rate": 3e-06, "loss": 0.0013, "step": 369 }, { "clip_ratio": 0.00018153311975765973, "epoch": 0.0010272128107318753, "grad_norm": 0.09191857278347015, "kl": 0.016698247753083706, "learning_rate": 3e-06, "loss": 0.0017, "step": 370 }, { "clip_ratio": 8.394895849050954e-05, "epoch": 0.0010299890615716911, "grad_norm": 0.07735387235879898, "kl": 0.017519176937639713, "learning_rate": 3e-06, "loss": 0.0017, "step": 371 }, { "clip_ratio": 0.0002578312996774912, "epoch": 0.001032765312411507, "grad_norm": 0.07390379160642624, "kl": 0.018353909254074097, "learning_rate": 3e-06, "loss": 0.0019, "step": 372 }, { "clip_ratio": 8.196721319109201e-05, "completion_length": 228.68750762939453, "epoch": 0.0010355415632513228, "grad_norm": 0.10989905893802643, "kl": 0.017692445777356625, "learning_rate": 3e-06, "loss": -0.0103, "reward": 0.2500000223517418, "reward_std": 0.2080453634262085, "rewards/countdown_reward_func": 0.2500000223517418, "step": 373, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00010279605339746922, "epoch": 0.0010383178140911387, "grad_norm": 0.15112467110157013, "kl": 0.03264212794601917, "learning_rate": 3e-06, "loss": -0.0084, "step": 374 }, { "clip_ratio": 0.0011789826894528233, "epoch": 0.0010410940649309547, "grad_norm": 0.1216062679886818, "kl": 0.021678635850548744, "learning_rate": 3e-06, "loss": -0.0091, "step": 375 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.0010438703157707706, "grad_norm": 0.08477499336004257, "kl": 0.016693778336048126, "learning_rate": 3e-06, "loss": -0.0099, "step": 376 }, { "clip_ratio": 0.0002832566970027983, "epoch": 0.0010466465666105864, "grad_norm": 0.10167127847671509, "kl": 0.017779462039470673, "learning_rate": 3e-06, "loss": -0.0093, "step": 377 }, { "clip_ratio": 9.184423106489703e-05, "epoch": 0.0010494228174504022, "grad_norm": 0.09984228760004044, "kl": 0.017343958839774132, "learning_rate": 3e-06, "loss": -0.0096, "step": 378 }, { "clip_ratio": 0.00017322444182354957, "epoch": 0.001052199068290218, "grad_norm": 0.10843722522258759, "kl": 0.018563530407845974, "learning_rate": 3e-06, "loss": -0.0104, "step": 379 }, { "clip_ratio": 0.0003475236881058663, "epoch": 0.0010549753191300341, "grad_norm": 0.137297123670578, "kl": 0.03769372217357159, "learning_rate": 3e-06, "loss": -0.0097, "step": 380 }, { "clip_ratio": 0.0006742323748767376, "epoch": 0.00105775156996985, "grad_norm": 0.13010026514530182, "kl": 0.024436946026980877, "learning_rate": 3e-06, "loss": -0.0105, "step": 381 }, { "clip_ratio": 0.0, "epoch": 0.0010605278208096658, "grad_norm": 0.09109175205230713, "kl": 0.01653394289314747, "learning_rate": 3e-06, "loss": -0.0106, "step": 382 }, { "clip_ratio": 0.0001830161054385826, "epoch": 0.0010633040716494816, "grad_norm": 0.08129965513944626, "kl": 0.017563740722835064, "learning_rate": 3e-06, "loss": -0.01, "step": 383 }, { "clip_ratio": 0.000265068665612489, "epoch": 0.0010660803224892975, "grad_norm": 0.09801369905471802, "kl": 0.01689669769257307, "learning_rate": 3e-06, "loss": -0.0115, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 240.64583587646484, "epoch": 0.0010688565733291133, "grad_norm": 0.07832996547222137, "kl": 0.02487264759838581, "learning_rate": 3e-06, "loss": 0.0146, "reward": 0.19166668504476547, "reward_std": 0.15476077422499657, "rewards/countdown_reward_func": 0.19166668504476547, "step": 385, "zero_std_ratio": 0.625 }, { "clip_ratio": 8.716875890968367e-05, "epoch": 0.0010716328241689294, "grad_norm": 0.07169502228498459, "kl": 0.016012447886168957, "learning_rate": 3e-06, "loss": 0.0138, "step": 386 }, { "clip_ratio": 9.005763422464952e-05, "epoch": 0.0010744090750087452, "grad_norm": 0.08705933392047882, "kl": 0.016524842474609613, "learning_rate": 3e-06, "loss": 0.0143, "step": 387 }, { "clip_ratio": 9.61538462433964e-05, "epoch": 0.001077185325848561, "grad_norm": 0.0732106938958168, "kl": 0.015910383313894272, "learning_rate": 3e-06, "loss": 0.0142, "step": 388 }, { "clip_ratio": 0.0002589142677607015, "epoch": 0.001079961576688377, "grad_norm": 0.07911605387926102, "kl": 0.01897307112812996, "learning_rate": 3e-06, "loss": 0.0151, "step": 389 }, { "clip_ratio": 0.00017143784498330206, "epoch": 0.0010827378275281927, "grad_norm": 0.0664311945438385, "kl": 0.016379999462515116, "learning_rate": 3e-06, "loss": 0.0145, "step": 390 }, { "clip_ratio": 0.0, "epoch": 0.0010855140783680088, "grad_norm": 0.08451702445745468, "kl": 0.0259824451059103, "learning_rate": 3e-06, "loss": 0.0146, "step": 391 }, { "clip_ratio": 9.005763422464952e-05, "epoch": 0.0010882903292078246, "grad_norm": 0.07148966938257217, "kl": 0.016325827687978745, "learning_rate": 3e-06, "loss": 0.0138, "step": 392 }, { "clip_ratio": 0.0, "epoch": 0.0010910665800476405, "grad_norm": 0.09877816587686539, "kl": 0.01728895679116249, "learning_rate": 3e-06, "loss": 0.0141, "step": 393 }, { "clip_ratio": 0.0001923076924867928, "epoch": 0.0010938428308874563, "grad_norm": 0.08034543693065643, "kl": 0.016465777531266212, "learning_rate": 3e-06, "loss": 0.0138, "step": 394 }, { "clip_ratio": 0.0, "epoch": 0.0010966190817272722, "grad_norm": 0.0796988308429718, "kl": 0.020429577212780714, "learning_rate": 3e-06, "loss": 0.0139, "step": 395 }, { "clip_ratio": 0.0005143135640537366, "epoch": 0.0010993953325670882, "grad_norm": 0.06786444783210754, "kl": 0.01705406652763486, "learning_rate": 3e-06, "loss": 0.013, "step": 396 }, { "clip_ratio": 0.00019317050464451313, "completion_length": 222.3541717529297, "epoch": 0.001102171583406904, "grad_norm": 0.1809883564710617, "kl": 0.02103044930845499, "learning_rate": 3e-06, "loss": 0.0201, "reward": 0.32500000298023224, "reward_std": 0.28887902945280075, "rewards/countdown_reward_func": 0.32499998807907104, "step": 397, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003668378631118685, "epoch": 0.0011049478342467199, "grad_norm": 0.13064327836036682, "kl": 0.019325342029333115, "learning_rate": 3e-06, "loss": 0.0188, "step": 398 }, { "clip_ratio": 0.0, "epoch": 0.0011077240850865357, "grad_norm": 0.134730726480484, "kl": 0.018743818625807762, "learning_rate": 3e-06, "loss": 0.018, "step": 399 }, { "clip_ratio": 0.0, "epoch": 0.0011105003359263516, "grad_norm": 0.11716154217720032, "kl": 0.03218943625688553, "learning_rate": 3e-06, "loss": 0.0188, "step": 400 }, { "clip_ratio": 0.0003666364573291503, "epoch": 0.0011132765867661674, "grad_norm": 0.1279788464307785, "kl": 0.02369430661201477, "learning_rate": 3e-06, "loss": 0.0183, "step": 401 }, { "clip_ratio": 0.00018321751849725842, "epoch": 0.0011160528376059835, "grad_norm": 0.1514863669872284, "kl": 0.02853427268564701, "learning_rate": 3e-06, "loss": 0.0172, "step": 402 }, { "clip_ratio": 0.0005473888304550201, "epoch": 0.0011188290884457993, "grad_norm": 0.17688550055027008, "kl": 0.02879752404987812, "learning_rate": 3e-06, "loss": 0.0159, "step": 403 }, { "clip_ratio": 0.0003765894507523626, "epoch": 0.0011216053392856151, "grad_norm": 0.1264808475971222, "kl": 0.028363430872559547, "learning_rate": 3e-06, "loss": 0.0154, "step": 404 }, { "clip_ratio": 0.0002071251074085012, "epoch": 0.001124381590125431, "grad_norm": 0.13313868641853333, "kl": 0.028233185410499573, "learning_rate": 3e-06, "loss": 0.0148, "step": 405 }, { "clip_ratio": 0.00029883457318646833, "epoch": 0.0011271578409652468, "grad_norm": 0.11772707849740982, "kl": 0.0557091049849987, "learning_rate": 3e-06, "loss": 0.0158, "step": 406 }, { "clip_ratio": 0.0008593437087256461, "epoch": 0.0011299340918050629, "grad_norm": 0.1197652816772461, "kl": 0.03832645434886217, "learning_rate": 3e-06, "loss": 0.0146, "step": 407 }, { "clip_ratio": 0.0008315023733302951, "epoch": 0.0011327103426448787, "grad_norm": 0.11358743906021118, "kl": 0.049091488122940063, "learning_rate": 3e-06, "loss": 0.0139, "step": 408 }, { "clip_ratio": 0.00010860121983569115, "completion_length": 212.20833587646484, "epoch": 0.0011354865934846946, "grad_norm": 0.13000725209712982, "kl": 0.03431596327573061, "learning_rate": 3e-06, "loss": -0.0011, "reward": 0.239583358168602, "reward_std": 0.3074723482131958, "rewards/countdown_reward_func": 0.2395833507180214, "step": 409, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.00010860121983569115, "epoch": 0.0011382628443245104, "grad_norm": 0.12621691823005676, "kl": 0.03901347145438194, "learning_rate": 3e-06, "loss": -0.0018, "step": 410 }, { "clip_ratio": 0.0, "epoch": 0.0011410390951643262, "grad_norm": 0.15552707016468048, "kl": 0.05270291492342949, "learning_rate": 3e-06, "loss": -0.0012, "step": 411 }, { "clip_ratio": 0.0003161708955303766, "epoch": 0.001143815346004142, "grad_norm": 0.1499052792787552, "kl": 0.04181257076561451, "learning_rate": 3e-06, "loss": -0.0025, "step": 412 }, { "clip_ratio": 0.0, "epoch": 0.0011465915968439581, "grad_norm": 0.11208264529705048, "kl": 0.04561302810907364, "learning_rate": 3e-06, "loss": -0.0011, "step": 413 }, { "clip_ratio": 8.514986257068813e-05, "epoch": 0.001149367847683774, "grad_norm": 0.117032989859581, "kl": 0.0457566250115633, "learning_rate": 3e-06, "loss": -0.002, "step": 414 }, { "clip_ratio": 0.0, "epoch": 0.0011521440985235898, "grad_norm": 0.1544887125492096, "kl": 0.04844648204743862, "learning_rate": 3e-06, "loss": -0.0009, "step": 415 }, { "clip_ratio": 0.00010860121983569115, "epoch": 0.0011549203493634056, "grad_norm": 0.1259896457195282, "kl": 0.05372907593846321, "learning_rate": 3e-06, "loss": -0.0026, "step": 416 }, { "clip_ratio": 0.0, "epoch": 0.0011576966002032215, "grad_norm": 0.15457068383693695, "kl": 0.06770433485507965, "learning_rate": 3e-06, "loss": -0.0032, "step": 417 }, { "clip_ratio": 0.00022890920809004456, "epoch": 0.0011604728510430375, "grad_norm": 0.1197003573179245, "kl": 0.0525658018887043, "learning_rate": 3e-06, "loss": -0.0043, "step": 418 }, { "clip_ratio": 0.0, "epoch": 0.0011632491018828534, "grad_norm": 0.10684943944215775, "kl": 0.053456977009773254, "learning_rate": 3e-06, "loss": -0.0021, "step": 419 }, { "clip_ratio": 0.00021355503849918023, "epoch": 0.0011660253527226692, "grad_norm": 0.10173308104276657, "kl": 0.05415925942361355, "learning_rate": 3e-06, "loss": -0.0027, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 237.62500762939453, "epoch": 0.001168801603562485, "grad_norm": 0.11933105438947678, "kl": 0.06486517563462257, "learning_rate": 3e-06, "loss": 0.0027, "reward": 0.27916668355464935, "reward_std": 0.29565654695034027, "rewards/countdown_reward_func": 0.27916668355464935, "step": 421, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.001171577854402301, "grad_norm": 0.20322704315185547, "kl": 0.08543343096971512, "learning_rate": 3e-06, "loss": 0.0032, "step": 422 }, { "clip_ratio": 0.0, "epoch": 0.0011743541052421167, "grad_norm": 0.19525644183158875, "kl": 0.13222120702266693, "learning_rate": 3e-06, "loss": 0.0045, "step": 423 }, { "clip_ratio": 0.0002695980292628519, "epoch": 0.0011771303560819328, "grad_norm": 0.13731662929058075, "kl": 0.0626319907605648, "learning_rate": 3e-06, "loss": 0.0031, "step": 424 }, { "clip_ratio": 0.0, "epoch": 0.0011799066069217486, "grad_norm": 0.13071469962596893, "kl": 0.05833687447011471, "learning_rate": 3e-06, "loss": 0.0014, "step": 425 }, { "clip_ratio": 0.0, "epoch": 0.0011826828577615645, "grad_norm": 0.23276741802692413, "kl": 0.06872095167636871, "learning_rate": 3e-06, "loss": 0.0012, "step": 426 }, { "clip_ratio": 0.0, "epoch": 0.0011854591086013803, "grad_norm": 0.11540886759757996, "kl": 0.053551726043224335, "learning_rate": 3e-06, "loss": 0.0002, "step": 427 }, { "clip_ratio": 0.00016356298874597996, "epoch": 0.0011882353594411962, "grad_norm": 0.19618336856365204, "kl": 0.06640568189322948, "learning_rate": 3e-06, "loss": -0.0006, "step": 428 }, { "clip_ratio": 8.218277798732743e-05, "epoch": 0.0011910116102810122, "grad_norm": 0.18861764669418335, "kl": 0.1051701121032238, "learning_rate": 3e-06, "loss": 0.0011, "step": 429 }, { "clip_ratio": 0.000244140625, "epoch": 0.001193787861120828, "grad_norm": 0.14318421483039856, "kl": 0.04604991525411606, "learning_rate": 3e-06, "loss": 0.0016, "step": 430 }, { "clip_ratio": 0.0003451758166193031, "epoch": 0.0011965641119606439, "grad_norm": 0.14576469361782074, "kl": 0.04200077801942825, "learning_rate": 3e-06, "loss": -0.0002, "step": 431 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0011993403628004597, "grad_norm": 0.2222023755311966, "kl": 0.04660602658987045, "learning_rate": 3e-06, "loss": -0.004, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 236.81250762939453, "epoch": 0.0012021166136402756, "grad_norm": 0.08921542763710022, "kl": 0.04309102147817612, "learning_rate": 3e-06, "loss": -0.0024, "reward": 0.15208333730697632, "reward_std": 0.11422888934612274, "rewards/countdown_reward_func": 0.15208332985639572, "step": 433, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0012048928644800914, "grad_norm": 0.1347406804561615, "kl": 0.04063236340880394, "learning_rate": 3e-06, "loss": -0.0032, "step": 434 }, { "clip_ratio": 0.0002881626787711866, "epoch": 0.0012076691153199075, "grad_norm": 0.11881622672080994, "kl": 0.0401871744543314, "learning_rate": 3e-06, "loss": -0.0035, "step": 435 }, { "clip_ratio": 0.0, "epoch": 0.0012104453661597233, "grad_norm": 0.06465653330087662, "kl": 0.03999011963605881, "learning_rate": 3e-06, "loss": -0.003, "step": 436 }, { "clip_ratio": 9.13075273274444e-05, "epoch": 0.0012132216169995391, "grad_norm": 0.07897289842367172, "kl": 0.03400629200041294, "learning_rate": 3e-06, "loss": -0.0031, "step": 437 }, { "clip_ratio": 0.00045705895172432065, "epoch": 0.001215997867839355, "grad_norm": 0.07360372692346573, "kl": 0.03901753947138786, "learning_rate": 3e-06, "loss": -0.0032, "step": 438 }, { "clip_ratio": 0.000265190057689324, "epoch": 0.0012187741186791708, "grad_norm": 0.07743567228317261, "kl": 0.030041148886084557, "learning_rate": 3e-06, "loss": -0.0041, "step": 439 }, { "clip_ratio": 0.000703962694387883, "epoch": 0.0012215503695189869, "grad_norm": 0.13309620320796967, "kl": 0.029125919565558434, "learning_rate": 3e-06, "loss": -0.0056, "step": 440 }, { "clip_ratio": 0.0005505228764377534, "epoch": 0.0012243266203588027, "grad_norm": 0.0934651717543602, "kl": 0.028855517506599426, "learning_rate": 3e-06, "loss": -0.0049, "step": 441 }, { "clip_ratio": 0.00017098593525588512, "epoch": 0.0012271028711986186, "grad_norm": 0.060086771845817566, "kl": 0.028184207156300545, "learning_rate": 3e-06, "loss": -0.0043, "step": 442 }, { "clip_ratio": 0.002318943908903748, "epoch": 0.0012298791220384344, "grad_norm": 0.07434255629777908, "kl": 0.024150204844772816, "learning_rate": 3e-06, "loss": -0.0043, "step": 443 }, { "clip_ratio": 0.0017171713116113096, "epoch": 0.0012326553728782502, "grad_norm": 0.06684679538011551, "kl": 0.028608722612261772, "learning_rate": 3e-06, "loss": -0.0045, "step": 444 }, { "clip_ratio": 0.00026747502852231264, "completion_length": 234.14584350585938, "epoch": 0.001235431623718066, "grad_norm": 0.07466662675142288, "kl": 0.024345185607671738, "learning_rate": 3e-06, "loss": 0.0042, "reward": 0.16875001043081284, "reward_std": 0.1619665026664734, "rewards/countdown_reward_func": 0.16875001043081284, "step": 445, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0012382078745578821, "grad_norm": 0.0632314682006836, "kl": 0.02845650538802147, "learning_rate": 3e-06, "loss": 0.0042, "step": 446 }, { "clip_ratio": 0.0, "epoch": 0.001240984125397698, "grad_norm": 0.064049132168293, "kl": 0.0239101629704237, "learning_rate": 3e-06, "loss": 0.0045, "step": 447 }, { "clip_ratio": 0.0002663619234226644, "epoch": 0.0012437603762375138, "grad_norm": 0.11490896344184875, "kl": 0.02280531730502844, "learning_rate": 3e-06, "loss": 0.0038, "step": 448 }, { "clip_ratio": 0.00027501455042511225, "epoch": 0.0012465366270773296, "grad_norm": 0.0676225870847702, "kl": 0.02224800456315279, "learning_rate": 3e-06, "loss": 0.0042, "step": 449 }, { "clip_ratio": 0.000460914452560246, "epoch": 0.0012493128779171455, "grad_norm": 0.07294591516256332, "kl": 0.023516996763646603, "learning_rate": 3e-06, "loss": 0.0047, "step": 450 }, { "clip_ratio": 0.00017831669538281858, "epoch": 0.0012520891287569615, "grad_norm": 0.07453124225139618, "kl": 0.021801339462399483, "learning_rate": 3e-06, "loss": 0.0039, "step": 451 }, { "clip_ratio": 0.0003460512016317807, "epoch": 0.0012548653795967774, "grad_norm": 0.06052190437912941, "kl": 0.026570623740553856, "learning_rate": 3e-06, "loss": 0.004, "step": 452 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0012576416304365932, "grad_norm": 0.06681171804666519, "kl": 0.022533809766173363, "learning_rate": 3e-06, "loss": 0.004, "step": 453 }, { "clip_ratio": 0.00017356310854665935, "epoch": 0.001260417881276409, "grad_norm": 0.1189865693449974, "kl": 0.02223300840705633, "learning_rate": 3e-06, "loss": 0.0031, "step": 454 }, { "clip_ratio": 0.00018221575010102242, "epoch": 0.001263194132116225, "grad_norm": 0.06577113270759583, "kl": 0.021816913969814777, "learning_rate": 3e-06, "loss": 0.0037, "step": 455 }, { "clip_ratio": 0.0005418355867732316, "epoch": 0.0012659703829560407, "grad_norm": 0.13180898129940033, "kl": 0.0227721044793725, "learning_rate": 3e-06, "loss": 0.0036, "step": 456 }, { "clip_ratio": 0.00020764119108207524, "completion_length": 221.70833587646484, "epoch": 0.0012687466337958568, "grad_norm": 0.109416663646698, "kl": 0.022198159247636795, "learning_rate": 3e-06, "loss": 0.0299, "reward": 0.32500001788139343, "reward_std": 0.28183095902204514, "rewards/countdown_reward_func": 0.32500001788139343, "step": 457, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0012715228846356726, "grad_norm": 0.12022383511066437, "kl": 0.02293353620916605, "learning_rate": 3e-06, "loss": 0.0297, "step": 458 }, { "clip_ratio": 0.0, "epoch": 0.0012742991354754885, "grad_norm": 0.10432354360818863, "kl": 0.024400770664215088, "learning_rate": 3e-06, "loss": 0.0295, "step": 459 }, { "clip_ratio": 9.13075273274444e-05, "epoch": 0.0012770753863153043, "grad_norm": 0.10826432704925537, "kl": 0.022371195256710052, "learning_rate": 3e-06, "loss": 0.0295, "step": 460 }, { "clip_ratio": 0.00018136516155209392, "epoch": 0.0012798516371551201, "grad_norm": 0.1814108043909073, "kl": 0.02964179776608944, "learning_rate": 3e-06, "loss": 0.029, "step": 461 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0012826278879949362, "grad_norm": 0.1098814606666565, "kl": 0.02524806559085846, "learning_rate": 3e-06, "loss": 0.0288, "step": 462 }, { "clip_ratio": 0.00020764119108207524, "epoch": 0.001285404138834752, "grad_norm": 0.10370974242687225, "kl": 0.023861460387706757, "learning_rate": 3e-06, "loss": 0.0284, "step": 463 }, { "clip_ratio": 0.00010382059554103762, "epoch": 0.0012881803896745679, "grad_norm": 0.1190827265381813, "kl": 0.024492272175848484, "learning_rate": 3e-06, "loss": 0.0278, "step": 464 }, { "clip_ratio": 0.00046262028627097607, "epoch": 0.0012909566405143837, "grad_norm": 0.10446801036596298, "kl": 0.027533382177352905, "learning_rate": 3e-06, "loss": 0.0282, "step": 465 }, { "clip_ratio": 0.0005552970251301304, "epoch": 0.0012937328913541996, "grad_norm": 0.10887821763753891, "kl": 0.024930739775300026, "learning_rate": 3e-06, "loss": 0.0271, "step": 466 }, { "clip_ratio": 0.0009011498477775604, "epoch": 0.0012965091421940156, "grad_norm": 0.15684081614017487, "kl": 0.03675767965614796, "learning_rate": 3e-06, "loss": 0.026, "step": 467 }, { "clip_ratio": 0.0004027693357784301, "epoch": 0.0012992853930338315, "grad_norm": 0.11377524584531784, "kl": 0.03045613970607519, "learning_rate": 3e-06, "loss": 0.0264, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 239.1875, "epoch": 0.0013020616438736473, "grad_norm": 0.10519543290138245, "kl": 0.03355495072901249, "learning_rate": 3e-06, "loss": 0.0046, "reward": 0.20625001192092896, "reward_std": 0.2200612723827362, "rewards/countdown_reward_func": 0.20625001192092896, "step": 469, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0013048378947134631, "grad_norm": 0.09295057505369186, "kl": 0.03722470626235008, "learning_rate": 3e-06, "loss": 0.0037, "step": 470 }, { "clip_ratio": 0.0, "epoch": 0.001307614145553279, "grad_norm": 0.07510315626859665, "kl": 0.035189252346754074, "learning_rate": 3e-06, "loss": 0.0038, "step": 471 }, { "clip_ratio": 0.000260960339801386, "epoch": 0.0013103903963930948, "grad_norm": 0.1088418960571289, "kl": 0.03208626061677933, "learning_rate": 3e-06, "loss": 0.0039, "step": 472 }, { "clip_ratio": 0.0, "epoch": 0.0013131666472329109, "grad_norm": 0.08343033492565155, "kl": 0.03320677764713764, "learning_rate": 3e-06, "loss": 0.0031, "step": 473 }, { "clip_ratio": 0.0002764667078736238, "epoch": 0.0013159428980727267, "grad_norm": 0.11299256235361099, "kl": 0.0330524817109108, "learning_rate": 3e-06, "loss": 0.0035, "step": 474 }, { "clip_ratio": 0.00027988169313175604, "epoch": 0.0013187191489125426, "grad_norm": 0.10915958881378174, "kl": 0.04382970742881298, "learning_rate": 3e-06, "loss": 0.0033, "step": 475 }, { "clip_ratio": 0.0, "epoch": 0.0013214953997523584, "grad_norm": 0.08877313882112503, "kl": 0.04925047419965267, "learning_rate": 3e-06, "loss": 0.0027, "step": 476 }, { "clip_ratio": 0.00017536517407279462, "epoch": 0.0013242716505921742, "grad_norm": 0.08160615712404251, "kl": 0.0415896400809288, "learning_rate": 3e-06, "loss": 0.0036, "step": 477 }, { "clip_ratio": 0.0, "epoch": 0.0013270479014319903, "grad_norm": 0.14993789792060852, "kl": 0.036379581317305565, "learning_rate": 3e-06, "loss": 0.0034, "step": 478 }, { "clip_ratio": 0.0, "epoch": 0.0013298241522718061, "grad_norm": 0.08193667978048325, "kl": 0.03808531537652016, "learning_rate": 3e-06, "loss": 0.0028, "step": 479 }, { "clip_ratio": 0.0002638619553181343, "epoch": 0.001332600403111622, "grad_norm": 0.10145576298236847, "kl": 0.0362465288490057, "learning_rate": 3e-06, "loss": 0.0028, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 232.0625, "epoch": 0.0013353766539514378, "grad_norm": 0.11346805095672607, "kl": 0.038885802030563354, "learning_rate": 3e-06, "loss": 0.0224, "reward": 0.19166667014360428, "reward_std": 0.20098165795207024, "rewards/countdown_reward_func": 0.19166667014360428, "step": 481, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0013381529047912536, "grad_norm": 0.0885414257645607, "kl": 0.03647511638700962, "learning_rate": 3e-06, "loss": 0.0223, "step": 482 }, { "clip_ratio": 0.0002071251074085012, "epoch": 0.0013409291556310695, "grad_norm": 0.09309430420398712, "kl": 0.03728037141263485, "learning_rate": 3e-06, "loss": 0.0219, "step": 483 }, { "clip_ratio": 0.00018494276446290314, "epoch": 0.0013437054064708855, "grad_norm": 0.08568699657917023, "kl": 0.053651321679353714, "learning_rate": 3e-06, "loss": 0.0231, "step": 484 }, { "clip_ratio": 0.00026833474839804694, "epoch": 0.0013464816573107014, "grad_norm": 0.12132450193166733, "kl": 0.04722470976412296, "learning_rate": 3e-06, "loss": 0.0222, "step": 485 }, { "clip_ratio": 8.532423089491203e-05, "epoch": 0.0013492579081505172, "grad_norm": 0.09739983081817627, "kl": 0.03846907988190651, "learning_rate": 3e-06, "loss": 0.0218, "step": 486 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.001352034158990333, "grad_norm": 0.1051330417394638, "kl": 0.04565967619419098, "learning_rate": 3e-06, "loss": 0.0213, "step": 487 }, { "clip_ratio": 0.0003686312338686548, "epoch": 0.001354810409830149, "grad_norm": 0.08285031467676163, "kl": 0.04506035894155502, "learning_rate": 3e-06, "loss": 0.0212, "step": 488 }, { "clip_ratio": 0.0008285004296340048, "epoch": 0.001357586660669965, "grad_norm": 0.0878533124923706, "kl": 0.04655962623655796, "learning_rate": 3e-06, "loss": 0.0206, "step": 489 }, { "clip_ratio": 0.0, "epoch": 0.0013603629115097808, "grad_norm": 0.07896918803453445, "kl": 0.06781511753797531, "learning_rate": 3e-06, "loss": 0.0215, "step": 490 }, { "clip_ratio": 0.0, "epoch": 0.0013631391623495966, "grad_norm": 0.10892756283283234, "kl": 0.06482304260134697, "learning_rate": 3e-06, "loss": 0.0204, "step": 491 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0013659154131894125, "grad_norm": 0.09697525948286057, "kl": 0.052130842581391335, "learning_rate": 3e-06, "loss": 0.0204, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 230.4791717529297, "epoch": 0.0013686916640292283, "grad_norm": 0.12033633142709732, "kl": 0.05308363772928715, "learning_rate": 3e-06, "loss": -0.0028, "reward": 0.21041666716337204, "reward_std": 0.17074457183480263, "rewards/countdown_reward_func": 0.21041666716337204, "step": 493, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0003349836479173973, "epoch": 0.0013714679148690441, "grad_norm": 0.09749346226453781, "kl": 0.05684215575456619, "learning_rate": 3e-06, "loss": -0.0034, "step": 494 }, { "clip_ratio": 9.025270992424339e-05, "epoch": 0.0013742441657088602, "grad_norm": 0.08872581273317337, "kl": 0.05968480557203293, "learning_rate": 3e-06, "loss": -0.003, "step": 495 }, { "clip_ratio": 0.00028852681134594604, "epoch": 0.001377020416548676, "grad_norm": 0.11848699301481247, "kl": 0.06252800300717354, "learning_rate": 3e-06, "loss": -0.0026, "step": 496 }, { "clip_ratio": 0.0, "epoch": 0.0013797966673884919, "grad_norm": 0.24924242496490479, "kl": 0.06530779227614403, "learning_rate": 3e-06, "loss": -0.0012, "step": 497 }, { "clip_ratio": 0.0003621914656832814, "epoch": 0.0013825729182283077, "grad_norm": 0.11978671699762344, "kl": 0.06549267843365669, "learning_rate": 3e-06, "loss": -0.0024, "step": 498 }, { "clip_ratio": 0.0, "epoch": 0.0013853491690681236, "grad_norm": 0.08640985190868378, "kl": 0.06214660406112671, "learning_rate": 3e-06, "loss": -0.0034, "step": 499 }, { "clip_ratio": 0.0, "epoch": 0.0013881254199079396, "grad_norm": 0.10044127702713013, "kl": 0.061088208109140396, "learning_rate": 3e-06, "loss": -0.0035, "step": 500 }, { "clip_ratio": 9.025270992424339e-05, "epoch": 0.0013909016707477555, "grad_norm": 0.09724526852369308, "kl": 0.06105843186378479, "learning_rate": 3e-06, "loss": -0.0036, "step": 501 }, { "clip_ratio": 0.00047674778033979237, "epoch": 0.0013936779215875713, "grad_norm": 0.1106707900762558, "kl": 0.05963746830821037, "learning_rate": 3e-06, "loss": -0.0037, "step": 502 }, { "clip_ratio": 0.0, "epoch": 0.0013964541724273871, "grad_norm": 0.28701546788215637, "kl": 0.058546338230371475, "learning_rate": 3e-06, "loss": -0.0043, "step": 503 }, { "clip_ratio": 0.0002860495515051298, "epoch": 0.001399230423267203, "grad_norm": 0.11430586874485016, "kl": 0.05745413526892662, "learning_rate": 3e-06, "loss": -0.0037, "step": 504 }, { "clip_ratio": 0.0004231043276377022, "completion_length": 207.95833587646484, "epoch": 0.0014020066741070188, "grad_norm": 0.1265731304883957, "kl": 0.06725966557860374, "learning_rate": 3e-06, "loss": -0.0148, "reward": 0.10833334177732468, "reward_std": 0.0599165465682745, "rewards/countdown_reward_func": 0.10833334177732468, "step": 505, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0001892800792120397, "epoch": 0.0014047829249468349, "grad_norm": 0.07790236175060272, "kl": 0.06213269755244255, "learning_rate": 3e-06, "loss": -0.0152, "step": 506 }, { "clip_ratio": 0.0, "epoch": 0.0014075591757866507, "grad_norm": 0.10863670706748962, "kl": 0.05851333029568195, "learning_rate": 3e-06, "loss": -0.0152, "step": 507 }, { "clip_ratio": 0.00011870844900840893, "epoch": 0.0014103354266264666, "grad_norm": 0.11995543539524078, "kl": 0.0762905403971672, "learning_rate": 3e-06, "loss": -0.0152, "step": 508 }, { "clip_ratio": 0.0, "epoch": 0.0014131116774662824, "grad_norm": 0.10893931984901428, "kl": 0.07419506646692753, "learning_rate": 3e-06, "loss": -0.0151, "step": 509 }, { "clip_ratio": 0.0002796600092551671, "epoch": 0.0014158879283060982, "grad_norm": 0.11752152442932129, "kl": 0.05459407716989517, "learning_rate": 3e-06, "loss": -0.0165, "step": 510 }, { "clip_ratio": 0.00028186137205921113, "epoch": 0.0014186641791459143, "grad_norm": 0.10197150707244873, "kl": 0.0478304848074913, "learning_rate": 3e-06, "loss": -0.0171, "step": 511 }, { "clip_ratio": 0.0010716330725699663, "epoch": 0.0014214404299857301, "grad_norm": 0.06922204792499542, "kl": 0.042900703847408295, "learning_rate": 3e-06, "loss": -0.0171, "step": 512 }, { "clip_ratio": 0.00039922940777614713, "epoch": 0.001424216680825546, "grad_norm": 0.09703180193901062, "kl": 0.03880976140499115, "learning_rate": 3e-06, "loss": -0.0185, "step": 513 }, { "clip_ratio": 0.0022654032800346613, "epoch": 0.0014269929316653618, "grad_norm": 0.10023833066225052, "kl": 0.04965279810130596, "learning_rate": 3e-06, "loss": -0.0183, "step": 514 }, { "clip_ratio": 0.0015507703792536631, "epoch": 0.0014297691825051776, "grad_norm": 0.0990394875407219, "kl": 0.04965635947883129, "learning_rate": 3e-06, "loss": -0.0188, "step": 515 }, { "clip_ratio": 0.0020738598541356623, "epoch": 0.0014325454333449935, "grad_norm": 0.1003248319029808, "kl": 0.03512744698673487, "learning_rate": 3e-06, "loss": -0.0204, "step": 516 }, { "clip_ratio": 0.00010129659494850785, "completion_length": 233.4166717529297, "epoch": 0.0014353216841848095, "grad_norm": 0.09848001599311829, "kl": 0.026790697127580643, "learning_rate": 3e-06, "loss": -0.0064, "reward": 0.2250000238418579, "reward_std": 0.2915927767753601, "rewards/countdown_reward_func": 0.22500000894069672, "step": 517, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0002025931898970157, "epoch": 0.0014380979350246254, "grad_norm": 0.10145186632871628, "kl": 0.024240675382316113, "learning_rate": 3e-06, "loss": -0.0064, "step": 518 }, { "clip_ratio": 0.0, "epoch": 0.0014408741858644412, "grad_norm": 0.09955999255180359, "kl": 0.02282124198973179, "learning_rate": 3e-06, "loss": -0.0068, "step": 519 }, { "clip_ratio": 0.0001803751802071929, "epoch": 0.001443650436704257, "grad_norm": 0.11280816793441772, "kl": 0.024487541057169437, "learning_rate": 3e-06, "loss": -0.0059, "step": 520 }, { "clip_ratio": 0.0003646711993496865, "epoch": 0.001446426687544073, "grad_norm": 0.10844244807958603, "kl": 0.025233074091374874, "learning_rate": 3e-06, "loss": -0.0072, "step": 521 }, { "clip_ratio": 8.722958591533825e-05, "epoch": 0.001449202938383889, "grad_norm": 0.16373442113399506, "kl": 0.024108163081109524, "learning_rate": 3e-06, "loss": -0.0069, "step": 522 }, { "clip_ratio": 0.0006587411771761253, "epoch": 0.0014519791892237048, "grad_norm": 0.09821134060621262, "kl": 0.022494456730782986, "learning_rate": 3e-06, "loss": -0.0072, "step": 523 }, { "clip_ratio": 0.0, "epoch": 0.0014547554400635206, "grad_norm": 0.1120600625872612, "kl": 0.021051418967545033, "learning_rate": 3e-06, "loss": -0.0086, "step": 524 }, { "clip_ratio": 0.00027575576677918434, "epoch": 0.0014575316909033365, "grad_norm": 0.09990929067134857, "kl": 0.020683609880506992, "learning_rate": 3e-06, "loss": -0.0081, "step": 525 }, { "clip_ratio": 0.0008699278841959313, "epoch": 0.0014603079417431523, "grad_norm": 0.1135360524058342, "kl": 0.02212886419147253, "learning_rate": 3e-06, "loss": -0.0077, "step": 526 }, { "clip_ratio": 0.0009819942351896316, "epoch": 0.0014630841925829681, "grad_norm": 0.09932764619588852, "kl": 0.02397053875029087, "learning_rate": 3e-06, "loss": -0.008, "step": 527 }, { "clip_ratio": 0.0010553163010627031, "epoch": 0.0014658604434227842, "grad_norm": 0.17817182838916779, "kl": 0.02317346353083849, "learning_rate": 3e-06, "loss": -0.008, "step": 528 }, { "clip_ratio": 0.00024488919734722003, "completion_length": 238.56250762939453, "epoch": 0.0014686366942626, "grad_norm": 0.06560882925987244, "kl": 0.025266059674322605, "learning_rate": 3e-06, "loss": -0.0058, "reward": 0.19166668504476547, "reward_std": 0.15476077422499657, "rewards/countdown_reward_func": 0.19166668504476547, "step": 529, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0, "epoch": 0.0014714129451024159, "grad_norm": 0.06739655882120132, "kl": 0.02375541441142559, "learning_rate": 3e-06, "loss": -0.0055, "step": 530 }, { "clip_ratio": 0.00017831669538281858, "epoch": 0.0014741891959422317, "grad_norm": 0.10260723531246185, "kl": 0.028817658312618732, "learning_rate": 3e-06, "loss": -0.0052, "step": 531 }, { "clip_ratio": 0.00017053855845006183, "epoch": 0.0014769654467820476, "grad_norm": 0.0644664391875267, "kl": 0.028914256952703, "learning_rate": 3e-06, "loss": -0.0053, "step": 532 }, { "clip_ratio": 9.03179170563817e-05, "epoch": 0.0014797416976218636, "grad_norm": 0.11235247552394867, "kl": 0.024057154543697834, "learning_rate": 3e-06, "loss": -0.0055, "step": 533 }, { "clip_ratio": 0.0001695947503321804, "epoch": 0.0014825179484616795, "grad_norm": 0.20861512422561646, "kl": 0.027661575004458427, "learning_rate": 3e-06, "loss": -0.0074, "step": 534 }, { "clip_ratio": 0.00025097496109083295, "epoch": 0.0014852941993014953, "grad_norm": 0.06489825993776321, "kl": 0.026025223545730114, "learning_rate": 3e-06, "loss": -0.0062, "step": 535 }, { "clip_ratio": 0.0007896195165812969, "epoch": 0.0014880704501413111, "grad_norm": 0.07131274044513702, "kl": 0.025113885290920734, "learning_rate": 3e-06, "loss": -0.0059, "step": 536 }, { "clip_ratio": 0.000509865116328001, "epoch": 0.001490846700981127, "grad_norm": 0.10190945118665695, "kl": 0.028591866604983807, "learning_rate": 3e-06, "loss": -0.0073, "step": 537 }, { "clip_ratio": 0.0011356433387845755, "epoch": 0.001493622951820943, "grad_norm": 0.07074970752000809, "kl": 0.027634769678115845, "learning_rate": 3e-06, "loss": -0.0058, "step": 538 }, { "clip_ratio": 0.000357331897248514, "epoch": 0.0014963992026607589, "grad_norm": 0.09467112272977829, "kl": 0.02522993925958872, "learning_rate": 3e-06, "loss": -0.0074, "step": 539 }, { "clip_ratio": 0.0012593485007528216, "epoch": 0.0014991754535005747, "grad_norm": 0.22211527824401855, "kl": 0.027548625133931637, "learning_rate": 3e-06, "loss": -0.0112, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 235.39584350585938, "epoch": 0.0015019517043403905, "grad_norm": 0.07119705528020859, "kl": 0.02431309036910534, "learning_rate": 3e-06, "loss": 0.0065, "reward": 0.30000002682209015, "reward_std": 0.22795327007770538, "rewards/countdown_reward_func": 0.30000001192092896, "step": 541, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00016297261754516512, "epoch": 0.0015047279551802064, "grad_norm": 0.09539937227964401, "kl": 0.025972411036491394, "learning_rate": 3e-06, "loss": 0.006, "step": 542 }, { "clip_ratio": 0.0, "epoch": 0.0015075042060200222, "grad_norm": 0.1664063185453415, "kl": 0.024256471544504166, "learning_rate": 3e-06, "loss": 0.0062, "step": 543 }, { "clip_ratio": 0.0, "epoch": 0.0015102804568598383, "grad_norm": 0.07563801109790802, "kl": 0.02507698815315962, "learning_rate": 3e-06, "loss": 0.0052, "step": 544 }, { "clip_ratio": 0.0, "epoch": 0.0015130567076996541, "grad_norm": 0.07607442140579224, "kl": 0.02424045093357563, "learning_rate": 3e-06, "loss": 0.0058, "step": 545 }, { "clip_ratio": 0.0006382791907526553, "epoch": 0.00151583295853947, "grad_norm": 0.15590515732765198, "kl": 0.02763733733445406, "learning_rate": 3e-06, "loss": 0.0064, "step": 546 }, { "clip_ratio": 0.0003597766626626253, "epoch": 0.0015186092093792858, "grad_norm": 0.07136835902929306, "kl": 0.025670908391475677, "learning_rate": 3e-06, "loss": 0.0057, "step": 547 }, { "clip_ratio": 0.0004282992740627378, "epoch": 0.0015213854602191016, "grad_norm": 0.08294139057397842, "kl": 0.027309386059641838, "learning_rate": 3e-06, "loss": 0.0059, "step": 548 }, { "clip_ratio": 0.0, "epoch": 0.0015241617110589177, "grad_norm": 0.1601613610982895, "kl": 0.025316720828413963, "learning_rate": 3e-06, "loss": 0.0056, "step": 549 }, { "clip_ratio": 0.0002441406322759576, "epoch": 0.0015269379618987335, "grad_norm": 0.0745542123913765, "kl": 0.025896431878209114, "learning_rate": 3e-06, "loss": 0.0051, "step": 550 }, { "clip_ratio": 0.0, "epoch": 0.0015297142127385494, "grad_norm": 0.06619153916835785, "kl": 0.024953342974185944, "learning_rate": 3e-06, "loss": 0.005, "step": 551 }, { "clip_ratio": 0.000708240841049701, "epoch": 0.0015324904635783652, "grad_norm": 0.13466133177280426, "kl": 0.028307722881436348, "learning_rate": 3e-06, "loss": 0.0054, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 234.81250762939453, "epoch": 0.001535266714418181, "grad_norm": 0.09829782694578171, "kl": 0.02855612989515066, "learning_rate": 3e-06, "loss": 0.0059, "reward": 0.19166667014360428, "reward_std": 0.19716466218233109, "rewards/countdown_reward_func": 0.19166666269302368, "step": 553, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00018876852846005931, "epoch": 0.001538042965257997, "grad_norm": 0.07418268173933029, "kl": 0.026963720098137856, "learning_rate": 3e-06, "loss": 0.0054, "step": 554 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.001540819216097813, "grad_norm": 0.07878899574279785, "kl": 0.028570099733769894, "learning_rate": 3e-06, "loss": 0.006, "step": 555 }, { "clip_ratio": 0.0, "epoch": 0.0015435954669376288, "grad_norm": 0.10893654823303223, "kl": 0.027858249843120575, "learning_rate": 3e-06, "loss": 0.0057, "step": 556 }, { "clip_ratio": 0.0, "epoch": 0.0015463717177774446, "grad_norm": 0.07773488014936447, "kl": 0.02769723255187273, "learning_rate": 3e-06, "loss": 0.0057, "step": 557 }, { "clip_ratio": 0.00019869584502885118, "epoch": 0.0015491479686172605, "grad_norm": 0.08286737650632858, "kl": 0.02732114028185606, "learning_rate": 3e-06, "loss": 0.0053, "step": 558 }, { "clip_ratio": 0.0, "epoch": 0.0015519242194570763, "grad_norm": 0.08795492351055145, "kl": 0.027023627422749996, "learning_rate": 3e-06, "loss": 0.0049, "step": 559 }, { "clip_ratio": 0.0, "epoch": 0.0015547004702968924, "grad_norm": 0.07500362396240234, "kl": 0.02461559884250164, "learning_rate": 3e-06, "loss": 0.0044, "step": 560 }, { "clip_ratio": 0.00018876852846005931, "epoch": 0.0015574767211367082, "grad_norm": 0.08417128771543503, "kl": 0.026552588678896427, "learning_rate": 3e-06, "loss": 0.0052, "step": 561 }, { "clip_ratio": 0.0, "epoch": 0.001560252971976524, "grad_norm": 0.07278092950582504, "kl": 0.024690870195627213, "learning_rate": 3e-06, "loss": 0.0046, "step": 562 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0015630292228163399, "grad_norm": 0.07505667209625244, "kl": 0.02481877151876688, "learning_rate": 3e-06, "loss": 0.0054, "step": 563 }, { "clip_ratio": 0.00010738831770140678, "epoch": 0.0015658054736561557, "grad_norm": 0.08289463818073273, "kl": 0.024328703992068768, "learning_rate": 3e-06, "loss": 0.0044, "step": 564 }, { "clip_ratio": 0.00019328358030179515, "completion_length": 228.62500762939453, "epoch": 0.0015685817244959716, "grad_norm": 0.09321510046720505, "kl": 0.02534264326095581, "learning_rate": 3e-06, "loss": -0.0105, "reward": 0.27916668355464935, "reward_std": 0.28499096632003784, "rewards/countdown_reward_func": 0.27916668355464935, "step": 565, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0015713579753357876, "grad_norm": 0.09789825975894928, "kl": 0.023500449024140835, "learning_rate": 3e-06, "loss": -0.0099, "step": 566 }, { "clip_ratio": 0.0, "epoch": 0.0015741342261756035, "grad_norm": 0.12602142989635468, "kl": 0.024465853348374367, "learning_rate": 3e-06, "loss": -0.01, "step": 567 }, { "clip_ratio": 0.00026942871045321226, "epoch": 0.0015769104770154193, "grad_norm": 0.07588708400726318, "kl": 0.021980220451951027, "learning_rate": 3e-06, "loss": -0.0094, "step": 568 }, { "clip_ratio": 0.0, "epoch": 0.0015796867278552351, "grad_norm": 0.10751640051603317, "kl": 0.02414284460246563, "learning_rate": 3e-06, "loss": -0.0093, "step": 569 }, { "clip_ratio": 0.0, "epoch": 0.001582462978695051, "grad_norm": 0.12433457374572754, "kl": 0.02209012396633625, "learning_rate": 3e-06, "loss": -0.0097, "step": 570 }, { "clip_ratio": 0.0, "epoch": 0.001585239229534867, "grad_norm": 0.1603914499282837, "kl": 0.02457761950790882, "learning_rate": 3e-06, "loss": -0.0102, "step": 571 }, { "clip_ratio": 0.0, "epoch": 0.0015880154803746829, "grad_norm": 0.09691650420427322, "kl": 0.023993924260139465, "learning_rate": 3e-06, "loss": -0.0106, "step": 572 }, { "clip_ratio": 0.0, "epoch": 0.0015907917312144987, "grad_norm": 0.13051147758960724, "kl": 0.02514663338661194, "learning_rate": 3e-06, "loss": -0.0105, "step": 573 }, { "clip_ratio": 0.00018027036276180297, "epoch": 0.0015935679820543145, "grad_norm": 0.14570623636245728, "kl": 0.023281600326299667, "learning_rate": 3e-06, "loss": -0.0104, "step": 574 }, { "clip_ratio": 8.698677993379533e-05, "epoch": 0.0015963442328941304, "grad_norm": 0.10116558521986008, "kl": 0.026070833206176758, "learning_rate": 3e-06, "loss": -0.0109, "step": 575 }, { "clip_ratio": 8.698677993379533e-05, "epoch": 0.0015991204837339462, "grad_norm": 0.1298682689666748, "kl": 0.02464818675071001, "learning_rate": 3e-06, "loss": -0.0109, "step": 576 }, { "clip_ratio": 0.0005351027357392013, "completion_length": 240.83333587646484, "epoch": 0.0016018967345737623, "grad_norm": 0.05101247504353523, "kl": 0.02285961899906397, "learning_rate": 3e-06, "loss": 0.0031, "reward": 0.2083333507180214, "reward_std": 0.16015682369470596, "rewards/countdown_reward_func": 0.2083333507180214, "step": 577, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0016046729854135781, "grad_norm": 0.07094918191432953, "kl": 0.023334643803536892, "learning_rate": 3e-06, "loss": 0.0029, "step": 578 }, { "clip_ratio": 0.0, "epoch": 0.001607449236253394, "grad_norm": 0.047830481082201004, "kl": 0.0244672242552042, "learning_rate": 3e-06, "loss": 0.003, "step": 579 }, { "clip_ratio": 0.0, "epoch": 0.0016102254870932098, "grad_norm": 0.06464120745658875, "kl": 0.024777245707809925, "learning_rate": 3e-06, "loss": 0.0036, "step": 580 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0016130017379330256, "grad_norm": 0.062469832599163055, "kl": 0.02657880913466215, "learning_rate": 3e-06, "loss": 0.0032, "step": 581 }, { "clip_ratio": 0.0007138257715268992, "epoch": 0.0016157779887728417, "grad_norm": 0.07282475382089615, "kl": 0.026876126416027546, "learning_rate": 3e-06, "loss": 0.0027, "step": 582 }, { "clip_ratio": 0.00042808218859136105, "epoch": 0.0016185542396126575, "grad_norm": 0.057990189641714096, "kl": 0.025540747679769993, "learning_rate": 3e-06, "loss": 0.0029, "step": 583 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0016213304904524734, "grad_norm": 0.07290374487638474, "kl": 0.02545579057186842, "learning_rate": 3e-06, "loss": 0.0028, "step": 584 }, { "clip_ratio": 0.0, "epoch": 0.0016241067412922892, "grad_norm": 0.04659357666969299, "kl": 0.026573507115244865, "learning_rate": 3e-06, "loss": 0.0028, "step": 585 }, { "clip_ratio": 0.0006510416860692203, "epoch": 0.001626882992132105, "grad_norm": 0.05872870981693268, "kl": 0.02712887153029442, "learning_rate": 3e-06, "loss": 0.0031, "step": 586 }, { "clip_ratio": 0.0, "epoch": 0.001629659242971921, "grad_norm": 0.05923086777329445, "kl": 0.02828708291053772, "learning_rate": 3e-06, "loss": 0.0029, "step": 587 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.001632435493811737, "grad_norm": 0.07497607171535492, "kl": 0.02832121029496193, "learning_rate": 3e-06, "loss": 0.002, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 240.33333587646484, "epoch": 0.0016352117446515528, "grad_norm": 0.12389404326677322, "kl": 0.028777985833585262, "learning_rate": 3e-06, "loss": 0.0163, "reward": 0.3583333417773247, "reward_std": 0.36252936720848083, "rewards/countdown_reward_func": 0.3583333268761635, "step": 589, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0016379879954913686, "grad_norm": 0.16146422922611237, "kl": 0.029465525411069393, "learning_rate": 3e-06, "loss": 0.0157, "step": 590 }, { "clip_ratio": 0.0002460629912093282, "epoch": 0.0016407642463311845, "grad_norm": 0.11740246415138245, "kl": 0.02861588355153799, "learning_rate": 3e-06, "loss": 0.0157, "step": 591 }, { "clip_ratio": 8.674531272845343e-05, "epoch": 0.0016435404971710003, "grad_norm": 0.11579007655382156, "kl": 0.02733410894870758, "learning_rate": 3e-06, "loss": 0.0159, "step": 592 }, { "clip_ratio": 0.0, "epoch": 0.0016463167480108164, "grad_norm": 0.09282111376523972, "kl": 0.029231702908873558, "learning_rate": 3e-06, "loss": 0.0151, "step": 593 }, { "clip_ratio": 0.0002602359454613179, "epoch": 0.0016490929988506322, "grad_norm": 0.15394672751426697, "kl": 0.029871191829442978, "learning_rate": 3e-06, "loss": 0.0157, "step": 594 }, { "clip_ratio": 8.202099706977606e-05, "epoch": 0.001651869249690448, "grad_norm": 0.12174469977617264, "kl": 0.027274416759610176, "learning_rate": 3e-06, "loss": 0.0142, "step": 595 }, { "clip_ratio": 0.0, "epoch": 0.0016546455005302639, "grad_norm": 0.20378051698207855, "kl": 0.026893497444689274, "learning_rate": 3e-06, "loss": 0.0129, "step": 596 }, { "clip_ratio": 0.00016404199413955212, "epoch": 0.0016574217513700797, "grad_norm": 0.11612839996814728, "kl": 0.027020023204386234, "learning_rate": 3e-06, "loss": 0.0138, "step": 597 }, { "clip_ratio": 8.555784006603062e-05, "epoch": 0.0016601980022098958, "grad_norm": 0.10161249339580536, "kl": 0.025793558917939663, "learning_rate": 3e-06, "loss": 0.013, "step": 598 }, { "clip_ratio": 8.555784006603062e-05, "epoch": 0.0016629742530497116, "grad_norm": 0.09491390734910965, "kl": 0.027327225543558598, "learning_rate": 3e-06, "loss": 0.0133, "step": 599 }, { "clip_ratio": 0.000265891409071628, "epoch": 0.0016657505038895275, "grad_norm": 0.13563629984855652, "kl": 0.027993053197860718, "learning_rate": 3e-06, "loss": 0.0133, "step": 600 }, { "clip_ratio": 8.322236681124195e-05, "completion_length": 238.75000762939453, "epoch": 0.0016685267547293433, "grad_norm": 0.09000701457262039, "kl": 0.028913519345223904, "learning_rate": 3e-06, "loss": 0.0272, "reward": 0.2083333432674408, "reward_std": 0.24819570034742355, "rewards/countdown_reward_func": 0.2083333283662796, "step": 601, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0016713030055691591, "grad_norm": 0.08314099907875061, "kl": 0.025995067320764065, "learning_rate": 3e-06, "loss": 0.0262, "step": 602 }, { "clip_ratio": 0.0, "epoch": 0.001674079256408975, "grad_norm": 0.1207355260848999, "kl": 0.03479589242488146, "learning_rate": 3e-06, "loss": 0.0271, "step": 603 }, { "clip_ratio": 0.0, "epoch": 0.001676855507248791, "grad_norm": 0.11306949704885483, "kl": 0.030153939500451088, "learning_rate": 3e-06, "loss": 0.0267, "step": 604 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0016796317580886069, "grad_norm": 0.08935578912496567, "kl": 0.028079458512365818, "learning_rate": 3e-06, "loss": 0.0258, "step": 605 }, { "clip_ratio": 0.00024782493710517883, "epoch": 0.0016824080089284227, "grad_norm": 0.0978025496006012, "kl": 0.029070213437080383, "learning_rate": 3e-06, "loss": 0.0253, "step": 606 }, { "clip_ratio": 0.0005404279145295732, "epoch": 0.0016851842597682385, "grad_norm": 0.0878811702132225, "kl": 0.03100433386862278, "learning_rate": 3e-06, "loss": 0.025, "step": 607 }, { "clip_ratio": 0.00018288222781848162, "epoch": 0.0016879605106080544, "grad_norm": 0.09215917438268661, "kl": 0.02795230783522129, "learning_rate": 3e-06, "loss": 0.0248, "step": 608 }, { "clip_ratio": 9.144111390924081e-05, "epoch": 0.0016907367614478704, "grad_norm": 0.12471023947000504, "kl": 0.03957472741603851, "learning_rate": 3e-06, "loss": 0.0252, "step": 609 }, { "clip_ratio": 8.322236681124195e-05, "epoch": 0.0016935130122876863, "grad_norm": 0.1100461557507515, "kl": 0.0361151285469532, "learning_rate": 3e-06, "loss": 0.0234, "step": 610 }, { "clip_ratio": 0.0, "epoch": 0.0016962892631275021, "grad_norm": 0.08403483778238297, "kl": 0.03325536102056503, "learning_rate": 3e-06, "loss": 0.0233, "step": 611 }, { "clip_ratio": 0.0007063246885081753, "epoch": 0.001699065513967318, "grad_norm": 0.09051875025033951, "kl": 0.034811416640877724, "learning_rate": 3e-06, "loss": 0.0225, "step": 612 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 243.9791717529297, "epoch": 0.0017018417648071338, "grad_norm": 0.09752815216779709, "kl": 0.0324931014329195, "learning_rate": 3e-06, "loss": 0.0035, "reward": 0.20625001937150955, "reward_std": 0.2455231510102749, "rewards/countdown_reward_func": 0.20625000447034836, "step": 613, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00018611777340993285, "epoch": 0.0017046180156469496, "grad_norm": 0.10532078891992569, "kl": 0.03395223990082741, "learning_rate": 3e-06, "loss": 0.0041, "step": 614 }, { "clip_ratio": 0.0, "epoch": 0.0017073942664867657, "grad_norm": 0.0800807774066925, "kl": 0.034072598442435265, "learning_rate": 3e-06, "loss": 0.0038, "step": 615 }, { "clip_ratio": 0.00025160193035844713, "epoch": 0.0017101705173265815, "grad_norm": 0.11874743551015854, "kl": 0.03573352470993996, "learning_rate": 3e-06, "loss": 0.0044, "step": 616 }, { "clip_ratio": 8.164597966242582e-05, "epoch": 0.0017129467681663974, "grad_norm": 0.08836469054222107, "kl": 0.06380523927509785, "learning_rate": 3e-06, "loss": 0.0045, "step": 617 }, { "clip_ratio": 0.0, "epoch": 0.0017157230190062132, "grad_norm": 0.08665862679481506, "kl": 0.045055605471134186, "learning_rate": 3e-06, "loss": 0.0032, "step": 618 }, { "clip_ratio": 0.0003530426474753767, "epoch": 0.001718499269846029, "grad_norm": 0.12727470695972443, "kl": 0.040752191096544266, "learning_rate": 3e-06, "loss": 0.004, "step": 619 }, { "clip_ratio": 0.0, "epoch": 0.0017212755206858451, "grad_norm": 0.11716008186340332, "kl": 0.042310649529099464, "learning_rate": 3e-06, "loss": 0.0028, "step": 620 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.001724051771525661, "grad_norm": 0.07381536811590195, "kl": 0.041947562247514725, "learning_rate": 3e-06, "loss": 0.0031, "step": 621 }, { "clip_ratio": 0.0001702217195997946, "epoch": 0.0017268280223654768, "grad_norm": 0.09706465154886246, "kl": 0.042636461555957794, "learning_rate": 3e-06, "loss": 0.0047, "step": 622 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0017296042732052926, "grad_norm": 0.08298773318529129, "kl": 0.07458402588963509, "learning_rate": 3e-06, "loss": 0.0036, "step": 623 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0017323805240451085, "grad_norm": 0.08479683846235275, "kl": 0.05383196100592613, "learning_rate": 3e-06, "loss": 0.0025, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 227.9166717529297, "epoch": 0.0017351567748849243, "grad_norm": 0.09818287193775177, "kl": 0.049956170842051506, "learning_rate": 3e-06, "loss": 0.0324, "reward": 0.20625001937150955, "reward_std": 0.20693624019622803, "rewards/countdown_reward_func": 0.20625000447034836, "step": 625, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003646973054856062, "epoch": 0.0017379330257247404, "grad_norm": 0.11653796583414078, "kl": 0.0570333506911993, "learning_rate": 3e-06, "loss": 0.0323, "step": 626 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0017407092765645562, "grad_norm": 0.10893271118402481, "kl": 0.05227653309702873, "learning_rate": 3e-06, "loss": 0.0315, "step": 627 }, { "clip_ratio": 0.00045520756975747645, "epoch": 0.001743485527404372, "grad_norm": 0.1160278171300888, "kl": 0.05303039960563183, "learning_rate": 3e-06, "loss": 0.0313, "step": 628 }, { "clip_ratio": 8.159269054885954e-05, "epoch": 0.0017462617782441879, "grad_norm": 0.10207342356443405, "kl": 0.05501401796936989, "learning_rate": 3e-06, "loss": 0.0314, "step": 629 }, { "clip_ratio": 0.0, "epoch": 0.0017490380290840037, "grad_norm": 0.08500054478645325, "kl": 0.06589866057038307, "learning_rate": 3e-06, "loss": 0.0306, "step": 630 }, { "clip_ratio": 9.117432637140155e-05, "epoch": 0.0017518142799238198, "grad_norm": 0.09498704224824905, "kl": 0.061905499547719955, "learning_rate": 3e-06, "loss": 0.0309, "step": 631 }, { "clip_ratio": 0.00027352297911420465, "epoch": 0.0017545905307636356, "grad_norm": 0.10427255183458328, "kl": 0.07243792712688446, "learning_rate": 3e-06, "loss": 0.0298, "step": 632 }, { "clip_ratio": 0.00017255453713005409, "epoch": 0.0017573667816034515, "grad_norm": 0.11053306609392166, "kl": 0.07044854387640953, "learning_rate": 3e-06, "loss": 0.0295, "step": 633 }, { "clip_ratio": 0.00010434056457597762, "epoch": 0.0017601430324432673, "grad_norm": 0.11823248863220215, "kl": 0.07041215151548386, "learning_rate": 3e-06, "loss": 0.0289, "step": 634 }, { "clip_ratio": 0.0005901292752241716, "epoch": 0.0017629192832830831, "grad_norm": 0.09512235224246979, "kl": 0.07614094018936157, "learning_rate": 3e-06, "loss": 0.0288, "step": 635 }, { "clip_ratio": 0.00029720802558586, "epoch": 0.001765695534122899, "grad_norm": 0.07722548395395279, "kl": 0.09275055304169655, "learning_rate": 3e-06, "loss": 0.0284, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 237.89583587646484, "epoch": 0.001768471784962715, "grad_norm": 0.0991094708442688, "kl": 0.10344970971345901, "learning_rate": 3e-06, "loss": 0.0205, "reward": 0.19166666269302368, "reward_std": 0.1672205552458763, "rewards/countdown_reward_func": 0.19166666269302368, "step": 637, "zero_std_ratio": 0.5 }, { "clip_ratio": 8.896797226043418e-05, "epoch": 0.0017712480358025309, "grad_norm": 0.10992806404829025, "kl": 0.10585097968578339, "learning_rate": 3e-06, "loss": 0.0203, "step": 638 }, { "clip_ratio": 0.0, "epoch": 0.0017740242866423467, "grad_norm": 0.07519710063934326, "kl": 0.1175805889070034, "learning_rate": 3e-06, "loss": 0.0194, "step": 639 }, { "clip_ratio": 0.00017600550199858844, "epoch": 0.0017768005374821625, "grad_norm": 0.15158335864543915, "kl": 0.1538476049900055, "learning_rate": 3e-06, "loss": 0.0215, "step": 640 }, { "clip_ratio": 9.462528396397829e-05, "epoch": 0.0017795767883219784, "grad_norm": 0.07322803884744644, "kl": 0.14574335515499115, "learning_rate": 3e-06, "loss": 0.021, "step": 641 }, { "clip_ratio": 0.0005492736818268895, "epoch": 0.0017823530391617944, "grad_norm": 0.12571430206298828, "kl": 0.15892429277300835, "learning_rate": 3e-06, "loss": 0.0215, "step": 642 }, { "clip_ratio": 0.0, "epoch": 0.0017851292900016103, "grad_norm": 0.0937129482626915, "kl": 0.14000995457172394, "learning_rate": 3e-06, "loss": 0.0194, "step": 643 }, { "clip_ratio": 0.00017034818301908672, "epoch": 0.0017879055408414261, "grad_norm": 0.10581913590431213, "kl": 0.13933642953634262, "learning_rate": 3e-06, "loss": 0.0194, "step": 644 }, { "clip_ratio": 0.00018925056792795658, "epoch": 0.001790681791681242, "grad_norm": 0.09151846915483475, "kl": 0.14892030507326126, "learning_rate": 3e-06, "loss": 0.0194, "step": 645 }, { "clip_ratio": 0.0002573857200331986, "epoch": 0.0017934580425210578, "grad_norm": 0.16274204850196838, "kl": 0.1857452318072319, "learning_rate": 3e-06, "loss": 0.0201, "step": 646 }, { "clip_ratio": 0.0, "epoch": 0.0017962342933608736, "grad_norm": 0.07416386157274246, "kl": 0.17207730561494827, "learning_rate": 3e-06, "loss": 0.0196, "step": 647 }, { "clip_ratio": 0.0008967332323663868, "epoch": 0.0017990105442006897, "grad_norm": 0.11084357649087906, "kl": 0.17409475147724152, "learning_rate": 3e-06, "loss": 0.02, "step": 648 }, { "clip_ratio": 9.999999747378752e-05, "completion_length": 226.64584350585938, "epoch": 0.0018017867950405055, "grad_norm": 0.15701425075531006, "kl": 0.16254248470067978, "learning_rate": 3e-06, "loss": 0.0062, "reward": 0.2770833596587181, "reward_std": 0.3268149420619011, "rewards/countdown_reward_func": 0.2770833596587181, "step": 649, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.999999747378752e-05, "epoch": 0.0018045630458803214, "grad_norm": 0.14769934117794037, "kl": 0.15972411632537842, "learning_rate": 3e-06, "loss": 0.0047, "step": 650 }, { "clip_ratio": 0.0, "epoch": 0.0018073392967201372, "grad_norm": 0.15065068006515503, "kl": 0.15974202752113342, "learning_rate": 3e-06, "loss": 0.0055, "step": 651 }, { "clip_ratio": 0.00018544086924521253, "epoch": 0.001810115547559953, "grad_norm": 0.25048333406448364, "kl": 0.1500309482216835, "learning_rate": 3e-06, "loss": 0.0031, "step": 652 }, { "clip_ratio": 8.638562576379627e-05, "epoch": 0.0018128917983997691, "grad_norm": 0.16509313881397247, "kl": 0.14627444744110107, "learning_rate": 3e-06, "loss": 0.0038, "step": 653 }, { "clip_ratio": 0.00018638561596162617, "epoch": 0.001815668049239585, "grad_norm": 0.2661731243133545, "kl": 0.1369135081768036, "learning_rate": 3e-06, "loss": 0.0038, "step": 654 }, { "clip_ratio": 9.999999747378752e-05, "epoch": 0.0018184443000794008, "grad_norm": 0.14947877824306488, "kl": 0.13618455082178116, "learning_rate": 3e-06, "loss": 0.0042, "step": 655 }, { "clip_ratio": 0.00019307520415168256, "epoch": 0.0018212205509192166, "grad_norm": 0.16721273958683014, "kl": 0.12803955748677254, "learning_rate": 3e-06, "loss": 0.0031, "step": 656 }, { "clip_ratio": 0.00010548523277975619, "epoch": 0.0018239968017590325, "grad_norm": 0.13844816386699677, "kl": 0.12379540503025055, "learning_rate": 3e-06, "loss": 0.0029, "step": 657 }, { "clip_ratio": 0.0003572673595044762, "epoch": 0.0018267730525988483, "grad_norm": 0.24204890429973602, "kl": 0.10973387956619263, "learning_rate": 3e-06, "loss": -0.003, "step": 658 }, { "clip_ratio": 0.0006522906478494406, "epoch": 0.0018295493034386644, "grad_norm": 0.1522047072649002, "kl": 0.10548713058233261, "learning_rate": 3e-06, "loss": -0.0003, "step": 659 }, { "clip_ratio": 0.001921730930916965, "epoch": 0.0018323255542784802, "grad_norm": 0.25724494457244873, "kl": 0.09767122194170952, "learning_rate": 3e-06, "loss": -0.0013, "step": 660 }, { "clip_ratio": 9.051412052940577e-05, "completion_length": 218.7916717529297, "epoch": 0.001835101805118296, "grad_norm": 0.11165298521518707, "kl": 0.0836155079305172, "learning_rate": 3e-06, "loss": 0.007, "reward": 0.2604166716337204, "reward_std": 0.26486171036958694, "rewards/countdown_reward_func": 0.2604166716337204, "step": 661, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0018378780559581119, "grad_norm": 0.10689469426870346, "kl": 0.09136947244405746, "learning_rate": 3e-06, "loss": 0.0088, "step": 662 }, { "clip_ratio": 0.00019087179680354893, "epoch": 0.0018406543067979277, "grad_norm": 0.12443461269140244, "kl": 0.07553870230913162, "learning_rate": 3e-06, "loss": 0.0075, "step": 663 }, { "clip_ratio": 8.383634849451482e-05, "epoch": 0.0018434305576377438, "grad_norm": 0.1088162511587143, "kl": 0.07327612116932869, "learning_rate": 3e-06, "loss": 0.0079, "step": 664 }, { "clip_ratio": 0.0001824945939006284, "epoch": 0.0018462068084775596, "grad_norm": 0.09485284239053726, "kl": 0.07073798030614853, "learning_rate": 3e-06, "loss": 0.0066, "step": 665 }, { "clip_ratio": 0.00030053697992116213, "epoch": 0.0018489830593173755, "grad_norm": 0.13278953731060028, "kl": 0.06900656968355179, "learning_rate": 3e-06, "loss": 0.0067, "step": 666 }, { "clip_ratio": 9.865824540611356e-05, "epoch": 0.0018517593101571913, "grad_norm": 0.10497219115495682, "kl": 0.06802161037921906, "learning_rate": 3e-06, "loss": 0.0067, "step": 667 }, { "clip_ratio": 0.00021725406986661255, "epoch": 0.0018545355609970071, "grad_norm": 0.11255109310150146, "kl": 0.07535571232438087, "learning_rate": 3e-06, "loss": 0.0089, "step": 668 }, { "clip_ratio": 0.0005888359155505896, "epoch": 0.0018573118118368232, "grad_norm": 0.10095056146383286, "kl": 0.06463643535971642, "learning_rate": 3e-06, "loss": 0.0064, "step": 669 }, { "clip_ratio": 0.0005477168742800131, "epoch": 0.001860088062676639, "grad_norm": 0.09998640418052673, "kl": 0.06509601883590221, "learning_rate": 3e-06, "loss": 0.007, "step": 670 }, { "clip_ratio": 0.0001824945939006284, "epoch": 0.0018628643135164549, "grad_norm": 0.1530102789402008, "kl": 0.06189018301665783, "learning_rate": 3e-06, "loss": 0.0052, "step": 671 }, { "clip_ratio": 0.0002959747507702559, "epoch": 0.0018656405643562707, "grad_norm": 0.13745903968811035, "kl": 0.06357544660568237, "learning_rate": 3e-06, "loss": 0.0064, "step": 672 }, { "clip_ratio": 8.486082515446469e-05, "completion_length": 220.12500762939453, "epoch": 0.0018684168151960865, "grad_norm": 0.17126166820526123, "kl": 0.060750387609004974, "learning_rate": 3e-06, "loss": -0.0094, "reward": 0.26250001788139343, "reward_std": 0.2778630629181862, "rewards/countdown_reward_func": 0.26250001043081284, "step": 673, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0018711930660359024, "grad_norm": 0.11247212439775467, "kl": 0.059839921072125435, "learning_rate": 3e-06, "loss": -0.0094, "step": 674 }, { "clip_ratio": 0.0, "epoch": 0.0018739693168757184, "grad_norm": 0.2667171359062195, "kl": 0.06426364183425903, "learning_rate": 3e-06, "loss": -0.0092, "step": 675 }, { "clip_ratio": 9.36329597607255e-05, "epoch": 0.0018767455677155343, "grad_norm": 0.1640663743019104, "kl": 0.06390438973903656, "learning_rate": 3e-06, "loss": -0.0087, "step": 676 }, { "clip_ratio": 0.0001101321613532491, "epoch": 0.0018795218185553501, "grad_norm": 0.11860151588916779, "kl": 0.054045552387833595, "learning_rate": 3e-06, "loss": -0.0108, "step": 677 }, { "clip_ratio": 0.00019816032727248967, "epoch": 0.001882298069395166, "grad_norm": 0.1296575963497162, "kl": 0.059720540419220924, "learning_rate": 3e-06, "loss": -0.0096, "step": 678 }, { "clip_ratio": 0.00033944330061785877, "epoch": 0.0018850743202349818, "grad_norm": 0.1466340571641922, "kl": 0.05133458413183689, "learning_rate": 3e-06, "loss": -0.0108, "step": 679 }, { "clip_ratio": 8.486082515446469e-05, "epoch": 0.0018878505710747979, "grad_norm": 0.10148210823535919, "kl": 0.049093831330537796, "learning_rate": 3e-06, "loss": -0.0103, "step": 680 }, { "clip_ratio": 9.476876584812999e-05, "epoch": 0.0018906268219146137, "grad_norm": 0.20377697050571442, "kl": 0.05094917304813862, "learning_rate": 3e-06, "loss": -0.0118, "step": 681 }, { "clip_ratio": 0.0003458706341916695, "epoch": 0.0018934030727544295, "grad_norm": 0.15611247718334198, "kl": 0.050493909046053886, "learning_rate": 3e-06, "loss": -0.0106, "step": 682 }, { "clip_ratio": 0.0002511018610675819, "epoch": 0.0018961793235942454, "grad_norm": 0.1299794614315033, "kl": 0.04327445663511753, "learning_rate": 3e-06, "loss": -0.0112, "step": 683 }, { "clip_ratio": 0.0003479021688690409, "epoch": 0.0018989555744340612, "grad_norm": 0.12855900824069977, "kl": 0.04604472406208515, "learning_rate": 3e-06, "loss": -0.0124, "step": 684 }, { "clip_ratio": 0.00042530003702268004, "completion_length": 235.7916717529297, "epoch": 0.001901731825273877, "grad_norm": 0.13944292068481445, "kl": 0.04754863306879997, "learning_rate": 3e-06, "loss": 0.0191, "reward": 0.24791667610406876, "reward_std": 0.23748211562633514, "rewards/countdown_reward_func": 0.24791667610406876, "step": 685, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.001904508076113693, "grad_norm": 0.10677601397037506, "kl": 0.041842538863420486, "learning_rate": 3e-06, "loss": 0.0195, "step": 686 }, { "clip_ratio": 0.0, "epoch": 0.001907284326953509, "grad_norm": 0.08806996792554855, "kl": 0.04203086532652378, "learning_rate": 3e-06, "loss": 0.019, "step": 687 }, { "clip_ratio": 0.00018215480668004602, "epoch": 0.0019100605777933248, "grad_norm": 0.1156989261507988, "kl": 0.042531004175543785, "learning_rate": 3e-06, "loss": 0.0186, "step": 688 }, { "clip_ratio": 0.00018691782315727323, "epoch": 0.0019128368286331406, "grad_norm": 0.09394989162683487, "kl": 0.041263919323682785, "learning_rate": 3e-06, "loss": 0.0194, "step": 689 }, { "clip_ratio": 0.00026606619940139353, "epoch": 0.0019156130794729565, "grad_norm": 0.10816143453121185, "kl": 0.041256049647927284, "learning_rate": 3e-06, "loss": 0.0201, "step": 690 }, { "clip_ratio": 0.0005038198505644687, "epoch": 0.0019183893303127725, "grad_norm": 0.16718047857284546, "kl": 0.04300625994801521, "learning_rate": 3e-06, "loss": 0.0206, "step": 691 }, { "clip_ratio": 0.0001649922487558797, "epoch": 0.0019211655811525884, "grad_norm": 0.10943221300840378, "kl": 0.04007406160235405, "learning_rate": 3e-06, "loss": 0.0195, "step": 692 }, { "clip_ratio": 0.0, "epoch": 0.0019239418319924042, "grad_norm": 0.08448076248168945, "kl": 0.040019482374191284, "learning_rate": 3e-06, "loss": 0.0188, "step": 693 }, { "clip_ratio": 0.00025433551491005346, "epoch": 0.00192671808283222, "grad_norm": 0.11321469396352768, "kl": 0.04307270236313343, "learning_rate": 3e-06, "loss": 0.0183, "step": 694 }, { "clip_ratio": 0.0, "epoch": 0.0019294943336720359, "grad_norm": 0.09902040660381317, "kl": 0.042439911514520645, "learning_rate": 3e-06, "loss": 0.0188, "step": 695 }, { "clip_ratio": 0.0001846859959186986, "epoch": 0.0019322705845118517, "grad_norm": 0.11367753148078918, "kl": 0.04280451126396656, "learning_rate": 3e-06, "loss": 0.0197, "step": 696 }, { "clip_ratio": 0.00010229132749373093, "completion_length": 230.8541717529297, "epoch": 0.0019350468353516678, "grad_norm": 0.13595792651176453, "kl": 0.04034610837697983, "learning_rate": 3e-06, "loss": 0.0264, "reward": 0.26250001788139343, "reward_std": 0.2784983515739441, "rewards/countdown_reward_func": 0.26250000298023224, "step": 697, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0019378230861914836, "grad_norm": 0.1038818508386612, "kl": 0.040640873834490776, "learning_rate": 3e-06, "loss": 0.0261, "step": 698 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0019405993370312995, "grad_norm": 0.0944518968462944, "kl": 0.04026073403656483, "learning_rate": 3e-06, "loss": 0.0269, "step": 699 }, { "clip_ratio": 0.0, "epoch": 0.0019433755878711153, "grad_norm": 0.10623365640640259, "kl": 0.03956114687025547, "learning_rate": 3e-06, "loss": 0.0262, "step": 700 }, { "clip_ratio": 0.0, "epoch": 0.0019461518387109311, "grad_norm": 0.09136571735143661, "kl": 0.044885121285915375, "learning_rate": 3e-06, "loss": 0.0263, "step": 701 }, { "clip_ratio": 8.915834769140929e-05, "epoch": 0.0019489280895507472, "grad_norm": 0.12446179240942001, "kl": 0.04785134270787239, "learning_rate": 3e-06, "loss": 0.0254, "step": 702 }, { "clip_ratio": 0.0003792484931182116, "epoch": 0.001951704340390563, "grad_norm": 0.1446102112531662, "kl": 0.04690235108137131, "learning_rate": 3e-06, "loss": 0.0251, "step": 703 }, { "clip_ratio": 8.915834769140929e-05, "epoch": 0.0019544805912303786, "grad_norm": 0.11323653906583786, "kl": 0.048219822347164154, "learning_rate": 3e-06, "loss": 0.0248, "step": 704 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.001957256842070195, "grad_norm": 0.09300023317337036, "kl": 0.04829951003193855, "learning_rate": 3e-06, "loss": 0.0253, "step": 705 }, { "clip_ratio": 0.0, "epoch": 0.0019600330929100108, "grad_norm": 0.09304996579885483, "kl": 0.04665645770728588, "learning_rate": 3e-06, "loss": 0.0243, "step": 706 }, { "clip_ratio": 0.0004711005021817982, "epoch": 0.0019628093437498266, "grad_norm": 0.10053187608718872, "kl": 0.056041302159428596, "learning_rate": 3e-06, "loss": 0.0249, "step": 707 }, { "clip_ratio": 8.915834769140929e-05, "epoch": 0.0019655855945896424, "grad_norm": 0.11689490079879761, "kl": 0.05928555130958557, "learning_rate": 3e-06, "loss": 0.023, "step": 708 }, { "clip_ratio": 0.0001020408162730746, "completion_length": 217.87500762939453, "epoch": 0.0019683618454294583, "grad_norm": 0.12107165902853012, "kl": 0.059857327491045, "learning_rate": 3e-06, "loss": 0.0018, "reward": 0.2875000312924385, "reward_std": 0.26614009588956833, "rewards/countdown_reward_func": 0.2875000312924385, "step": 709, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.001971138096269274, "grad_norm": 0.1230737566947937, "kl": 0.06688201427459717, "learning_rate": 3e-06, "loss": 0.0023, "step": 710 }, { "clip_ratio": 0.0003445422335062176, "epoch": 0.00197391434710909, "grad_norm": 0.09549138695001602, "kl": 0.06407090276479721, "learning_rate": 3e-06, "loss": 0.0025, "step": 711 }, { "clip_ratio": 0.00010593220213195309, "epoch": 0.001976690597948906, "grad_norm": 0.1953248232603073, "kl": 0.06954507902264595, "learning_rate": 3e-06, "loss": 0.0026, "step": 712 }, { "clip_ratio": 0.000314092067128513, "epoch": 0.0019794668487887216, "grad_norm": 0.09191843122243881, "kl": 0.0671706348657608, "learning_rate": 3e-06, "loss": 0.0025, "step": 713 }, { "clip_ratio": 9.191176650347188e-05, "epoch": 0.0019822430996285375, "grad_norm": 0.12914390861988068, "kl": 0.06849226728081703, "learning_rate": 3e-06, "loss": 0.0013, "step": 714 }, { "clip_ratio": 0.0, "epoch": 0.0019850193504683533, "grad_norm": 0.1148599311709404, "kl": 0.06904346123337746, "learning_rate": 3e-06, "loss": 0.0014, "step": 715 }, { "clip_ratio": 0.0, "epoch": 0.0019877956013081696, "grad_norm": 0.12667132914066315, "kl": 0.07613038271665573, "learning_rate": 3e-06, "loss": 0.002, "step": 716 }, { "clip_ratio": 0.00017349517293041572, "epoch": 0.0019905718521479854, "grad_norm": 0.09524544328451157, "kl": 0.07107832655310631, "learning_rate": 3e-06, "loss": 0.0013, "step": 717 }, { "clip_ratio": 0.00010593220213195309, "epoch": 0.0019933481029878013, "grad_norm": 0.13948820531368256, "kl": 0.07579323649406433, "learning_rate": 3e-06, "loss": 0.0023, "step": 718 }, { "clip_ratio": 0.00030192390113370493, "epoch": 0.001996124353827617, "grad_norm": 0.0964130386710167, "kl": 0.07115180417895317, "learning_rate": 3e-06, "loss": 0.0018, "step": 719 }, { "clip_ratio": 0.00019251657067798078, "epoch": 0.001998900604667433, "grad_norm": 0.13039907813072205, "kl": 0.07175230234861374, "learning_rate": 3e-06, "loss": 0.0002, "step": 720 }, { "clip_ratio": 0.00028670569736277685, "completion_length": 223.6875, "epoch": 0.002001676855507249, "grad_norm": 0.22225071489810944, "kl": 0.07658716291189194, "learning_rate": 3e-06, "loss": 0.0017, "reward": 0.15208333730697632, "reward_std": 0.14035604149103165, "rewards/countdown_reward_func": 0.15208333730697632, "step": 721, "zero_std_ratio": 0.625 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0020044531063470646, "grad_norm": 0.07475583255290985, "kl": 0.06551392935216427, "learning_rate": 3e-06, "loss": 0.0021, "step": 722 }, { "clip_ratio": 9.742790280142799e-05, "epoch": 0.0020072293571868805, "grad_norm": 0.08553145080804825, "kl": 0.06234482862055302, "learning_rate": 3e-06, "loss": 0.0017, "step": 723 }, { "clip_ratio": 0.00019808593060588464, "epoch": 0.0020100056080266963, "grad_norm": 0.05904084071516991, "kl": 0.06513764709234238, "learning_rate": 3e-06, "loss": 0.0017, "step": 724 }, { "clip_ratio": 0.0, "epoch": 0.002012781858866512, "grad_norm": 0.07201807200908661, "kl": 0.06482937932014465, "learning_rate": 3e-06, "loss": 0.0019, "step": 725 }, { "clip_ratio": 9.984025382436812e-05, "epoch": 0.002015558109706328, "grad_norm": 0.07085428386926651, "kl": 0.06466732174158096, "learning_rate": 3e-06, "loss": 0.0018, "step": 726 }, { "clip_ratio": 0.00018783042469294742, "epoch": 0.0020183343605461443, "grad_norm": 0.24391846358776093, "kl": 0.06686633080244064, "learning_rate": 3e-06, "loss": -0.001, "step": 727 }, { "clip_ratio": 0.00016469038382638246, "epoch": 0.00202111061138596, "grad_norm": 0.08265461027622223, "kl": 0.056683897972106934, "learning_rate": 3e-06, "loss": 0.0008, "step": 728 }, { "clip_ratio": 0.0, "epoch": 0.002023886862225776, "grad_norm": 0.08346160501241684, "kl": 0.05314911529421806, "learning_rate": 3e-06, "loss": 0.0008, "step": 729 }, { "clip_ratio": 0.00019712094945134595, "epoch": 0.0020266631130655918, "grad_norm": 0.057865407317876816, "kl": 0.05542047321796417, "learning_rate": 3e-06, "loss": 0.0012, "step": 730 }, { "clip_ratio": 0.0002478523238096386, "epoch": 0.0020294393639054076, "grad_norm": 0.07208742946386337, "kl": 0.054563652724027634, "learning_rate": 3e-06, "loss": 0.0008, "step": 731 }, { "clip_ratio": 0.0003381839778739959, "epoch": 0.0020322156147452235, "grad_norm": 0.06954418867826462, "kl": 0.05391604080796242, "learning_rate": 3e-06, "loss": 0.0007, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 209.27083587646484, "epoch": 0.0020349918655850393, "grad_norm": 0.12923622131347656, "kl": 0.054487695917487144, "learning_rate": 3e-06, "loss": 0.0177, "reward": 0.24791669100522995, "reward_std": 0.3050043284893036, "rewards/countdown_reward_func": 0.24791667610406876, "step": 733, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0001197317978949286, "epoch": 0.002037768116424855, "grad_norm": 0.10722116380929947, "kl": 0.05108374170958996, "learning_rate": 3e-06, "loss": 0.0178, "step": 734 }, { "clip_ratio": 0.0001197317978949286, "epoch": 0.002040544367264671, "grad_norm": 0.11042473465204239, "kl": 0.052645549178123474, "learning_rate": 3e-06, "loss": 0.0183, "step": 735 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.002043320618104487, "grad_norm": 0.1352359801530838, "kl": 0.049059027805924416, "learning_rate": 3e-06, "loss": 0.0169, "step": 736 }, { "clip_ratio": 0.0, "epoch": 0.0020460968689443026, "grad_norm": 0.11228242516517639, "kl": 0.056157197803258896, "learning_rate": 3e-06, "loss": 0.0182, "step": 737 }, { "clip_ratio": 0.00029813701985403895, "epoch": 0.002048873119784119, "grad_norm": 0.09875979274511337, "kl": 0.04752412252128124, "learning_rate": 3e-06, "loss": 0.0174, "step": 738 }, { "clip_ratio": 8.922198321670294e-05, "epoch": 0.0020516493706239348, "grad_norm": 0.12920497357845306, "kl": 0.04895847663283348, "learning_rate": 3e-06, "loss": 0.0167, "step": 739 }, { "clip_ratio": 0.0003326056757941842, "epoch": 0.0020544256214637506, "grad_norm": 0.10440492630004883, "kl": 0.04799576476216316, "learning_rate": 3e-06, "loss": 0.0173, "step": 740 }, { "clip_ratio": 0.00021150236716493964, "epoch": 0.0020572018723035664, "grad_norm": 0.10824400186538696, "kl": 0.050282422453165054, "learning_rate": 3e-06, "loss": 0.0179, "step": 741 }, { "clip_ratio": 0.0002083023136947304, "epoch": 0.0020599781231433823, "grad_norm": 0.17329491674900055, "kl": 0.04743002541363239, "learning_rate": 3e-06, "loss": 0.0154, "step": 742 }, { "clip_ratio": 0.0, "epoch": 0.002062754373983198, "grad_norm": 0.11108177900314331, "kl": 0.05471383221447468, "learning_rate": 3e-06, "loss": 0.017, "step": 743 }, { "clip_ratio": 0.00021150236716493964, "epoch": 0.002065530624823014, "grad_norm": 0.09756101667881012, "kl": 0.048802973702549934, "learning_rate": 3e-06, "loss": 0.0166, "step": 744 }, { "clip_ratio": 8.650519157527015e-05, "completion_length": 231.375, "epoch": 0.00206830687566283, "grad_norm": 0.11797621846199036, "kl": 0.05034934915602207, "learning_rate": 3e-06, "loss": 0.0265, "reward": 0.32500001043081284, "reward_std": 0.33128294348716736, "rewards/countdown_reward_func": 0.32500001043081284, "step": 745, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0020710831265026456, "grad_norm": 0.1360631287097931, "kl": 0.049324722960591316, "learning_rate": 3e-06, "loss": 0.0259, "step": 746 }, { "clip_ratio": 0.0, "epoch": 0.0020738593773424615, "grad_norm": 0.11022765189409256, "kl": 0.04972606897354126, "learning_rate": 3e-06, "loss": 0.0263, "step": 747 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0020766356281822773, "grad_norm": 0.11825685203075409, "kl": 0.05200809799134731, "learning_rate": 3e-06, "loss": 0.0255, "step": 748 }, { "clip_ratio": 0.0, "epoch": 0.0020794118790220936, "grad_norm": 0.15757912397384644, "kl": 0.05278201401233673, "learning_rate": 3e-06, "loss": 0.0252, "step": 749 }, { "clip_ratio": 9.335325012216344e-05, "epoch": 0.0020821881298619094, "grad_norm": 0.12597443163394928, "kl": 0.05840662308037281, "learning_rate": 3e-06, "loss": 0.0252, "step": 750 }, { "clip_ratio": 0.0, "epoch": 0.0020849643807017253, "grad_norm": 0.12119497358798981, "kl": 0.061403946951031685, "learning_rate": 3e-06, "loss": 0.0255, "step": 751 }, { "clip_ratio": 0.0, "epoch": 0.002087740631541541, "grad_norm": 0.15548032522201538, "kl": 0.06312582828104496, "learning_rate": 3e-06, "loss": 0.024, "step": 752 }, { "clip_ratio": 0.0, "epoch": 0.002090516882381357, "grad_norm": 0.09560614824295044, "kl": 0.06370921805500984, "learning_rate": 3e-06, "loss": 0.0251, "step": 753 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.002093293133221173, "grad_norm": 0.11291124671697617, "kl": 0.06948983296751976, "learning_rate": 3e-06, "loss": 0.023, "step": 754 }, { "clip_ratio": 9.328358282800764e-05, "epoch": 0.0020960693840609886, "grad_norm": 0.14481361210346222, "kl": 0.07263422012329102, "learning_rate": 3e-06, "loss": 0.0218, "step": 755 }, { "clip_ratio": 9.13075273274444e-05, "epoch": 0.0020988456349008045, "grad_norm": 0.11197732388973236, "kl": 0.08130589872598648, "learning_rate": 3e-06, "loss": 0.0229, "step": 756 }, { "clip_ratio": 9.144111390924081e-05, "completion_length": 233.5416717529297, "epoch": 0.0021016218857406203, "grad_norm": 0.08032705634832382, "kl": 0.0910085029900074, "learning_rate": 3e-06, "loss": 0.0084, "reward": 0.20625000447034836, "reward_std": 0.245523139834404, "rewards/countdown_reward_func": 0.20625000447034836, "step": 757, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00019638649246189743, "epoch": 0.002104398136580436, "grad_norm": 0.08426212519407272, "kl": 0.0998266153037548, "learning_rate": 3e-06, "loss": 0.0084, "step": 758 }, { "clip_ratio": 0.0001748835711623542, "epoch": 0.002107174387420252, "grad_norm": 0.09099453687667847, "kl": 0.09878429025411606, "learning_rate": 3e-06, "loss": 0.0081, "step": 759 }, { "clip_ratio": 0.0, "epoch": 0.0021099506382600683, "grad_norm": 0.10690543055534363, "kl": 0.10915743559598923, "learning_rate": 3e-06, "loss": 0.0094, "step": 760 }, { "clip_ratio": 0.00026571148191578686, "epoch": 0.002112726889099884, "grad_norm": 0.08678600192070007, "kl": 0.11478978767991066, "learning_rate": 3e-06, "loss": 0.0084, "step": 761 }, { "clip_ratio": 0.0001645819575060159, "epoch": 0.0021155031399397, "grad_norm": 0.09271499514579773, "kl": 0.1079954281449318, "learning_rate": 3e-06, "loss": 0.0083, "step": 762 }, { "clip_ratio": 9.144111390924081e-05, "epoch": 0.0021182793907795158, "grad_norm": 0.08949034661054611, "kl": 0.11407085880637169, "learning_rate": 3e-06, "loss": 0.0076, "step": 763 }, { "clip_ratio": 0.00028337843832559884, "epoch": 0.0021210556416193316, "grad_norm": 0.09156984835863113, "kl": 0.12308426946401596, "learning_rate": 3e-06, "loss": 0.0082, "step": 764 }, { "clip_ratio": 0.0001748835711623542, "epoch": 0.0021238318924591475, "grad_norm": 0.08866655081510544, "kl": 0.11775670573115349, "learning_rate": 3e-06, "loss": 0.0079, "step": 765 }, { "clip_ratio": 9.819324623094872e-05, "epoch": 0.0021266081432989633, "grad_norm": 0.10422717034816742, "kl": 0.12401585280895233, "learning_rate": 3e-06, "loss": 0.0084, "step": 766 }, { "clip_ratio": 9.259259240934625e-05, "epoch": 0.002129384394138779, "grad_norm": 0.08140372484922409, "kl": 0.12763554602861404, "learning_rate": 3e-06, "loss": 0.008, "step": 767 }, { "clip_ratio": 0.000270773540250957, "epoch": 0.002132160644978595, "grad_norm": 0.09283492714166641, "kl": 0.11555681005120277, "learning_rate": 3e-06, "loss": 0.0076, "step": 768 }, { "clip_ratio": 0.00017576066602487117, "completion_length": 214.00000762939453, "epoch": 0.002134936895818411, "grad_norm": 0.1142767146229744, "kl": 0.10155479237437248, "learning_rate": 3e-06, "loss": 0.0015, "reward": 0.20625000447034836, "reward_std": 0.20823679491877556, "rewards/countdown_reward_func": 0.20625000447034836, "step": 769, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0021377131466582266, "grad_norm": 0.12313186377286911, "kl": 0.11426293849945068, "learning_rate": 3e-06, "loss": 0.001, "step": 770 }, { "clip_ratio": 0.0, "epoch": 0.002140489397498043, "grad_norm": 0.10338300466537476, "kl": 0.10406426340341568, "learning_rate": 3e-06, "loss": 0.0004, "step": 771 }, { "clip_ratio": 0.0, "epoch": 0.0021432656483378588, "grad_norm": 0.12133070081472397, "kl": 0.09835099801421165, "learning_rate": 3e-06, "loss": 0.0006, "step": 772 }, { "clip_ratio": 0.00040839538269210607, "epoch": 0.0021460418991776746, "grad_norm": 0.13222798705101013, "kl": 0.10666431114077568, "learning_rate": 3e-06, "loss": 0.0006, "step": 773 }, { "clip_ratio": 0.0, "epoch": 0.0021488181500174904, "grad_norm": 0.1232038140296936, "kl": 0.09488913416862488, "learning_rate": 3e-06, "loss": 0.0003, "step": 774 }, { "clip_ratio": 9.164222865365446e-05, "epoch": 0.0021515944008573063, "grad_norm": 0.10721003264188766, "kl": 0.08875302597880363, "learning_rate": 3e-06, "loss": -0.0009, "step": 775 }, { "clip_ratio": 0.0, "epoch": 0.002154370651697122, "grad_norm": 0.12701751291751862, "kl": 0.09637927263975143, "learning_rate": 3e-06, "loss": -0.0006, "step": 776 }, { "clip_ratio": 0.0003659786016214639, "epoch": 0.002157146902536938, "grad_norm": 0.10145313292741776, "kl": 0.08755913749337196, "learning_rate": 3e-06, "loss": -0.001, "step": 777 }, { "clip_ratio": 0.00018341893155593425, "epoch": 0.002159923153376754, "grad_norm": 0.11642103642225266, "kl": 0.08162051066756248, "learning_rate": 3e-06, "loss": -0.0017, "step": 778 }, { "clip_ratio": 0.0006324425921775401, "epoch": 0.0021626994042165696, "grad_norm": 0.1388908177614212, "kl": 0.08449938148260117, "learning_rate": 3e-06, "loss": -0.0006, "step": 779 }, { "clip_ratio": 0.00041412835707888007, "epoch": 0.0021654756550563855, "grad_norm": 0.12610669434070587, "kl": 0.07518958300352097, "learning_rate": 3e-06, "loss": -0.001, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 227.58333587646484, "epoch": 0.0021682519058962017, "grad_norm": 0.11886297166347504, "kl": 0.08037593588232994, "learning_rate": 3e-06, "loss": 0.0063, "reward": 0.26250000298023224, "reward_std": 0.27733949571847916, "rewards/countdown_reward_func": 0.26249999552965164, "step": 781, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00010702054714784026, "epoch": 0.0021710281567360176, "grad_norm": 0.14289286732673645, "kl": 0.07106167078018188, "learning_rate": 3e-06, "loss": 0.0047, "step": 782 }, { "clip_ratio": 0.0001884007579064928, "epoch": 0.0021738044075758334, "grad_norm": 0.12329453229904175, "kl": 0.07392378151416779, "learning_rate": 3e-06, "loss": 0.0059, "step": 783 }, { "clip_ratio": 0.000244140625, "epoch": 0.0021765806584156493, "grad_norm": 0.11597783863544464, "kl": 0.06427717953920364, "learning_rate": 3e-06, "loss": 0.0049, "step": 784 }, { "clip_ratio": 8.37801635498181e-05, "epoch": 0.002179356909255465, "grad_norm": 0.12029743194580078, "kl": 0.06854793429374695, "learning_rate": 3e-06, "loss": 0.0063, "step": 785 }, { "clip_ratio": 0.0002638931109686382, "epoch": 0.002182133160095281, "grad_norm": 0.11125701665878296, "kl": 0.06303357519209385, "learning_rate": 3e-06, "loss": 0.0061, "step": 786 }, { "clip_ratio": 0.0005720614135498181, "epoch": 0.002184909410935097, "grad_norm": 0.11483973264694214, "kl": 0.06457911431789398, "learning_rate": 3e-06, "loss": 0.0044, "step": 787 }, { "clip_ratio": 0.0, "epoch": 0.0021876856617749126, "grad_norm": 0.1489003449678421, "kl": 0.057757457718253136, "learning_rate": 3e-06, "loss": 0.0044, "step": 788 }, { "clip_ratio": 0.00045818173384759575, "epoch": 0.0021904619126147285, "grad_norm": 0.1282450258731842, "kl": 0.06039944291114807, "learning_rate": 3e-06, "loss": 0.0042, "step": 789 }, { "clip_ratio": 0.00048828125, "epoch": 0.0021932381634545443, "grad_norm": 0.12330767512321472, "kl": 0.055448392406105995, "learning_rate": 3e-06, "loss": 0.0036, "step": 790 }, { "clip_ratio": 0.0007235735538415611, "epoch": 0.00219601441429436, "grad_norm": 0.12403135746717453, "kl": 0.058060334995388985, "learning_rate": 3e-06, "loss": 0.0048, "step": 791 }, { "clip_ratio": 0.00037331361090764403, "epoch": 0.0021987906651341764, "grad_norm": 0.10532835870981216, "kl": 0.05376404523849487, "learning_rate": 3e-06, "loss": 0.0047, "step": 792 }, { "clip_ratio": 0.00010984182881657034, "completion_length": 222.31250762939453, "epoch": 0.0022015669159739923, "grad_norm": 0.04651214927434921, "kl": 0.05401436612010002, "learning_rate": 3e-06, "loss": 0.003, "reward": 0.15000001341104507, "reward_std": 0.11558077484369278, "rewards/countdown_reward_func": 0.15000000223517418, "step": 793, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.000742982214433141, "epoch": 0.002204343166813808, "grad_norm": 0.07730413973331451, "kl": 0.0580199658870697, "learning_rate": 3e-06, "loss": 0.0033, "step": 794 }, { "clip_ratio": 0.00010984182881657034, "epoch": 0.002207119417653624, "grad_norm": 0.051809556782245636, "kl": 0.057794030755758286, "learning_rate": 3e-06, "loss": 0.0035, "step": 795 }, { "clip_ratio": 0.0007241156417876482, "epoch": 0.0022098956684934398, "grad_norm": 0.0728415995836258, "kl": 0.05362066254019737, "learning_rate": 3e-06, "loss": 0.0025, "step": 796 }, { "clip_ratio": 0.0002787023695418611, "epoch": 0.0022126719193332556, "grad_norm": 0.06317047029733658, "kl": 0.05362400598824024, "learning_rate": 3e-06, "loss": 0.0032, "step": 797 }, { "clip_ratio": 0.00017235546692973003, "epoch": 0.0022154481701730715, "grad_norm": 0.06358745694160461, "kl": 0.05262857303023338, "learning_rate": 3e-06, "loss": 0.0024, "step": 798 }, { "clip_ratio": 9.097525617107749e-05, "epoch": 0.0022182244210128873, "grad_norm": 0.05143192410469055, "kl": 0.049039315432310104, "learning_rate": 3e-06, "loss": 0.0028, "step": 799 }, { "clip_ratio": 0.00010984182881657034, "epoch": 0.002221000671852703, "grad_norm": 0.0856727808713913, "kl": 0.052018070593476295, "learning_rate": 3e-06, "loss": 0.0027, "step": 800 }, { "clip_ratio": 0.00038732589746359736, "epoch": 0.002223776922692519, "grad_norm": 0.05116529390215874, "kl": 0.05213000252842903, "learning_rate": 3e-06, "loss": 0.003, "step": 801 }, { "clip_ratio": 0.0006221811127034016, "epoch": 0.002226553173532335, "grad_norm": 0.07559063285589218, "kl": 0.04888819716870785, "learning_rate": 3e-06, "loss": 0.0019, "step": 802 }, { "clip_ratio": 0.0006752755725756288, "epoch": 0.002229329424372151, "grad_norm": 0.06661748886108398, "kl": 0.04938088357448578, "learning_rate": 3e-06, "loss": 0.0025, "step": 803 }, { "clip_ratio": 0.0006201023061294109, "epoch": 0.002232105675211967, "grad_norm": 0.06247713044285774, "kl": 0.04884421452879906, "learning_rate": 3e-06, "loss": 0.0018, "step": 804 }, { "clip_ratio": 0.00010399334132671356, "completion_length": 228.52083587646484, "epoch": 0.0022348819260517828, "grad_norm": 0.10135756433010101, "kl": 0.043619923293590546, "learning_rate": 3e-06, "loss": 0.0141, "reward": 0.2875000014901161, "reward_std": 0.28183095902204514, "rewards/countdown_reward_func": 0.2875000014901161, "step": 805, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003937007859349251, "epoch": 0.0022376581768915986, "grad_norm": 0.10308714210987091, "kl": 0.0447169654071331, "learning_rate": 3e-06, "loss": 0.014, "step": 806 }, { "clip_ratio": 0.0, "epoch": 0.0022404344277314144, "grad_norm": 0.12397963553667068, "kl": 0.04130372405052185, "learning_rate": 3e-06, "loss": 0.0146, "step": 807 }, { "clip_ratio": 0.0, "epoch": 0.0022432106785712303, "grad_norm": 0.0904117226600647, "kl": 0.04326653108000755, "learning_rate": 3e-06, "loss": 0.0137, "step": 808 }, { "clip_ratio": 8.747375977691263e-05, "epoch": 0.002245986929411046, "grad_norm": 0.11636728793382645, "kl": 0.04562844708561897, "learning_rate": 3e-06, "loss": 0.0135, "step": 809 }, { "clip_ratio": 0.0, "epoch": 0.002248763180250862, "grad_norm": 0.10902263969182968, "kl": 0.04578346759080887, "learning_rate": 3e-06, "loss": 0.0137, "step": 810 }, { "clip_ratio": 8.747375977691263e-05, "epoch": 0.002251539431090678, "grad_norm": 0.10831674933433533, "kl": 0.04260227829217911, "learning_rate": 3e-06, "loss": 0.0132, "step": 811 }, { "clip_ratio": 0.0003889085492119193, "epoch": 0.0022543156819304936, "grad_norm": 0.11185412853956223, "kl": 0.04456242546439171, "learning_rate": 3e-06, "loss": 0.0142, "step": 812 }, { "clip_ratio": 0.00047818864550208673, "epoch": 0.0022570919327703095, "grad_norm": 0.10440458357334137, "kl": 0.04222998023033142, "learning_rate": 3e-06, "loss": 0.0138, "step": 813 }, { "clip_ratio": 0.00018712585733737797, "epoch": 0.0022598681836101257, "grad_norm": 0.0939682200551033, "kl": 0.04461575858294964, "learning_rate": 3e-06, "loss": 0.0124, "step": 814 }, { "clip_ratio": 0.0, "epoch": 0.0022626444344499416, "grad_norm": 0.09211880713701248, "kl": 0.047047100961208344, "learning_rate": 3e-06, "loss": 0.0121, "step": 815 }, { "clip_ratio": 0.00036837265361100435, "epoch": 0.0022654206852897574, "grad_norm": 0.12892583012580872, "kl": 0.04755326174199581, "learning_rate": 3e-06, "loss": 0.0134, "step": 816 }, { "clip_ratio": 0.00047789004747755826, "completion_length": 220.1666717529297, "epoch": 0.0022681969361295733, "grad_norm": 0.18188029527664185, "kl": 0.04710717685520649, "learning_rate": 3e-06, "loss": 0.0458, "reward": 0.34166669845581055, "reward_std": 0.27476726472377777, "rewards/countdown_reward_func": 0.34166666865348816, "step": 817, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00017529455362819135, "epoch": 0.002270973186969389, "grad_norm": 0.1454269289970398, "kl": 0.05038350075483322, "learning_rate": 3e-06, "loss": 0.0453, "step": 818 }, { "clip_ratio": 0.0, "epoch": 0.002273749437809205, "grad_norm": 0.13133502006530762, "kl": 0.04751100763678551, "learning_rate": 3e-06, "loss": 0.045, "step": 819 }, { "clip_ratio": 0.0, "epoch": 0.002276525688649021, "grad_norm": 0.1250462830066681, "kl": 0.04578425548970699, "learning_rate": 3e-06, "loss": 0.0449, "step": 820 }, { "clip_ratio": 0.0001035625537042506, "epoch": 0.0022793019394888366, "grad_norm": 0.14770175516605377, "kl": 0.05231819488108158, "learning_rate": 3e-06, "loss": 0.0442, "step": 821 }, { "clip_ratio": 0.0004608447488863021, "epoch": 0.0022820781903286525, "grad_norm": 0.14028239250183105, "kl": 0.05149639584124088, "learning_rate": 3e-06, "loss": 0.0435, "step": 822 }, { "clip_ratio": 0.0, "epoch": 0.0022848544411684683, "grad_norm": 0.17355775833129883, "kl": 0.056207796558737755, "learning_rate": 3e-06, "loss": 0.0419, "step": 823 }, { "clip_ratio": 0.0001035625537042506, "epoch": 0.002287630692008284, "grad_norm": 0.14540952444076538, "kl": 0.060877278447151184, "learning_rate": 3e-06, "loss": 0.042, "step": 824 }, { "clip_ratio": 0.00018522187019698322, "epoch": 0.0022904069428481004, "grad_norm": 0.1283130794763565, "kl": 0.060043616220355034, "learning_rate": 3e-06, "loss": 0.042, "step": 825 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.0022931831936879163, "grad_norm": 0.13025355339050293, "kl": 0.06011705473065376, "learning_rate": 3e-06, "loss": 0.0415, "step": 826 }, { "clip_ratio": 0.0006889307842357084, "epoch": 0.002295959444527732, "grad_norm": 0.1317429095506668, "kl": 0.07226631790399551, "learning_rate": 3e-06, "loss": 0.0392, "step": 827 }, { "clip_ratio": 0.0011769172851927578, "epoch": 0.002298735695367548, "grad_norm": 0.12668536603450775, "kl": 0.07085844874382019, "learning_rate": 3e-06, "loss": 0.0392, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 217.25, "epoch": 0.0023015119462073638, "grad_norm": 0.0859948992729187, "kl": 0.07980459555983543, "learning_rate": 3e-06, "loss": 0.0234, "reward": 0.21250000596046448, "reward_std": 0.2080453746020794, "rewards/countdown_reward_func": 0.21250000596046448, "step": 829, "zero_std_ratio": 0.5 }, { "clip_ratio": 9.593246068106964e-05, "epoch": 0.0023042881970471796, "grad_norm": 0.08892592787742615, "kl": 0.08402768895030022, "learning_rate": 3e-06, "loss": 0.0227, "step": 830 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.0023070644478869955, "grad_norm": 0.07939447462558746, "kl": 0.08712485805153847, "learning_rate": 3e-06, "loss": 0.0224, "step": 831 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.0023098406987268113, "grad_norm": 0.09546811133623123, "kl": 0.09581883251667023, "learning_rate": 3e-06, "loss": 0.0229, "step": 832 }, { "clip_ratio": 0.0, "epoch": 0.002312616949566627, "grad_norm": 0.1163158193230629, "kl": 0.10002047568559647, "learning_rate": 3e-06, "loss": 0.0228, "step": 833 }, { "clip_ratio": 0.00019186492136213928, "epoch": 0.002315393200406443, "grad_norm": 0.1022767499089241, "kl": 0.10018961131572723, "learning_rate": 3e-06, "loss": 0.0225, "step": 834 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.002318169451246259, "grad_norm": 0.08190451562404633, "kl": 0.11375463381409645, "learning_rate": 3e-06, "loss": 0.0222, "step": 835 }, { "clip_ratio": 0.0001830161054385826, "epoch": 0.002320945702086075, "grad_norm": 0.1015833392739296, "kl": 0.11530355364084244, "learning_rate": 3e-06, "loss": 0.0219, "step": 836 }, { "clip_ratio": 0.00027506323385750875, "epoch": 0.002323721952925891, "grad_norm": 0.08156981319189072, "kl": 0.11573269963264465, "learning_rate": 3e-06, "loss": 0.0224, "step": 837 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.0023264982037657068, "grad_norm": 0.08199756592512131, "kl": 0.12695779651403427, "learning_rate": 3e-06, "loss": 0.0217, "step": 838 }, { "clip_ratio": 0.0, "epoch": 0.0023292744546055226, "grad_norm": 0.1132536232471466, "kl": 0.12991363927721977, "learning_rate": 3e-06, "loss": 0.0228, "step": 839 }, { "clip_ratio": 0.00036751566221937537, "epoch": 0.0023320507054453384, "grad_norm": 0.10609925538301468, "kl": 0.1261560171842575, "learning_rate": 3e-06, "loss": 0.0212, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 216.02083587646484, "epoch": 0.0023348269562851543, "grad_norm": 0.13725493848323822, "kl": 0.12262994423508644, "learning_rate": 3e-06, "loss": 0.0071, "reward": 0.24375002086162567, "reward_std": 0.26503098011016846, "rewards/countdown_reward_func": 0.24375002086162567, "step": 841, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.00233760320712497, "grad_norm": 0.13868065178394318, "kl": 0.13203585147857666, "learning_rate": 3e-06, "loss": 0.007, "step": 842 }, { "clip_ratio": 9.03179170563817e-05, "epoch": 0.002340379457964786, "grad_norm": 0.13435383141040802, "kl": 0.13662400841712952, "learning_rate": 3e-06, "loss": 0.007, "step": 843 }, { "clip_ratio": 9.038322605192661e-05, "epoch": 0.002343155708804602, "grad_norm": 0.158269003033638, "kl": 0.12493979930877686, "learning_rate": 3e-06, "loss": 0.0079, "step": 844 }, { "clip_ratio": 8.662508480483666e-05, "epoch": 0.0023459319596444176, "grad_norm": 0.13201403617858887, "kl": 0.14266318082809448, "learning_rate": 3e-06, "loss": 0.0071, "step": 845 }, { "clip_ratio": 9.03179170563817e-05, "epoch": 0.0023487082104842335, "grad_norm": 0.11519859731197357, "kl": 0.13292869180440903, "learning_rate": 3e-06, "loss": 0.0078, "step": 846 }, { "clip_ratio": 0.0, "epoch": 0.0023514844613240497, "grad_norm": 0.1352914720773697, "kl": 0.12666785717010498, "learning_rate": 3e-06, "loss": 0.0055, "step": 847 }, { "clip_ratio": 9.07111752894707e-05, "epoch": 0.0023542607121638656, "grad_norm": 0.13024933636188507, "kl": 0.12938842177391052, "learning_rate": 3e-06, "loss": 0.0058, "step": 848 }, { "clip_ratio": 0.0003490790259093046, "epoch": 0.0023570369630036814, "grad_norm": 0.13579486310482025, "kl": 0.13479531556367874, "learning_rate": 3e-06, "loss": 0.0064, "step": 849 }, { "clip_ratio": 0.0009073439359781332, "epoch": 0.0023598132138434973, "grad_norm": 0.15496942400932312, "kl": 0.1218239888548851, "learning_rate": 3e-06, "loss": 0.0063, "step": 850 }, { "clip_ratio": 0.0004516682820394635, "epoch": 0.002362589464683313, "grad_norm": 0.13003304600715637, "kl": 0.13300446420907974, "learning_rate": 3e-06, "loss": 0.0044, "step": 851 }, { "clip_ratio": 0.00022212152543943375, "epoch": 0.002365365715523129, "grad_norm": 0.11777352541685104, "kl": 0.12342452257871628, "learning_rate": 3e-06, "loss": 0.0056, "step": 852 }, { "clip_ratio": 0.00018712575547397137, "completion_length": 218.52083587646484, "epoch": 0.0023681419663629448, "grad_norm": 0.12623701989650726, "kl": 0.12132728099822998, "learning_rate": 3e-06, "loss": -0.0039, "reward": 0.16875000298023224, "reward_std": 0.16144294291734695, "rewards/countdown_reward_func": 0.16874999552965164, "step": 853, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.238728671334684e-05, "epoch": 0.0023709182172027606, "grad_norm": 0.12842930853366852, "kl": 0.11158213764429092, "learning_rate": 3e-06, "loss": -0.0044, "step": 854 }, { "clip_ratio": 0.0005675574648194015, "epoch": 0.0023736944680425765, "grad_norm": 0.16828866302967072, "kl": 0.10891519114375114, "learning_rate": 3e-06, "loss": -0.0048, "step": 855 }, { "clip_ratio": 9.245562250725925e-05, "epoch": 0.0023764707188823923, "grad_norm": 0.15191693603992462, "kl": 0.11035206541419029, "learning_rate": 3e-06, "loss": -0.0052, "step": 856 }, { "clip_ratio": 0.0, "epoch": 0.002379246969722208, "grad_norm": 0.09676827490329742, "kl": 0.10043661668896675, "learning_rate": 3e-06, "loss": -0.0061, "step": 857 }, { "clip_ratio": 0.00019535439787432551, "epoch": 0.0023820232205620244, "grad_norm": 0.13413938879966736, "kl": 0.0950070358812809, "learning_rate": 3e-06, "loss": -0.0066, "step": 858 }, { "clip_ratio": 0.00039087486220523715, "epoch": 0.0023847994714018403, "grad_norm": 0.12939363718032837, "kl": 0.09167426824569702, "learning_rate": 3e-06, "loss": -0.0072, "step": 859 }, { "clip_ratio": 0.0017128197359852493, "epoch": 0.002387575722241656, "grad_norm": 0.11775387078523636, "kl": 0.08235512301325798, "learning_rate": 3e-06, "loss": -0.0086, "step": 860 }, { "clip_ratio": 0.0017219405272044241, "epoch": 0.002390351973081472, "grad_norm": 0.10675527900457382, "kl": 0.07996315136551857, "learning_rate": 3e-06, "loss": -0.0083, "step": 861 }, { "clip_ratio": 0.0017481384566053748, "epoch": 0.0023931282239212878, "grad_norm": 0.14627479016780853, "kl": 0.07891392335295677, "learning_rate": 3e-06, "loss": -0.0086, "step": 862 }, { "clip_ratio": 0.003889568499289453, "epoch": 0.0023959044747611036, "grad_norm": 0.09009343385696411, "kl": 0.07284262031316757, "learning_rate": 3e-06, "loss": -0.0076, "step": 863 }, { "clip_ratio": 0.006949112517759204, "epoch": 0.0023986807256009194, "grad_norm": 0.11177929490804672, "kl": 0.06860150396823883, "learning_rate": 3e-06, "loss": -0.0095, "step": 864 }, { "clip_ratio": 8.520791016053408e-05, "completion_length": 221.2916717529297, "epoch": 0.0024014569764407353, "grad_norm": 0.0992504358291626, "kl": 0.07187891751527786, "learning_rate": 3e-06, "loss": 0.024, "reward": 0.24791669100522995, "reward_std": 0.22883931919932365, "rewards/countdown_reward_func": 0.24791667610406876, "step": 865, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.002404233227280551, "grad_norm": 0.10073084384202957, "kl": 0.06338691711425781, "learning_rate": 3e-06, "loss": 0.0226, "step": 866 }, { "clip_ratio": 9.811617201194167e-05, "epoch": 0.002407009478120367, "grad_norm": 0.10182993113994598, "kl": 0.061187781393527985, "learning_rate": 3e-06, "loss": 0.0235, "step": 867 }, { "clip_ratio": 0.00030962208984419703, "epoch": 0.002409785728960183, "grad_norm": 0.1328115165233612, "kl": 0.06472436338663101, "learning_rate": 3e-06, "loss": 0.0248, "step": 868 }, { "clip_ratio": 9.811617201194167e-05, "epoch": 0.002412561979799999, "grad_norm": 0.12169165909290314, "kl": 0.06141612492501736, "learning_rate": 3e-06, "loss": 0.0244, "step": 869 }, { "clip_ratio": 0.00019770623475778848, "epoch": 0.002415338230639815, "grad_norm": 0.11152414232492447, "kl": 0.06216576509177685, "learning_rate": 3e-06, "loss": 0.0251, "step": 870 }, { "clip_ratio": 8.520791016053408e-05, "epoch": 0.0024181144814796308, "grad_norm": 0.08779774606227875, "kl": 0.06478393822908401, "learning_rate": 3e-06, "loss": 0.0248, "step": 871 }, { "clip_ratio": 0.00018243287922814488, "epoch": 0.0024208907323194466, "grad_norm": 0.10080928355455399, "kl": 0.05965164303779602, "learning_rate": 3e-06, "loss": 0.0239, "step": 872 }, { "clip_ratio": 0.0002776125547825359, "epoch": 0.0024236669831592624, "grad_norm": 0.09574668854475021, "kl": 0.05859038606286049, "learning_rate": 3e-06, "loss": 0.0238, "step": 873 }, { "clip_ratio": 0.00010575295891612768, "epoch": 0.0024264432339990783, "grad_norm": 0.1512366235256195, "kl": 0.06465988978743553, "learning_rate": 3e-06, "loss": 0.0241, "step": 874 }, { "clip_ratio": 0.0002814402541844174, "epoch": 0.002429219484838894, "grad_norm": 0.11698296666145325, "kl": 0.06264219433069229, "learning_rate": 3e-06, "loss": 0.0225, "step": 875 }, { "clip_ratio": 0.0004044586094096303, "epoch": 0.00243199573567871, "grad_norm": 0.1288166046142578, "kl": 0.06408961862325668, "learning_rate": 3e-06, "loss": 0.0247, "step": 876 }, { "clip_ratio": 9.765625145519152e-05, "completion_length": 228.87500762939453, "epoch": 0.002434771986518526, "grad_norm": 0.12193048000335693, "kl": 0.06197246536612511, "learning_rate": 3e-06, "loss": -0.0093, "reward": 0.3229166716337204, "reward_std": 0.3930432200431824, "rewards/countdown_reward_func": 0.3229166716337204, "step": 877, "zero_std_ratio": 0.0 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0024375482373583416, "grad_norm": 0.10932080447673798, "kl": 0.06703280657529831, "learning_rate": 3e-06, "loss": -0.0094, "step": 878 }, { "clip_ratio": 0.00034134124871343374, "epoch": 0.0024403244881981575, "grad_norm": 0.13418442010879517, "kl": 0.06639807298779488, "learning_rate": 3e-06, "loss": -0.0095, "step": 879 }, { "clip_ratio": 0.0, "epoch": 0.0024431007390379737, "grad_norm": 0.15062019228935242, "kl": 0.06595133803784847, "learning_rate": 3e-06, "loss": -0.0089, "step": 880 }, { "clip_ratio": 0.0, "epoch": 0.0024458769898777896, "grad_norm": 0.16575033962726593, "kl": 0.0656869113445282, "learning_rate": 3e-06, "loss": -0.0098, "step": 881 }, { "clip_ratio": 0.000244140625, "epoch": 0.0024486532407176054, "grad_norm": 0.1262388676404953, "kl": 0.06567086651921272, "learning_rate": 3e-06, "loss": -0.0092, "step": 882 }, { "clip_ratio": 0.00017017313075484708, "epoch": 0.0024514294915574213, "grad_norm": 0.11113395541906357, "kl": 0.06466436572372913, "learning_rate": 3e-06, "loss": -0.0093, "step": 883 }, { "clip_ratio": 0.0, "epoch": 0.002454205742397237, "grad_norm": 0.10662338137626648, "kl": 0.0688115581870079, "learning_rate": 3e-06, "loss": -0.0096, "step": 884 }, { "clip_ratio": 0.0, "epoch": 0.002456981993237053, "grad_norm": 0.1345973163843155, "kl": 0.06801817566156387, "learning_rate": 3e-06, "loss": -0.0096, "step": 885 }, { "clip_ratio": 9.720062371343374e-05, "epoch": 0.0024597582440768688, "grad_norm": 0.144433856010437, "kl": 0.06568440422415733, "learning_rate": 3e-06, "loss": -0.0107, "step": 886 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0024625344949166846, "grad_norm": 0.16164414584636688, "kl": 0.06407869979739189, "learning_rate": 3e-06, "loss": -0.0106, "step": 887 }, { "clip_ratio": 0.0, "epoch": 0.0024653107457565005, "grad_norm": 0.11676425486803055, "kl": 0.06534116342663765, "learning_rate": 3e-06, "loss": -0.0106, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 220.02084350585938, "epoch": 0.0024680869965963163, "grad_norm": 0.1673632115125656, "kl": 0.061935342848300934, "learning_rate": 3e-06, "loss": -0.0087, "reward": 0.22291667014360428, "reward_std": 0.26279305666685104, "rewards/countdown_reward_func": 0.22291666269302368, "step": 889, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00020538756507448852, "epoch": 0.002470863247436132, "grad_norm": 0.11077067255973816, "kl": 0.06396340392529964, "learning_rate": 3e-06, "loss": -0.0089, "step": 890 }, { "clip_ratio": 0.00019290123600512743, "epoch": 0.0024736394982759484, "grad_norm": 0.15075094997882843, "kl": 0.06460290402173996, "learning_rate": 3e-06, "loss": -0.0083, "step": 891 }, { "clip_ratio": 9.272996976505965e-05, "epoch": 0.0024764157491157643, "grad_norm": 0.10630524158477783, "kl": 0.0635167695581913, "learning_rate": 3e-06, "loss": -0.0087, "step": 892 }, { "clip_ratio": 0.0002878382147173397, "epoch": 0.00247919199995558, "grad_norm": 0.11911377310752869, "kl": 0.06044987216591835, "learning_rate": 3e-06, "loss": -0.0086, "step": 893 }, { "clip_ratio": 0.00030706560937687755, "epoch": 0.002481968250795396, "grad_norm": 0.11326951533555984, "kl": 0.06032133474946022, "learning_rate": 3e-06, "loss": -0.0098, "step": 894 }, { "clip_ratio": 0.00010088780982187018, "epoch": 0.0024847445016352118, "grad_norm": 0.17850473523139954, "kl": 0.05723920278251171, "learning_rate": 3e-06, "loss": -0.0116, "step": 895 }, { "clip_ratio": 0.00030706560937687755, "epoch": 0.0024875207524750276, "grad_norm": 0.11720696091651917, "kl": 0.05883798375725746, "learning_rate": 3e-06, "loss": -0.0106, "step": 896 }, { "clip_ratio": 0.0002937890458269976, "epoch": 0.0024902970033148434, "grad_norm": 0.1409841924905777, "kl": 0.0597931444644928, "learning_rate": 3e-06, "loss": -0.0107, "step": 897 }, { "clip_ratio": 0.00028926064987899736, "epoch": 0.0024930732541546593, "grad_norm": 0.1104314997792244, "kl": 0.05723441019654274, "learning_rate": 3e-06, "loss": -0.0106, "step": 898 }, { "clip_ratio": 0.0003814154479186982, "epoch": 0.002495849504994475, "grad_norm": 0.1183939203619957, "kl": 0.05589612200856209, "learning_rate": 3e-06, "loss": -0.0112, "step": 899 }, { "clip_ratio": 0.0010607101139612496, "epoch": 0.002498625755834291, "grad_norm": 0.12353586405515671, "kl": 0.05687661096453667, "learning_rate": 3e-06, "loss": -0.0124, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 232.25, "epoch": 0.002501402006674107, "grad_norm": 0.09452740103006363, "kl": 0.06010809168219566, "learning_rate": 3e-06, "loss": 0.0025, "reward": 0.3020833358168602, "reward_std": 0.2922677993774414, "rewards/countdown_reward_func": 0.3020833358168602, "step": 901, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.002504178257513923, "grad_norm": 0.11706841737031937, "kl": 0.06478729844093323, "learning_rate": 3e-06, "loss": 0.0024, "step": 902 }, { "clip_ratio": 8.680555765749887e-05, "epoch": 0.002506954508353739, "grad_norm": 0.09865949302911758, "kl": 0.062279969453811646, "learning_rate": 3e-06, "loss": 0.0019, "step": 903 }, { "clip_ratio": 0.00018655490566743538, "epoch": 0.0025097307591935548, "grad_norm": 0.13041819632053375, "kl": 0.05919046886265278, "learning_rate": 3e-06, "loss": 0.0029, "step": 904 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.0025125070100333706, "grad_norm": 0.14748121798038483, "kl": 0.059419430792331696, "learning_rate": 3e-06, "loss": 0.0023, "step": 905 }, { "clip_ratio": 0.0, "epoch": 0.0025152832608731864, "grad_norm": 0.09640306979417801, "kl": 0.05946239456534386, "learning_rate": 3e-06, "loss": 0.0024, "step": 906 }, { "clip_ratio": 0.00036380960227688774, "epoch": 0.0025180595117130023, "grad_norm": 0.08679016679525375, "kl": 0.05758332274854183, "learning_rate": 3e-06, "loss": 0.0021, "step": 907 }, { "clip_ratio": 0.0004546546551864594, "epoch": 0.002520835762552818, "grad_norm": 0.10903292149305344, "kl": 0.06176324933767319, "learning_rate": 3e-06, "loss": 0.0007, "step": 908 }, { "clip_ratio": 0.0001810974645195529, "epoch": 0.002523612013392634, "grad_norm": 0.10386759042739868, "kl": 0.06048583798110485, "learning_rate": 3e-06, "loss": 0.0009, "step": 909 }, { "clip_ratio": 0.0005492813070304692, "epoch": 0.00252638826423245, "grad_norm": 0.10254993289709091, "kl": 0.057429682463407516, "learning_rate": 3e-06, "loss": 0.0013, "step": 910 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.0025291645150722656, "grad_norm": 0.14101967215538025, "kl": 0.05812258459627628, "learning_rate": 3e-06, "loss": 0.0015, "step": 911 }, { "clip_ratio": 0.0004883037181571126, "epoch": 0.0025319407659120815, "grad_norm": 0.08682620525360107, "kl": 0.059369875118136406, "learning_rate": 3e-06, "loss": 0.0012, "step": 912 }, { "clip_ratio": 0.0005338219925761223, "completion_length": 228.77083587646484, "epoch": 0.0025347170167518977, "grad_norm": 0.11657971143722534, "kl": 0.05467422492802143, "learning_rate": 3e-06, "loss": 0.0358, "reward": 0.23125001043081284, "reward_std": 0.2115694098174572, "rewards/countdown_reward_func": 0.23125001043081284, "step": 913, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0025374932675917136, "grad_norm": 0.12091764807701111, "kl": 0.05531150847673416, "learning_rate": 3e-06, "loss": 0.0353, "step": 914 }, { "clip_ratio": 0.0012001376890111715, "epoch": 0.0025402695184315294, "grad_norm": 0.13323377072811127, "kl": 0.05716773122549057, "learning_rate": 3e-06, "loss": 0.0353, "step": 915 }, { "clip_ratio": 0.0001666462339926511, "epoch": 0.0025430457692713453, "grad_norm": 0.08699194341897964, "kl": 0.058137066662311554, "learning_rate": 3e-06, "loss": 0.0353, "step": 916 }, { "clip_ratio": 0.0005200195519137196, "epoch": 0.002545822020111161, "grad_norm": 0.10792747139930725, "kl": 0.06085832789540291, "learning_rate": 3e-06, "loss": 0.0362, "step": 917 }, { "clip_ratio": 8.60289073898457e-05, "epoch": 0.002548598270950977, "grad_norm": 0.08918231725692749, "kl": 0.05700236186385155, "learning_rate": 3e-06, "loss": 0.0347, "step": 918 }, { "clip_ratio": 0.0, "epoch": 0.0025513745217907928, "grad_norm": 0.11331060528755188, "kl": 0.05938470549881458, "learning_rate": 3e-06, "loss": 0.035, "step": 919 }, { "clip_ratio": 0.0, "epoch": 0.0025541507726306086, "grad_norm": 0.12028547376394272, "kl": 0.060254018753767014, "learning_rate": 3e-06, "loss": 0.0339, "step": 920 }, { "clip_ratio": 0.0003332924679853022, "epoch": 0.0025569270234704245, "grad_norm": 0.12606388330459595, "kl": 0.06682027131319046, "learning_rate": 3e-06, "loss": 0.0338, "step": 921 }, { "clip_ratio": 8.526603050995618e-05, "epoch": 0.0025597032743102403, "grad_norm": 0.08522091805934906, "kl": 0.06515257433056831, "learning_rate": 3e-06, "loss": 0.0341, "step": 922 }, { "clip_ratio": 0.0005214190459810197, "epoch": 0.0025624795251500566, "grad_norm": 0.10933967679738998, "kl": 0.06955453753471375, "learning_rate": 3e-06, "loss": 0.0342, "step": 923 }, { "clip_ratio": 8.60289073898457e-05, "epoch": 0.0025652557759898724, "grad_norm": 0.08516346663236618, "kl": 0.06566111743450165, "learning_rate": 3e-06, "loss": 0.0332, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 238.70834350585938, "epoch": 0.0025680320268296883, "grad_norm": 0.051000598818063736, "kl": 0.06130129098892212, "learning_rate": 3e-06, "loss": 0.008, "reward": 0.19166667759418488, "reward_std": 0.1091257855296135, "rewards/countdown_reward_func": 0.19166666269302368, "step": 925, "zero_std_ratio": 0.625 }, { "clip_ratio": 9.704969124868512e-05, "epoch": 0.002570808277669504, "grad_norm": 0.06743042171001434, "kl": 0.06677010655403137, "learning_rate": 3e-06, "loss": 0.0085, "step": 926 }, { "clip_ratio": 0.0002635217970237136, "epoch": 0.00257358452850932, "grad_norm": 0.05554405599832535, "kl": 0.06589578464627266, "learning_rate": 3e-06, "loss": 0.0079, "step": 927 }, { "clip_ratio": 8.520791016053408e-05, "epoch": 0.0025763607793491358, "grad_norm": 0.06437760591506958, "kl": 0.06517062336206436, "learning_rate": 3e-06, "loss": 0.0083, "step": 928 }, { "clip_ratio": 0.00024796833167783916, "epoch": 0.0025791370301889516, "grad_norm": 0.056508298963308334, "kl": 0.0656033419072628, "learning_rate": 3e-06, "loss": 0.0079, "step": 929 }, { "clip_ratio": 0.0002620808663778007, "epoch": 0.0025819132810287674, "grad_norm": 0.05479772761464119, "kl": 0.07185834646224976, "learning_rate": 3e-06, "loss": 0.0084, "step": 930 }, { "clip_ratio": 9.704969124868512e-05, "epoch": 0.0025846895318685833, "grad_norm": 0.05194047465920448, "kl": 0.06922575458884239, "learning_rate": 3e-06, "loss": 0.008, "step": 931 }, { "clip_ratio": 0.0, "epoch": 0.002587465782708399, "grad_norm": 0.06964042782783508, "kl": 0.07410523667931557, "learning_rate": 3e-06, "loss": 0.008, "step": 932 }, { "clip_ratio": 9.704969124868512e-05, "epoch": 0.002590242033548215, "grad_norm": 0.04855266585946083, "kl": 0.07128038257360458, "learning_rate": 3e-06, "loss": 0.0079, "step": 933 }, { "clip_ratio": 0.0, "epoch": 0.0025930182843880312, "grad_norm": 0.07284444570541382, "kl": 0.0707533061504364, "learning_rate": 3e-06, "loss": 0.0085, "step": 934 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.002595794535227847, "grad_norm": 0.06308530271053314, "kl": 0.07013512030243874, "learning_rate": 3e-06, "loss": 0.0075, "step": 935 }, { "clip_ratio": 0.00040690103196538985, "epoch": 0.002598570786067663, "grad_norm": 0.05616437643766403, "kl": 0.0736420564353466, "learning_rate": 3e-06, "loss": 0.0074, "step": 936 }, { "clip_ratio": 0.00017452477186452597, "completion_length": 236.02084350585938, "epoch": 0.0026013470369074788, "grad_norm": 0.09799813479185104, "kl": 0.08096648007631302, "learning_rate": 3e-06, "loss": -0.0021, "reward": 0.24583333730697632, "reward_std": 0.214500330388546, "rewards/countdown_reward_func": 0.24583332985639572, "step": 937, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00016611294995527714, "epoch": 0.0026041232877472946, "grad_norm": 0.19633734226226807, "kl": 0.08069871366024017, "learning_rate": 3e-06, "loss": -0.0012, "step": 938 }, { "clip_ratio": 0.00033192152477568015, "epoch": 0.0026068995385871104, "grad_norm": 0.09509781002998352, "kl": 0.07796993106603622, "learning_rate": 3e-06, "loss": -0.0019, "step": 939 }, { "clip_ratio": 9.865824540611356e-05, "epoch": 0.0026096757894269263, "grad_norm": 0.09252092242240906, "kl": 0.08357620239257812, "learning_rate": 3e-06, "loss": -0.002, "step": 940 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.002612452040266742, "grad_norm": 0.1239117830991745, "kl": 0.0816221609711647, "learning_rate": 3e-06, "loss": -0.002, "step": 941 }, { "clip_ratio": 0.00018341893155593425, "epoch": 0.002615228291106558, "grad_norm": 0.09028197079896927, "kl": 0.0803113654255867, "learning_rate": 3e-06, "loss": -0.0015, "step": 942 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.002618004541946374, "grad_norm": 0.09007646143436432, "kl": 0.07762451097369194, "learning_rate": 3e-06, "loss": -0.0025, "step": 943 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0026207807927861896, "grad_norm": 0.21453428268432617, "kl": 0.07529747486114502, "learning_rate": 3e-06, "loss": -0.004, "step": 944 }, { "clip_ratio": 0.0001691611105343327, "epoch": 0.002623557043626006, "grad_norm": 0.08801465481519699, "kl": 0.07271482422947884, "learning_rate": 3e-06, "loss": -0.002, "step": 945 }, { "clip_ratio": 0.00018643914518179372, "epoch": 0.0026263332944658217, "grad_norm": 0.09710396826267242, "kl": 0.0768899917602539, "learning_rate": 3e-06, "loss": -0.003, "step": 946 }, { "clip_ratio": 0.0, "epoch": 0.0026291095453056376, "grad_norm": 0.1289989948272705, "kl": 0.07622705027461052, "learning_rate": 3e-06, "loss": -0.0033, "step": 947 }, { "clip_ratio": 0.0005273459973977879, "epoch": 0.0026318857961454534, "grad_norm": 0.08834261447191238, "kl": 0.07354174181818962, "learning_rate": 3e-06, "loss": -0.0024, "step": 948 }, { "clip_ratio": 8.890469325706363e-05, "completion_length": 230.70834350585938, "epoch": 0.0026346620469852693, "grad_norm": 0.07417230308055878, "kl": 0.06710755452513695, "learning_rate": 3e-06, "loss": -0.0056, "reward": 0.16875001788139343, "reward_std": 0.15433254092931747, "rewards/countdown_reward_func": 0.16875001788139343, "step": 949, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00036710136919282377, "epoch": 0.002637438297825085, "grad_norm": 0.10319241136312485, "kl": 0.0669807717204094, "learning_rate": 3e-06, "loss": -0.0052, "step": 950 }, { "clip_ratio": 8.890469325706363e-05, "epoch": 0.002640214548664901, "grad_norm": 0.08117877691984177, "kl": 0.06458613276481628, "learning_rate": 3e-06, "loss": -0.0062, "step": 951 }, { "clip_ratio": 0.0001982553512789309, "epoch": 0.0026429907995047168, "grad_norm": 0.08867120742797852, "kl": 0.06354731135070324, "learning_rate": 3e-06, "loss": -0.0049, "step": 952 }, { "clip_ratio": 9.65996878221631e-05, "epoch": 0.0026457670503445326, "grad_norm": 0.07009001821279526, "kl": 0.06276779621839523, "learning_rate": 3e-06, "loss": -0.0057, "step": 953 }, { "clip_ratio": 0.0001789265952538699, "epoch": 0.0026485433011843485, "grad_norm": 0.08066268265247345, "kl": 0.061625886708498, "learning_rate": 3e-06, "loss": -0.0056, "step": 954 }, { "clip_ratio": 0.00026640027499524876, "epoch": 0.0026513195520241643, "grad_norm": 0.07269750535488129, "kl": 0.058654628694057465, "learning_rate": 3e-06, "loss": -0.0063, "step": 955 }, { "clip_ratio": 0.0012421588180586696, "epoch": 0.0026540958028639806, "grad_norm": 0.08015119284391403, "kl": 0.05745803192257881, "learning_rate": 3e-06, "loss": -0.0057, "step": 956 }, { "clip_ratio": 0.0, "epoch": 0.0026568720537037964, "grad_norm": 0.07885020971298218, "kl": 0.056764453649520874, "learning_rate": 3e-06, "loss": -0.007, "step": 957 }, { "clip_ratio": 0.00027389177557779476, "epoch": 0.0026596483045436123, "grad_norm": 0.08463863283395767, "kl": 0.056341877207159996, "learning_rate": 3e-06, "loss": -0.0064, "step": 958 }, { "clip_ratio": 0.001056087960023433, "epoch": 0.002662424555383428, "grad_norm": 0.07922295480966568, "kl": 0.05454845167696476, "learning_rate": 3e-06, "loss": -0.007, "step": 959 }, { "clip_ratio": 0.0007927687838673592, "epoch": 0.002665200806223244, "grad_norm": 0.0849594920873642, "kl": 0.05426880158483982, "learning_rate": 3e-06, "loss": -0.006, "step": 960 }, { "clip_ratio": 0.00043250381713733077, "completion_length": 235.6666717529297, "epoch": 0.0026679770570630598, "grad_norm": 0.09037365019321442, "kl": 0.05271290987730026, "learning_rate": 3e-06, "loss": 0.0065, "reward": 0.229166679084301, "reward_std": 0.18291139230132103, "rewards/countdown_reward_func": 0.229166679084301, "step": 961, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0026707533079028756, "grad_norm": 0.10632222890853882, "kl": 0.05197305604815483, "learning_rate": 3e-06, "loss": 0.0064, "step": 962 }, { "clip_ratio": 0.0001750104856910184, "epoch": 0.0026735295587426914, "grad_norm": 0.06896176934242249, "kl": 0.04834933951497078, "learning_rate": 3e-06, "loss": 0.0068, "step": 963 }, { "clip_ratio": 0.0002540506684454158, "epoch": 0.0026763058095825073, "grad_norm": 0.07026878744363785, "kl": 0.051994968205690384, "learning_rate": 3e-06, "loss": 0.007, "step": 964 }, { "clip_ratio": 8.821453957352787e-05, "epoch": 0.002679082060422323, "grad_norm": 0.06770803779363632, "kl": 0.05004582740366459, "learning_rate": 3e-06, "loss": 0.0068, "step": 965 }, { "clip_ratio": 8.722958591533825e-05, "epoch": 0.002681858311262139, "grad_norm": 0.06422404199838638, "kl": 0.04954391345381737, "learning_rate": 3e-06, "loss": 0.0067, "step": 966 }, { "clip_ratio": 8.821453957352787e-05, "epoch": 0.0026846345621019552, "grad_norm": 0.09136969596147537, "kl": 0.04910329729318619, "learning_rate": 3e-06, "loss": 0.006, "step": 967 }, { "clip_ratio": 0.00027080931613454595, "epoch": 0.002687410812941771, "grad_norm": 0.10625612735748291, "kl": 0.047768834978342056, "learning_rate": 3e-06, "loss": 0.0058, "step": 968 }, { "clip_ratio": 0.0003528630913933739, "epoch": 0.002690187063781587, "grad_norm": 0.06184415891766548, "kl": 0.04515623487532139, "learning_rate": 3e-06, "loss": 0.0063, "step": 969 }, { "clip_ratio": 0.00016860979667399079, "epoch": 0.0026929633146214028, "grad_norm": 0.06915763020515442, "kl": 0.0480178352445364, "learning_rate": 3e-06, "loss": 0.006, "step": 970 }, { "clip_ratio": 0.0, "epoch": 0.0026957395654612186, "grad_norm": 0.06491398066282272, "kl": 0.047293346375226974, "learning_rate": 3e-06, "loss": 0.0062, "step": 971 }, { "clip_ratio": 0.00016860979667399079, "epoch": 0.0026985158163010344, "grad_norm": 0.07768521457910538, "kl": 0.04707910679280758, "learning_rate": 3e-06, "loss": 0.0061, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 241.56250762939453, "epoch": 0.0027012920671408503, "grad_norm": 0.10156456381082535, "kl": 0.04707324132323265, "learning_rate": 3e-06, "loss": 0.0114, "reward": 0.2666666805744171, "reward_std": 0.2661244869232178, "rewards/countdown_reward_func": 0.2666666731238365, "step": 973, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00025393527175765485, "epoch": 0.002704068317980666, "grad_norm": 0.08211185038089752, "kl": 0.044957129284739494, "learning_rate": 3e-06, "loss": 0.0122, "step": 974 }, { "clip_ratio": 9.391435014549643e-05, "epoch": 0.002706844568820482, "grad_norm": 0.080048106610775, "kl": 0.047641387209296227, "learning_rate": 3e-06, "loss": 0.0124, "step": 975 }, { "clip_ratio": 0.00024569814559072256, "epoch": 0.002709620819660298, "grad_norm": 0.1037638708949089, "kl": 0.04482552409172058, "learning_rate": 3e-06, "loss": 0.0119, "step": 976 }, { "clip_ratio": 0.00017228929937118664, "epoch": 0.0027123970705001136, "grad_norm": 0.08106601238250732, "kl": 0.04870855435729027, "learning_rate": 3e-06, "loss": 0.0119, "step": 977 }, { "clip_ratio": 0.0002454323766869493, "epoch": 0.00271517332133993, "grad_norm": 0.09446461498737335, "kl": 0.04764538258314133, "learning_rate": 3e-06, "loss": 0.0117, "step": 978 }, { "clip_ratio": 0.0, "epoch": 0.0027179495721797457, "grad_norm": 0.0833747461438179, "kl": 0.04668613523244858, "learning_rate": 3e-06, "loss": 0.0109, "step": 979 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.0027207258230195616, "grad_norm": 0.07825154811143875, "kl": 0.045210424810647964, "learning_rate": 3e-06, "loss": 0.0113, "step": 980 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0027235020738593774, "grad_norm": 0.0840124562382698, "kl": 0.04799755476415157, "learning_rate": 3e-06, "loss": 0.0107, "step": 981 }, { "clip_ratio": 0.0004264606104698032, "epoch": 0.0027262783246991933, "grad_norm": 0.11435186117887497, "kl": 0.04499867558479309, "learning_rate": 3e-06, "loss": 0.0109, "step": 982 }, { "clip_ratio": 0.00037225715641397983, "epoch": 0.002729054575539009, "grad_norm": 0.08535240590572357, "kl": 0.049062151461839676, "learning_rate": 3e-06, "loss": 0.011, "step": 983 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.002731830826378825, "grad_norm": 0.09356574714183807, "kl": 0.04856777377426624, "learning_rate": 3e-06, "loss": 0.0105, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 222.45833587646484, "epoch": 0.0027346070772186408, "grad_norm": 0.11254774034023285, "kl": 0.048314955085515976, "learning_rate": 3e-06, "loss": 0.0166, "reward": 0.22291667759418488, "reward_std": 0.26681750267744064, "rewards/countdown_reward_func": 0.22291667759418488, "step": 985, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0027373833280584566, "grad_norm": 0.128098264336586, "kl": 0.0460913497954607, "learning_rate": 3e-06, "loss": 0.0151, "step": 986 }, { "clip_ratio": 0.00010339123400626704, "epoch": 0.0027401595788982725, "grad_norm": 0.09424342215061188, "kl": 0.0468437634408474, "learning_rate": 3e-06, "loss": 0.0159, "step": 987 }, { "clip_ratio": 0.0, "epoch": 0.0027429358297380883, "grad_norm": 0.11079585552215576, "kl": 0.04501481167972088, "learning_rate": 3e-06, "loss": 0.0165, "step": 988 }, { "clip_ratio": 0.0026023527534562163, "epoch": 0.0027457120805779046, "grad_norm": 0.08553663641214371, "kl": 0.04914248362183571, "learning_rate": 3e-06, "loss": 0.0171, "step": 989 }, { "clip_ratio": 0.0, "epoch": 0.0027484883314177204, "grad_norm": 0.0737449899315834, "kl": 0.04859176091849804, "learning_rate": 3e-06, "loss": 0.016, "step": 990 }, { "clip_ratio": 0.0, "epoch": 0.0027512645822575362, "grad_norm": 0.1108751967549324, "kl": 0.05102384462952614, "learning_rate": 3e-06, "loss": 0.0155, "step": 991 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.002754040833097352, "grad_norm": 0.10651170462369919, "kl": 0.04962940514087677, "learning_rate": 3e-06, "loss": 0.0151, "step": 992 }, { "clip_ratio": 0.00029216046823421493, "epoch": 0.002756817083937168, "grad_norm": 0.08815775066614151, "kl": 0.050901319831609726, "learning_rate": 3e-06, "loss": 0.0147, "step": 993 }, { "clip_ratio": 0.0, "epoch": 0.0027595933347769838, "grad_norm": 0.1047065481543541, "kl": 0.04865125194191933, "learning_rate": 3e-06, "loss": 0.0157, "step": 994 }, { "clip_ratio": 0.0027102040985482745, "epoch": 0.0027623695856167996, "grad_norm": 0.08856725692749023, "kl": 0.053303858265280724, "learning_rate": 3e-06, "loss": 0.0166, "step": 995 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0027651458364566154, "grad_norm": 0.07977961748838425, "kl": 0.0532270185649395, "learning_rate": 3e-06, "loss": 0.0156, "step": 996 }, { "clip_ratio": 0.00010254306835122406, "completion_length": 225.8541717529297, "epoch": 0.0027679220872964313, "grad_norm": 0.11937547475099564, "kl": 0.0554725993424654, "learning_rate": 3e-06, "loss": 0.0133, "reward": 0.30000001937150955, "reward_std": 0.27453966438770294, "rewards/countdown_reward_func": 0.30000001937150955, "step": 997, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.002770698338136247, "grad_norm": 0.45567768812179565, "kl": 0.05745168775320053, "learning_rate": 3e-06, "loss": 0.0133, "step": 998 }, { "clip_ratio": 0.0001995211496250704, "epoch": 0.002773474588976063, "grad_norm": 0.1074514389038086, "kl": 0.060051169246435165, "learning_rate": 3e-06, "loss": 0.0124, "step": 999 }, { "epoch": 0.0027762508398158792, "grad_norm": 0.12824182212352753, "learning_rate": 3e-06, "loss": 0.012, "step": 1000 }, { "clip_ratio": 0.00012577813322423026, "epoch": 0.002779027090655695, "grad_norm": 0.09932482987642288, "kl": 0.060510776937007904, "learning_rate": 3e-06, "loss": 0.0125, "step": 1001 }, { "clip_ratio": 0.0, "epoch": 0.002781803341495511, "grad_norm": 0.09447323530912399, "kl": 0.06021983176469803, "learning_rate": 3e-06, "loss": 0.0126, "step": 1002 }, { "clip_ratio": 0.0002693641581572592, "epoch": 0.0027845795923353268, "grad_norm": 0.11810337752103806, "kl": 0.06065908633172512, "learning_rate": 3e-06, "loss": 0.0114, "step": 1003 }, { "clip_ratio": 9.97605748125352e-05, "epoch": 0.0027873558431751426, "grad_norm": 0.15187525749206543, "kl": 0.06130600720643997, "learning_rate": 3e-06, "loss": 0.012, "step": 1004 }, { "clip_ratio": 0.00019688567408593372, "epoch": 0.0027901320940149584, "grad_norm": 0.09934880584478378, "kl": 0.06536610797047615, "learning_rate": 3e-06, "loss": 0.0109, "step": 1005 }, { "clip_ratio": 0.0002693641581572592, "epoch": 0.0027929083448547743, "grad_norm": 0.1290806382894516, "kl": 0.06462856568396091, "learning_rate": 3e-06, "loss": 0.0118, "step": 1006 }, { "clip_ratio": 0.00017373176524415612, "epoch": 0.00279568459569459, "grad_norm": 0.09871116280555725, "kl": 0.06735832616686821, "learning_rate": 3e-06, "loss": 0.0116, "step": 1007 }, { "clip_ratio": 9.97605748125352e-05, "epoch": 0.002798460846534406, "grad_norm": 0.09537402540445328, "kl": 0.06406117230653763, "learning_rate": 3e-06, "loss": 0.0114, "step": 1008 }, { "clip_ratio": 0.00011488970631035045, "completion_length": 231.37500762939453, "epoch": 0.002801237097374222, "grad_norm": 0.3545832335948944, "kl": 0.05917428806424141, "learning_rate": 3e-06, "loss": 0.0063, "reward": 0.208333358168602, "reward_std": 0.18449045717716217, "rewards/countdown_reward_func": 0.2083333507180214, "step": 1009, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0004897922772215679, "epoch": 0.0028040133482140376, "grad_norm": 0.0879272073507309, "kl": 0.06689955294132233, "learning_rate": 3e-06, "loss": 0.0063, "step": 1010 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.002806789599053854, "grad_norm": 0.06225398927927017, "kl": 0.06652659550309181, "learning_rate": 3e-06, "loss": 0.007, "step": 1011 }, { "clip_ratio": 0.0002790517173707485, "epoch": 0.0028095658498936697, "grad_norm": 0.0772099494934082, "kl": 0.07098489999771118, "learning_rate": 3e-06, "loss": 0.0065, "step": 1012 }, { "clip_ratio": 0.00016578249051235616, "epoch": 0.0028123421007334856, "grad_norm": 0.09716866910457611, "kl": 0.06839455664157867, "learning_rate": 3e-06, "loss": 0.007, "step": 1013 }, { "clip_ratio": 0.0, "epoch": 0.0028151183515733014, "grad_norm": 0.08656153082847595, "kl": 0.06978728249669075, "learning_rate": 3e-06, "loss": 0.0069, "step": 1014 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0028178946024131173, "grad_norm": 0.07657860964536667, "kl": 0.0637618862092495, "learning_rate": 3e-06, "loss": 0.0061, "step": 1015 }, { "clip_ratio": 0.00025825316697591916, "epoch": 0.002820670853252933, "grad_norm": 0.09020351618528366, "kl": 0.0716327540576458, "learning_rate": 3e-06, "loss": 0.0058, "step": 1016 }, { "clip_ratio": 0.0004928143753204495, "epoch": 0.002823447104092749, "grad_norm": 0.07356920838356018, "kl": 0.06976194307208061, "learning_rate": 3e-06, "loss": 0.0068, "step": 1017 }, { "clip_ratio": 0.00035517166543286294, "epoch": 0.0028262233549325648, "grad_norm": 0.08907874673604965, "kl": 0.07387241721153259, "learning_rate": 3e-06, "loss": 0.0067, "step": 1018 }, { "clip_ratio": 0.00017687295621726662, "epoch": 0.0028289996057723806, "grad_norm": 0.08614590018987656, "kl": 0.07145676389336586, "learning_rate": 3e-06, "loss": 0.0071, "step": 1019 }, { "clip_ratio": 0.0001792114635463804, "epoch": 0.0028317758566121965, "grad_norm": 0.10110598802566528, "kl": 0.07314455509185791, "learning_rate": 3e-06, "loss": 0.0059, "step": 1020 }, { "clip_ratio": 0.00019379844889044762, "completion_length": 232.77084350585938, "epoch": 0.0028345521074520123, "grad_norm": 0.07683458924293518, "kl": 0.07031066715717316, "learning_rate": 3e-06, "loss": 0.0068, "reward": 0.2291666716337204, "reward_std": 0.21667250245809555, "rewards/countdown_reward_func": 0.2291666641831398, "step": 1021, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0002766043471638113, "epoch": 0.0028373283582918286, "grad_norm": 0.08281517773866653, "kl": 0.07224087789654732, "learning_rate": 3e-06, "loss": 0.0075, "step": 1022 }, { "clip_ratio": 0.0003614198212744668, "epoch": 0.0028401046091316444, "grad_norm": 0.13180024921894073, "kl": 0.07020625099539757, "learning_rate": 3e-06, "loss": 0.0074, "step": 1023 }, { "clip_ratio": 0.0, "epoch": 0.0028428808599714602, "grad_norm": 0.09534606337547302, "kl": 0.07347037643194199, "learning_rate": 3e-06, "loss": 0.0071, "step": 1024 }, { "clip_ratio": 0.0003760801919270307, "epoch": 0.002845657110811276, "grad_norm": 0.11282859742641449, "kl": 0.07506215572357178, "learning_rate": 3e-06, "loss": 0.0069, "step": 1025 }, { "clip_ratio": 9.124087227974087e-05, "epoch": 0.002848433361651092, "grad_norm": 0.08544313162565231, "kl": 0.07066556811332703, "learning_rate": 3e-06, "loss": 0.0062, "step": 1026 }, { "clip_ratio": 0.0, "epoch": 0.0028512096124909078, "grad_norm": 0.07569853216409683, "kl": 0.07081609964370728, "learning_rate": 3e-06, "loss": 0.0069, "step": 1027 }, { "clip_ratio": 0.00036229201941750944, "epoch": 0.0028539858633307236, "grad_norm": 0.09049994498491287, "kl": 0.07069279998540878, "learning_rate": 3e-06, "loss": 0.0068, "step": 1028 }, { "clip_ratio": 0.0009208531701005995, "epoch": 0.0028567621141705394, "grad_norm": 0.15014901757240295, "kl": 0.06979522109031677, "learning_rate": 3e-06, "loss": 0.0065, "step": 1029 }, { "clip_ratio": 0.00026231101946905255, "epoch": 0.0028595383650103553, "grad_norm": 0.10039456933736801, "kl": 0.0724063590168953, "learning_rate": 3e-06, "loss": 0.006, "step": 1030 }, { "clip_ratio": 0.0007212661657831632, "epoch": 0.002862314615850171, "grad_norm": 0.10597745329141617, "kl": 0.07448813319206238, "learning_rate": 3e-06, "loss": 0.005, "step": 1031 }, { "clip_ratio": 0.00026949287712341174, "epoch": 0.002865090866689987, "grad_norm": 0.08808151632547379, "kl": 0.06819097325205803, "learning_rate": 3e-06, "loss": 0.0057, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 220.4791717529297, "epoch": 0.0028678671175298032, "grad_norm": 0.12043464183807373, "kl": 0.06623350828886032, "learning_rate": 3e-06, "loss": -0.0042, "reward": 0.35625001788139343, "reward_std": 0.29714028537273407, "rewards/countdown_reward_func": 0.35625001788139343, "step": 1033, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.448223863728344e-05, "epoch": 0.002870643368369619, "grad_norm": 0.10195807367563248, "kl": 0.07157684862613678, "learning_rate": 3e-06, "loss": -0.0033, "step": 1034 }, { "clip_ratio": 0.0, "epoch": 0.002873419619209435, "grad_norm": 0.09431300312280655, "kl": 0.06604013964533806, "learning_rate": 3e-06, "loss": -0.0042, "step": 1035 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0028761958700492508, "grad_norm": 0.10502111911773682, "kl": 0.06586654111742973, "learning_rate": 3e-06, "loss": -0.0037, "step": 1036 }, { "clip_ratio": 0.0, "epoch": 0.0028789721208890666, "grad_norm": 0.17106521129608154, "kl": 0.06683254614472389, "learning_rate": 3e-06, "loss": -0.0046, "step": 1037 }, { "clip_ratio": 0.0, "epoch": 0.0028817483717288824, "grad_norm": 0.1259961724281311, "kl": 0.06468446552753448, "learning_rate": 3e-06, "loss": -0.0038, "step": 1038 }, { "clip_ratio": 9.448223863728344e-05, "epoch": 0.0028845246225686983, "grad_norm": 0.12149322032928467, "kl": 0.06319839507341385, "learning_rate": 3e-06, "loss": -0.0049, "step": 1039 }, { "clip_ratio": 9.448223863728344e-05, "epoch": 0.002887300873408514, "grad_norm": 0.09481094032526016, "kl": 0.0666964054107666, "learning_rate": 3e-06, "loss": -0.0039, "step": 1040 }, { "clip_ratio": 8.491847984259948e-05, "epoch": 0.00289007712424833, "grad_norm": 0.09099525958299637, "kl": 0.061311766505241394, "learning_rate": 3e-06, "loss": -0.005, "step": 1041 }, { "clip_ratio": 0.00018680129142012447, "epoch": 0.002892853375088146, "grad_norm": 0.11290211975574493, "kl": 0.061817897483706474, "learning_rate": 3e-06, "loss": -0.0053, "step": 1042 }, { "clip_ratio": 0.00027388295711716637, "epoch": 0.0028956296259279616, "grad_norm": 0.13663621246814728, "kl": 0.06613102555274963, "learning_rate": 3e-06, "loss": -0.0062, "step": 1043 }, { "clip_ratio": 0.0, "epoch": 0.002898405876767778, "grad_norm": 0.10643167048692703, "kl": 0.06023329682648182, "learning_rate": 3e-06, "loss": -0.0055, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 232.56250762939453, "epoch": 0.0029011821276075937, "grad_norm": 0.07880929112434387, "kl": 0.06694615818560123, "learning_rate": 3e-06, "loss": 0.0099, "reward": 0.21250002086162567, "reward_std": 0.16564146801829338, "rewards/countdown_reward_func": 0.21250002086162567, "step": 1045, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00028686892619589344, "epoch": 0.0029039583784474096, "grad_norm": 0.08016741275787354, "kl": 0.06679723039269447, "learning_rate": 3e-06, "loss": 0.0107, "step": 1046 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0029067346292872254, "grad_norm": 0.07099456340074539, "kl": 0.06291750818490982, "learning_rate": 3e-06, "loss": 0.0097, "step": 1047 }, { "clip_ratio": 0.00010056315659312531, "epoch": 0.0029095108801270413, "grad_norm": 0.07919389754533768, "kl": 0.06845093332231045, "learning_rate": 3e-06, "loss": 0.0109, "step": 1048 }, { "clip_ratio": 0.00010080645006382838, "epoch": 0.002912287130966857, "grad_norm": 0.07597494125366211, "kl": 0.06439798139035702, "learning_rate": 3e-06, "loss": 0.0106, "step": 1049 }, { "clip_ratio": 0.00017322444182354957, "epoch": 0.002915063381806673, "grad_norm": 0.09772368520498276, "kl": 0.0649341493844986, "learning_rate": 3e-06, "loss": 0.0107, "step": 1050 }, { "clip_ratio": 0.001496979035437107, "epoch": 0.0029178396326464888, "grad_norm": 0.08392099291086197, "kl": 0.06803803145885468, "learning_rate": 3e-06, "loss": 0.0101, "step": 1051 }, { "clip_ratio": 0.000669743909384124, "epoch": 0.0029206158834863046, "grad_norm": 0.08624348044395447, "kl": 0.06706851720809937, "learning_rate": 3e-06, "loss": 0.01, "step": 1052 }, { "clip_ratio": 0.001187364658107981, "epoch": 0.0029233921343261205, "grad_norm": 0.07946142554283142, "kl": 0.06465382128953934, "learning_rate": 3e-06, "loss": 0.0091, "step": 1053 }, { "clip_ratio": 0.0002712814530241303, "epoch": 0.0029261683851659363, "grad_norm": 0.1127048060297966, "kl": 0.07115714997053146, "learning_rate": 3e-06, "loss": 0.0101, "step": 1054 }, { "clip_ratio": 0.00016687953029759228, "epoch": 0.0029289446360057526, "grad_norm": 0.07184568047523499, "kl": 0.06631535664200783, "learning_rate": 3e-06, "loss": 0.0091, "step": 1055 }, { "clip_ratio": 0.00020161290012765676, "epoch": 0.0029317208868455684, "grad_norm": 0.107190802693367, "kl": 0.06775633245706558, "learning_rate": 3e-06, "loss": 0.0092, "step": 1056 }, { "clip_ratio": 0.00026758435706142336, "completion_length": 230.25000762939453, "epoch": 0.0029344971376853842, "grad_norm": 0.08019760996103287, "kl": 0.07465282827615738, "learning_rate": 3e-06, "loss": 0.0048, "reward": 0.22708334773778915, "reward_std": 0.17232364416122437, "rewards/countdown_reward_func": 0.22708334773778915, "step": 1057, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.294625149574131e-05, "epoch": 0.0029372733885252, "grad_norm": 0.08832976967096329, "kl": 0.07460768148303032, "learning_rate": 3e-06, "loss": 0.0056, "step": 1058 }, { "clip_ratio": 0.00026728439843282104, "epoch": 0.002940049639365016, "grad_norm": 0.07996435463428497, "kl": 0.07490583881735802, "learning_rate": 3e-06, "loss": 0.0055, "step": 1059 }, { "clip_ratio": 0.0, "epoch": 0.0029428258902048318, "grad_norm": 0.06566757708787918, "kl": 0.07495003193616867, "learning_rate": 3e-06, "loss": 0.0056, "step": 1060 }, { "clip_ratio": 0.0001822225167416036, "epoch": 0.0029456021410446476, "grad_norm": 0.07333634048700333, "kl": 0.07876036688685417, "learning_rate": 3e-06, "loss": 0.0054, "step": 1061 }, { "clip_ratio": 9.72762645687908e-05, "epoch": 0.0029483783918844634, "grad_norm": 0.08647799491882324, "kl": 0.0771314725279808, "learning_rate": 3e-06, "loss": 0.005, "step": 1062 }, { "clip_ratio": 8.928571332944557e-05, "epoch": 0.0029511546427242793, "grad_norm": 0.07851536571979523, "kl": 0.07513091340661049, "learning_rate": 3e-06, "loss": 0.0045, "step": 1063 }, { "clip_ratio": 0.0002651687682373449, "epoch": 0.002953930893564095, "grad_norm": 0.12982326745986938, "kl": 0.07557458430528641, "learning_rate": 3e-06, "loss": 0.0051, "step": 1064 }, { "clip_ratio": 0.00026728439843282104, "epoch": 0.0029567071444039114, "grad_norm": 0.07488778233528137, "kl": 0.07366466149687767, "learning_rate": 3e-06, "loss": 0.0045, "step": 1065 }, { "clip_ratio": 0.00026775128208100796, "epoch": 0.0029594833952437272, "grad_norm": 0.07061980664730072, "kl": 0.0725603848695755, "learning_rate": 3e-06, "loss": 0.0048, "step": 1066 }, { "clip_ratio": 0.00024727272102609277, "epoch": 0.002962259646083543, "grad_norm": 0.07645444571971893, "kl": 0.07657236978411674, "learning_rate": 3e-06, "loss": 0.0046, "step": 1067 }, { "clip_ratio": 0.0003571428533177823, "epoch": 0.002965035896923359, "grad_norm": 0.07127691805362701, "kl": 0.07353055477142334, "learning_rate": 3e-06, "loss": 0.0045, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 222.81250762939453, "epoch": 0.0029678121477631748, "grad_norm": 0.0665845200419426, "kl": 0.06168382987380028, "learning_rate": 3e-06, "loss": 0.0032, "reward": 0.17500000447034836, "reward_std": 0.14995060861110687, "rewards/countdown_reward_func": 0.17500000074505806, "step": 1069, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00035589729668572545, "epoch": 0.0029705883986029906, "grad_norm": 0.08601254224777222, "kl": 0.05869180150330067, "learning_rate": 3e-06, "loss": 0.0034, "step": 1070 }, { "clip_ratio": 0.0, "epoch": 0.0029733646494428064, "grad_norm": 0.08124065399169922, "kl": 0.05769502557814121, "learning_rate": 3e-06, "loss": 0.0034, "step": 1071 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0029761409002826223, "grad_norm": 0.062196120619773865, "kl": 0.0583453718572855, "learning_rate": 3e-06, "loss": 0.0034, "step": 1072 }, { "clip_ratio": 0.0002649337984621525, "epoch": 0.002978917151122438, "grad_norm": 0.05779939889907837, "kl": 0.056453775614500046, "learning_rate": 3e-06, "loss": 0.0038, "step": 1073 }, { "clip_ratio": 0.00030795554630458355, "epoch": 0.002981693401962254, "grad_norm": 0.05982575565576553, "kl": 0.055925050750374794, "learning_rate": 3e-06, "loss": 0.0027, "step": 1074 }, { "clip_ratio": 0.00030165042699081823, "epoch": 0.00298446965280207, "grad_norm": 0.07068102061748505, "kl": 0.057197773829102516, "learning_rate": 3e-06, "loss": 0.0028, "step": 1075 }, { "clip_ratio": 0.0004265254028723575, "epoch": 0.002987245903641886, "grad_norm": 0.09287840127944946, "kl": 0.05506822466850281, "learning_rate": 3e-06, "loss": 0.0024, "step": 1076 }, { "clip_ratio": 0.0004502255469560623, "epoch": 0.002990022154481702, "grad_norm": 0.07072468101978302, "kl": 0.053443435579538345, "learning_rate": 3e-06, "loss": 0.0028, "step": 1077 }, { "clip_ratio": 0.00017390426364727318, "epoch": 0.0029927984053215177, "grad_norm": 0.07123330235481262, "kl": 0.05358114279806614, "learning_rate": 3e-06, "loss": 0.003, "step": 1078 }, { "clip_ratio": 0.0006227242993190885, "epoch": 0.0029955746561613336, "grad_norm": 0.063522107899189, "kl": 0.052388763055205345, "learning_rate": 3e-06, "loss": 0.0032, "step": 1079 }, { "clip_ratio": 0.0002694505383260548, "epoch": 0.0029983509070011494, "grad_norm": 0.058882538229227066, "kl": 0.05159541964530945, "learning_rate": 3e-06, "loss": 0.0024, "step": 1080 }, { "clip_ratio": 0.00018450184143148363, "completion_length": 226.64584350585938, "epoch": 0.0030011271578409653, "grad_norm": 0.1080564484000206, "kl": 0.05418804846704006, "learning_rate": 3e-06, "loss": -0.0019, "reward": 0.28333334624767303, "reward_std": 0.3219813033938408, "rewards/countdown_reward_func": 0.28333333879709244, "step": 1081, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0006435593895730563, "epoch": 0.003003903408680781, "grad_norm": 0.12120220065116882, "kl": 0.05108523927628994, "learning_rate": 3e-06, "loss": -0.0019, "step": 1082 }, { "clip_ratio": 0.0006201210926519707, "epoch": 0.003006679659520597, "grad_norm": 0.10109229385852814, "kl": 0.052261438220739365, "learning_rate": 3e-06, "loss": -0.0024, "step": 1083 }, { "clip_ratio": 0.00017445541743654758, "epoch": 0.0030094559103604128, "grad_norm": 0.09882961213588715, "kl": 0.05041038617491722, "learning_rate": 3e-06, "loss": -0.0026, "step": 1084 }, { "clip_ratio": 0.00020145705639151856, "epoch": 0.0030122321612002286, "grad_norm": 0.10909520089626312, "kl": 0.0491649005562067, "learning_rate": 3e-06, "loss": -0.0027, "step": 1085 }, { "clip_ratio": 0.0, "epoch": 0.0030150084120400445, "grad_norm": 0.1672067940235138, "kl": 0.049838531762361526, "learning_rate": 3e-06, "loss": -0.0029, "step": 1086 }, { "clip_ratio": 0.0, "epoch": 0.0030177846628798607, "grad_norm": 0.12344881147146225, "kl": 0.05135510489344597, "learning_rate": 3e-06, "loss": -0.0028, "step": 1087 }, { "clip_ratio": 0.001279033807804808, "epoch": 0.0030205609137196766, "grad_norm": 0.11550983041524887, "kl": 0.04587686434388161, "learning_rate": 3e-06, "loss": -0.0032, "step": 1088 }, { "clip_ratio": 0.0004374117561383173, "epoch": 0.0030233371645594924, "grad_norm": 0.10007118433713913, "kl": 0.048211125656962395, "learning_rate": 3e-06, "loss": -0.0032, "step": 1089 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0030261134153993082, "grad_norm": 0.10254620760679245, "kl": 0.04644571617245674, "learning_rate": 3e-06, "loss": -0.004, "step": 1090 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.003028889666239124, "grad_norm": 0.11051363497972488, "kl": 0.044875843450427055, "learning_rate": 3e-06, "loss": -0.0043, "step": 1091 }, { "clip_ratio": 0.0014370106218848377, "epoch": 0.00303166591707894, "grad_norm": 0.15505895018577576, "kl": 0.0460149310529232, "learning_rate": 3e-06, "loss": -0.0041, "step": 1092 }, { "clip_ratio": 8.698677993379533e-05, "completion_length": 237.5416717529297, "epoch": 0.0030344421679187558, "grad_norm": 0.10442124307155609, "kl": 0.04337725602090359, "learning_rate": 3e-06, "loss": 0.0063, "reward": 0.2854166701436043, "reward_std": 0.31687821447849274, "rewards/countdown_reward_func": 0.2854166701436043, "step": 1093, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0001680653658695519, "epoch": 0.0030372184187585716, "grad_norm": 0.1659352034330368, "kl": 0.04275266453623772, "learning_rate": 3e-06, "loss": 0.0053, "step": 1094 }, { "clip_ratio": 0.00017313426360487938, "epoch": 0.0030399946695983874, "grad_norm": 0.13962288200855255, "kl": 0.04703891836106777, "learning_rate": 3e-06, "loss": 0.0056, "step": 1095 }, { "clip_ratio": 0.00017009561270242557, "epoch": 0.0030427709204382033, "grad_norm": 0.13137014210224152, "kl": 0.044395437464118004, "learning_rate": 3e-06, "loss": 0.0057, "step": 1096 }, { "clip_ratio": 0.0002661462058313191, "epoch": 0.003045547171278019, "grad_norm": 0.10747912526130676, "kl": 0.04033567197620869, "learning_rate": 3e-06, "loss": 0.0051, "step": 1097 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0030483234221178354, "grad_norm": 0.10946369171142578, "kl": 0.042647797614336014, "learning_rate": 3e-06, "loss": 0.0051, "step": 1098 }, { "clip_ratio": 9.704969124868512e-05, "epoch": 0.0030510996729576512, "grad_norm": 0.10199002921581268, "kl": 0.04471134953200817, "learning_rate": 3e-06, "loss": 0.0047, "step": 1099 }, { "clip_ratio": 0.0002724502555793151, "epoch": 0.003053875923797467, "grad_norm": 0.10533545166254044, "kl": 0.044932011514902115, "learning_rate": 3e-06, "loss": 0.0036, "step": 1100 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.003056652174637283, "grad_norm": 0.14148595929145813, "kl": 0.047867994755506516, "learning_rate": 3e-06, "loss": 0.0031, "step": 1101 }, { "clip_ratio": 0.00046344404108822346, "epoch": 0.0030594284254770988, "grad_norm": 0.12489910423755646, "kl": 0.047186482697725296, "learning_rate": 3e-06, "loss": 0.0034, "step": 1102 }, { "clip_ratio": 0.0002588110146461986, "epoch": 0.0030622046763169146, "grad_norm": 0.13311858475208282, "kl": 0.04366219788789749, "learning_rate": 3e-06, "loss": 0.0042, "step": 1103 }, { "clip_ratio": 9.704969124868512e-05, "epoch": 0.0030649809271567304, "grad_norm": 0.09973665326833725, "kl": 0.045309677720069885, "learning_rate": 3e-06, "loss": 0.0039, "step": 1104 }, { "clip_ratio": 0.0007995735504664481, "completion_length": 233.06250762939453, "epoch": 0.0030677571779965463, "grad_norm": 0.12320923805236816, "kl": 0.04813423752784729, "learning_rate": 3e-06, "loss": 0.0091, "reward": 0.24791669100522995, "reward_std": 0.22883931919932365, "rewards/countdown_reward_func": 0.24791667610406876, "step": 1105, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.884150884114206e-05, "epoch": 0.003070533428836362, "grad_norm": 0.09392733871936798, "kl": 0.04717006906867027, "learning_rate": 3e-06, "loss": 0.0082, "step": 1106 }, { "clip_ratio": 0.00026870113651966676, "epoch": 0.003073309679676178, "grad_norm": 0.10014389455318451, "kl": 0.05198243260383606, "learning_rate": 3e-06, "loss": 0.0081, "step": 1107 }, { "clip_ratio": 8.934953802963719e-05, "epoch": 0.003076085930515994, "grad_norm": 0.06945673376321793, "kl": 0.048388589173555374, "learning_rate": 3e-06, "loss": 0.0083, "step": 1108 }, { "clip_ratio": 8.350033749593422e-05, "epoch": 0.00307886218135581, "grad_norm": 0.07839835435152054, "kl": 0.05126038379967213, "learning_rate": 3e-06, "loss": 0.0088, "step": 1109 }, { "clip_ratio": 0.0, "epoch": 0.003081638432195626, "grad_norm": 0.07928033918142319, "kl": 0.05163071118295193, "learning_rate": 3e-06, "loss": 0.0077, "step": 1110 }, { "clip_ratio": 0.00010382059554103762, "epoch": 0.0030844146830354417, "grad_norm": 0.09869455546140671, "kl": 0.05214657075703144, "learning_rate": 3e-06, "loss": 0.008, "step": 1111 }, { "clip_ratio": 0.00017819104687077925, "epoch": 0.0030871909338752576, "grad_norm": 0.08661158382892609, "kl": 0.05166606977581978, "learning_rate": 3e-06, "loss": 0.0079, "step": 1112 }, { "clip_ratio": 0.00043943087075604126, "epoch": 0.0030899671847150734, "grad_norm": 0.08715176582336426, "kl": 0.05564386770129204, "learning_rate": 3e-06, "loss": 0.0077, "step": 1113 }, { "clip_ratio": 0.0001715863836579956, "epoch": 0.0030927434355548893, "grad_norm": 0.06843938678503036, "kl": 0.0525389164686203, "learning_rate": 3e-06, "loss": 0.0073, "step": 1114 }, { "clip_ratio": 0.00018732093303697184, "epoch": 0.003095519686394705, "grad_norm": 0.08035042136907578, "kl": 0.05591611564159393, "learning_rate": 3e-06, "loss": 0.008, "step": 1115 }, { "clip_ratio": 8.934953802963719e-05, "epoch": 0.003098295937234521, "grad_norm": 0.0761677697300911, "kl": 0.056424250826239586, "learning_rate": 3e-06, "loss": 0.0069, "step": 1116 }, { "clip_ratio": 0.00025893703423207626, "completion_length": 221.83333587646484, "epoch": 0.0031010721880743368, "grad_norm": 0.08327441662549973, "kl": 0.05536172166466713, "learning_rate": 3e-06, "loss": -0.0039, "reward": 0.2645833492279053, "reward_std": 0.21911749243736267, "rewards/countdown_reward_func": 0.2645833343267441, "step": 1117, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0031038484389141526, "grad_norm": 0.100023552775383, "kl": 0.05755512788891792, "learning_rate": 3e-06, "loss": -0.004, "step": 1118 }, { "clip_ratio": 0.00018813963106367737, "epoch": 0.0031066246897539685, "grad_norm": 0.139785036444664, "kl": 0.05882943049073219, "learning_rate": 3e-06, "loss": -0.0037, "step": 1119 }, { "clip_ratio": 0.0001152073746197857, "epoch": 0.0031094009405937847, "grad_norm": 0.09004988521337509, "kl": 0.055969417095184326, "learning_rate": 3e-06, "loss": -0.0038, "step": 1120 }, { "clip_ratio": 9.191176650347188e-05, "epoch": 0.0031121771914336006, "grad_norm": 0.06995224207639694, "kl": 0.05503300577402115, "learning_rate": 3e-06, "loss": -0.0039, "step": 1121 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.0031149534422734164, "grad_norm": 0.060606665909290314, "kl": 0.05465872026979923, "learning_rate": 3e-06, "loss": -0.0036, "step": 1122 }, { "clip_ratio": 0.00017755682347342372, "epoch": 0.0031177296931132322, "grad_norm": 0.07822634279727936, "kl": 0.056107934564352036, "learning_rate": 3e-06, "loss": -0.0044, "step": 1123 }, { "clip_ratio": 0.0001152073746197857, "epoch": 0.003120505943953048, "grad_norm": 0.08555705845355988, "kl": 0.05655817873775959, "learning_rate": 3e-06, "loss": -0.0041, "step": 1124 }, { "clip_ratio": 0.00019658758537843823, "epoch": 0.003123282194792864, "grad_norm": 0.13074277341365814, "kl": 0.05860498920083046, "learning_rate": 3e-06, "loss": -0.005, "step": 1125 }, { "clip_ratio": 0.00038114040944492444, "epoch": 0.0031260584456326798, "grad_norm": 0.0899885818362236, "kl": 0.054890843108296394, "learning_rate": 3e-06, "loss": -0.005, "step": 1126 }, { "clip_ratio": 0.0006433823728002608, "epoch": 0.0031288346964724956, "grad_norm": 0.06713841110467911, "kl": 0.05289384722709656, "learning_rate": 3e-06, "loss": -0.005, "step": 1127 }, { "clip_ratio": 0.00027796779613709077, "epoch": 0.0031316109473123114, "grad_norm": 0.06459382176399231, "kl": 0.05130323953926563, "learning_rate": 3e-06, "loss": -0.0041, "step": 1128 }, { "clip_ratio": 0.0010648140450939536, "completion_length": 236.02083587646484, "epoch": 0.0031343871981521273, "grad_norm": 0.13045701384544373, "kl": 0.052938977256417274, "learning_rate": 3e-06, "loss": 0.0017, "reward": 0.2500000149011612, "reward_std": 0.29990123212337494, "rewards/countdown_reward_func": 0.2500000149011612, "step": 1129, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00026675376284401864, "epoch": 0.003137163448991943, "grad_norm": 0.09997519105672836, "kl": 0.05317997932434082, "learning_rate": 3e-06, "loss": 0.0011, "step": 1130 }, { "clip_ratio": 0.0, "epoch": 0.0031399396998317594, "grad_norm": 0.08752911537885666, "kl": 0.052106352522969246, "learning_rate": 3e-06, "loss": 0.001, "step": 1131 }, { "clip_ratio": 0.00019378491560928524, "epoch": 0.0031427159506715752, "grad_norm": 0.11155866086483002, "kl": 0.052127305418252945, "learning_rate": 3e-06, "loss": 0.0009, "step": 1132 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.003145492201511391, "grad_norm": 0.0927848219871521, "kl": 0.053360870108008385, "learning_rate": 3e-06, "loss": 0.0009, "step": 1133 }, { "clip_ratio": 0.000244140625, "epoch": 0.003148268452351207, "grad_norm": 0.09402063488960266, "kl": 0.053240761160850525, "learning_rate": 3e-06, "loss": 0.001, "step": 1134 }, { "clip_ratio": 0.0004069010537932627, "epoch": 0.0031510447031910228, "grad_norm": 0.12245853245258331, "kl": 0.050540367141366005, "learning_rate": 3e-06, "loss": 0.0015, "step": 1135 }, { "clip_ratio": 0.0005173567624296993, "epoch": 0.0031538209540308386, "grad_norm": 0.10049139708280563, "kl": 0.05162609927356243, "learning_rate": 3e-06, "loss": 0.0006, "step": 1136 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0031565972048706544, "grad_norm": 0.10036790370941162, "kl": 0.051185326650738716, "learning_rate": 3e-06, "loss": -0.0004, "step": 1137 }, { "clip_ratio": 0.00037915847497060895, "epoch": 0.0031593734557104703, "grad_norm": 0.10827629268169403, "kl": 0.05169490538537502, "learning_rate": 3e-06, "loss": 0.001, "step": 1138 }, { "clip_ratio": 0.0004089097637915984, "epoch": 0.003162149706550286, "grad_norm": 0.0900021344423294, "kl": 0.052112411707639694, "learning_rate": 3e-06, "loss": 0.0002, "step": 1139 }, { "clip_ratio": 0.0002506030141375959, "epoch": 0.003164925957390102, "grad_norm": 0.09494884312152863, "kl": 0.05369776301085949, "learning_rate": 3e-06, "loss": -0.0003, "step": 1140 }, { "clip_ratio": 8.316699677379802e-05, "completion_length": 241.08334350585938, "epoch": 0.003167702208229918, "grad_norm": 0.14724688231945038, "kl": 0.050434716045856476, "learning_rate": 3e-06, "loss": 0.0052, "reward": 0.3750000149011612, "reward_std": 0.40321892499923706, "rewards/countdown_reward_func": 0.375, "step": 1141, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.0008169302745955065, "epoch": 0.003170478459069734, "grad_norm": 0.10081308335065842, "kl": 0.05200190842151642, "learning_rate": 3e-06, "loss": 0.005, "step": 1142 }, { "clip_ratio": 0.0002557880652602762, "epoch": 0.00317325470990955, "grad_norm": 0.11964285373687744, "kl": 0.05052047781646252, "learning_rate": 3e-06, "loss": 0.0048, "step": 1143 }, { "clip_ratio": 0.0005074728833278641, "epoch": 0.0031760309607493657, "grad_norm": 0.20048139989376068, "kl": 0.056764667853713036, "learning_rate": 3e-06, "loss": 0.006, "step": 1144 }, { "clip_ratio": 0.00027372263139113784, "epoch": 0.0031788072115891816, "grad_norm": 0.10537584125995636, "kl": 0.04961631819605827, "learning_rate": 3e-06, "loss": 0.0046, "step": 1145 }, { "clip_ratio": 0.00033895507658598945, "epoch": 0.0031815834624289974, "grad_norm": 0.11174456030130386, "kl": 0.0519944503903389, "learning_rate": 3e-06, "loss": 0.0037, "step": 1146 }, { "clip_ratio": 0.0, "epoch": 0.0031843597132688133, "grad_norm": 0.11552406847476959, "kl": 0.05385969392955303, "learning_rate": 3e-06, "loss": 0.0032, "step": 1147 }, { "clip_ratio": 0.0011523118009790778, "epoch": 0.003187135964108629, "grad_norm": 0.10118366032838821, "kl": 0.05519482307136059, "learning_rate": 3e-06, "loss": 0.0034, "step": 1148 }, { "clip_ratio": 0.0002575748658273369, "epoch": 0.003189912214948445, "grad_norm": 0.12076838314533234, "kl": 0.05428927019238472, "learning_rate": 3e-06, "loss": 0.0032, "step": 1149 }, { "clip_ratio": 0.0005272936105029657, "epoch": 0.0031926884657882608, "grad_norm": 0.1505252718925476, "kl": 0.06170363910496235, "learning_rate": 3e-06, "loss": 0.004, "step": 1150 }, { "clip_ratio": 0.00017786595708457753, "epoch": 0.0031954647166280766, "grad_norm": 0.0998905599117279, "kl": 0.054063014686107635, "learning_rate": 3e-06, "loss": 0.0033, "step": 1151 }, { "clip_ratio": 0.0005830957088619471, "epoch": 0.0031982409674678925, "grad_norm": 0.11353659629821777, "kl": 0.055830128490924835, "learning_rate": 3e-06, "loss": 0.003, "step": 1152 }, { "clip_ratio": 8.26719551696442e-05, "completion_length": 232.8541717529297, "epoch": 0.0032010172183077087, "grad_norm": 0.12211279571056366, "kl": 0.07155881449580193, "learning_rate": 3e-06, "loss": 0.0302, "reward": 0.3229166865348816, "reward_std": 0.24453017860651016, "rewards/countdown_reward_func": 0.3229166865348816, "step": 1153, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00018164265202358365, "epoch": 0.0032037934691475246, "grad_norm": 0.12927870452404022, "kl": 0.064654640853405, "learning_rate": 3e-06, "loss": 0.0298, "step": 1154 }, { "clip_ratio": 0.0001653439103392884, "epoch": 0.0032065697199873404, "grad_norm": 0.10473229736089706, "kl": 0.07196308299899101, "learning_rate": 3e-06, "loss": 0.0299, "step": 1155 }, { "clip_ratio": 0.00024557957658544183, "epoch": 0.0032093459708271562, "grad_norm": 0.1307552605867386, "kl": 0.06787708401679993, "learning_rate": 3e-06, "loss": 0.0292, "step": 1156 }, { "clip_ratio": 0.00028894259594380856, "epoch": 0.003212122221666972, "grad_norm": 0.1252485066652298, "kl": 0.07557385787367821, "learning_rate": 3e-06, "loss": 0.03, "step": 1157 }, { "clip_ratio": 0.0008342704823007807, "epoch": 0.003214898472506788, "grad_norm": 0.11512560397386551, "kl": 0.07205545529723167, "learning_rate": 3e-06, "loss": 0.0282, "step": 1158 }, { "clip_ratio": 8.406186680076644e-05, "epoch": 0.0032176747233466038, "grad_norm": 0.13281333446502686, "kl": 0.07855755090713501, "learning_rate": 3e-06, "loss": 0.0279, "step": 1159 }, { "clip_ratio": 0.0001690331264398992, "epoch": 0.0032204509741864196, "grad_norm": 0.12156619131565094, "kl": 0.07363288104534149, "learning_rate": 3e-06, "loss": 0.0271, "step": 1160 }, { "clip_ratio": 0.00041335978312417865, "epoch": 0.0032232272250262354, "grad_norm": 0.10087471455335617, "kl": 0.08428899571299553, "learning_rate": 3e-06, "loss": 0.0275, "step": 1161 }, { "clip_ratio": 0.0, "epoch": 0.0032260034758660513, "grad_norm": 0.14251810312271118, "kl": 0.07908787578344345, "learning_rate": 3e-06, "loss": 0.0252, "step": 1162 }, { "clip_ratio": 0.0003094059356953949, "epoch": 0.003228779726705867, "grad_norm": 0.11256741732358932, "kl": 0.08613381907343864, "learning_rate": 3e-06, "loss": 0.026, "step": 1163 }, { "clip_ratio": 0.00026637538394425064, "epoch": 0.0032315559775456834, "grad_norm": 0.1097400113940239, "kl": 0.08562665432691574, "learning_rate": 3e-06, "loss": 0.0253, "step": 1164 }, { "clip_ratio": 0.0001998317675315775, "completion_length": 213.89584350585938, "epoch": 0.0032343322283854992, "grad_norm": 0.087900809943676, "kl": 0.0852268636226654, "learning_rate": 3e-06, "loss": 0.0202, "reward": 0.4166666865348816, "reward_std": 0.3476388454437256, "rewards/countdown_reward_func": 0.4166666567325592, "step": 1165, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.003237108479225315, "grad_norm": 0.1092759519815445, "kl": 0.08935586363077164, "learning_rate": 3e-06, "loss": 0.02, "step": 1166 }, { "clip_ratio": 0.00037480452738236636, "epoch": 0.003239884730065131, "grad_norm": 0.1183471605181694, "kl": 0.09479113668203354, "learning_rate": 3e-06, "loss": 0.0208, "step": 1167 }, { "clip_ratio": 0.0002687285596039146, "epoch": 0.0032426609809049468, "grad_norm": 0.09400281310081482, "kl": 0.09442057460546494, "learning_rate": 3e-06, "loss": 0.0193, "step": 1168 }, { "clip_ratio": 0.00017308967653661966, "epoch": 0.0032454372317447626, "grad_norm": 0.14759588241577148, "kl": 0.09654445201158524, "learning_rate": 3e-06, "loss": 0.0188, "step": 1169 }, { "clip_ratio": 0.00020166092144791037, "epoch": 0.0032482134825845784, "grad_norm": 0.10944969207048416, "kl": 0.09353556111454964, "learning_rate": 3e-06, "loss": 0.0197, "step": 1170 }, { "clip_ratio": 0.00010879024921450764, "epoch": 0.0032509897334243943, "grad_norm": 0.10797934979200363, "kl": 0.10146218538284302, "learning_rate": 3e-06, "loss": 0.0192, "step": 1171 }, { "clip_ratio": 0.0008672036346979439, "epoch": 0.00325376598426421, "grad_norm": 0.11213571578264236, "kl": 0.1059374175965786, "learning_rate": 3e-06, "loss": 0.0183, "step": 1172 }, { "clip_ratio": 0.0018254909082315862, "epoch": 0.003256542235104026, "grad_norm": 0.1145535483956337, "kl": 0.1167490966618061, "learning_rate": 3e-06, "loss": 0.0203, "step": 1173 }, { "clip_ratio": 0.0010834443964995444, "epoch": 0.003259318485943842, "grad_norm": 0.08255753666162491, "kl": 0.11234157159924507, "learning_rate": 3e-06, "loss": 0.0177, "step": 1174 }, { "clip_ratio": 0.0012724358239211142, "epoch": 0.003262094736783658, "grad_norm": 0.10409360378980637, "kl": 0.11117475107312202, "learning_rate": 3e-06, "loss": 0.0181, "step": 1175 }, { "clip_ratio": 0.0022241021506488323, "epoch": 0.003264870987623474, "grad_norm": 0.09691368043422699, "kl": 0.10797763615846634, "learning_rate": 3e-06, "loss": 0.018, "step": 1176 }, { "clip_ratio": 0.0002880159590858966, "completion_length": 233.7291717529297, "epoch": 0.0032676472384632897, "grad_norm": 0.08466751873493195, "kl": 0.12909483164548874, "learning_rate": 3e-06, "loss": 0.0086, "reward": 0.2291666939854622, "reward_std": 0.2128555178642273, "rewards/countdown_reward_func": 0.2291666865348816, "step": 1177, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0003561055054888129, "epoch": 0.0032704234893031056, "grad_norm": 0.11577208340167999, "kl": 0.12361054494976997, "learning_rate": 3e-06, "loss": 0.0078, "step": 1178 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0032731997401429214, "grad_norm": 0.09205642342567444, "kl": 0.13038898259401321, "learning_rate": 3e-06, "loss": 0.0089, "step": 1179 }, { "clip_ratio": 0.000333176227286458, "epoch": 0.0032759759909827373, "grad_norm": 0.10060107707977295, "kl": 0.13378220051527023, "learning_rate": 3e-06, "loss": 0.0083, "step": 1180 }, { "clip_ratio": 0.00010113268945133314, "epoch": 0.003278752241822553, "grad_norm": 0.09801634401082993, "kl": 0.1360284462571144, "learning_rate": 3e-06, "loss": 0.0089, "step": 1181 }, { "clip_ratio": 0.0002441406322759576, "epoch": 0.003281528492662369, "grad_norm": 0.08726264536380768, "kl": 0.131295807659626, "learning_rate": 3e-06, "loss": 0.0089, "step": 1182 }, { "clip_ratio": 0.00017773196304915473, "epoch": 0.0032843047435021848, "grad_norm": 0.08691044896841049, "kl": 0.13674616813659668, "learning_rate": 3e-06, "loss": 0.0085, "step": 1183 }, { "clip_ratio": 0.00018315018678549677, "epoch": 0.0032870809943420006, "grad_norm": 0.11604570597410202, "kl": 0.1295301541686058, "learning_rate": 3e-06, "loss": 0.0073, "step": 1184 }, { "clip_ratio": 0.0007019351178314537, "epoch": 0.0032898572451818165, "grad_norm": 0.09062675386667252, "kl": 0.13159853219985962, "learning_rate": 3e-06, "loss": 0.0084, "step": 1185 }, { "clip_ratio": 0.0004526742995949462, "epoch": 0.0032926334960216327, "grad_norm": 0.10843745619058609, "kl": 0.13263515383005142, "learning_rate": 3e-06, "loss": 0.0079, "step": 1186 }, { "clip_ratio": 0.0006641855870839208, "epoch": 0.0032954097468614486, "grad_norm": 0.08221055567264557, "kl": 0.13184264674782753, "learning_rate": 3e-06, "loss": 0.0083, "step": 1187 }, { "clip_ratio": 0.0004921089857816696, "epoch": 0.0032981859977012644, "grad_norm": 0.08339028805494308, "kl": 0.1247389204800129, "learning_rate": 3e-06, "loss": 0.0072, "step": 1188 }, { "clip_ratio": 0.00020850708824582398, "completion_length": 234.0416717529297, "epoch": 0.0033009622485410802, "grad_norm": 0.11418038606643677, "kl": 0.10565116629004478, "learning_rate": 3e-06, "loss": -0.0015, "reward": 0.3437500298023224, "reward_std": 0.28535500913858414, "rewards/countdown_reward_func": 0.3437500149011612, "step": 1189, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.003303738499380896, "grad_norm": 0.12243412435054779, "kl": 0.09964673221111298, "learning_rate": 3e-06, "loss": -0.0021, "step": 1190 }, { "clip_ratio": 0.00028036253206664696, "epoch": 0.003306514750220712, "grad_norm": 0.17603905498981476, "kl": 0.10419896617531776, "learning_rate": 3e-06, "loss": -0.0001, "step": 1191 }, { "clip_ratio": 0.0001840942568378523, "epoch": 0.0033092910010605278, "grad_norm": 0.11945787817239761, "kl": 0.09894903376698494, "learning_rate": 3e-06, "loss": -0.0027, "step": 1192 }, { "clip_ratio": 8.406186680076644e-05, "epoch": 0.0033120672519003436, "grad_norm": 0.14571459591388702, "kl": 0.09924932196736336, "learning_rate": 3e-06, "loss": -0.0023, "step": 1193 }, { "clip_ratio": 0.000245529918174725, "epoch": 0.0033148435027401594, "grad_norm": 0.1262568235397339, "kl": 0.09315666556358337, "learning_rate": 3e-06, "loss": -0.0041, "step": 1194 }, { "clip_ratio": 0.00029078290390316397, "epoch": 0.0033176197535799753, "grad_norm": 0.12053867429494858, "kl": 0.09638616442680359, "learning_rate": 3e-06, "loss": -0.0028, "step": 1195 }, { "clip_ratio": 0.0008103728177957237, "epoch": 0.0033203960044197916, "grad_norm": 0.14339543879032135, "kl": 0.089625783264637, "learning_rate": 3e-06, "loss": -0.0034, "step": 1196 }, { "clip_ratio": 0.0003617427501012571, "epoch": 0.0033231722552596074, "grad_norm": 0.17292562127113342, "kl": 0.09256411343812943, "learning_rate": 3e-06, "loss": -0.0038, "step": 1197 }, { "clip_ratio": 0.0008070902986219153, "epoch": 0.0033259485060994232, "grad_norm": 0.11857297271490097, "kl": 0.08773430064320564, "learning_rate": 3e-06, "loss": -0.0032, "step": 1198 }, { "clip_ratio": 0.00044789022649638355, "epoch": 0.003328724756939239, "grad_norm": 0.1319698989391327, "kl": 0.08959746360778809, "learning_rate": 3e-06, "loss": -0.0056, "step": 1199 }, { "clip_ratio": 0.0012999755563214421, "epoch": 0.003331501007779055, "grad_norm": 0.1194002777338028, "kl": 0.08413466811180115, "learning_rate": 3e-06, "loss": -0.0052, "step": 1200 }, { "clip_ratio": 0.00024771419703029096, "completion_length": 225.64584350585938, "epoch": 0.0033342772586188708, "grad_norm": 0.07703583687543869, "kl": 0.09083598852157593, "learning_rate": 3e-06, "loss": 0.0078, "reward": 0.2291666939854622, "reward_std": 0.18291139975190163, "rewards/countdown_reward_func": 0.2291666865348816, "step": 1201, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00020563387079164386, "epoch": 0.0033370535094586866, "grad_norm": 0.0965060293674469, "kl": 0.09030065685510635, "learning_rate": 3e-06, "loss": 0.0065, "step": 1202 }, { "clip_ratio": 0.00019009599782293662, "epoch": 0.0033398297602985024, "grad_norm": 0.07806777209043503, "kl": 0.09229482710361481, "learning_rate": 3e-06, "loss": 0.0073, "step": 1203 }, { "clip_ratio": 0.00033928067568922415, "epoch": 0.0033426060111383183, "grad_norm": 0.08718756586313248, "kl": 0.08507254719734192, "learning_rate": 3e-06, "loss": 0.0068, "step": 1204 }, { "clip_ratio": 0.0, "epoch": 0.003345382261978134, "grad_norm": 0.08955029398202896, "kl": 0.08796864375472069, "learning_rate": 3e-06, "loss": 0.0063, "step": 1205 }, { "clip_ratio": 8.316699677379802e-05, "epoch": 0.00334815851281795, "grad_norm": 0.11132470518350601, "kl": 0.09032916277647018, "learning_rate": 3e-06, "loss": 0.007, "step": 1206 }, { "clip_ratio": 0.0002561136716394685, "epoch": 0.0033509347636577662, "grad_norm": 0.08454351872205734, "kl": 0.08703029155731201, "learning_rate": 3e-06, "loss": 0.0064, "step": 1207 }, { "clip_ratio": 9.137426968663931e-05, "epoch": 0.003353711014497582, "grad_norm": 0.08685325086116791, "kl": 0.08563976734876633, "learning_rate": 3e-06, "loss": 0.006, "step": 1208 }, { "clip_ratio": 0.0001069290010491386, "epoch": 0.003356487265337398, "grad_norm": 0.08751443773508072, "kl": 0.0876334123313427, "learning_rate": 3e-06, "loss": 0.007, "step": 1209 }, { "clip_ratio": 0.00042423446575412527, "epoch": 0.0033592635161772137, "grad_norm": 0.07404050976037979, "kl": 0.07978618890047073, "learning_rate": 3e-06, "loss": 0.0063, "step": 1210 }, { "clip_ratio": 0.0005211606621742249, "epoch": 0.0033620397670170296, "grad_norm": 0.1123705729842186, "kl": 0.08384502306580544, "learning_rate": 3e-06, "loss": 0.006, "step": 1211 }, { "clip_ratio": 0.00043046276550740004, "epoch": 0.0033648160178568454, "grad_norm": 0.09377264976501465, "kl": 0.08539973199367523, "learning_rate": 3e-06, "loss": 0.0065, "step": 1212 }, { "clip_ratio": 0.00017872503667604178, "completion_length": 238.9375, "epoch": 0.0033675922686966613, "grad_norm": 0.08966858685016632, "kl": 0.08921418339014053, "learning_rate": 3e-06, "loss": 0.0032, "reward": 0.26458335667848587, "reward_std": 0.2634518966078758, "rewards/countdown_reward_func": 0.26458335667848587, "step": 1213, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003323842174722813, "epoch": 0.003370368519536477, "grad_norm": 0.11603780835866928, "kl": 0.09003105014562607, "learning_rate": 3e-06, "loss": 0.0033, "step": 1214 }, { "clip_ratio": 9.505703201284632e-05, "epoch": 0.003373144770376293, "grad_norm": 0.12451639026403427, "kl": 0.08492600917816162, "learning_rate": 3e-06, "loss": 0.0038, "step": 1215 }, { "clip_ratio": 8.234519191319123e-05, "epoch": 0.0033759210212161088, "grad_norm": 0.09685535728931427, "kl": 0.08527249097824097, "learning_rate": 3e-06, "loss": 0.0028, "step": 1216 }, { "clip_ratio": 0.0005166697083041072, "epoch": 0.0033786972720559246, "grad_norm": 0.1449529230594635, "kl": 0.08535202592611313, "learning_rate": 3e-06, "loss": 0.0034, "step": 1217 }, { "clip_ratio": 0.0, "epoch": 0.003381473522895741, "grad_norm": 0.08282797783613205, "kl": 0.0859798900783062, "learning_rate": 3e-06, "loss": 0.0036, "step": 1218 }, { "clip_ratio": 0.00018502863531466573, "epoch": 0.0033842497737355567, "grad_norm": 0.09159551560878754, "kl": 0.08557010814547539, "learning_rate": 3e-06, "loss": 0.0027, "step": 1219 }, { "clip_ratio": 0.0002536034444347024, "epoch": 0.0033870260245753726, "grad_norm": 0.10851403325796127, "kl": 0.08524026349186897, "learning_rate": 3e-06, "loss": 0.0024, "step": 1220 }, { "clip_ratio": 0.0005241364997345954, "epoch": 0.0033898022754151884, "grad_norm": 0.1287156045436859, "kl": 0.08104551210999489, "learning_rate": 3e-06, "loss": 0.0025, "step": 1221 }, { "clip_ratio": 0.00017740222392603755, "epoch": 0.0033925785262550042, "grad_norm": 0.10037072002887726, "kl": 0.08082518354058266, "learning_rate": 3e-06, "loss": 0.002, "step": 1222 }, { "clip_ratio": 0.00017451102030463517, "epoch": 0.00339535477709482, "grad_norm": 0.13647525012493134, "kl": 0.08178085088729858, "learning_rate": 3e-06, "loss": 0.0009, "step": 1223 }, { "clip_ratio": 0.000411814013205003, "epoch": 0.003398131027934636, "grad_norm": 0.08645734190940857, "kl": 0.08200101554393768, "learning_rate": 3e-06, "loss": 0.0031, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 241.62500762939453, "epoch": 0.0034009072787744518, "grad_norm": 0.10560040175914764, "kl": 0.07796743884682655, "learning_rate": 3e-06, "loss": 0.0136, "reward": 0.26875001937150955, "reward_std": 0.2272602580487728, "rewards/countdown_reward_func": 0.26875001937150955, "step": 1225, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0034036835296142676, "grad_norm": 0.15348504483699799, "kl": 0.07507862150669098, "learning_rate": 3e-06, "loss": 0.014, "step": 1226 }, { "clip_ratio": 0.00033518215059302747, "epoch": 0.0034064597804540834, "grad_norm": 0.08251035213470459, "kl": 0.08217736333608627, "learning_rate": 3e-06, "loss": 0.015, "step": 1227 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0034092360312938993, "grad_norm": 0.11358001828193665, "kl": 0.07504938542842865, "learning_rate": 3e-06, "loss": 0.0148, "step": 1228 }, { "clip_ratio": 0.0003684598486870527, "epoch": 0.0034120122821337156, "grad_norm": 0.1041935384273529, "kl": 0.08010926097631454, "learning_rate": 3e-06, "loss": 0.0143, "step": 1229 }, { "clip_ratio": 0.0005087452591396868, "epoch": 0.0034147885329735314, "grad_norm": 0.09497429430484772, "kl": 0.07363765686750412, "learning_rate": 3e-06, "loss": 0.0139, "step": 1230 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0034175647838133472, "grad_norm": 0.08915600925683975, "kl": 0.07781890407204628, "learning_rate": 3e-06, "loss": 0.0131, "step": 1231 }, { "clip_ratio": 0.0003355148946866393, "epoch": 0.003420341034653163, "grad_norm": 0.0972413569688797, "kl": 0.07709966227412224, "learning_rate": 3e-06, "loss": 0.0131, "step": 1232 }, { "clip_ratio": 0.000244140625, "epoch": 0.003423117285492979, "grad_norm": 0.08852770924568176, "kl": 0.08478394895792007, "learning_rate": 3e-06, "loss": 0.0147, "step": 1233 }, { "clip_ratio": 0.0008937545935623348, "epoch": 0.0034258935363327947, "grad_norm": 0.11425919830799103, "kl": 0.07852787896990776, "learning_rate": 3e-06, "loss": 0.0144, "step": 1234 }, { "clip_ratio": 0.000438432558439672, "epoch": 0.0034286697871726106, "grad_norm": 0.09492716938257217, "kl": 0.08431283012032509, "learning_rate": 3e-06, "loss": 0.013, "step": 1235 }, { "clip_ratio": 0.0004982753162039444, "epoch": 0.0034314460380124264, "grad_norm": 0.10918039828538895, "kl": 0.07764238864183426, "learning_rate": 3e-06, "loss": 0.012, "step": 1236 }, { "clip_ratio": 8.821453957352787e-05, "completion_length": 239.8541717529297, "epoch": 0.0034342222888522423, "grad_norm": 0.1400684416294098, "kl": 0.0789659395813942, "learning_rate": 3e-06, "loss": 0.0171, "reward": 0.4375000149011612, "reward_std": 0.398020476102829, "rewards/countdown_reward_func": 0.4375000149011612, "step": 1237, "zero_std_ratio": 0.125 }, { "clip_ratio": 8.790435822447762e-05, "epoch": 0.003436998539692058, "grad_norm": 0.117515429854393, "kl": 0.08153605833649635, "learning_rate": 3e-06, "loss": 0.0183, "step": 1238 }, { "clip_ratio": 0.0001731122611090541, "epoch": 0.003439774790531874, "grad_norm": 0.12731020152568817, "kl": 0.08211733773350716, "learning_rate": 3e-06, "loss": 0.0186, "step": 1239 }, { "clip_ratio": 0.00017580871644895524, "epoch": 0.0034425510413716902, "grad_norm": 0.1385928988456726, "kl": 0.07959434390068054, "learning_rate": 3e-06, "loss": 0.0168, "step": 1240 }, { "clip_ratio": 0.00017041582032106817, "epoch": 0.003445327292211506, "grad_norm": 0.12391113489866257, "kl": 0.08285382017493248, "learning_rate": 3e-06, "loss": 0.0182, "step": 1241 }, { "clip_ratio": 0.0, "epoch": 0.003448103543051322, "grad_norm": 0.10620970278978348, "kl": 0.0919383093714714, "learning_rate": 3e-06, "loss": 0.0182, "step": 1242 }, { "clip_ratio": 8.821453957352787e-05, "epoch": 0.0034508797938911377, "grad_norm": 0.1453152298927307, "kl": 0.08631188422441483, "learning_rate": 3e-06, "loss": 0.0174, "step": 1243 }, { "clip_ratio": 8.884150884114206e-05, "epoch": 0.0034536560447309536, "grad_norm": 0.11560267210006714, "kl": 0.08962936699390411, "learning_rate": 3e-06, "loss": 0.0167, "step": 1244 }, { "clip_ratio": 0.000705406149791088, "epoch": 0.0034564322955707694, "grad_norm": 0.13734284043312073, "kl": 0.09047690033912659, "learning_rate": 3e-06, "loss": 0.0165, "step": 1245 }, { "clip_ratio": 8.234519191319123e-05, "epoch": 0.0034592085464105853, "grad_norm": 0.11958130449056625, "kl": 0.08676190301775932, "learning_rate": 3e-06, "loss": 0.0159, "step": 1246 }, { "clip_ratio": 0.00025863035989459604, "epoch": 0.003461984797250401, "grad_norm": 0.13450902700424194, "kl": 0.09278702363371849, "learning_rate": 3e-06, "loss": 0.0169, "step": 1247 }, { "clip_ratio": 8.884150884114206e-05, "epoch": 0.003464761048090217, "grad_norm": 0.11801480501890182, "kl": 0.1030571274459362, "learning_rate": 3e-06, "loss": 0.0163, "step": 1248 }, { "clip_ratio": 9.231905278284103e-05, "completion_length": 225.1041717529297, "epoch": 0.0034675372989300328, "grad_norm": 0.13714726269245148, "kl": 0.09993145614862442, "learning_rate": 3e-06, "loss": 0.0016, "reward": 0.30000002682209015, "reward_std": 0.30601368844509125, "rewards/countdown_reward_func": 0.30000000447034836, "step": 1249, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00034739852708298713, "epoch": 0.0034703135497698486, "grad_norm": 0.11648391932249069, "kl": 0.10220260173082352, "learning_rate": 3e-06, "loss": 0.0002, "step": 1250 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.003473089800609665, "grad_norm": 0.0945407897233963, "kl": 0.11158350110054016, "learning_rate": 3e-06, "loss": 0.0013, "step": 1251 }, { "clip_ratio": 0.00010683760774554685, "epoch": 0.0034758660514494807, "grad_norm": 0.1068953201174736, "kl": 0.1054844819009304, "learning_rate": 3e-06, "loss": 0.001, "step": 1252 }, { "clip_ratio": 0.00010416666918899864, "epoch": 0.0034786423022892966, "grad_norm": 0.1203155517578125, "kl": 0.11958234012126923, "learning_rate": 3e-06, "loss": 0.0009, "step": 1253 }, { "clip_ratio": 0.00018602220370667055, "epoch": 0.0034814185531291124, "grad_norm": 0.13715901970863342, "kl": 0.11173819750547409, "learning_rate": 3e-06, "loss": 0.0016, "step": 1254 }, { "clip_ratio": 9.231905278284103e-05, "epoch": 0.0034841948039689282, "grad_norm": 0.14036564528942108, "kl": 0.10752937197685242, "learning_rate": 3e-06, "loss": 0.0002, "step": 1255 }, { "clip_ratio": 0.0004339037259342149, "epoch": 0.003486971054808744, "grad_norm": 0.1094101294875145, "kl": 0.10794253274798393, "learning_rate": 3e-06, "loss": -0.0001, "step": 1256 }, { "clip_ratio": 0.0002743252844084054, "epoch": 0.00348974730564856, "grad_norm": 0.1030493676662445, "kl": 0.11747561022639275, "learning_rate": 3e-06, "loss": 0.0004, "step": 1257 }, { "clip_ratio": 0.0, "epoch": 0.0034925235564883758, "grad_norm": 0.09440533816814423, "kl": 0.11023146659135818, "learning_rate": 3e-06, "loss": 0.0007, "step": 1258 }, { "clip_ratio": 0.000180825540155638, "epoch": 0.0034952998073281916, "grad_norm": 0.11672715842723846, "kl": 0.1223120354115963, "learning_rate": 3e-06, "loss": -0.0004, "step": 1259 }, { "clip_ratio": 0.00018602220370667055, "epoch": 0.0034980760581680074, "grad_norm": 0.14074575901031494, "kl": 0.11307071894407272, "learning_rate": 3e-06, "loss": -0.0001, "step": 1260 }, { "clip_ratio": 0.00016666666488163173, "completion_length": 237.3125, "epoch": 0.0035008523090078233, "grad_norm": 0.10606394708156586, "kl": 0.10512935742735863, "learning_rate": 3e-06, "loss": -0.0002, "reward": 0.23125001043081284, "reward_std": 0.2202121764421463, "rewards/countdown_reward_func": 0.23125001043081284, "step": 1261, "zero_std_ratio": 0.5 }, { "clip_ratio": 9.412650251761079e-05, "epoch": 0.0035036285598476396, "grad_norm": 0.11450087279081345, "kl": 0.10360938310623169, "learning_rate": 3e-06, "loss": 0.0004, "step": 1262 }, { "clip_ratio": 0.0, "epoch": 0.0035064048106874554, "grad_norm": 0.09911279380321503, "kl": 0.11000372096896172, "learning_rate": 3e-06, "loss": -0.0002, "step": 1263 }, { "clip_ratio": 0.00035296654095873237, "epoch": 0.0035091810615272712, "grad_norm": 0.09363103657960892, "kl": 0.10553918406367302, "learning_rate": 3e-06, "loss": -0.0005, "step": 1264 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.003511957312367087, "grad_norm": 0.1504822075366974, "kl": 0.10735897347331047, "learning_rate": 3e-06, "loss": 0.0005, "step": 1265 }, { "clip_ratio": 0.00025724637089297175, "epoch": 0.003514733563206903, "grad_norm": 0.08245550096035004, "kl": 0.10223409533500671, "learning_rate": 3e-06, "loss": 0.0002, "step": 1266 }, { "clip_ratio": 0.00024871622008504346, "epoch": 0.0035175098140467187, "grad_norm": 0.09745687991380692, "kl": 0.1023399755358696, "learning_rate": 3e-06, "loss": -0.001, "step": 1267 }, { "clip_ratio": 0.0002794273314066231, "epoch": 0.0035202860648865346, "grad_norm": 0.09371484816074371, "kl": 0.09988303855061531, "learning_rate": 3e-06, "loss": -0.0003, "step": 1268 }, { "clip_ratio": 8.896797226043418e-05, "epoch": 0.0035230623157263504, "grad_norm": 0.12038465589284897, "kl": 0.10891411453485489, "learning_rate": 3e-06, "loss": -0.0019, "step": 1269 }, { "clip_ratio": 0.0005968022960587405, "epoch": 0.0035258385665661663, "grad_norm": 0.10312320291996002, "kl": 0.1005222424864769, "learning_rate": 3e-06, "loss": -0.0011, "step": 1270 }, { "clip_ratio": 0.0006010157812852412, "epoch": 0.003528614817405982, "grad_norm": 0.16399547457695007, "kl": 0.10354287177324295, "learning_rate": 3e-06, "loss": -0.0024, "step": 1271 }, { "clip_ratio": 0.00042980091529898345, "epoch": 0.003531391068245798, "grad_norm": 0.08301949501037598, "kl": 0.09836991503834724, "learning_rate": 3e-06, "loss": -0.0004, "step": 1272 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 243.81250762939453, "epoch": 0.0035341673190856142, "grad_norm": 0.08151868730783463, "kl": 0.09731181338429451, "learning_rate": 3e-06, "loss": 0.0097, "reward": 0.19375000149011612, "reward_std": 0.15347465127706528, "rewards/countdown_reward_func": 0.19375000149011612, "step": 1273, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.00353694356992543, "grad_norm": 0.08378417789936066, "kl": 0.09494262933731079, "learning_rate": 3e-06, "loss": 0.0091, "step": 1274 }, { "clip_ratio": 0.00034852556564146653, "epoch": 0.003539719820765246, "grad_norm": 0.06453592330217361, "kl": 0.09252246841788292, "learning_rate": 3e-06, "loss": 0.009, "step": 1275 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0035424960716050617, "grad_norm": 0.059850361198186874, "kl": 0.09488331153988838, "learning_rate": 3e-06, "loss": 0.009, "step": 1276 }, { "clip_ratio": 0.0005926662124693394, "epoch": 0.0035452723224448776, "grad_norm": 0.08670902252197266, "kl": 0.09074336290359497, "learning_rate": 3e-06, "loss": 0.0087, "step": 1277 }, { "clip_ratio": 0.0004440398042788729, "epoch": 0.0035480485732846934, "grad_norm": 0.14446234703063965, "kl": 0.09088549762964249, "learning_rate": 3e-06, "loss": 0.0092, "step": 1278 }, { "clip_ratio": 0.0, "epoch": 0.0035508248241245093, "grad_norm": 0.07895587384700775, "kl": 0.09484604373574257, "learning_rate": 3e-06, "loss": 0.0086, "step": 1279 }, { "clip_ratio": 0.000244140625, "epoch": 0.003553601074964325, "grad_norm": 0.07360237091779709, "kl": 0.09300564974546432, "learning_rate": 3e-06, "loss": 0.0093, "step": 1280 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.003556377325804141, "grad_norm": 0.06718261539936066, "kl": 0.09006435796618462, "learning_rate": 3e-06, "loss": 0.0091, "step": 1281 }, { "clip_ratio": 0.00025739153352333233, "epoch": 0.0035591535766439568, "grad_norm": 0.06159327179193497, "kl": 0.09368356689810753, "learning_rate": 3e-06, "loss": 0.0087, "step": 1282 }, { "clip_ratio": 0.00044258125126361847, "epoch": 0.0035619298274837726, "grad_norm": 0.1282200962305069, "kl": 0.08870528638362885, "learning_rate": 3e-06, "loss": 0.0088, "step": 1283 }, { "clip_ratio": 0.0005040145115344785, "epoch": 0.003564706078323589, "grad_norm": 0.13320301473140717, "kl": 0.0886475220322609, "learning_rate": 3e-06, "loss": 0.0076, "step": 1284 }, { "clip_ratio": 0.0005532470531761646, "completion_length": 223.64584350585938, "epoch": 0.0035674823291634047, "grad_norm": 0.11603424698114395, "kl": 0.08919220045208931, "learning_rate": 3e-06, "loss": -0.0095, "reward": 0.27916668355464935, "reward_std": 0.3674810379743576, "rewards/countdown_reward_func": 0.27916668355464935, "step": 1285, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.0001726210757624358, "epoch": 0.0035702585800032206, "grad_norm": 0.11266878992319107, "kl": 0.09426239505410194, "learning_rate": 3e-06, "loss": -0.0086, "step": 1286 }, { "clip_ratio": 0.00018248174455948174, "epoch": 0.0035730348308430364, "grad_norm": 0.1493123471736908, "kl": 0.10469094663858414, "learning_rate": 3e-06, "loss": -0.0097, "step": 1287 }, { "clip_ratio": 9.272996976505965e-05, "epoch": 0.0035758110816828522, "grad_norm": 0.1302638202905655, "kl": 0.09487400203943253, "learning_rate": 3e-06, "loss": -0.0106, "step": 1288 }, { "clip_ratio": 0.0001050420178216882, "epoch": 0.003578587332522668, "grad_norm": 0.11855723708868027, "kl": 0.09268858656287193, "learning_rate": 3e-06, "loss": -0.0108, "step": 1289 }, { "clip_ratio": 9.124087227974087e-05, "epoch": 0.003581363583362484, "grad_norm": 0.0906294658780098, "kl": 0.10145439580082893, "learning_rate": 3e-06, "loss": -0.009, "step": 1290 }, { "clip_ratio": 0.00035833738365909085, "epoch": 0.0035841398342022998, "grad_norm": 0.10448364913463593, "kl": 0.0870097205042839, "learning_rate": 3e-06, "loss": -0.0114, "step": 1291 }, { "clip_ratio": 0.0001726210757624358, "epoch": 0.0035869160850421156, "grad_norm": 0.12192343175411224, "kl": 0.09259336069226265, "learning_rate": 3e-06, "loss": -0.0107, "step": 1292 }, { "clip_ratio": 0.00018248174455948174, "epoch": 0.0035896923358819314, "grad_norm": 0.12048960477113724, "kl": 0.10295253619551659, "learning_rate": 3e-06, "loss": -0.0115, "step": 1293 }, { "clip_ratio": 9.272996976505965e-05, "epoch": 0.0035924685867217473, "grad_norm": 0.12977741658687592, "kl": 0.0927329771220684, "learning_rate": 3e-06, "loss": -0.0141, "step": 1294 }, { "clip_ratio": 0.0003834426242974587, "epoch": 0.0035952448375615636, "grad_norm": 0.1129441037774086, "kl": 0.09166432544589043, "learning_rate": 3e-06, "loss": -0.0121, "step": 1295 }, { "clip_ratio": 0.0004474218876566738, "epoch": 0.0035980210884013794, "grad_norm": 0.10428842157125473, "kl": 0.10321545228362083, "learning_rate": 3e-06, "loss": -0.0114, "step": 1296 }, { "clip_ratio": 0.00017595020472072065, "completion_length": 240.0625, "epoch": 0.0036007973392411952, "grad_norm": 0.11313264816999435, "kl": 0.08579898625612259, "learning_rate": 3e-06, "loss": 0.0105, "reward": 0.18958333879709244, "reward_std": 0.20279134064912796, "rewards/countdown_reward_func": 0.18958333134651184, "step": 1297, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00016954136663116515, "epoch": 0.003603573590081011, "grad_norm": 0.08754347264766693, "kl": 0.07732414454221725, "learning_rate": 3e-06, "loss": 0.0108, "step": 1298 }, { "clip_ratio": 0.00021079258294776082, "epoch": 0.003606349840920827, "grad_norm": 0.09487965703010559, "kl": 0.07977383211255074, "learning_rate": 3e-06, "loss": 0.0106, "step": 1299 }, { "clip_ratio": 8.468834857922047e-05, "epoch": 0.0036091260917606427, "grad_norm": 0.08399857580661774, "kl": 0.08700349926948547, "learning_rate": 3e-06, "loss": 0.0118, "step": 1300 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0036119023426004586, "grad_norm": 0.06855960935354233, "kl": 0.07815397530794144, "learning_rate": 3e-06, "loss": 0.0115, "step": 1301 }, { "clip_ratio": 0.0002462810225551948, "epoch": 0.0036146785934402744, "grad_norm": 0.0689474567770958, "kl": 0.07884856685996056, "learning_rate": 3e-06, "loss": 0.0115, "step": 1302 }, { "clip_ratio": 0.00018677650223253295, "epoch": 0.0036174548442800903, "grad_norm": 0.10885361582040787, "kl": 0.08625783771276474, "learning_rate": 3e-06, "loss": 0.0108, "step": 1303 }, { "clip_ratio": 0.00017632621165830642, "epoch": 0.003620231095119906, "grad_norm": 0.07666079699993134, "kl": 0.07669586688280106, "learning_rate": 3e-06, "loss": 0.01, "step": 1304 }, { "clip_ratio": 0.0002925453591160476, "epoch": 0.003623007345959722, "grad_norm": 0.08154989778995514, "kl": 0.07938069105148315, "learning_rate": 3e-06, "loss": 0.01, "step": 1305 }, { "clip_ratio": 0.0, "epoch": 0.0036257835967995382, "grad_norm": 0.0733746886253357, "kl": 0.08602719753980637, "learning_rate": 3e-06, "loss": 0.0106, "step": 1306 }, { "clip_ratio": 0.0002507569151930511, "epoch": 0.003628559847639354, "grad_norm": 0.06607773900032043, "kl": 0.07838872075080872, "learning_rate": 3e-06, "loss": 0.0107, "step": 1307 }, { "clip_ratio": 0.0004108092689421028, "epoch": 0.00363133609847917, "grad_norm": 0.07465820014476776, "kl": 0.07868605107069016, "learning_rate": 3e-06, "loss": 0.0115, "step": 1308 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 249.0416717529297, "epoch": 0.0036341123493189857, "grad_norm": 0.10679054260253906, "kl": 0.07724452763795853, "learning_rate": 3e-06, "loss": -0.0055, "reward": 0.32500002533197403, "reward_std": 0.27318819612264633, "rewards/countdown_reward_func": 0.32499999552965164, "step": 1309, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.645061800256371e-05, "epoch": 0.0036368886001588016, "grad_norm": 0.08431645482778549, "kl": 0.07454115152359009, "learning_rate": 3e-06, "loss": -0.0059, "step": 1310 }, { "clip_ratio": 0.00041751094977371395, "epoch": 0.0036396648509986174, "grad_norm": 0.10150210559368134, "kl": 0.07631824165582657, "learning_rate": 3e-06, "loss": -0.0058, "step": 1311 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0036424411018384333, "grad_norm": 0.08061017096042633, "kl": 0.07825561240315437, "learning_rate": 3e-06, "loss": -0.0056, "step": 1312 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.003645217352678249, "grad_norm": 0.1347588747739792, "kl": 0.07661932334303856, "learning_rate": 3e-06, "loss": -0.0053, "step": 1313 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.003647993603518065, "grad_norm": 0.09667013585567474, "kl": 0.07734496891498566, "learning_rate": 3e-06, "loss": -0.0057, "step": 1314 }, { "clip_ratio": 0.0, "epoch": 0.0036507698543578808, "grad_norm": 0.11161034554243088, "kl": 0.07669401913881302, "learning_rate": 3e-06, "loss": -0.0062, "step": 1315 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0036535461051976966, "grad_norm": 0.08601905405521393, "kl": 0.07428323850035667, "learning_rate": 3e-06, "loss": -0.0066, "step": 1316 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.003656322356037513, "grad_norm": 0.11961396038532257, "kl": 0.07602384686470032, "learning_rate": 3e-06, "loss": -0.0068, "step": 1317 }, { "clip_ratio": 0.0009102527255890891, "epoch": 0.0036590986068773287, "grad_norm": 0.08503416925668716, "kl": 0.0780373215675354, "learning_rate": 3e-06, "loss": -0.0059, "step": 1318 }, { "clip_ratio": 0.000244140625, "epoch": 0.0036618748577171446, "grad_norm": 0.13413764536380768, "kl": 0.07871726900339127, "learning_rate": 3e-06, "loss": -0.0072, "step": 1319 }, { "clip_ratio": 0.00017783083603717387, "epoch": 0.0036646511085569604, "grad_norm": 0.16159453988075256, "kl": 0.07949946075677872, "learning_rate": 3e-06, "loss": -0.0074, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 224.3541717529297, "epoch": 0.0036674273593967762, "grad_norm": 0.12504050135612488, "kl": 0.08029628917574883, "learning_rate": 3e-06, "loss": 0.0344, "reward": 0.3583333492279053, "reward_std": 0.3268071115016937, "rewards/countdown_reward_func": 0.3583333492279053, "step": 1321, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0005550031710299663, "epoch": 0.003670203610236592, "grad_norm": 0.14056633412837982, "kl": 0.08753347396850586, "learning_rate": 3e-06, "loss": 0.0343, "step": 1322 }, { "clip_ratio": 0.0009008992274175398, "epoch": 0.003672979861076408, "grad_norm": 0.1351320594549179, "kl": 0.0846715196967125, "learning_rate": 3e-06, "loss": 0.0351, "step": 1323 }, { "clip_ratio": 0.00020169082563370466, "epoch": 0.0036757561119162238, "grad_norm": 0.16348907351493835, "kl": 0.07923517376184464, "learning_rate": 3e-06, "loss": 0.0341, "step": 1324 }, { "clip_ratio": 0.0011630582448560745, "epoch": 0.0036785323627560396, "grad_norm": 0.1238764300942421, "kl": 0.07697004824876785, "learning_rate": 3e-06, "loss": 0.0332, "step": 1325 }, { "clip_ratio": 0.00018680757784750313, "epoch": 0.0036813086135958554, "grad_norm": 0.13604602217674255, "kl": 0.08331404998898506, "learning_rate": 3e-06, "loss": 0.0335, "step": 1326 }, { "clip_ratio": 0.0, "epoch": 0.0036840848644356713, "grad_norm": 0.12275619804859161, "kl": 0.08377550542354584, "learning_rate": 3e-06, "loss": 0.0324, "step": 1327 }, { "clip_ratio": 0.0008797913324087858, "epoch": 0.0036868611152754876, "grad_norm": 0.13052096962928772, "kl": 0.09020379930734634, "learning_rate": 3e-06, "loss": 0.032, "step": 1328 }, { "clip_ratio": 0.0005085435113869607, "epoch": 0.0036896373661153034, "grad_norm": 0.1253458559513092, "kl": 0.0886102207005024, "learning_rate": 3e-06, "loss": 0.0316, "step": 1329 }, { "clip_ratio": 0.000303602428175509, "epoch": 0.0036924136169551192, "grad_norm": 0.16655124723911285, "kl": 0.08304303884506226, "learning_rate": 3e-06, "loss": 0.0295, "step": 1330 }, { "clip_ratio": 0.0007008319735177793, "epoch": 0.003695189867794935, "grad_norm": 0.11321963369846344, "kl": 0.08307855576276779, "learning_rate": 3e-06, "loss": 0.0305, "step": 1331 }, { "clip_ratio": 0.0001017087051877752, "epoch": 0.003697966118634751, "grad_norm": 0.12860439717769623, "kl": 0.09165726974606514, "learning_rate": 3e-06, "loss": 0.0293, "step": 1332 }, { "clip_ratio": 0.00017229790682904422, "completion_length": 233.58333587646484, "epoch": 0.0037007423694745667, "grad_norm": 0.12028127908706665, "kl": 0.10655560344457626, "learning_rate": 3e-06, "loss": 0.0049, "reward": 0.2666666731238365, "reward_std": 0.2709502577781677, "rewards/countdown_reward_func": 0.2666666731238365, "step": 1333, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0037035186203143826, "grad_norm": 0.09744270890951157, "kl": 0.11018062755465508, "learning_rate": 3e-06, "loss": 0.0061, "step": 1334 }, { "clip_ratio": 8.579272252973169e-05, "epoch": 0.0037062948711541984, "grad_norm": 0.09601444751024246, "kl": 0.10575228929519653, "learning_rate": 3e-06, "loss": 0.0058, "step": 1335 }, { "clip_ratio": 0.0, "epoch": 0.0037090711219940143, "grad_norm": 0.11352216452360153, "kl": 0.10562450438737869, "learning_rate": 3e-06, "loss": 0.0053, "step": 1336 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00371184737283383, "grad_norm": 0.0934673398733139, "kl": 0.11037517711520195, "learning_rate": 3e-06, "loss": 0.0042, "step": 1337 }, { "clip_ratio": 0.0005623652250505984, "epoch": 0.0037146236236736464, "grad_norm": 0.09364868700504303, "kl": 0.10772473365068436, "learning_rate": 3e-06, "loss": 0.0057, "step": 1338 }, { "clip_ratio": 9.377344395034015e-05, "epoch": 0.0037173998745134622, "grad_norm": 0.2115117907524109, "kl": 0.12028898671269417, "learning_rate": 3e-06, "loss": 0.0033, "step": 1339 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.003720176125353278, "grad_norm": 0.10891567170619965, "kl": 0.12114686891436577, "learning_rate": 3e-06, "loss": 0.0054, "step": 1340 }, { "clip_ratio": 8.650519157527015e-05, "epoch": 0.003722952376193094, "grad_norm": 0.09488851577043533, "kl": 0.11187062785029411, "learning_rate": 3e-06, "loss": 0.0057, "step": 1341 }, { "clip_ratio": 0.000176635745447129, "epoch": 0.0037257286270329097, "grad_norm": 0.1026405617594719, "kl": 0.10968057066202164, "learning_rate": 3e-06, "loss": 0.0054, "step": 1342 }, { "clip_ratio": 0.0003423265879973769, "epoch": 0.0037285048778727256, "grad_norm": 0.10193421691656113, "kl": 0.11749966815114021, "learning_rate": 3e-06, "loss": 0.0035, "step": 1343 }, { "clip_ratio": 0.0011813296005129814, "epoch": 0.0037312811287125414, "grad_norm": 0.09785063564777374, "kl": 0.10901345312595367, "learning_rate": 3e-06, "loss": 0.0051, "step": 1344 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 224.70834350585938, "epoch": 0.0037340573795523573, "grad_norm": 0.15201179683208466, "kl": 0.11200324073433876, "learning_rate": 3e-06, "loss": 0.0035, "reward": 0.33750002086162567, "reward_std": 0.29471276700496674, "rewards/countdown_reward_func": 0.33750002086162567, "step": 1345, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.003736833630392173, "grad_norm": 0.12798374891281128, "kl": 0.11272676661610603, "learning_rate": 3e-06, "loss": 0.0036, "step": 1346 }, { "clip_ratio": 0.0, "epoch": 0.003739609881231989, "grad_norm": 0.12021691352128983, "kl": 0.11317825317382812, "learning_rate": 3e-06, "loss": 0.0036, "step": 1347 }, { "clip_ratio": 9.448223863728344e-05, "epoch": 0.0037423861320718048, "grad_norm": 0.11852817982435226, "kl": 0.11661011353135109, "learning_rate": 3e-06, "loss": 0.0027, "step": 1348 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.003745162382911621, "grad_norm": 0.15811295807361603, "kl": 0.1139235608279705, "learning_rate": 3e-06, "loss": 0.0029, "step": 1349 }, { "clip_ratio": 0.0, "epoch": 0.003747938633751437, "grad_norm": 0.14151591062545776, "kl": 0.10737059265375137, "learning_rate": 3e-06, "loss": 0.0033, "step": 1350 }, { "clip_ratio": 0.0002975761817651801, "epoch": 0.0037507148845912527, "grad_norm": 0.11291375011205673, "kl": 0.10967541486024857, "learning_rate": 3e-06, "loss": 0.0032, "step": 1351 }, { "clip_ratio": 0.0, "epoch": 0.0037534911354310686, "grad_norm": 0.1173899918794632, "kl": 0.10674367100000381, "learning_rate": 3e-06, "loss": 0.0013, "step": 1352 }, { "clip_ratio": 0.0, "epoch": 0.0037562673862708844, "grad_norm": 0.13352568447589874, "kl": 0.10739367455244064, "learning_rate": 3e-06, "loss": 0.0019, "step": 1353 }, { "clip_ratio": 8.229097875300795e-05, "epoch": 0.0037590436371107002, "grad_norm": 0.10833993554115295, "kl": 0.11009395867586136, "learning_rate": 3e-06, "loss": 0.0012, "step": 1354 }, { "clip_ratio": 0.0004950694419676438, "epoch": 0.003761819887950516, "grad_norm": 0.1272728592157364, "kl": 0.106673963367939, "learning_rate": 3e-06, "loss": 0.0015, "step": 1355 }, { "clip_ratio": 0.0003583999059628695, "epoch": 0.003764596138790332, "grad_norm": 0.13600032031536102, "kl": 0.10148491337895393, "learning_rate": 3e-06, "loss": 0.0016, "step": 1356 }, { "clip_ratio": 0.00035957014188170433, "completion_length": 235.6875, "epoch": 0.0037673723896301478, "grad_norm": 0.1315588802099228, "kl": 0.11271170154213905, "learning_rate": 3e-06, "loss": 0.0092, "reward": 0.2854166775941849, "reward_std": 0.23748210817575455, "rewards/countdown_reward_func": 0.2854166775941849, "step": 1357, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.37240440887399e-05, "epoch": 0.0037701486404699636, "grad_norm": 0.10195645689964294, "kl": 0.11273347586393356, "learning_rate": 3e-06, "loss": 0.0096, "step": 1358 }, { "clip_ratio": 0.000245323171839118, "epoch": 0.0037729248913097794, "grad_norm": 0.10797201097011566, "kl": 0.10941316187381744, "learning_rate": 3e-06, "loss": 0.0089, "step": 1359 }, { "clip_ratio": 0.00018649760022526607, "epoch": 0.0037757011421495957, "grad_norm": 0.092195063829422, "kl": 0.10438445582985878, "learning_rate": 3e-06, "loss": 0.0092, "step": 1360 }, { "clip_ratio": 0.0003678085922729224, "epoch": 0.0037784773929894115, "grad_norm": 0.1453506350517273, "kl": 0.10365356132388115, "learning_rate": 3e-06, "loss": 0.0098, "step": 1361 }, { "clip_ratio": 0.00017496491636848077, "epoch": 0.0037812536438292274, "grad_norm": 0.12429957091808319, "kl": 0.10129589587450027, "learning_rate": 3e-06, "loss": 0.0095, "step": 1362 }, { "clip_ratio": 9.272996976505965e-05, "epoch": 0.0037840298946690432, "grad_norm": 0.2589426636695862, "kl": 0.10026075318455696, "learning_rate": 3e-06, "loss": 0.008, "step": 1363 }, { "clip_ratio": 0.00017496491636848077, "epoch": 0.003786806145508859, "grad_norm": 0.09787080436944962, "kl": 0.09990701079368591, "learning_rate": 3e-06, "loss": 0.009, "step": 1364 }, { "clip_ratio": 0.0006807727040722966, "epoch": 0.003789582396348675, "grad_norm": 0.1061621904373169, "kl": 0.09625846892595291, "learning_rate": 3e-06, "loss": 0.0082, "step": 1365 }, { "clip_ratio": 0.0002560440043453127, "epoch": 0.0037923586471884907, "grad_norm": 0.09652417153120041, "kl": 0.0922749936580658, "learning_rate": 3e-06, "loss": 0.0085, "step": 1366 }, { "clip_ratio": 0.0006315374630503356, "epoch": 0.0037951348980283066, "grad_norm": 0.1492643803358078, "kl": 0.09287701919674873, "learning_rate": 3e-06, "loss": 0.0089, "step": 1367 }, { "clip_ratio": 0.0005454363708849996, "epoch": 0.0037979111488681224, "grad_norm": 0.12842383980751038, "kl": 0.0905197411775589, "learning_rate": 3e-06, "loss": 0.0081, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 230.39583587646484, "epoch": 0.0038006873997079383, "grad_norm": 0.11756516993045807, "kl": 0.08745963126420975, "learning_rate": 3e-06, "loss": -0.01, "reward": 0.3229166865348816, "reward_std": 0.29132401943206787, "rewards/countdown_reward_func": 0.3229166865348816, "step": 1369, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00017399911303073168, "epoch": 0.003803463650547754, "grad_norm": 0.12292972207069397, "kl": 0.09213906899094582, "learning_rate": 3e-06, "loss": -0.0102, "step": 1370 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0038062399013875704, "grad_norm": 0.10542229562997818, "kl": 0.08154939487576485, "learning_rate": 3e-06, "loss": -0.0099, "step": 1371 }, { "clip_ratio": 0.0002737229297054, "epoch": 0.003809016152227386, "grad_norm": 0.13350439071655273, "kl": 0.08433039486408234, "learning_rate": 3e-06, "loss": -0.0102, "step": 1372 }, { "clip_ratio": 0.000268786505330354, "epoch": 0.003811792403067202, "grad_norm": 0.11377749592065811, "kl": 0.08954215422272682, "learning_rate": 3e-06, "loss": -0.0088, "step": 1373 }, { "clip_ratio": 0.00017508336168248206, "epoch": 0.003814568653907018, "grad_norm": 0.1519603431224823, "kl": 0.07980604469776154, "learning_rate": 3e-06, "loss": -0.0121, "step": 1374 }, { "clip_ratio": 0.00018208303663413972, "epoch": 0.0038173449047468337, "grad_norm": 0.11631568521261215, "kl": 0.07712772488594055, "learning_rate": 3e-06, "loss": -0.0112, "step": 1375 }, { "clip_ratio": 0.00018214939336758107, "epoch": 0.0038201211555866496, "grad_norm": 0.10698658227920532, "kl": 0.08292505145072937, "learning_rate": 3e-06, "loss": -0.0116, "step": 1376 }, { "clip_ratio": 0.0005369195641833358, "epoch": 0.0038228974064264654, "grad_norm": 0.1011744886636734, "kl": 0.07199421525001526, "learning_rate": 3e-06, "loss": -0.0121, "step": 1377 }, { "clip_ratio": 0.000721444986993447, "epoch": 0.0038256736572662813, "grad_norm": 0.11945533007383347, "kl": 0.07399719953536987, "learning_rate": 3e-06, "loss": -0.0122, "step": 1378 }, { "clip_ratio": 0.0008790974534349516, "epoch": 0.003828449908106097, "grad_norm": 0.11251705139875412, "kl": 0.07932459190487862, "learning_rate": 3e-06, "loss": -0.0119, "step": 1379 }, { "clip_ratio": 0.000354837131453678, "epoch": 0.003831226158945913, "grad_norm": 0.15868762135505676, "kl": 0.07149028778076172, "learning_rate": 3e-06, "loss": -0.0146, "step": 1380 }, { "clip_ratio": 0.00017755682347342372, "completion_length": 225.33333587646484, "epoch": 0.0038340024097857288, "grad_norm": 0.11194837838411331, "kl": 0.07465841248631477, "learning_rate": 3e-06, "loss": -0.0171, "reward": 0.28541669249534607, "reward_std": 0.31687821447849274, "rewards/countdown_reward_func": 0.2854166850447655, "step": 1381, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0005127792246639729, "epoch": 0.003836778660625545, "grad_norm": 0.1955379843711853, "kl": 0.07209612429141998, "learning_rate": 3e-06, "loss": -0.0169, "step": 1382 }, { "clip_ratio": 8.877841173671186e-05, "epoch": 0.003839554911465361, "grad_norm": 0.11438043415546417, "kl": 0.06967338174581528, "learning_rate": 3e-06, "loss": -0.0187, "step": 1383 }, { "clip_ratio": 0.00017607717745704576, "epoch": 0.0038423311623051767, "grad_norm": 0.11129593104124069, "kl": 0.07491909712553024, "learning_rate": 3e-06, "loss": -0.0179, "step": 1384 }, { "clip_ratio": 0.0, "epoch": 0.0038451074131449926, "grad_norm": 0.0992979034781456, "kl": 0.07310023903846741, "learning_rate": 3e-06, "loss": -0.0178, "step": 1385 }, { "clip_ratio": 0.0002843922993633896, "epoch": 0.0038478836639848084, "grad_norm": 0.12313024699687958, "kl": 0.0734870582818985, "learning_rate": 3e-06, "loss": -0.0174, "step": 1386 }, { "clip_ratio": 0.0005009273590985686, "epoch": 0.0038506599148246242, "grad_norm": 0.12103430181741714, "kl": 0.06940187513828278, "learning_rate": 3e-06, "loss": -0.0189, "step": 1387 }, { "clip_ratio": 0.0008541795250494033, "epoch": 0.00385343616566444, "grad_norm": 0.20041851699352264, "kl": 0.06606732122600079, "learning_rate": 3e-06, "loss": -0.0202, "step": 1388 }, { "clip_ratio": 0.00030615586729254574, "epoch": 0.003856212416504256, "grad_norm": 0.09863422811031342, "kl": 0.06485064327716827, "learning_rate": 3e-06, "loss": -0.0203, "step": 1389 }, { "clip_ratio": 0.0005214700649958104, "epoch": 0.0038589886673440718, "grad_norm": 0.11281978338956833, "kl": 0.07150644063949585, "learning_rate": 3e-06, "loss": -0.021, "step": 1390 }, { "clip_ratio": 0.0021090117224957794, "epoch": 0.0038617649181838876, "grad_norm": 0.09994053095579147, "kl": 0.07051489502191544, "learning_rate": 3e-06, "loss": -0.0197, "step": 1391 }, { "clip_ratio": 0.0022545086685568094, "epoch": 0.0038645411690237034, "grad_norm": 0.12088467925786972, "kl": 0.07186911255121231, "learning_rate": 3e-06, "loss": -0.0191, "step": 1392 }, { "clip_ratio": 0.0001826150546548888, "completion_length": 240.52083587646484, "epoch": 0.0038673174198635197, "grad_norm": 0.0785083919763565, "kl": 0.06684068590402603, "learning_rate": 3e-06, "loss": 0.0088, "reward": 0.21250000596046448, "reward_std": 0.2080453708767891, "rewards/countdown_reward_func": 0.21250000596046448, "step": 1393, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.000263084439211525, "epoch": 0.0038700936707033355, "grad_norm": 0.07907012850046158, "kl": 0.062989991158247, "learning_rate": 3e-06, "loss": 0.0086, "step": 1394 }, { "clip_ratio": 0.0002763880111160688, "epoch": 0.0038728699215431514, "grad_norm": 0.070924311876297, "kl": 0.07118378207087517, "learning_rate": 3e-06, "loss": 0.0093, "step": 1395 }, { "clip_ratio": 0.0004090342263225466, "epoch": 0.0038756461723829672, "grad_norm": 0.06483456492424011, "kl": 0.062327247112989426, "learning_rate": 3e-06, "loss": 0.0086, "step": 1396 }, { "clip_ratio": 0.0008118404075503349, "epoch": 0.003878422423222783, "grad_norm": 0.06826819479465485, "kl": 0.06643109023571014, "learning_rate": 3e-06, "loss": 0.009, "step": 1397 }, { "clip_ratio": 0.00035210512578487396, "epoch": 0.003881198674062599, "grad_norm": 0.07230556011199951, "kl": 0.0697788167744875, "learning_rate": 3e-06, "loss": 0.009, "step": 1398 }, { "clip_ratio": 0.0006263371469685808, "epoch": 0.0038839749249024147, "grad_norm": 0.08306799083948135, "kl": 0.06783925369381905, "learning_rate": 3e-06, "loss": 0.009, "step": 1399 }, { "clip_ratio": 0.00025315712264273316, "epoch": 0.0038867511757422306, "grad_norm": 0.08525695651769638, "kl": 0.06320414319634438, "learning_rate": 3e-06, "loss": 0.0081, "step": 1400 }, { "clip_ratio": 0.00045900307304691523, "epoch": 0.0038895274265820464, "grad_norm": 0.09048361331224442, "kl": 0.07014712691307068, "learning_rate": 3e-06, "loss": 0.0092, "step": 1401 }, { "clip_ratio": 0.0009449293720535934, "epoch": 0.0038923036774218623, "grad_norm": 0.06213811784982681, "kl": 0.06285431608557701, "learning_rate": 3e-06, "loss": 0.0082, "step": 1402 }, { "clip_ratio": 0.0002441406322759576, "epoch": 0.003895079928261678, "grad_norm": 0.061364803463220596, "kl": 0.0679735615849495, "learning_rate": 3e-06, "loss": 0.009, "step": 1403 }, { "clip_ratio": 0.0007923201483208686, "epoch": 0.0038978561791014944, "grad_norm": 0.07901846617460251, "kl": 0.06937158480286598, "learning_rate": 3e-06, "loss": 0.0082, "step": 1404 }, { "clip_ratio": 0.00017844396643340588, "completion_length": 237.95833587646484, "epoch": 0.00390063242994131, "grad_norm": 0.09061950445175171, "kl": 0.06311015225946903, "learning_rate": 3e-06, "loss": 0.0072, "reward": 0.2083333507180214, "reward_std": 0.21443458646535873, "rewards/countdown_reward_func": 0.2083333432674408, "step": 1405, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00017060219397535548, "epoch": 0.003903408680781126, "grad_norm": 0.06206053867936134, "kl": 0.055555472150444984, "learning_rate": 3e-06, "loss": 0.007, "step": 1406 }, { "clip_ratio": 9.266123379347846e-05, "epoch": 0.003906184931620942, "grad_norm": 0.10189791768789291, "kl": 0.05937515199184418, "learning_rate": 3e-06, "loss": 0.0071, "step": 1407 }, { "clip_ratio": 0.0004294630925869569, "epoch": 0.003908961182460757, "grad_norm": 0.07696424424648285, "kl": 0.060661833733320236, "learning_rate": 3e-06, "loss": 0.0079, "step": 1408 }, { "clip_ratio": 0.0, "epoch": 0.003911737433300574, "grad_norm": 0.0714978277683258, "kl": 0.06452694535255432, "learning_rate": 3e-06, "loss": 0.0071, "step": 1409 }, { "clip_ratio": 0.000443236087448895, "epoch": 0.00391451368414039, "grad_norm": 0.1540229171514511, "kl": 0.06495703011751175, "learning_rate": 3e-06, "loss": 0.007, "step": 1410 }, { "clip_ratio": 9.266123379347846e-05, "epoch": 0.003917289934980205, "grad_norm": 0.09125962853431702, "kl": 0.06371764466166496, "learning_rate": 3e-06, "loss": 0.0066, "step": 1411 }, { "clip_ratio": 0.0002598241771920584, "epoch": 0.0039200661858200215, "grad_norm": 0.0668187364935875, "kl": 0.05546574853360653, "learning_rate": 3e-06, "loss": 0.0062, "step": 1412 }, { "clip_ratio": 9.266123379347846e-05, "epoch": 0.003922842436659837, "grad_norm": 0.1012551337480545, "kl": 0.05864310637116432, "learning_rate": 3e-06, "loss": 0.0062, "step": 1413 }, { "clip_ratio": 9.266123379347846e-05, "epoch": 0.003925618687499653, "grad_norm": 0.07103273272514343, "kl": 0.06015484221279621, "learning_rate": 3e-06, "loss": 0.0071, "step": 1414 }, { "clip_ratio": 0.0005025817663408816, "epoch": 0.003928394938339469, "grad_norm": 0.06943207234144211, "kl": 0.06314868479967117, "learning_rate": 3e-06, "loss": 0.0062, "step": 1415 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.003931171189179285, "grad_norm": 0.10475575923919678, "kl": 0.06454218924045563, "learning_rate": 3e-06, "loss": 0.0056, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 236.2291717529297, "epoch": 0.0039339474400191, "grad_norm": 0.0840207114815712, "kl": 0.06996531412005424, "learning_rate": 3e-06, "loss": -0.0023, "reward": 0.24583332985639572, "reward_std": 0.3068140149116516, "rewards/countdown_reward_func": 0.24583332985639572, "step": 1417, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.000174452448845841, "epoch": 0.0039367236908589166, "grad_norm": 0.09196259081363678, "kl": 0.08129844069480896, "learning_rate": 3e-06, "loss": -0.0026, "step": 1418 }, { "clip_ratio": 0.0006801302515668795, "epoch": 0.003939499941698732, "grad_norm": 0.08758815377950668, "kl": 0.0710601955652237, "learning_rate": 3e-06, "loss": -0.0027, "step": 1419 }, { "clip_ratio": 0.0006429070708691142, "epoch": 0.003942276192538548, "grad_norm": 0.07932908087968826, "kl": 0.06780187785625458, "learning_rate": 3e-06, "loss": -0.0034, "step": 1420 }, { "clip_ratio": 0.00027745863917516544, "epoch": 0.0039450524433783645, "grad_norm": 0.10550674796104431, "kl": 0.0699809230864048, "learning_rate": 3e-06, "loss": -0.003, "step": 1421 }, { "clip_ratio": 0.0, "epoch": 0.00394782869421818, "grad_norm": 0.1297944188117981, "kl": 0.0661407969892025, "learning_rate": 3e-06, "loss": -0.0031, "step": 1422 }, { "clip_ratio": 0.00017994409427046776, "epoch": 0.003950604945057996, "grad_norm": 0.08687513321638107, "kl": 0.06938713788986206, "learning_rate": 3e-06, "loss": -0.0033, "step": 1423 }, { "clip_ratio": 8.185985643649474e-05, "epoch": 0.003953381195897812, "grad_norm": 0.09864569455385208, "kl": 0.08142375946044922, "learning_rate": 3e-06, "loss": -0.0026, "step": 1424 }, { "clip_ratio": 0.00047662161523476243, "epoch": 0.003956157446737628, "grad_norm": 0.10956618189811707, "kl": 0.07189195230603218, "learning_rate": 3e-06, "loss": -0.0031, "step": 1425 }, { "clip_ratio": 0.000174703003722243, "epoch": 0.003958933697577443, "grad_norm": 0.08014890551567078, "kl": 0.0699024386703968, "learning_rate": 3e-06, "loss": -0.0037, "step": 1426 }, { "clip_ratio": 0.0002667709268280305, "epoch": 0.0039617099484172595, "grad_norm": 0.10431994497776031, "kl": 0.07111196964979172, "learning_rate": 3e-06, "loss": -0.0033, "step": 1427 }, { "clip_ratio": 8.73515018611215e-05, "epoch": 0.003964486199257075, "grad_norm": 0.13337118923664093, "kl": 0.06857022643089294, "learning_rate": 3e-06, "loss": -0.0047, "step": 1428 }, { "clip_ratio": 0.00010064412344945595, "completion_length": 244.58334350585938, "epoch": 0.003967262450096891, "grad_norm": 0.07497075945138931, "kl": 0.058519456535577774, "learning_rate": 3e-06, "loss": -0.0042, "reward": 0.22708335518836975, "reward_std": 0.25706911087036133, "rewards/countdown_reward_func": 0.22708334028720856, "step": 1429, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003266488783992827, "epoch": 0.003970038700936707, "grad_norm": 0.0831906795501709, "kl": 0.06262272223830223, "learning_rate": 3e-06, "loss": -0.0035, "step": 1430 }, { "clip_ratio": 0.000244140625, "epoch": 0.003972814951776523, "grad_norm": 0.13976958394050598, "kl": 0.05951959639787674, "learning_rate": 3e-06, "loss": -0.0041, "step": 1431 }, { "clip_ratio": 0.0022837779542896897, "epoch": 0.003975591202616339, "grad_norm": 0.09820695966482162, "kl": 0.059285424649715424, "learning_rate": 3e-06, "loss": -0.0036, "step": 1432 }, { "clip_ratio": 0.0, "epoch": 0.003978367453456155, "grad_norm": 0.08077849447727203, "kl": 0.05986793525516987, "learning_rate": 3e-06, "loss": -0.0047, "step": 1433 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.003981143704295971, "grad_norm": 0.091177798807621, "kl": 0.06356241181492805, "learning_rate": 3e-06, "loss": -0.0043, "step": 1434 }, { "clip_ratio": 0.0005404362455010414, "epoch": 0.003983919955135786, "grad_norm": 0.08302978426218033, "kl": 0.06063215062022209, "learning_rate": 3e-06, "loss": -0.0047, "step": 1435 }, { "clip_ratio": 0.0004089395733899437, "epoch": 0.0039866962059756025, "grad_norm": 0.08915877342224121, "kl": 0.0654546320438385, "learning_rate": 3e-06, "loss": -0.0043, "step": 1436 }, { "clip_ratio": 0.000328657595673576, "epoch": 0.003989472456815418, "grad_norm": 0.1467614471912384, "kl": 0.062131211161613464, "learning_rate": 3e-06, "loss": -0.0064, "step": 1437 }, { "clip_ratio": 0.0012416826793923974, "epoch": 0.003992248707655234, "grad_norm": 0.11402739584445953, "kl": 0.06069890968501568, "learning_rate": 3e-06, "loss": -0.0043, "step": 1438 }, { "clip_ratio": 0.00041950895683839917, "epoch": 0.00399502495849505, "grad_norm": 0.08940764516592026, "kl": 0.06148812361061573, "learning_rate": 3e-06, "loss": -0.0055, "step": 1439 }, { "clip_ratio": 0.0012557760928757489, "epoch": 0.003997801209334866, "grad_norm": 0.08653771132230759, "kl": 0.06552222743630409, "learning_rate": 3e-06, "loss": -0.0057, "step": 1440 }, { "clip_ratio": 0.00010434056457597762, "completion_length": 232.9791717529297, "epoch": 0.004000577460174681, "grad_norm": 0.11438572406768799, "kl": 0.07905929908156395, "learning_rate": 3e-06, "loss": 0.0201, "reward": 0.4375, "reward_std": 0.3626646399497986, "rewards/countdown_reward_func": 0.4375, "step": 1441, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.004003353711014498, "grad_norm": 0.13407845795154572, "kl": 0.07975272089242935, "learning_rate": 3e-06, "loss": 0.0212, "step": 1442 }, { "clip_ratio": 0.0002699867181945592, "epoch": 0.004006129961854314, "grad_norm": 0.18041589856147766, "kl": 0.08294541388750076, "learning_rate": 3e-06, "loss": 0.0206, "step": 1443 }, { "clip_ratio": 0.00037823428283445537, "epoch": 0.004008906212694129, "grad_norm": 0.10812884569168091, "kl": 0.07889501005411148, "learning_rate": 3e-06, "loss": 0.0205, "step": 1444 }, { "clip_ratio": 0.00018572076805867255, "epoch": 0.0040116824635339455, "grad_norm": 0.10284543037414551, "kl": 0.07939217984676361, "learning_rate": 3e-06, "loss": 0.0199, "step": 1445 }, { "clip_ratio": 0.000244140625, "epoch": 0.004014458714373761, "grad_norm": 0.1508864164352417, "kl": 0.07976921647787094, "learning_rate": 3e-06, "loss": 0.02, "step": 1446 }, { "clip_ratio": 0.0001915093234856613, "epoch": 0.004017234965213577, "grad_norm": 0.11611857265233994, "kl": 0.08126603439450264, "learning_rate": 3e-06, "loss": 0.0194, "step": 1447 }, { "clip_ratio": 0.00018572077533463016, "epoch": 0.004020011216053393, "grad_norm": 0.1259177178144455, "kl": 0.08240155875682831, "learning_rate": 3e-06, "loss": 0.0193, "step": 1448 }, { "clip_ratio": 0.0006606255192309618, "epoch": 0.004022787466893209, "grad_norm": 0.15441936254501343, "kl": 0.08467500656843185, "learning_rate": 3e-06, "loss": 0.0179, "step": 1449 }, { "clip_ratio": 0.00018383923452347517, "epoch": 0.004025563717733024, "grad_norm": 0.11314117908477783, "kl": 0.0805022120475769, "learning_rate": 3e-06, "loss": 0.0187, "step": 1450 }, { "clip_ratio": 0.0006165863596834242, "epoch": 0.0040283399685728406, "grad_norm": 0.11447932571172714, "kl": 0.08336110040545464, "learning_rate": 3e-06, "loss": 0.0186, "step": 1451 }, { "clip_ratio": 0.0005323204240994528, "epoch": 0.004031116219412656, "grad_norm": 0.15156546235084534, "kl": 0.0839230939745903, "learning_rate": 3e-06, "loss": 0.0175, "step": 1452 }, { "clip_ratio": 8.278145833173767e-05, "completion_length": 235.14583587646484, "epoch": 0.004033892470252472, "grad_norm": 0.1193864643573761, "kl": 0.08672641590237617, "learning_rate": 3e-06, "loss": 0.0317, "reward": 0.35625001788139343, "reward_std": 0.3211328834295273, "rewards/countdown_reward_func": 0.35625001788139343, "step": 1453, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0040366687210922885, "grad_norm": 0.12705117464065552, "kl": 0.08588536828756332, "learning_rate": 3e-06, "loss": 0.0305, "step": 1454 }, { "clip_ratio": 0.0, "epoch": 0.004039444971932104, "grad_norm": 0.10421835631132126, "kl": 0.08824677020311356, "learning_rate": 3e-06, "loss": 0.0301, "step": 1455 }, { "clip_ratio": 0.0, "epoch": 0.00404222122277192, "grad_norm": 0.12364216893911362, "kl": 0.08488886058330536, "learning_rate": 3e-06, "loss": 0.0296, "step": 1456 }, { "clip_ratio": 9.170946577796713e-05, "epoch": 0.004044997473611736, "grad_norm": 0.11285992711782455, "kl": 0.08128321543335915, "learning_rate": 3e-06, "loss": 0.0301, "step": 1457 }, { "clip_ratio": 9.137426968663931e-05, "epoch": 0.004047773724451552, "grad_norm": 0.09293954819440842, "kl": 0.0865498036146164, "learning_rate": 3e-06, "loss": 0.0286, "step": 1458 }, { "clip_ratio": 0.00026412875740788877, "epoch": 0.004050549975291367, "grad_norm": 0.11421343684196472, "kl": 0.09296088293194771, "learning_rate": 3e-06, "loss": 0.0293, "step": 1459 }, { "clip_ratio": 0.0, "epoch": 0.0040533262261311835, "grad_norm": 0.11324277520179749, "kl": 0.09277210384607315, "learning_rate": 3e-06, "loss": 0.0278, "step": 1460 }, { "clip_ratio": 0.0001641616690903902, "epoch": 0.004056102476970999, "grad_norm": 0.11188522726297379, "kl": 0.09390882402658463, "learning_rate": 3e-06, "loss": 0.0275, "step": 1461 }, { "clip_ratio": 0.0001641616690903902, "epoch": 0.004058878727810815, "grad_norm": 0.11014936119318008, "kl": 0.09254594147205353, "learning_rate": 3e-06, "loss": 0.0265, "step": 1462 }, { "clip_ratio": 0.0001744909241097048, "epoch": 0.004061654978650631, "grad_norm": 0.11003927886486053, "kl": 0.08989466726779938, "learning_rate": 3e-06, "loss": 0.0281, "step": 1463 }, { "clip_ratio": 8.532423089491203e-05, "epoch": 0.004064431229490447, "grad_norm": 0.10190403461456299, "kl": 0.09649864211678505, "learning_rate": 3e-06, "loss": 0.026, "step": 1464 }, { "clip_ratio": 0.00018395879305899143, "completion_length": 239.625, "epoch": 0.004067207480330263, "grad_norm": 0.09303663671016693, "kl": 0.09736296534538269, "learning_rate": 3e-06, "loss": 0.0134, "reward": 0.30625003576278687, "reward_std": 0.2696641683578491, "rewards/countdown_reward_func": 0.30625003576278687, "step": 1465, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.211496217176318e-05, "epoch": 0.004069983731170079, "grad_norm": 0.1130358949303627, "kl": 0.08961380645632744, "learning_rate": 3e-06, "loss": 0.0126, "step": 1466 }, { "clip_ratio": 0.00017335961456410587, "epoch": 0.004072759982009895, "grad_norm": 0.0847916454076767, "kl": 0.10012925043702126, "learning_rate": 3e-06, "loss": 0.013, "step": 1467 }, { "clip_ratio": 0.0, "epoch": 0.00407553623284971, "grad_norm": 0.09342540800571442, "kl": 0.09443873539566994, "learning_rate": 3e-06, "loss": 0.0127, "step": 1468 }, { "clip_ratio": 0.000447636324679479, "epoch": 0.0040783124836895265, "grad_norm": 0.10130415111780167, "kl": 0.09730666130781174, "learning_rate": 3e-06, "loss": 0.0124, "step": 1469 }, { "clip_ratio": 8.60289073898457e-05, "epoch": 0.004081088734529342, "grad_norm": 0.09560680389404297, "kl": 0.10042464733123779, "learning_rate": 3e-06, "loss": 0.0124, "step": 1470 }, { "clip_ratio": 0.00025938851467799395, "epoch": 0.004083864985369158, "grad_norm": 0.1030803993344307, "kl": 0.10246840491890907, "learning_rate": 3e-06, "loss": 0.0117, "step": 1471 }, { "clip_ratio": 0.00017924292478710413, "epoch": 0.004086641236208974, "grad_norm": 0.10079611092805862, "kl": 0.09707974642515182, "learning_rate": 3e-06, "loss": 0.0117, "step": 1472 }, { "clip_ratio": 0.0, "epoch": 0.00408941748704879, "grad_norm": 0.0831371396780014, "kl": 0.10519356653094292, "learning_rate": 3e-06, "loss": 0.0114, "step": 1473 }, { "clip_ratio": 0.0, "epoch": 0.004092193737888605, "grad_norm": 0.1005701944231987, "kl": 0.09982095658779144, "learning_rate": 3e-06, "loss": 0.0109, "step": 1474 }, { "clip_ratio": 0.0009021220030263066, "epoch": 0.004094969988728422, "grad_norm": 0.08770573884248734, "kl": 0.10152734071016312, "learning_rate": 3e-06, "loss": 0.0107, "step": 1475 }, { "clip_ratio": 0.0006180326745379716, "epoch": 0.004097746239568238, "grad_norm": 0.10259686410427094, "kl": 0.10511034727096558, "learning_rate": 3e-06, "loss": 0.0115, "step": 1476 }, { "clip_ratio": 8.747375977691263e-05, "completion_length": 225.6875, "epoch": 0.004100522490408053, "grad_norm": 0.11141236126422882, "kl": 0.10264391079545021, "learning_rate": 3e-06, "loss": 0.0114, "reward": 0.26875001937150955, "reward_std": 0.2696641534566879, "rewards/countdown_reward_func": 0.26875001937150955, "step": 1477, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0041032987412478695, "grad_norm": 0.10607676953077316, "kl": 0.10159710422158241, "learning_rate": 3e-06, "loss": 0.0115, "step": 1478 }, { "clip_ratio": 0.00045908106403658167, "epoch": 0.004106074992087685, "grad_norm": 0.12285490334033966, "kl": 0.09724065288901329, "learning_rate": 3e-06, "loss": 0.0121, "step": 1479 }, { "clip_ratio": 8.915834769140929e-05, "epoch": 0.004108851242927501, "grad_norm": 0.10112053155899048, "kl": 0.09738883748650551, "learning_rate": 3e-06, "loss": 0.0111, "step": 1480 }, { "clip_ratio": 0.000543587921129074, "epoch": 0.004111627493767317, "grad_norm": 0.11109442263841629, "kl": 0.10089835524559021, "learning_rate": 3e-06, "loss": 0.0117, "step": 1481 }, { "clip_ratio": 8.7596352386754e-05, "epoch": 0.004114403744607133, "grad_norm": 0.09819518029689789, "kl": 0.10074819624423981, "learning_rate": 3e-06, "loss": 0.0116, "step": 1482 }, { "clip_ratio": 0.00010557432688074186, "epoch": 0.004117179995446948, "grad_norm": 0.10844285786151886, "kl": 0.1009376123547554, "learning_rate": 3e-06, "loss": 0.0101, "step": 1483 }, { "clip_ratio": 0.0, "epoch": 0.0041199562462867646, "grad_norm": 0.1014895886182785, "kl": 0.1020418331027031, "learning_rate": 3e-06, "loss": 0.0103, "step": 1484 }, { "clip_ratio": 0.0003863678648485802, "epoch": 0.00412273249712658, "grad_norm": 0.11585193872451782, "kl": 0.09769553691148758, "learning_rate": 3e-06, "loss": 0.0107, "step": 1485 }, { "clip_ratio": 0.0004875522281508893, "epoch": 0.004125508747966396, "grad_norm": 0.09965824335813522, "kl": 0.09749152138829231, "learning_rate": 3e-06, "loss": 0.0099, "step": 1486 }, { "clip_ratio": 0.0005232414114288986, "epoch": 0.0041282849988062125, "grad_norm": 0.11992790549993515, "kl": 0.09825573861598969, "learning_rate": 3e-06, "loss": 0.0098, "step": 1487 }, { "clip_ratio": 0.00016897656314540654, "epoch": 0.004131061249646028, "grad_norm": 0.09515371173620224, "kl": 0.09882111474871635, "learning_rate": 3e-06, "loss": 0.0099, "step": 1488 }, { "clip_ratio": 9.170946577796713e-05, "completion_length": 212.7916717529297, "epoch": 0.004133837500485844, "grad_norm": 0.06438077986240387, "kl": 0.1108875423669815, "learning_rate": 3e-06, "loss": 0.005, "reward": 0.16874999925494194, "reward_std": 0.11910479329526424, "rewards/countdown_reward_func": 0.16874999925494194, "step": 1489, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0001817244410631247, "epoch": 0.00413661375132566, "grad_norm": 0.07629207521677017, "kl": 0.1037943847477436, "learning_rate": 3e-06, "loss": 0.0059, "step": 1490 }, { "clip_ratio": 9.170946577796713e-05, "epoch": 0.004139390002165476, "grad_norm": 0.07570668309926987, "kl": 0.09897951036691666, "learning_rate": 3e-06, "loss": 0.0049, "step": 1491 }, { "clip_ratio": 0.0, "epoch": 0.004142166253005291, "grad_norm": 0.06962047517299652, "kl": 0.1040000282227993, "learning_rate": 3e-06, "loss": 0.0055, "step": 1492 }, { "clip_ratio": 0.0, "epoch": 0.0041449425038451075, "grad_norm": 0.07831595093011856, "kl": 0.09726639091968536, "learning_rate": 3e-06, "loss": 0.0045, "step": 1493 }, { "clip_ratio": 0.0001838923490140587, "epoch": 0.004147718754684923, "grad_norm": 0.06859088689088821, "kl": 0.10122519358992577, "learning_rate": 3e-06, "loss": 0.0049, "step": 1494 }, { "clip_ratio": 0.0007797929574735463, "epoch": 0.004150495005524739, "grad_norm": 0.07352989166975021, "kl": 0.10665607079863548, "learning_rate": 3e-06, "loss": 0.0048, "step": 1495 }, { "clip_ratio": 8.954155055107549e-05, "epoch": 0.004153271256364555, "grad_norm": 0.0793951153755188, "kl": 0.09884294867515564, "learning_rate": 3e-06, "loss": 0.0047, "step": 1496 }, { "clip_ratio": 0.0, "epoch": 0.004156047507204371, "grad_norm": 0.07391367107629776, "kl": 0.09311042353510857, "learning_rate": 3e-06, "loss": 0.0046, "step": 1497 }, { "clip_ratio": 0.0002756018075160682, "epoch": 0.004158823758044187, "grad_norm": 0.06575078517198563, "kl": 0.09845591336488724, "learning_rate": 3e-06, "loss": 0.0047, "step": 1498 }, { "clip_ratio": 0.00036945813917554915, "epoch": 0.004161600008884003, "grad_norm": 0.08721313625574112, "kl": 0.09197269007563591, "learning_rate": 3e-06, "loss": 0.0037, "step": 1499 }, { "clip_ratio": 0.0002712659916142002, "epoch": 0.004164376259723819, "grad_norm": 0.07341030240058899, "kl": 0.09440727904438972, "learning_rate": 3e-06, "loss": 0.0046, "step": 1500 }, { "clip_ratio": 0.00010064412344945595, "completion_length": 220.5416717529297, "epoch": 0.004167152510563634, "grad_norm": 0.11328040063381195, "kl": 0.09348515421152115, "learning_rate": 3e-06, "loss": 0.0053, "reward": 0.35625001788139343, "reward_std": 0.31346985697746277, "rewards/countdown_reward_func": 0.35625001788139343, "step": 1501, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0041699287614034505, "grad_norm": 0.11099179089069366, "kl": 0.09527323395013809, "learning_rate": 3e-06, "loss": 0.0064, "step": 1502 }, { "clip_ratio": 0.00023474179033655673, "epoch": 0.004172705012243266, "grad_norm": 0.10442166030406952, "kl": 0.08588381856679916, "learning_rate": 3e-06, "loss": 0.0054, "step": 1503 }, { "clip_ratio": 8.992805669549853e-05, "epoch": 0.004175481263083082, "grad_norm": 0.13162118196487427, "kl": 0.08620287477970123, "learning_rate": 3e-06, "loss": 0.005, "step": 1504 }, { "clip_ratio": 0.000326107838191092, "epoch": 0.004178257513922898, "grad_norm": 0.11255912482738495, "kl": 0.08432386443018913, "learning_rate": 3e-06, "loss": 0.0055, "step": 1505 }, { "clip_ratio": 0.0004599874228006229, "epoch": 0.004181033764762714, "grad_norm": 0.11156965047121048, "kl": 0.08439022302627563, "learning_rate": 3e-06, "loss": 0.0056, "step": 1506 }, { "clip_ratio": 9.750390017870814e-05, "epoch": 0.004183810015602529, "grad_norm": 0.11413870751857758, "kl": 0.08592484891414642, "learning_rate": 3e-06, "loss": 0.0051, "step": 1507 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004186586266442346, "grad_norm": 0.12828348577022552, "kl": 0.08880098164081573, "learning_rate": 3e-06, "loss": 0.0044, "step": 1508 }, { "clip_ratio": 0.0002807183118420653, "epoch": 0.004189362517282162, "grad_norm": 0.10022372007369995, "kl": 0.07961970940232277, "learning_rate": 3e-06, "loss": 0.0039, "step": 1509 }, { "clip_ratio": 0.0004508365091169253, "epoch": 0.004192138768121977, "grad_norm": 0.1420833021402359, "kl": 0.08037307858467102, "learning_rate": 3e-06, "loss": 0.0048, "step": 1510 }, { "clip_ratio": 0.0011079075193265453, "epoch": 0.0041949150189617935, "grad_norm": 0.11328054964542389, "kl": 0.07894135639071465, "learning_rate": 3e-06, "loss": 0.0045, "step": 1511 }, { "clip_ratio": 0.0002834866172634065, "epoch": 0.004197691269801609, "grad_norm": 0.12972882390022278, "kl": 0.07961436733603477, "learning_rate": 3e-06, "loss": 0.004, "step": 1512 }, { "clip_ratio": 0.0002615062694530934, "completion_length": 231.58333587646484, "epoch": 0.004200467520641425, "grad_norm": 0.09790017455816269, "kl": 0.08236554637551308, "learning_rate": 3e-06, "loss": 0.0149, "reward": 0.229166679084301, "reward_std": 0.19716466218233109, "rewards/countdown_reward_func": 0.229166679084301, "step": 1513, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00018543919577496126, "epoch": 0.004203243771481241, "grad_norm": 0.07614653557538986, "kl": 0.09001684933900833, "learning_rate": 3e-06, "loss": 0.0156, "step": 1514 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.004206020022321057, "grad_norm": 0.08829134702682495, "kl": 0.0819174312055111, "learning_rate": 3e-06, "loss": 0.0153, "step": 1515 }, { "clip_ratio": 0.0001734273391775787, "epoch": 0.004208796273160872, "grad_norm": 0.05850941315293312, "kl": 0.0837816521525383, "learning_rate": 3e-06, "loss": 0.0154, "step": 1516 }, { "clip_ratio": 0.00029112402262398973, "epoch": 0.0042115725240006886, "grad_norm": 0.07008534669876099, "kl": 0.08121475949883461, "learning_rate": 3e-06, "loss": 0.0152, "step": 1517 }, { "clip_ratio": 0.0001695139508228749, "epoch": 0.004214348774840504, "grad_norm": 0.07618328183889389, "kl": 0.08276839926838875, "learning_rate": 3e-06, "loss": 0.0155, "step": 1518 }, { "clip_ratio": 0.00034776486427290365, "epoch": 0.00421712502568032, "grad_norm": 0.10261756926774979, "kl": 0.082290880382061, "learning_rate": 3e-06, "loss": 0.0155, "step": 1519 }, { "clip_ratio": 0.00016469038382638246, "epoch": 0.0042199012765201365, "grad_norm": 0.06574001908302307, "kl": 0.09078622981905937, "learning_rate": 3e-06, "loss": 0.0156, "step": 1520 }, { "clip_ratio": 0.0, "epoch": 0.004222677527359952, "grad_norm": 0.08486269414424896, "kl": 0.08278439193964005, "learning_rate": 3e-06, "loss": 0.0151, "step": 1521 }, { "clip_ratio": 0.0001796506403479725, "epoch": 0.004225453778199768, "grad_norm": 0.05522237718105316, "kl": 0.08544733002781868, "learning_rate": 3e-06, "loss": 0.0154, "step": 1522 }, { "clip_ratio": 0.00026453185273567215, "epoch": 0.004228230029039584, "grad_norm": 0.08137688040733337, "kl": 0.08243484795093536, "learning_rate": 3e-06, "loss": 0.0151, "step": 1523 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0042310062798794, "grad_norm": 0.06071337312459946, "kl": 0.08487970381975174, "learning_rate": 3e-06, "loss": 0.0142, "step": 1524 }, { "clip_ratio": 0.00026819700724445283, "completion_length": 232.87500762939453, "epoch": 0.004233782530719215, "grad_norm": 0.066123828291893, "kl": 0.08086292445659637, "learning_rate": 3e-06, "loss": -0.0006, "reward": 0.17083334922790527, "reward_std": 0.11393594369292259, "rewards/countdown_reward_func": 0.17083334922790527, "step": 1525, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00016953254089457914, "epoch": 0.0042365587815590315, "grad_norm": 0.19338181614875793, "kl": 0.08114173635840416, "learning_rate": 3e-06, "loss": -0.0, "step": 1526 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004239335032398847, "grad_norm": 0.08377951383590698, "kl": 0.07633576914668083, "learning_rate": 3e-06, "loss": -0.0006, "step": 1527 }, { "clip_ratio": 0.0001913186424644664, "epoch": 0.004242111283238663, "grad_norm": 0.06688714772462845, "kl": 0.0815008170902729, "learning_rate": 3e-06, "loss": -0.0004, "step": 1528 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004244887534078479, "grad_norm": 0.05774759501218796, "kl": 0.07802558317780495, "learning_rate": 3e-06, "loss": -0.0003, "step": 1529 }, { "clip_ratio": 0.0004640174884116277, "epoch": 0.004247663784918295, "grad_norm": 0.09573258459568024, "kl": 0.07392871007323265, "learning_rate": 3e-06, "loss": -0.001, "step": 1530 }, { "clip_ratio": 0.00025327454204671085, "epoch": 0.004250440035758111, "grad_norm": 0.07546308636665344, "kl": 0.07837391272187233, "learning_rate": 3e-06, "loss": -0.0008, "step": 1531 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004253216286597927, "grad_norm": 0.21055151522159576, "kl": 0.07768003270030022, "learning_rate": 3e-06, "loss": -0.0019, "step": 1532 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004255992537437743, "grad_norm": 0.0746055543422699, "kl": 0.07167375087738037, "learning_rate": 3e-06, "loss": -0.0011, "step": 1533 }, { "clip_ratio": 0.0003103909839410335, "epoch": 0.004258768788277558, "grad_norm": 0.07092703133821487, "kl": 0.076376061886549, "learning_rate": 3e-06, "loss": -0.0013, "step": 1534 }, { "clip_ratio": 0.0007744758331682533, "epoch": 0.0042615450391173745, "grad_norm": 0.058808572590351105, "kl": 0.07276593148708344, "learning_rate": 3e-06, "loss": -0.0009, "step": 1535 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00426432128995719, "grad_norm": 0.07845474034547806, "kl": 0.06798744946718216, "learning_rate": 3e-06, "loss": -0.0021, "step": 1536 }, { "clip_ratio": 0.0003383140719961375, "completion_length": 234.5, "epoch": 0.004267097540797006, "grad_norm": 0.11504442989826202, "kl": 0.06899916008114815, "learning_rate": 3e-06, "loss": 0.0036, "reward": 0.2854166775941849, "reward_std": 0.2782912850379944, "rewards/countdown_reward_func": 0.2854166552424431, "step": 1537, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00025050100521184504, "epoch": 0.004269873791636822, "grad_norm": 0.08852005749940872, "kl": 0.06766302511096, "learning_rate": 3e-06, "loss": 0.0039, "step": 1538 }, { "clip_ratio": 0.0, "epoch": 0.004272650042476638, "grad_norm": 0.08253919333219528, "kl": 0.06684885919094086, "learning_rate": 3e-06, "loss": 0.0035, "step": 1539 }, { "clip_ratio": 0.0003363504074513912, "epoch": 0.004275426293316453, "grad_norm": 0.09681417047977448, "kl": 0.062055185437202454, "learning_rate": 3e-06, "loss": 0.0033, "step": 1540 }, { "clip_ratio": 0.0004321035448811017, "epoch": 0.00427820254415627, "grad_norm": 0.09291265904903412, "kl": 0.06612430512905121, "learning_rate": 3e-06, "loss": 0.0039, "step": 1541 }, { "clip_ratio": 8.520791016053408e-05, "epoch": 0.004280978794996086, "grad_norm": 0.09506966918706894, "kl": 0.06933099403977394, "learning_rate": 3e-06, "loss": 0.0042, "step": 1542 }, { "clip_ratio": 0.0, "epoch": 0.004283755045835901, "grad_norm": 0.11389172077178955, "kl": 0.0656692385673523, "learning_rate": 3e-06, "loss": 0.0028, "step": 1543 }, { "clip_ratio": 0.00033449956390541047, "epoch": 0.0042865312966757175, "grad_norm": 0.08597692847251892, "kl": 0.0642948318272829, "learning_rate": 3e-06, "loss": 0.0026, "step": 1544 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004289307547515533, "grad_norm": 0.09447018057107925, "kl": 0.06415480002760887, "learning_rate": 3e-06, "loss": 0.0023, "step": 1545 }, { "clip_ratio": 0.0008052970515564084, "epoch": 0.004292083798355349, "grad_norm": 0.08938866853713989, "kl": 0.06041167676448822, "learning_rate": 3e-06, "loss": 0.0018, "step": 1546 }, { "clip_ratio": 0.00062086500111036, "epoch": 0.004294860049195165, "grad_norm": 0.5281029939651489, "kl": 0.06464917957782745, "learning_rate": 3e-06, "loss": 0.0026, "step": 1547 }, { "clip_ratio": 0.0005138094347785227, "epoch": 0.004297636300034981, "grad_norm": 0.08887960761785507, "kl": 0.06742081791162491, "learning_rate": 3e-06, "loss": 0.0035, "step": 1548 }, { "clip_ratio": 0.0002517917600926012, "completion_length": 237.14584350585938, "epoch": 0.004300412550874796, "grad_norm": 0.10000678896903992, "kl": 0.05254969373345375, "learning_rate": 3e-06, "loss": 0.0067, "reward": 0.2666666954755783, "reward_std": 0.2623074799776077, "rewards/countdown_reward_func": 0.2666666880249977, "step": 1549, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003343000134918839, "epoch": 0.0043031888017146126, "grad_norm": 0.09186454117298126, "kl": 0.056838370859622955, "learning_rate": 3e-06, "loss": 0.0056, "step": 1550 }, { "clip_ratio": 0.0007483240551664494, "epoch": 0.004305965052554429, "grad_norm": 0.09988920390605927, "kl": 0.05866162106394768, "learning_rate": 3e-06, "loss": 0.0075, "step": 1551 }, { "clip_ratio": 0.00024526867491658777, "epoch": 0.004308741303394244, "grad_norm": 0.07087462395429611, "kl": 0.055921848863363266, "learning_rate": 3e-06, "loss": 0.0053, "step": 1552 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0043115175542340605, "grad_norm": 0.07110070437192917, "kl": 0.05466669611632824, "learning_rate": 3e-06, "loss": 0.0062, "step": 1553 }, { "clip_ratio": 0.00032777692831587046, "epoch": 0.004314293805073876, "grad_norm": 0.09251809120178223, "kl": 0.05664430558681488, "learning_rate": 3e-06, "loss": 0.0071, "step": 1554 }, { "clip_ratio": 0.00032552083575865254, "epoch": 0.004317070055913692, "grad_norm": 0.09662747383117676, "kl": 0.05329587869346142, "learning_rate": 3e-06, "loss": 0.0064, "step": 1555 }, { "clip_ratio": 0.00032552083575865254, "epoch": 0.004319846306753508, "grad_norm": 0.10306031256914139, "kl": 0.056288715451955795, "learning_rate": 3e-06, "loss": 0.0048, "step": 1556 }, { "clip_ratio": 0.0003680418885778636, "epoch": 0.004322622557593324, "grad_norm": 0.08361591398715973, "kl": 0.06024627387523651, "learning_rate": 3e-06, "loss": 0.0068, "step": 1557 }, { "clip_ratio": 0.00032552083575865254, "epoch": 0.004325398808433139, "grad_norm": 0.06961996108293533, "kl": 0.05571894347667694, "learning_rate": 3e-06, "loss": 0.0052, "step": 1558 }, { "clip_ratio": 9.051412052940577e-05, "epoch": 0.0043281750592729555, "grad_norm": 0.20629696547985077, "kl": 0.05542530678212643, "learning_rate": 3e-06, "loss": 0.0058, "step": 1559 }, { "clip_ratio": 0.0005872198089491576, "epoch": 0.004330951310112771, "grad_norm": 0.09624534100294113, "kl": 0.05719461478292942, "learning_rate": 3e-06, "loss": 0.0056, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 233.52084350585938, "epoch": 0.004333727560952587, "grad_norm": 0.07015831768512726, "kl": 0.06173841841518879, "learning_rate": 3e-06, "loss": -0.0015, "reward": 0.2666666954755783, "reward_std": 0.22149831801652908, "rewards/countdown_reward_func": 0.2666666805744171, "step": 1561, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00018502863531466573, "epoch": 0.0043365038117924035, "grad_norm": 0.07204274088144302, "kl": 0.0598264392465353, "learning_rate": 3e-06, "loss": -0.0018, "step": 1562 }, { "clip_ratio": 0.00016556291666347533, "epoch": 0.004339280062632219, "grad_norm": 0.08235902339220047, "kl": 0.06111270561814308, "learning_rate": 3e-06, "loss": -0.0016, "step": 1563 }, { "clip_ratio": 0.000179314476554282, "epoch": 0.004342056313472035, "grad_norm": 0.08542285859584808, "kl": 0.06232334300875664, "learning_rate": 3e-06, "loss": -0.002, "step": 1564 }, { "clip_ratio": 0.0001975396226043813, "epoch": 0.004344832564311851, "grad_norm": 0.08785772323608398, "kl": 0.06148502230644226, "learning_rate": 3e-06, "loss": -0.0023, "step": 1565 }, { "clip_ratio": 0.0001065643664333038, "epoch": 0.004347608815151667, "grad_norm": 0.06917145103216171, "kl": 0.06312652491033077, "learning_rate": 3e-06, "loss": -0.0017, "step": 1566 }, { "clip_ratio": 0.0, "epoch": 0.004350385065991482, "grad_norm": 0.07437288761138916, "kl": 0.0617559514939785, "learning_rate": 3e-06, "loss": -0.0019, "step": 1567 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0043531613168312985, "grad_norm": 0.07470710575580597, "kl": 0.0611141100525856, "learning_rate": 3e-06, "loss": -0.0026, "step": 1568 }, { "clip_ratio": 0.00016971942386589944, "epoch": 0.004355937567671114, "grad_norm": 0.10094926506280899, "kl": 0.06114407069981098, "learning_rate": 3e-06, "loss": -0.0028, "step": 1569 }, { "clip_ratio": 0.0, "epoch": 0.00435871381851093, "grad_norm": 0.08059901744127274, "kl": 0.06223343499004841, "learning_rate": 3e-06, "loss": -0.0026, "step": 1570 }, { "clip_ratio": 0.0002885148787754588, "epoch": 0.004361490069350746, "grad_norm": 0.07476944476366043, "kl": 0.06176266819238663, "learning_rate": 3e-06, "loss": -0.0027, "step": 1571 }, { "clip_ratio": 0.0001713502424536273, "epoch": 0.004364266320190562, "grad_norm": 0.07194018363952637, "kl": 0.06364806741476059, "learning_rate": 3e-06, "loss": -0.0024, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 238.9791717529297, "epoch": 0.004367042571030378, "grad_norm": 0.06975971907377243, "kl": 0.07021701335906982, "learning_rate": 3e-06, "loss": 0.0159, "reward": 0.22500000894069672, "reward_std": 0.18065783381462097, "rewards/countdown_reward_func": 0.22500000149011612, "step": 1573, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.004369818821870194, "grad_norm": 0.06149615719914436, "kl": 0.07013114541769028, "learning_rate": 3e-06, "loss": 0.0162, "step": 1574 }, { "clip_ratio": 0.0001724217290757224, "epoch": 0.00437259507271001, "grad_norm": 0.06460988521575928, "kl": 0.07087487727403641, "learning_rate": 3e-06, "loss": 0.0164, "step": 1575 }, { "clip_ratio": 0.0001953798928298056, "epoch": 0.004375371323549825, "grad_norm": 0.08817639201879501, "kl": 0.07189563289284706, "learning_rate": 3e-06, "loss": 0.0164, "step": 1576 }, { "clip_ratio": 0.00010486577230039984, "epoch": 0.0043781475743896415, "grad_norm": 0.0764622911810875, "kl": 0.06396012753248215, "learning_rate": 3e-06, "loss": 0.0164, "step": 1577 }, { "clip_ratio": 0.0002441406322759576, "epoch": 0.004380923825229457, "grad_norm": 0.06788492202758789, "kl": 0.06740446761250496, "learning_rate": 3e-06, "loss": 0.0158, "step": 1578 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.004383700076069273, "grad_norm": 0.06214107945561409, "kl": 0.07128945738077164, "learning_rate": 3e-06, "loss": 0.0165, "step": 1579 }, { "clip_ratio": 0.00024570666573708877, "epoch": 0.004386476326909089, "grad_norm": 0.06276597827672958, "kl": 0.0706591084599495, "learning_rate": 3e-06, "loss": 0.0157, "step": 1580 }, { "clip_ratio": 0.00033518214331706986, "epoch": 0.004389252577748905, "grad_norm": 0.06644298136234283, "kl": 0.07160716131329536, "learning_rate": 3e-06, "loss": 0.0156, "step": 1581 }, { "clip_ratio": 0.00033465474552940577, "epoch": 0.00439202882858872, "grad_norm": 0.09485163539648056, "kl": 0.07267293706536293, "learning_rate": 3e-06, "loss": 0.0154, "step": 1582 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.0043948050794285366, "grad_norm": 0.07193972915410995, "kl": 0.06612110696732998, "learning_rate": 3e-06, "loss": 0.0152, "step": 1583 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004397581330268353, "grad_norm": 0.06304839253425598, "kl": 0.06994717195630074, "learning_rate": 3e-06, "loss": 0.0155, "step": 1584 }, { "clip_ratio": 0.00011627907224465162, "completion_length": 229.9166717529297, "epoch": 0.004400357581108168, "grad_norm": 0.16353268921375275, "kl": 0.07109616324305534, "learning_rate": 3e-06, "loss": 0.012, "reward": 0.3437500298023224, "reward_std": 0.33480696380138397, "rewards/countdown_reward_func": 0.3437500298023224, "step": 1585, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0044031338319479845, "grad_norm": 0.09060167521238327, "kl": 0.06814149767160416, "learning_rate": 3e-06, "loss": 0.0127, "step": 1586 }, { "clip_ratio": 0.00019944607629440725, "epoch": 0.0044059100827878, "grad_norm": 0.11436658352613449, "kl": 0.06723786145448685, "learning_rate": 3e-06, "loss": 0.0122, "step": 1587 }, { "clip_ratio": 0.00029042133246548474, "epoch": 0.004408686333627616, "grad_norm": 0.09583216160535812, "kl": 0.07507549971342087, "learning_rate": 3e-06, "loss": 0.0116, "step": 1588 }, { "clip_ratio": 0.0, "epoch": 0.004411462584467432, "grad_norm": 0.08748263865709305, "kl": 0.07277511805295944, "learning_rate": 3e-06, "loss": 0.0118, "step": 1589 }, { "clip_ratio": 0.0005822835373692214, "epoch": 0.004414238835307248, "grad_norm": 0.10275944322347641, "kl": 0.06728723272681236, "learning_rate": 3e-06, "loss": 0.0114, "step": 1590 }, { "clip_ratio": 0.00011627907224465162, "epoch": 0.004417015086147063, "grad_norm": 0.15305157005786896, "kl": 0.0735839381814003, "learning_rate": 3e-06, "loss": 0.0102, "step": 1591 }, { "clip_ratio": 0.00043021741294069216, "epoch": 0.0044197913369868795, "grad_norm": 0.1069241613149643, "kl": 0.07472379878163338, "learning_rate": 3e-06, "loss": 0.0111, "step": 1592 }, { "clip_ratio": 0.0, "epoch": 0.004422567587826695, "grad_norm": 0.09179497510194778, "kl": 0.07055623456835747, "learning_rate": 3e-06, "loss": 0.0105, "step": 1593 }, { "clip_ratio": 0.00019944607629440725, "epoch": 0.004425343838666511, "grad_norm": 0.09448972344398499, "kl": 0.08183940127491951, "learning_rate": 3e-06, "loss": 0.0114, "step": 1594 }, { "clip_ratio": 8.316699677379802e-05, "epoch": 0.0044281200895063275, "grad_norm": 0.12228698283433914, "kl": 0.07783747464418411, "learning_rate": 3e-06, "loss": 0.01, "step": 1595 }, { "clip_ratio": 0.0004985033592674881, "epoch": 0.004430896340346143, "grad_norm": 0.13479016721248627, "kl": 0.07179780304431915, "learning_rate": 3e-06, "loss": 0.0101, "step": 1596 }, { "clip_ratio": 0.0002777284535113722, "completion_length": 230.43750762939453, "epoch": 0.004433672591185959, "grad_norm": 0.11454746127128601, "kl": 0.06979512795805931, "learning_rate": 3e-06, "loss": 0.0316, "reward": 0.30416668951511383, "reward_std": 0.2991009056568146, "rewards/countdown_reward_func": 0.30416667461395264, "step": 1597, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00035156837839167565, "epoch": 0.004436448842025775, "grad_norm": 0.14501981437206268, "kl": 0.07220923155546188, "learning_rate": 3e-06, "loss": 0.0309, "step": 1598 }, { "clip_ratio": 9.484066686127335e-05, "epoch": 0.004439225092865591, "grad_norm": 0.09493423253297806, "kl": 0.07610296085476875, "learning_rate": 3e-06, "loss": 0.0313, "step": 1599 }, { "clip_ratio": 0.0005380346046877094, "epoch": 0.004442001343705406, "grad_norm": 0.10318285971879959, "kl": 0.07600558176636696, "learning_rate": 3e-06, "loss": 0.0309, "step": 1600 }, { "clip_ratio": 0.0, "epoch": 0.0044447775945452225, "grad_norm": 0.1465279459953308, "kl": 0.07435224205255508, "learning_rate": 3e-06, "loss": 0.0301, "step": 1601 }, { "clip_ratio": 0.0, "epoch": 0.004447553845385038, "grad_norm": 0.132258802652359, "kl": 0.07526106759905815, "learning_rate": 3e-06, "loss": 0.0296, "step": 1602 }, { "clip_ratio": 0.0003190559073118493, "epoch": 0.004450330096224854, "grad_norm": 0.11824331432580948, "kl": 0.07644139230251312, "learning_rate": 3e-06, "loss": 0.0287, "step": 1603 }, { "clip_ratio": 0.00036518805427476764, "epoch": 0.00445310634706467, "grad_norm": 0.12642835080623627, "kl": 0.08045900240540504, "learning_rate": 3e-06, "loss": 0.028, "step": 1604 }, { "clip_ratio": 9.484066686127335e-05, "epoch": 0.004455882597904486, "grad_norm": 0.08899746090173721, "kl": 0.08476192504167557, "learning_rate": 3e-06, "loss": 0.029, "step": 1605 }, { "clip_ratio": 0.00035244174068793654, "epoch": 0.004458658848744302, "grad_norm": 0.09743326157331467, "kl": 0.08727932721376419, "learning_rate": 3e-06, "loss": 0.0277, "step": 1606 }, { "clip_ratio": 0.0, "epoch": 0.004461435099584118, "grad_norm": 0.13737818598747253, "kl": 0.08706852421164513, "learning_rate": 3e-06, "loss": 0.0263, "step": 1607 }, { "clip_ratio": 0.0005535711825359613, "epoch": 0.004464211350423934, "grad_norm": 0.12821559607982635, "kl": 0.08946230262517929, "learning_rate": 3e-06, "loss": 0.0255, "step": 1608 }, { "clip_ratio": 0.0003540786055964418, "completion_length": 227.70833587646484, "epoch": 0.004466987601263749, "grad_norm": 0.11783099174499512, "kl": 0.10899167135357857, "learning_rate": 3e-06, "loss": -0.0008, "reward": 0.30416668951511383, "reward_std": 0.3719724863767624, "rewards/countdown_reward_func": 0.30416668951511383, "step": 1609, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.704969124868512e-05, "epoch": 0.0044697638521035655, "grad_norm": 0.10847090929746628, "kl": 0.10389678180217743, "learning_rate": 3e-06, "loss": -0.0007, "step": 1610 }, { "clip_ratio": 0.0006049876101315022, "epoch": 0.004472540102943381, "grad_norm": 0.11665836721658707, "kl": 0.10452697053551674, "learning_rate": 3e-06, "loss": -0.0001, "step": 1611 }, { "clip_ratio": 0.000750809209421277, "epoch": 0.004475316353783197, "grad_norm": 0.1398657262325287, "kl": 0.12372316420078278, "learning_rate": 3e-06, "loss": -0.0002, "step": 1612 }, { "clip_ratio": 0.0, "epoch": 0.004478092604623013, "grad_norm": 0.09913130849599838, "kl": 0.11288458853960037, "learning_rate": 3e-06, "loss": -0.0017, "step": 1613 }, { "clip_ratio": 0.00017424796533305198, "epoch": 0.004480868855462829, "grad_norm": 0.1439204066991806, "kl": 0.1121748648583889, "learning_rate": 3e-06, "loss": -0.0008, "step": 1614 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.004483645106302644, "grad_norm": 0.11864045262336731, "kl": 0.12259047850966454, "learning_rate": 3e-06, "loss": 0.0006, "step": 1615 }, { "clip_ratio": 0.0, "epoch": 0.0044864213571424606, "grad_norm": 0.10626986622810364, "kl": 0.11897166818380356, "learning_rate": 3e-06, "loss": -0.0003, "step": 1616 }, { "clip_ratio": 0.0007439094333676621, "epoch": 0.004489197607982277, "grad_norm": 0.10574115067720413, "kl": 0.1181485652923584, "learning_rate": 3e-06, "loss": 0.0004, "step": 1617 }, { "clip_ratio": 0.000587511618505232, "epoch": 0.004491973858822092, "grad_norm": 0.12936261296272278, "kl": 0.13641805946826935, "learning_rate": 3e-06, "loss": -0.0005, "step": 1618 }, { "clip_ratio": 0.0001871362328529358, "epoch": 0.0044947501096619085, "grad_norm": 0.09758912771940231, "kl": 0.12104825302958488, "learning_rate": 3e-06, "loss": -0.0026, "step": 1619 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.004497526360501724, "grad_norm": 0.10167786478996277, "kl": 0.11872199177742004, "learning_rate": 3e-06, "loss": -0.0008, "step": 1620 }, { "clip_ratio": 0.0004541333037195727, "completion_length": 230.33333587646484, "epoch": 0.00450030261134154, "grad_norm": 0.09969217330217361, "kl": 0.13528293371200562, "learning_rate": 3e-06, "loss": 0.0064, "reward": 0.2875000238418579, "reward_std": 0.2661401033401489, "rewards/countdown_reward_func": 0.2875000163912773, "step": 1621, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.004503078862181356, "grad_norm": 0.11825183779001236, "kl": 0.11860142275691032, "learning_rate": 3e-06, "loss": 0.0048, "step": 1622 }, { "clip_ratio": 0.0, "epoch": 0.004505855113021172, "grad_norm": 0.09762841463088989, "kl": 0.1295238509774208, "learning_rate": 3e-06, "loss": 0.0057, "step": 1623 }, { "clip_ratio": 0.00018274853937327862, "epoch": 0.004508631363860987, "grad_norm": 0.1245664656162262, "kl": 0.12989997118711472, "learning_rate": 3e-06, "loss": 0.0048, "step": 1624 }, { "clip_ratio": 0.0004756918060593307, "epoch": 0.0045114076147008035, "grad_norm": 0.09959601610898972, "kl": 0.13353952765464783, "learning_rate": 3e-06, "loss": 0.0055, "step": 1625 }, { "clip_ratio": 0.0004383840714581311, "epoch": 0.004514183865540619, "grad_norm": 0.10265988856554031, "kl": 0.1284654214978218, "learning_rate": 3e-06, "loss": 0.0058, "step": 1626 }, { "clip_ratio": 0.0007191235781647265, "epoch": 0.004516960116380435, "grad_norm": 0.09630168974399567, "kl": 0.13352051377296448, "learning_rate": 3e-06, "loss": 0.0057, "step": 1627 }, { "clip_ratio": 0.0001932617014972493, "epoch": 0.0045197363672202515, "grad_norm": 0.1130925789475441, "kl": 0.11547379568219185, "learning_rate": 3e-06, "loss": 0.0042, "step": 1628 }, { "clip_ratio": 0.00018102824105881155, "epoch": 0.004522512618060067, "grad_norm": 0.10465371608734131, "kl": 0.12602712959051132, "learning_rate": 3e-06, "loss": 0.0042, "step": 1629 }, { "clip_ratio": 0.0002716227754717693, "epoch": 0.004525288868899883, "grad_norm": 0.11785195767879486, "kl": 0.12726370617747307, "learning_rate": 3e-06, "loss": 0.003, "step": 1630 }, { "clip_ratio": 0.00018102824105881155, "epoch": 0.004528065119739699, "grad_norm": 0.1045258566737175, "kl": 0.12971790879964828, "learning_rate": 3e-06, "loss": 0.0042, "step": 1631 }, { "clip_ratio": 0.0006421779398806393, "epoch": 0.004530841370579515, "grad_norm": 0.10235866159200668, "kl": 0.12507328763604164, "learning_rate": 3e-06, "loss": 0.004, "step": 1632 }, { "clip_ratio": 9.61538462433964e-05, "completion_length": 237.31250762939453, "epoch": 0.00453361762141933, "grad_norm": 0.10965386778116226, "kl": 0.11101473122835159, "learning_rate": 3e-06, "loss": 0.0247, "reward": 0.34166668355464935, "reward_std": 0.3452594429254532, "rewards/countdown_reward_func": 0.34166668355464935, "step": 1633, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0002707348467083648, "epoch": 0.0045363938722591465, "grad_norm": 0.10254738479852676, "kl": 0.11892389506101608, "learning_rate": 3e-06, "loss": 0.0251, "step": 1634 }, { "clip_ratio": 0.00018317173817194998, "epoch": 0.004539170123098962, "grad_norm": 0.1211012601852417, "kl": 0.11001782864332199, "learning_rate": 3e-06, "loss": 0.0254, "step": 1635 }, { "clip_ratio": 0.0007125435367925093, "epoch": 0.004541946373938778, "grad_norm": 0.10486630350351334, "kl": 0.11037512868642807, "learning_rate": 3e-06, "loss": 0.025, "step": 1636 }, { "clip_ratio": 0.0001923076924867928, "epoch": 0.004544722624778594, "grad_norm": 0.11679045855998993, "kl": 0.11175461858510971, "learning_rate": 3e-06, "loss": 0.025, "step": 1637 }, { "clip_ratio": 0.00017780938651412725, "epoch": 0.00454749887561841, "grad_norm": 0.11241600662469864, "kl": 0.11684262380003929, "learning_rate": 3e-06, "loss": 0.0248, "step": 1638 }, { "clip_ratio": 8.383634849451482e-05, "epoch": 0.004550275126458226, "grad_norm": 0.12549988925457, "kl": 0.1113254725933075, "learning_rate": 3e-06, "loss": 0.0236, "step": 1639 }, { "clip_ratio": 0.0003387995529919863, "epoch": 0.004553051377298042, "grad_norm": 0.09110260754823685, "kl": 0.11747504025697708, "learning_rate": 3e-06, "loss": 0.0238, "step": 1640 }, { "clip_ratio": 0.00026678377616917714, "epoch": 0.004555827628137858, "grad_norm": 0.12658299505710602, "kl": 0.11034082993865013, "learning_rate": 3e-06, "loss": 0.0242, "step": 1641 }, { "clip_ratio": 0.0003627651822171174, "epoch": 0.004558603878977673, "grad_norm": 0.1093655377626419, "kl": 0.11209141090512276, "learning_rate": 3e-06, "loss": 0.0244, "step": 1642 }, { "clip_ratio": 9.61538462433964e-05, "epoch": 0.0045613801298174895, "grad_norm": 0.11235036700963974, "kl": 0.11257147789001465, "learning_rate": 3e-06, "loss": 0.0237, "step": 1643 }, { "clip_ratio": 8.890469325706363e-05, "epoch": 0.004564156380657305, "grad_norm": 0.11150780320167542, "kl": 0.11817445978522301, "learning_rate": 3e-06, "loss": 0.024, "step": 1644 }, { "clip_ratio": 0.00017182034207507968, "completion_length": 218.52083587646484, "epoch": 0.004566932631497121, "grad_norm": 0.11840204149484634, "kl": 0.13194195181131363, "learning_rate": 3e-06, "loss": 0.0075, "reward": 0.3791666626930237, "reward_std": 0.36280614137649536, "rewards/countdown_reward_func": 0.3791666626930237, "step": 1645, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0001797049117158167, "epoch": 0.004569708882336937, "grad_norm": 0.10418499261140823, "kl": 0.13568759709596634, "learning_rate": 3e-06, "loss": 0.0073, "step": 1646 }, { "clip_ratio": 0.00017531556659378111, "epoch": 0.004572485133176753, "grad_norm": 0.13569895923137665, "kl": 0.1283918097615242, "learning_rate": 3e-06, "loss": 0.0067, "step": 1647 }, { "clip_ratio": 0.00020443140238057822, "epoch": 0.004575261384016568, "grad_norm": 0.13669107854366302, "kl": 0.13637378439307213, "learning_rate": 3e-06, "loss": 0.0073, "step": 1648 }, { "clip_ratio": 0.0, "epoch": 0.0045780376348563846, "grad_norm": 0.10260222107172012, "kl": 0.1373545378446579, "learning_rate": 3e-06, "loss": 0.0077, "step": 1649 }, { "clip_ratio": 0.0004510040016612038, "epoch": 0.004580813885696201, "grad_norm": 0.1269139051437378, "kl": 0.14053868502378464, "learning_rate": 3e-06, "loss": 0.0081, "step": 1650 }, { "clip_ratio": 0.0006953472329769284, "epoch": 0.004583590136536016, "grad_norm": 0.11335631459951401, "kl": 0.12832976132631302, "learning_rate": 3e-06, "loss": 0.0068, "step": 1651 }, { "clip_ratio": 0.0004522776580415666, "epoch": 0.0045863663873758325, "grad_norm": 0.12118861079216003, "kl": 0.13261255621910095, "learning_rate": 3e-06, "loss": 0.0063, "step": 1652 }, { "clip_ratio": 0.0, "epoch": 0.004589142638215648, "grad_norm": 0.130536749958992, "kl": 0.12578046321868896, "learning_rate": 3e-06, "loss": 0.0049, "step": 1653 }, { "clip_ratio": 0.00020579837291734293, "epoch": 0.004591918889055464, "grad_norm": 0.13175055384635925, "kl": 0.13097375631332397, "learning_rate": 3e-06, "loss": 0.0062, "step": 1654 }, { "clip_ratio": 0.00038550328463315964, "epoch": 0.00459469513989528, "grad_norm": 0.0984969213604927, "kl": 0.13125114515423775, "learning_rate": 3e-06, "loss": 0.0059, "step": 1655 }, { "clip_ratio": 0.0004875610757153481, "epoch": 0.004597471390735096, "grad_norm": 0.11173491179943085, "kl": 0.13282914832234383, "learning_rate": 3e-06, "loss": 0.0056, "step": 1656 }, { "clip_ratio": 0.0001720578147796914, "completion_length": 235.7916717529297, "epoch": 0.004600247641574911, "grad_norm": 0.13065429031848907, "kl": 0.10866416990756989, "learning_rate": 3e-06, "loss": 0.015, "reward": 0.30625002831220627, "reward_std": 0.31206804513931274, "rewards/countdown_reward_func": 0.3062500134110451, "step": 1657, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.650519157527015e-05, "epoch": 0.0046030238924147275, "grad_norm": 0.09855504333972931, "kl": 0.10450971871614456, "learning_rate": 3e-06, "loss": 0.0145, "step": 1658 }, { "clip_ratio": 0.0006019261782057583, "epoch": 0.004605800143254543, "grad_norm": 0.08144383132457733, "kl": 0.11103589460253716, "learning_rate": 3e-06, "loss": 0.0156, "step": 1659 }, { "clip_ratio": 0.00030096308910287917, "epoch": 0.004608576394094359, "grad_norm": 0.09432277828454971, "kl": 0.10549774393439293, "learning_rate": 3e-06, "loss": 0.0161, "step": 1660 }, { "clip_ratio": 0.0004300602595321834, "epoch": 0.0046113526449341755, "grad_norm": 0.09225940704345703, "kl": 0.108955267816782, "learning_rate": 3e-06, "loss": 0.0163, "step": 1661 }, { "clip_ratio": 0.00045322794176172465, "epoch": 0.004614128895773991, "grad_norm": 0.12361875921487808, "kl": 0.1058424860239029, "learning_rate": 3e-06, "loss": 0.0149, "step": 1662 }, { "clip_ratio": 8.60289073898457e-05, "epoch": 0.004616905146613807, "grad_norm": 0.11309605091810226, "kl": 0.10573908686637878, "learning_rate": 3e-06, "loss": 0.0146, "step": 1663 }, { "clip_ratio": 0.00016788540233392268, "epoch": 0.004619681397453623, "grad_norm": 0.09809679538011551, "kl": 0.10098403319716454, "learning_rate": 3e-06, "loss": 0.0145, "step": 1664 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.004622457648293439, "grad_norm": 0.07421303540468216, "kl": 0.10688856989145279, "learning_rate": 3e-06, "loss": 0.0152, "step": 1665 }, { "clip_ratio": 0.00010032102727564052, "epoch": 0.004625233899133254, "grad_norm": 0.09634650498628616, "kl": 0.10281075537204742, "learning_rate": 3e-06, "loss": 0.0153, "step": 1666 }, { "clip_ratio": 0.0004270849240128882, "epoch": 0.0046280101499730705, "grad_norm": 0.10299887508153915, "kl": 0.10465852543711662, "learning_rate": 3e-06, "loss": 0.0147, "step": 1667 }, { "clip_ratio": 0.0006324783898890018, "epoch": 0.004630786400812886, "grad_norm": 0.11212204396724701, "kl": 0.10365309193730354, "learning_rate": 3e-06, "loss": 0.0127, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 232.75, "epoch": 0.004633562651652702, "grad_norm": 0.06845124810934067, "kl": 0.09933054447174072, "learning_rate": 3e-06, "loss": -0.008, "reward": 0.15416667610406876, "reward_std": 0.1091257855296135, "rewards/countdown_reward_func": 0.15416667610406876, "step": 1669, "zero_std_ratio": 0.625 }, { "clip_ratio": 8.45165632199496e-05, "epoch": 0.004636338902492518, "grad_norm": 0.0753096267580986, "kl": 0.10704569146037102, "learning_rate": 3e-06, "loss": -0.0078, "step": 1670 }, { "clip_ratio": 0.00035820446646539494, "epoch": 0.004639115153332334, "grad_norm": 0.056882914155721664, "kl": 0.10202248394489288, "learning_rate": 3e-06, "loss": -0.008, "step": 1671 }, { "clip_ratio": 0.0, "epoch": 0.00464189140417215, "grad_norm": 0.0649522915482521, "kl": 0.0977204330265522, "learning_rate": 3e-06, "loss": -0.008, "step": 1672 }, { "clip_ratio": 0.0, "epoch": 0.004644667655011966, "grad_norm": 0.06856367737054825, "kl": 0.09061568230390549, "learning_rate": 3e-06, "loss": -0.0085, "step": 1673 }, { "clip_ratio": 0.0005575706818490289, "epoch": 0.004647443905851782, "grad_norm": 0.09739033877849579, "kl": 0.09403853490948677, "learning_rate": 3e-06, "loss": -0.0085, "step": 1674 }, { "clip_ratio": 9.412650251761079e-05, "epoch": 0.004650220156691597, "grad_norm": 0.06736251711845398, "kl": 0.09165050461888313, "learning_rate": 3e-06, "loss": -0.0087, "step": 1675 }, { "clip_ratio": 0.00017174614913528785, "epoch": 0.0046529964075314135, "grad_norm": 0.0773087590932846, "kl": 0.09735730290412903, "learning_rate": 3e-06, "loss": -0.0086, "step": 1676 }, { "clip_ratio": 0.001088498393073678, "epoch": 0.004655772658371229, "grad_norm": 0.05607219412922859, "kl": 0.0904586911201477, "learning_rate": 3e-06, "loss": -0.009, "step": 1677 }, { "clip_ratio": 0.0009852994699031115, "epoch": 0.004658548909211045, "grad_norm": 0.07360868901014328, "kl": 0.08624210581183434, "learning_rate": 3e-06, "loss": -0.0091, "step": 1678 }, { "clip_ratio": 0.0013295641401782632, "epoch": 0.004661325160050861, "grad_norm": 0.06193244829773903, "kl": 0.07950897514820099, "learning_rate": 3e-06, "loss": -0.0099, "step": 1679 }, { "clip_ratio": 0.001959454733878374, "epoch": 0.004664101410890677, "grad_norm": 0.06660173088312149, "kl": 0.0817050151526928, "learning_rate": 3e-06, "loss": -0.0093, "step": 1680 }, { "clip_ratio": 0.00019872814300470054, "completion_length": 224.27083587646484, "epoch": 0.004666877661730492, "grad_norm": 0.11621461063623428, "kl": 0.09354111552238464, "learning_rate": 3e-06, "loss": -0.0011, "reward": 0.3020833432674408, "reward_std": 0.2798703759908676, "rewards/countdown_reward_func": 0.3020833358168602, "step": 1681, "zero_std_ratio": 0.125 }, { "clip_ratio": 8.644536865176633e-05, "epoch": 0.0046696539125703086, "grad_norm": 0.09669879823923111, "kl": 0.07669191434979439, "learning_rate": 3e-06, "loss": -0.0009, "step": 1682 }, { "clip_ratio": 0.0003749770621652715, "epoch": 0.004672430163410125, "grad_norm": 0.10819410532712936, "kl": 0.08729588240385056, "learning_rate": 3e-06, "loss": -0.001, "step": 1683 }, { "clip_ratio": 8.716875890968367e-05, "epoch": 0.00467520641424994, "grad_norm": 0.08888690173625946, "kl": 0.07762591540813446, "learning_rate": 3e-06, "loss": -0.0003, "step": 1684 }, { "clip_ratio": 8.704735228093341e-05, "epoch": 0.0046779826650897565, "grad_norm": 0.08127215504646301, "kl": 0.08648081123828888, "learning_rate": 3e-06, "loss": -0.0002, "step": 1685 }, { "clip_ratio": 8.704735228093341e-05, "epoch": 0.004680758915929572, "grad_norm": 0.08273155987262726, "kl": 0.0694781243801117, "learning_rate": 3e-06, "loss": -0.0011, "step": 1686 }, { "clip_ratio": 0.0006585983792319894, "epoch": 0.004683535166769388, "grad_norm": 0.1248389482498169, "kl": 0.08379602804780006, "learning_rate": 3e-06, "loss": -0.0014, "step": 1687 }, { "clip_ratio": 0.00037935634463792667, "epoch": 0.004686311417609204, "grad_norm": 0.09269773215055466, "kl": 0.06929580494761467, "learning_rate": 3e-06, "loss": -0.0026, "step": 1688 }, { "clip_ratio": 0.0001979566877707839, "epoch": 0.00468908766844902, "grad_norm": 0.10713022947311401, "kl": 0.08045705035328865, "learning_rate": 3e-06, "loss": -0.0023, "step": 1689 }, { "clip_ratio": 0.0003845375686069019, "epoch": 0.004691863919288835, "grad_norm": 0.093922920525074, "kl": 0.07109234854578972, "learning_rate": 3e-06, "loss": -0.001, "step": 1690 }, { "clip_ratio": 0.000683231744915247, "epoch": 0.0046946401701286515, "grad_norm": 0.08436852693557739, "kl": 0.07955209910869598, "learning_rate": 3e-06, "loss": -0.0011, "step": 1691 }, { "clip_ratio": 0.0004670836788136512, "epoch": 0.004697416420968467, "grad_norm": 0.07506747543811798, "kl": 0.06490103155374527, "learning_rate": 3e-06, "loss": -0.0018, "step": 1692 }, { "clip_ratio": 0.0004444956357474439, "completion_length": 228.9791717529297, "epoch": 0.004700192671808283, "grad_norm": 0.08811832219362259, "kl": 0.07729529216885567, "learning_rate": 3e-06, "loss": -0.0021, "reward": 0.2645833492279053, "reward_std": 0.23099135607481003, "rewards/countdown_reward_func": 0.2645833417773247, "step": 1693, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0047029689226480995, "grad_norm": 0.13022494316101074, "kl": 0.07385160028934479, "learning_rate": 3e-06, "loss": -0.0017, "step": 1694 }, { "clip_ratio": 0.0014592632069252431, "epoch": 0.004705745173487915, "grad_norm": 0.18172861635684967, "kl": 0.07337892800569534, "learning_rate": 3e-06, "loss": -0.0003, "step": 1695 }, { "clip_ratio": 0.0005964136798866093, "epoch": 0.004708521424327731, "grad_norm": 0.0807371437549591, "kl": 0.07234909385442734, "learning_rate": 3e-06, "loss": -0.0011, "step": 1696 }, { "clip_ratio": 0.0004422512211021967, "epoch": 0.004711297675167547, "grad_norm": 0.07396223396062851, "kl": 0.07509288936853409, "learning_rate": 3e-06, "loss": -0.002, "step": 1697 }, { "clip_ratio": 0.0003952159022446722, "epoch": 0.004714073926007363, "grad_norm": 0.09103484451770782, "kl": 0.08038389682769775, "learning_rate": 3e-06, "loss": -0.0014, "step": 1698 }, { "clip_ratio": 9.873617818811908e-05, "epoch": 0.004716850176847178, "grad_norm": 0.1258394420146942, "kl": 0.07613859698176384, "learning_rate": 3e-06, "loss": -0.0025, "step": 1699 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0047196264276869945, "grad_norm": 0.12652093172073364, "kl": 0.07367661595344543, "learning_rate": 3e-06, "loss": -0.0034, "step": 1700 }, { "clip_ratio": 0.0015260791988112032, "epoch": 0.00472240267852681, "grad_norm": 0.07796800136566162, "kl": 0.07423160970211029, "learning_rate": 3e-06, "loss": -0.0021, "step": 1701 }, { "clip_ratio": 0.00018615041335579008, "epoch": 0.004725178929366626, "grad_norm": 0.0833539217710495, "kl": 0.07227658852934837, "learning_rate": 3e-06, "loss": -0.0026, "step": 1702 }, { "clip_ratio": 0.0006381928396876901, "epoch": 0.004727955180206442, "grad_norm": 0.08434394001960754, "kl": 0.07486193627119064, "learning_rate": 3e-06, "loss": -0.0027, "step": 1703 }, { "clip_ratio": 0.0013060531928204, "epoch": 0.004730731431046258, "grad_norm": 0.07230468839406967, "kl": 0.08221058547496796, "learning_rate": 3e-06, "loss": -0.0026, "step": 1704 }, { "clip_ratio": 0.0002603422835818492, "completion_length": 229.4166717529297, "epoch": 0.004733507681886074, "grad_norm": 0.10037482529878616, "kl": 0.06833513081073761, "learning_rate": 3e-06, "loss": 0.0286, "reward": 0.36249999701976776, "reward_std": 0.33992572128772736, "rewards/countdown_reward_func": 0.36249999701976776, "step": 1705, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0047362839327258896, "grad_norm": 0.10078221559524536, "kl": 0.06954888440668583, "learning_rate": 3e-06, "loss": 0.0288, "step": 1706 }, { "clip_ratio": 0.00036663911305367947, "epoch": 0.004739060183565706, "grad_norm": 0.08536067605018616, "kl": 0.07179216295480728, "learning_rate": 3e-06, "loss": 0.028, "step": 1707 }, { "clip_ratio": 0.0001773409530869685, "epoch": 0.004741836434405521, "grad_norm": 0.11112114787101746, "kl": 0.061546871438622475, "learning_rate": 3e-06, "loss": 0.0281, "step": 1708 }, { "clip_ratio": 0.00025751072098501027, "epoch": 0.0047446126852453375, "grad_norm": 0.24011927843093872, "kl": 0.07169570401310921, "learning_rate": 3e-06, "loss": 0.0287, "step": 1709 }, { "clip_ratio": 0.0001065643664333038, "epoch": 0.004747388936085153, "grad_norm": 0.09349286556243896, "kl": 0.07777471095323563, "learning_rate": 3e-06, "loss": 0.029, "step": 1710 }, { "clip_ratio": 9.433962259208784e-05, "epoch": 0.004750165186924969, "grad_norm": 0.10448563098907471, "kl": 0.07240425795316696, "learning_rate": 3e-06, "loss": 0.0277, "step": 1711 }, { "clip_ratio": 9.211496217176318e-05, "epoch": 0.004752941437764785, "grad_norm": 0.09998632222414017, "kl": 0.07552265375852585, "learning_rate": 3e-06, "loss": 0.0273, "step": 1712 }, { "clip_ratio": 9.433962259208784e-05, "epoch": 0.004755717688604601, "grad_norm": 0.09559866786003113, "kl": 0.07743804901838303, "learning_rate": 3e-06, "loss": 0.0268, "step": 1713 }, { "clip_ratio": 0.0007562700993730687, "epoch": 0.004758493939444416, "grad_norm": 0.10372816026210785, "kl": 0.06693735346198082, "learning_rate": 3e-06, "loss": 0.0273, "step": 1714 }, { "clip_ratio": 0.0005361997973523103, "epoch": 0.0047612701902842326, "grad_norm": 0.13456197082996368, "kl": 0.07971575111150742, "learning_rate": 3e-06, "loss": 0.0266, "step": 1715 }, { "clip_ratio": 0.00019023237109649926, "epoch": 0.004764046441124049, "grad_norm": 0.10750555992126465, "kl": 0.08734529465436935, "learning_rate": 3e-06, "loss": 0.0272, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 239.375, "epoch": 0.004766822691963864, "grad_norm": 0.10056161880493164, "kl": 0.0764843225479126, "learning_rate": 3e-06, "loss": 0.0052, "reward": 0.3229166939854622, "reward_std": 0.3206951767206192, "rewards/countdown_reward_func": 0.3229166939854622, "step": 1717, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00016971943114185706, "epoch": 0.0047695989428036805, "grad_norm": 0.0818033367395401, "kl": 0.0784323550760746, "learning_rate": 3e-06, "loss": 0.0056, "step": 1718 }, { "clip_ratio": 0.000244140625, "epoch": 0.004772375193643496, "grad_norm": 0.11514274030923843, "kl": 0.08092424646019936, "learning_rate": 3e-06, "loss": 0.0055, "step": 1719 }, { "clip_ratio": 0.00044265526230446994, "epoch": 0.004775151444483312, "grad_norm": 0.08406610041856766, "kl": 0.07746245339512825, "learning_rate": 3e-06, "loss": 0.0053, "step": 1720 }, { "clip_ratio": 0.00016971943114185706, "epoch": 0.004777927695323128, "grad_norm": 0.0868329182267189, "kl": 0.08470290526747704, "learning_rate": 3e-06, "loss": 0.0054, "step": 1721 }, { "clip_ratio": 9.07111752894707e-05, "epoch": 0.004780703946162944, "grad_norm": 0.11065360903739929, "kl": 0.08721869066357613, "learning_rate": 3e-06, "loss": 0.0062, "step": 1722 }, { "clip_ratio": 0.0, "epoch": 0.004783480197002759, "grad_norm": 0.09971947222948074, "kl": 0.08457087725400925, "learning_rate": 3e-06, "loss": 0.0049, "step": 1723 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0047862564478425755, "grad_norm": 0.0916123166680336, "kl": 0.08624141290783882, "learning_rate": 3e-06, "loss": 0.0043, "step": 1724 }, { "clip_ratio": 0.0, "epoch": 0.004789032698682391, "grad_norm": 0.09791970252990723, "kl": 0.08757534250617027, "learning_rate": 3e-06, "loss": 0.0052, "step": 1725 }, { "clip_ratio": 0.00026201604487141594, "epoch": 0.004791808949522207, "grad_norm": 0.11371166259050369, "kl": 0.08409228920936584, "learning_rate": 3e-06, "loss": 0.0047, "step": 1726 }, { "clip_ratio": 8.833922038320452e-05, "epoch": 0.0047945852003620235, "grad_norm": 0.08852825313806534, "kl": 0.09211602807044983, "learning_rate": 3e-06, "loss": 0.0046, "step": 1727 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.004797361451201839, "grad_norm": 0.08812572062015533, "kl": 0.0936342217028141, "learning_rate": 3e-06, "loss": 0.005, "step": 1728 }, { "clip_ratio": 8.333333244081587e-05, "completion_length": 226.58334350585938, "epoch": 0.004800137702041655, "grad_norm": 0.08161304146051407, "kl": 0.10271313786506653, "learning_rate": 3e-06, "loss": 0.0168, "reward": 0.28333336114883423, "reward_std": 0.21825158596038818, "rewards/countdown_reward_func": 0.28333333134651184, "step": 1729, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00017384745297022164, "epoch": 0.004802913952881471, "grad_norm": 0.09780708700418472, "kl": 0.0883183516561985, "learning_rate": 3e-06, "loss": 0.0166, "step": 1730 }, { "clip_ratio": 0.0, "epoch": 0.004805690203721287, "grad_norm": 0.18682555854320526, "kl": 0.09598564729094505, "learning_rate": 3e-06, "loss": 0.0164, "step": 1731 }, { "clip_ratio": 0.0, "epoch": 0.004808466454561102, "grad_norm": 0.06634990125894547, "kl": 0.1002919152379036, "learning_rate": 3e-06, "loss": 0.0164, "step": 1732 }, { "clip_ratio": 8.947744936449453e-05, "epoch": 0.0048112427054009185, "grad_norm": 0.07831425219774246, "kl": 0.10307565331459045, "learning_rate": 3e-06, "loss": 0.0172, "step": 1733 }, { "clip_ratio": 0.0004471343127079308, "epoch": 0.004814018956240734, "grad_norm": 0.09252594411373138, "kl": 0.09812160581350327, "learning_rate": 3e-06, "loss": 0.0166, "step": 1734 }, { "clip_ratio": 0.00027552236133487895, "epoch": 0.00481679520708055, "grad_norm": 0.08623359352350235, "kl": 0.10652950406074524, "learning_rate": 3e-06, "loss": 0.0157, "step": 1735 }, { "clip_ratio": 0.0, "epoch": 0.004819571457920366, "grad_norm": 0.08916810154914856, "kl": 0.0917881578207016, "learning_rate": 3e-06, "loss": 0.016, "step": 1736 }, { "clip_ratio": 0.0, "epoch": 0.004822347708760182, "grad_norm": 0.11515611410140991, "kl": 0.09990720450878143, "learning_rate": 3e-06, "loss": 0.0151, "step": 1737 }, { "clip_ratio": 0.0, "epoch": 0.004825123959599998, "grad_norm": 0.06613731384277344, "kl": 0.10350186005234718, "learning_rate": 3e-06, "loss": 0.0154, "step": 1738 }, { "clip_ratio": 9.051412052940577e-05, "epoch": 0.0048279002104398136, "grad_norm": 0.08245322108268738, "kl": 0.10670360550284386, "learning_rate": 3e-06, "loss": 0.0161, "step": 1739 }, { "clip_ratio": 8.947744936449453e-05, "epoch": 0.00483067646127963, "grad_norm": 0.08258913457393646, "kl": 0.10113772377371788, "learning_rate": 3e-06, "loss": 0.0151, "step": 1740 }, { "clip_ratio": 9.999999747378752e-05, "completion_length": 238.39584350585938, "epoch": 0.004833452712119445, "grad_norm": 0.08632933348417282, "kl": 0.10816295444965363, "learning_rate": 3e-06, "loss": 0.0146, "reward": 0.3395833671092987, "reward_std": 0.3079586625099182, "rewards/countdown_reward_func": 0.3395833522081375, "step": 1741, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.000183780153747648, "epoch": 0.0048362289629592615, "grad_norm": 0.10708604753017426, "kl": 0.10778484493494034, "learning_rate": 3e-06, "loss": 0.0143, "step": 1742 }, { "clip_ratio": 8.468834857922047e-05, "epoch": 0.004839005213799077, "grad_norm": 0.11992800235748291, "kl": 0.10616076365113258, "learning_rate": 3e-06, "loss": 0.0148, "step": 1743 }, { "clip_ratio": 0.0, "epoch": 0.004841781464638893, "grad_norm": 0.08565586805343628, "kl": 0.11514625698328018, "learning_rate": 3e-06, "loss": 0.0148, "step": 1744 }, { "clip_ratio": 0.0, "epoch": 0.004844557715478709, "grad_norm": 0.10972116142511368, "kl": 0.11541270092129707, "learning_rate": 3e-06, "loss": 0.0141, "step": 1745 }, { "clip_ratio": 0.0, "epoch": 0.004847333966318525, "grad_norm": 0.11946596950292587, "kl": 0.11952852457761765, "learning_rate": 3e-06, "loss": 0.0145, "step": 1746 }, { "clip_ratio": 0.0003465483241598122, "epoch": 0.00485011021715834, "grad_norm": 0.1060066968202591, "kl": 0.11223845556378365, "learning_rate": 3e-06, "loss": 0.0138, "step": 1747 }, { "clip_ratio": 8.468834857922047e-05, "epoch": 0.0048528864679981566, "grad_norm": 0.09957956522703171, "kl": 0.11351840943098068, "learning_rate": 3e-06, "loss": 0.0139, "step": 1748 }, { "clip_ratio": 0.0, "epoch": 0.004855662718837973, "grad_norm": 0.10374125838279724, "kl": 0.11228137835860252, "learning_rate": 3e-06, "loss": 0.0126, "step": 1749 }, { "clip_ratio": 0.0001720578147796914, "epoch": 0.004858438969677788, "grad_norm": 0.08856089413166046, "kl": 0.121010672301054, "learning_rate": 3e-06, "loss": 0.0141, "step": 1750 }, { "clip_ratio": 0.0001767400826793164, "epoch": 0.0048612152205176045, "grad_norm": 0.13084955513477325, "kl": 0.12098020315170288, "learning_rate": 3e-06, "loss": 0.0122, "step": 1751 }, { "clip_ratio": 0.00024654832668602467, "epoch": 0.00486399147135742, "grad_norm": 0.10557901859283447, "kl": 0.12543104588985443, "learning_rate": 3e-06, "loss": 0.0119, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 231.7916717529297, "epoch": 0.004866767722197236, "grad_norm": 0.09326691925525665, "kl": 0.1041104681789875, "learning_rate": 3e-06, "loss": -0.0005, "reward": 0.3229166865348816, "reward_std": 0.2270909696817398, "rewards/countdown_reward_func": 0.3229166716337204, "step": 1753, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00018162409833166748, "epoch": 0.004869543973037052, "grad_norm": 0.1563946008682251, "kl": 0.10441932827234268, "learning_rate": 3e-06, "loss": 0.001, "step": 1754 }, { "clip_ratio": 0.0, "epoch": 0.004872320223876868, "grad_norm": 0.09415509551763535, "kl": 0.10655347630381584, "learning_rate": 3e-06, "loss": 0.0006, "step": 1755 }, { "clip_ratio": 0.0001735860641929321, "epoch": 0.004875096474716683, "grad_norm": 0.07791784405708313, "kl": 0.11189163848757744, "learning_rate": 3e-06, "loss": 0.001, "step": 1756 }, { "clip_ratio": 0.0005474452627822757, "epoch": 0.0048778727255564995, "grad_norm": 0.07700037211179733, "kl": 0.11374571919441223, "learning_rate": 3e-06, "loss": 0.0006, "step": 1757 }, { "clip_ratio": 0.0004572612378979102, "epoch": 0.004880648976396315, "grad_norm": 0.09134937077760696, "kl": 0.1076822318136692, "learning_rate": 3e-06, "loss": -0.0005, "step": 1758 }, { "clip_ratio": 0.0005507961686816998, "epoch": 0.004883425227236131, "grad_norm": 0.11843130737543106, "kl": 0.1056605763733387, "learning_rate": 3e-06, "loss": -0.0003, "step": 1759 }, { "clip_ratio": 0.0, "epoch": 0.0048862014780759475, "grad_norm": 0.16378162801265717, "kl": 0.10637029260396957, "learning_rate": 3e-06, "loss": 0.0001, "step": 1760 }, { "clip_ratio": 0.00017188674246426672, "epoch": 0.004888977728915763, "grad_norm": 0.09075536578893661, "kl": 0.10630379244685173, "learning_rate": 3e-06, "loss": -0.0009, "step": 1761 }, { "clip_ratio": 0.00036519287823466584, "epoch": 0.004891753979755579, "grad_norm": 0.07516534626483917, "kl": 0.11119216680526733, "learning_rate": 3e-06, "loss": -0.0004, "step": 1762 }, { "clip_ratio": 0.00018491214723326266, "epoch": 0.004894530230595395, "grad_norm": 0.07346532493829727, "kl": 0.1081731989979744, "learning_rate": 3e-06, "loss": -0.0006, "step": 1763 }, { "clip_ratio": 0.0012658692721743137, "epoch": 0.004897306481435211, "grad_norm": 0.09767357259988785, "kl": 0.10378069430589676, "learning_rate": 3e-06, "loss": -0.0013, "step": 1764 }, { "clip_ratio": 0.000739818497095257, "completion_length": 228.83333587646484, "epoch": 0.004900082732275026, "grad_norm": 0.04857170954346657, "kl": 0.11081210523843765, "learning_rate": 3e-06, "loss": 0.0008, "reward": 0.13333334028720856, "reward_std": 0.09824509173631668, "rewards/countdown_reward_func": 0.13333333283662796, "step": 1765, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.001252438290975988, "epoch": 0.0049028589831148425, "grad_norm": 0.06208095699548721, "kl": 0.11198806017637253, "learning_rate": 3e-06, "loss": 0.0006, "step": 1766 }, { "clip_ratio": 0.00036127932253293693, "epoch": 0.004905635233954658, "grad_norm": 0.06156253442168236, "kl": 0.11192040145397186, "learning_rate": 3e-06, "loss": 0.0009, "step": 1767 }, { "clip_ratio": 0.0002693784481380135, "epoch": 0.004908411484794474, "grad_norm": 0.05420385301113129, "kl": 0.10709425061941147, "learning_rate": 3e-06, "loss": 0.001, "step": 1768 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00491118773563429, "grad_norm": 0.054946351796388626, "kl": 0.10676027089357376, "learning_rate": 3e-06, "loss": 0.0004, "step": 1769 }, { "clip_ratio": 0.0008511265332344919, "epoch": 0.004913963986474106, "grad_norm": 0.03839809074997902, "kl": 0.10732469335198402, "learning_rate": 3e-06, "loss": 0.0007, "step": 1770 }, { "clip_ratio": 0.0003610541461966932, "epoch": 0.004916740237313922, "grad_norm": 0.046104252338409424, "kl": 0.10469045117497444, "learning_rate": 3e-06, "loss": 0.0, "step": 1771 }, { "clip_ratio": 0.0015279441722668707, "epoch": 0.0049195164881537376, "grad_norm": 0.06267226487398148, "kl": 0.10644160956144333, "learning_rate": 3e-06, "loss": 0.0, "step": 1772 }, { "clip_ratio": 0.0001966938652913086, "epoch": 0.004922292738993554, "grad_norm": 0.05547748878598213, "kl": 0.10656990110874176, "learning_rate": 3e-06, "loss": 0.0001, "step": 1773 }, { "clip_ratio": 0.0008054111385717988, "epoch": 0.004925068989833369, "grad_norm": 0.0485399030148983, "kl": 0.10373036935925484, "learning_rate": 3e-06, "loss": 0.0, "step": 1774 }, { "clip_ratio": 0.0011505663569550961, "epoch": 0.0049278452406731855, "grad_norm": 0.09875226765871048, "kl": 0.10132811218500137, "learning_rate": 3e-06, "loss": -0.0, "step": 1775 }, { "clip_ratio": 0.0020636909175664186, "epoch": 0.004930621491513001, "grad_norm": 0.042701300233602524, "kl": 0.10125269740819931, "learning_rate": 3e-06, "loss": 0.0002, "step": 1776 }, { "clip_ratio": 0.0004316127669881098, "completion_length": 223.7291717529297, "epoch": 0.004933397742352817, "grad_norm": 0.37167254090309143, "kl": 0.10485289990901947, "learning_rate": 3e-06, "loss": 0.0251, "reward": 0.3812500238418579, "reward_std": 0.350497841835022, "rewards/countdown_reward_func": 0.3812499940395355, "step": 1777, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.004936173993192633, "grad_norm": 0.1220475360751152, "kl": 0.10859641060233116, "learning_rate": 3e-06, "loss": 0.0243, "step": 1778 }, { "clip_ratio": 0.0, "epoch": 0.004938950244032449, "grad_norm": 0.11971476674079895, "kl": 0.10970620810985565, "learning_rate": 3e-06, "loss": 0.0248, "step": 1779 }, { "clip_ratio": 0.0, "epoch": 0.004941726494872264, "grad_norm": 0.11429105699062347, "kl": 0.09786089137196541, "learning_rate": 3e-06, "loss": 0.0239, "step": 1780 }, { "clip_ratio": 0.0, "epoch": 0.0049445027457120806, "grad_norm": 0.10652303695678711, "kl": 0.10274695977568626, "learning_rate": 3e-06, "loss": 0.0252, "step": 1781 }, { "clip_ratio": 0.000254739832598716, "epoch": 0.004947278996551897, "grad_norm": 0.12241549789905548, "kl": 0.10112031176686287, "learning_rate": 3e-06, "loss": 0.0244, "step": 1782 }, { "clip_ratio": 0.00017335961456410587, "epoch": 0.004950055247391712, "grad_norm": 0.1391787976026535, "kl": 0.10613827779889107, "learning_rate": 3e-06, "loss": 0.0237, "step": 1783 }, { "clip_ratio": 0.0, "epoch": 0.0049528314982315285, "grad_norm": 0.17388495802879333, "kl": 0.10883147642016411, "learning_rate": 3e-06, "loss": 0.0232, "step": 1784 }, { "clip_ratio": 0.00029968634044053033, "epoch": 0.004955607749071344, "grad_norm": 0.11082387715578079, "kl": 0.11147769913077354, "learning_rate": 3e-06, "loss": 0.0236, "step": 1785 }, { "clip_ratio": 0.0001846859959186986, "epoch": 0.00495838399991116, "grad_norm": 0.11685911566019058, "kl": 0.09964372962713242, "learning_rate": 3e-06, "loss": 0.0226, "step": 1786 }, { "clip_ratio": 0.00011281588376732543, "epoch": 0.004961160250750976, "grad_norm": 0.09485632926225662, "kl": 0.10617783665657043, "learning_rate": 3e-06, "loss": 0.0236, "step": 1787 }, { "clip_ratio": 0.00043700785317923874, "epoch": 0.004963936501590792, "grad_norm": 0.11837552487850189, "kl": 0.10385986790060997, "learning_rate": 3e-06, "loss": 0.0227, "step": 1788 }, { "clip_ratio": 0.00016897656314540654, "completion_length": 237.0416717529297, "epoch": 0.004966712752430607, "grad_norm": 0.12939836084842682, "kl": 0.1072714664041996, "learning_rate": 3e-06, "loss": 0.0052, "reward": 0.3437500298023224, "reward_std": 0.37016281485557556, "rewards/countdown_reward_func": 0.3437500298023224, "step": 1789, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0049694890032704235, "grad_norm": 0.11611343175172806, "kl": 0.11076124012470245, "learning_rate": 3e-06, "loss": 0.0048, "step": 1790 }, { "clip_ratio": 0.0003420523353270255, "epoch": 0.004972265254110239, "grad_norm": 0.11028125882148743, "kl": 0.11113661155104637, "learning_rate": 3e-06, "loss": 0.0043, "step": 1791 }, { "clip_ratio": 0.00017754628788679838, "epoch": 0.004975041504950055, "grad_norm": 0.09910908341407776, "kl": 0.11355148255825043, "learning_rate": 3e-06, "loss": 0.0056, "step": 1792 }, { "clip_ratio": 0.00017655367264524102, "epoch": 0.0049778177557898715, "grad_norm": 0.0909920260310173, "kl": 0.115181814879179, "learning_rate": 3e-06, "loss": 0.0045, "step": 1793 }, { "clip_ratio": 0.0, "epoch": 0.004980594006629687, "grad_norm": 0.08922834694385529, "kl": 0.10475178062915802, "learning_rate": 3e-06, "loss": 0.0044, "step": 1794 }, { "clip_ratio": 8.7596352386754e-05, "epoch": 0.004983370257469503, "grad_norm": 0.08417384326457977, "kl": 0.11183957010507584, "learning_rate": 3e-06, "loss": 0.0046, "step": 1795 }, { "clip_ratio": 8.7596352386754e-05, "epoch": 0.004986146508309319, "grad_norm": 0.11513616889715195, "kl": 0.11391586065292358, "learning_rate": 3e-06, "loss": 0.0035, "step": 1796 }, { "clip_ratio": 0.00017655367264524102, "epoch": 0.004988922759149135, "grad_norm": 0.09690085053443909, "kl": 0.1138366088271141, "learning_rate": 3e-06, "loss": 0.0038, "step": 1797 }, { "clip_ratio": 0.0008805023971945047, "epoch": 0.00499169900998895, "grad_norm": 0.1030769795179367, "kl": 0.11408108472824097, "learning_rate": 3e-06, "loss": 0.0052, "step": 1798 }, { "clip_ratio": 0.0005165053516975604, "epoch": 0.0049944752608287665, "grad_norm": 0.08906270563602448, "kl": 0.11490156501531601, "learning_rate": 3e-06, "loss": 0.0045, "step": 1799 }, { "clip_ratio": 8.827683632262051e-05, "epoch": 0.004997251511668582, "grad_norm": 0.09603746235370636, "kl": 0.10548309236764908, "learning_rate": 3e-06, "loss": 0.0031, "step": 1800 }, { "clip_ratio": 0.0002600554726086557, "completion_length": 222.20834350585938, "epoch": 0.005000027762508398, "grad_norm": 0.0949837937951088, "kl": 0.10473751276731491, "learning_rate": 3e-06, "loss": 0.0252, "reward": 0.302083358168602, "reward_std": 0.2531573101878166, "rewards/countdown_reward_func": 0.302083358168602, "step": 1801, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.716875890968367e-05, "epoch": 0.005002804013348214, "grad_norm": 0.13353735208511353, "kl": 0.09882612898945808, "learning_rate": 3e-06, "loss": 0.0252, "step": 1802 }, { "clip_ratio": 0.0, "epoch": 0.00500558026418803, "grad_norm": 0.12187909334897995, "kl": 0.10175449773669243, "learning_rate": 3e-06, "loss": 0.0253, "step": 1803 }, { "clip_ratio": 0.0, "epoch": 0.005008356515027846, "grad_norm": 0.13179193437099457, "kl": 0.10488756746053696, "learning_rate": 3e-06, "loss": 0.0246, "step": 1804 }, { "clip_ratio": 0.0, "epoch": 0.0050111327658676616, "grad_norm": 0.10637587308883667, "kl": 0.10015484690666199, "learning_rate": 3e-06, "loss": 0.0246, "step": 1805 }, { "clip_ratio": 8.668516238685697e-05, "epoch": 0.005013909016707478, "grad_norm": 0.10470699518918991, "kl": 0.10087545961141586, "learning_rate": 3e-06, "loss": 0.0246, "step": 1806 }, { "clip_ratio": 0.0, "epoch": 0.005016685267547293, "grad_norm": 0.1398010551929474, "kl": 0.10762974619865417, "learning_rate": 3e-06, "loss": 0.0242, "step": 1807 }, { "clip_ratio": 0.00020646479970309883, "epoch": 0.0050194615183871095, "grad_norm": 0.19034115970134735, "kl": 0.10274334251880646, "learning_rate": 3e-06, "loss": 0.0227, "step": 1808 }, { "clip_ratio": 0.0, "epoch": 0.005022237769226925, "grad_norm": 0.11217880249023438, "kl": 0.10773766785860062, "learning_rate": 3e-06, "loss": 0.023, "step": 1809 }, { "clip_ratio": 0.00010822511103469878, "epoch": 0.005025014020066741, "grad_norm": 0.1465480774641037, "kl": 0.11147905141115189, "learning_rate": 3e-06, "loss": 0.0215, "step": 1810 }, { "clip_ratio": 0.0004162504119449295, "epoch": 0.005027790270906557, "grad_norm": 0.0986005887389183, "kl": 0.10899289697408676, "learning_rate": 3e-06, "loss": 0.0235, "step": 1811 }, { "clip_ratio": 9.788566967472434e-05, "epoch": 0.005030566521746373, "grad_norm": 0.13088902831077576, "kl": 0.1097114235162735, "learning_rate": 3e-06, "loss": 0.023, "step": 1812 }, { "clip_ratio": 0.0002538930275477469, "completion_length": 213.83333587646484, "epoch": 0.005033342772586188, "grad_norm": 0.09580644220113754, "kl": 0.10691358521580696, "learning_rate": 3e-06, "loss": -0.001, "reward": 0.3958333432674408, "reward_std": 0.2887437641620636, "rewards/countdown_reward_func": 0.3958333283662796, "step": 1813, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0050361190234260046, "grad_norm": 0.08536209911108017, "kl": 0.11967135220766068, "learning_rate": 3e-06, "loss": -0.001, "step": 1814 }, { "clip_ratio": 0.00019878626335412264, "epoch": 0.005038895274265821, "grad_norm": 0.0959966704249382, "kl": 0.11832546070218086, "learning_rate": 3e-06, "loss": -0.0003, "step": 1815 }, { "clip_ratio": 0.0021427306346595287, "epoch": 0.005041671525105636, "grad_norm": 0.11329105496406555, "kl": 0.12149709090590477, "learning_rate": 3e-06, "loss": 0.0015, "step": 1816 }, { "clip_ratio": 0.00026538767997408286, "epoch": 0.0050444477759454525, "grad_norm": 0.12283118814229965, "kl": 0.12353919818997383, "learning_rate": 3e-06, "loss": -0.0004, "step": 1817 }, { "clip_ratio": 0.0004591475590132177, "epoch": 0.005047224026785268, "grad_norm": 0.11809934675693512, "kl": 0.11520658433437347, "learning_rate": 3e-06, "loss": -0.002, "step": 1818 }, { "clip_ratio": 0.00027188926469534636, "epoch": 0.005050000277625084, "grad_norm": 0.10198795795440674, "kl": 0.11487729102373123, "learning_rate": 3e-06, "loss": -0.001, "step": 1819 }, { "clip_ratio": 0.0, "epoch": 0.0050527765284649, "grad_norm": 0.09649267792701721, "kl": 0.12504612654447556, "learning_rate": 3e-06, "loss": -0.0014, "step": 1820 }, { "clip_ratio": 0.0004089476424269378, "epoch": 0.005055552779304716, "grad_norm": 0.2582434117794037, "kl": 0.12240016460418701, "learning_rate": 3e-06, "loss": -0.0011, "step": 1821 }, { "clip_ratio": 0.0019799493966274895, "epoch": 0.005058329030144531, "grad_norm": 0.12102940678596497, "kl": 0.12317222356796265, "learning_rate": 3e-06, "loss": -0.0002, "step": 1822 }, { "clip_ratio": 0.00018400746921543032, "epoch": 0.0050611052809843475, "grad_norm": 0.12739579379558563, "kl": 0.12464824318885803, "learning_rate": 3e-06, "loss": -0.0008, "step": 1823 }, { "clip_ratio": 0.00020525451691355556, "epoch": 0.005063881531824163, "grad_norm": 0.11777213960886002, "kl": 0.11386393383145332, "learning_rate": 3e-06, "loss": -0.0031, "step": 1824 }, { "clip_ratio": 0.0003442906090640463, "completion_length": 233.89584350585938, "epoch": 0.005066657782663979, "grad_norm": 0.07550974935293198, "kl": 0.13742565363645554, "learning_rate": 3e-06, "loss": 0.014, "reward": 0.24791669100522995, "reward_std": 0.2587834596633911, "rewards/countdown_reward_func": 0.24791667610406876, "step": 1825, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00044292007805779576, "epoch": 0.0050694340335037955, "grad_norm": 0.09571371972560883, "kl": 0.13907401263713837, "learning_rate": 3e-06, "loss": 0.0135, "step": 1826 }, { "clip_ratio": 0.00018455334065947682, "epoch": 0.005072210284343611, "grad_norm": 0.11037315428256989, "kl": 0.13096477836370468, "learning_rate": 3e-06, "loss": 0.0139, "step": 1827 }, { "clip_ratio": 0.0003636363544501364, "epoch": 0.005074986535183427, "grad_norm": 0.08234788477420807, "kl": 0.12375519797205925, "learning_rate": 3e-06, "loss": 0.0135, "step": 1828 }, { "clip_ratio": 0.0, "epoch": 0.005077762786023243, "grad_norm": 0.12358090281486511, "kl": 0.1380329132080078, "learning_rate": 3e-06, "loss": 0.0136, "step": 1829 }, { "clip_ratio": 0.0005509999464266002, "epoch": 0.005080539036863059, "grad_norm": 0.10022760182619095, "kl": 0.12769127637147903, "learning_rate": 3e-06, "loss": 0.0132, "step": 1830 }, { "clip_ratio": 0.00044850878475699574, "epoch": 0.005083315287702874, "grad_norm": 0.07287811487913132, "kl": 0.1334082931280136, "learning_rate": 3e-06, "loss": 0.0134, "step": 1831 }, { "clip_ratio": 0.0004338581129559316, "epoch": 0.0050860915385426905, "grad_norm": 0.07464426755905151, "kl": 0.13454636931419373, "learning_rate": 3e-06, "loss": 0.0137, "step": 1832 }, { "clip_ratio": 0.00045034874347038567, "epoch": 0.005088867789382506, "grad_norm": 0.12014682590961456, "kl": 0.12987623363733292, "learning_rate": 3e-06, "loss": 0.0129, "step": 1833 }, { "clip_ratio": 0.00018235020252177492, "epoch": 0.005091644040222322, "grad_norm": 0.07914389669895172, "kl": 0.12284610792994499, "learning_rate": 3e-06, "loss": 0.0123, "step": 1834 }, { "clip_ratio": 0.00017130826745415106, "epoch": 0.0050944202910621385, "grad_norm": 0.11240369826555252, "kl": 0.13621504604816437, "learning_rate": 3e-06, "loss": 0.0131, "step": 1835 }, { "clip_ratio": 0.0006353274511639029, "epoch": 0.005097196541901954, "grad_norm": 0.09824000298976898, "kl": 0.12700487673282623, "learning_rate": 3e-06, "loss": 0.0133, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 230.31250762939453, "epoch": 0.00509997279274177, "grad_norm": 0.12020771205425262, "kl": 0.1342645138502121, "learning_rate": 3e-06, "loss": -0.0, "reward": 0.35833336412906647, "reward_std": 0.37625907361507416, "rewards/countdown_reward_func": 0.3583333492279053, "step": 1837, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0051027490435815856, "grad_norm": 0.17950089275836945, "kl": 0.1457306444644928, "learning_rate": 3e-06, "loss": 0.0, "step": 1838 }, { "clip_ratio": 9.968101949198171e-05, "epoch": 0.005105525294421402, "grad_norm": 0.150423064827919, "kl": 0.1225435845553875, "learning_rate": 3e-06, "loss": -0.0004, "step": 1839 }, { "clip_ratio": 0.0003465738918748684, "epoch": 0.005108301545261217, "grad_norm": 0.17598502337932587, "kl": 0.12997641041874886, "learning_rate": 3e-06, "loss": -0.0014, "step": 1840 }, { "clip_ratio": 0.0003827196196652949, "epoch": 0.0051110777961010335, "grad_norm": 0.13662053644657135, "kl": 0.12417108565568924, "learning_rate": 3e-06, "loss": -0.0021, "step": 1841 }, { "clip_ratio": 9.904913167702034e-05, "epoch": 0.005113854046940849, "grad_norm": 0.1616424322128296, "kl": 0.12268723547458649, "learning_rate": 3e-06, "loss": -0.0027, "step": 1842 }, { "clip_ratio": 0.0006147185195004568, "epoch": 0.005116630297780665, "grad_norm": 0.1300649344921112, "kl": 0.12857786938548088, "learning_rate": 3e-06, "loss": -0.0024, "step": 1843 }, { "clip_ratio": 0.00027205952210351825, "epoch": 0.005119406548620481, "grad_norm": 0.1779082864522934, "kl": 0.14096860587596893, "learning_rate": 3e-06, "loss": -0.0021, "step": 1844 }, { "clip_ratio": 0.00035519967786967754, "epoch": 0.005122182799460297, "grad_norm": 0.13828246295452118, "kl": 0.1195240318775177, "learning_rate": 3e-06, "loss": -0.004, "step": 1845 }, { "clip_ratio": 0.001379472087137401, "epoch": 0.005124959050300113, "grad_norm": 0.16142214834690094, "kl": 0.12471278756856918, "learning_rate": 3e-06, "loss": -0.0059, "step": 1846 }, { "clip_ratio": 0.001573986024595797, "epoch": 0.0051277353011399286, "grad_norm": 0.1403919756412506, "kl": 0.11798427999019623, "learning_rate": 3e-06, "loss": -0.0067, "step": 1847 }, { "clip_ratio": 0.0015037042612675577, "epoch": 0.005130511551979745, "grad_norm": 0.1678413301706314, "kl": 0.1188434436917305, "learning_rate": 3e-06, "loss": -0.0067, "step": 1848 }, { "clip_ratio": 0.0005114346859045327, "completion_length": 231.4375, "epoch": 0.00513328780281956, "grad_norm": 0.06092618405818939, "kl": 0.11399786919355392, "learning_rate": 3e-06, "loss": -0.0017, "reward": 0.1937500163912773, "reward_std": 0.15347465500235558, "rewards/countdown_reward_func": 0.1937500163912773, "step": 1849, "zero_std_ratio": 0.625 }, { "clip_ratio": 8.802816591924056e-05, "epoch": 0.0051360640536593765, "grad_norm": 0.08002404868602753, "kl": 0.10806835442781448, "learning_rate": 3e-06, "loss": -0.0025, "step": 1850 }, { "clip_ratio": 0.0001694083766778931, "epoch": 0.005138840304499192, "grad_norm": 0.07181860506534576, "kl": 0.10447893664240837, "learning_rate": 3e-06, "loss": -0.0033, "step": 1851 }, { "clip_ratio": 0.0006369501352310181, "epoch": 0.005141616555339008, "grad_norm": 0.0680135115981102, "kl": 0.10594388097524643, "learning_rate": 3e-06, "loss": -0.0029, "step": 1852 }, { "clip_ratio": 0.0009738111984916031, "epoch": 0.005144392806178824, "grad_norm": 0.05742673948407173, "kl": 0.10426938533782959, "learning_rate": 3e-06, "loss": -0.0026, "step": 1853 }, { "clip_ratio": 0.0006258258072193712, "epoch": 0.00514716905701864, "grad_norm": 0.06563463062047958, "kl": 0.10713209956884384, "learning_rate": 3e-06, "loss": -0.003, "step": 1854 }, { "clip_ratio": 0.001724798115901649, "epoch": 0.005149945307858455, "grad_norm": 0.05824752897024155, "kl": 0.10814163088798523, "learning_rate": 3e-06, "loss": -0.0022, "step": 1855 }, { "clip_ratio": 0.0009082746400963515, "epoch": 0.0051527215586982715, "grad_norm": 0.08871683478355408, "kl": 0.10223837941884995, "learning_rate": 3e-06, "loss": -0.0039, "step": 1856 }, { "clip_ratio": 0.0010515126050449908, "epoch": 0.005155497809538088, "grad_norm": 0.0660201832652092, "kl": 0.10071967542171478, "learning_rate": 3e-06, "loss": -0.004, "step": 1857 }, { "clip_ratio": 0.002384283463470638, "epoch": 0.005158274060377903, "grad_norm": 0.0644756555557251, "kl": 0.10329937189817429, "learning_rate": 3e-06, "loss": -0.0041, "step": 1858 }, { "clip_ratio": 0.0010630089382175356, "epoch": 0.0051610503112177195, "grad_norm": 0.05238967016339302, "kl": 0.09916985407471657, "learning_rate": 3e-06, "loss": -0.0034, "step": 1859 }, { "clip_ratio": 0.0028576954500749707, "epoch": 0.005163826562057535, "grad_norm": 0.06547994166612625, "kl": 0.1038946770131588, "learning_rate": 3e-06, "loss": -0.0043, "step": 1860 }, { "clip_ratio": 8.46310067572631e-05, "completion_length": 236.08333587646484, "epoch": 0.005166602812897351, "grad_norm": 0.1372431516647339, "kl": 0.1432003378868103, "learning_rate": 3e-06, "loss": 0.0302, "reward": 0.3041666969656944, "reward_std": 0.27210913598537445, "rewards/countdown_reward_func": 0.3041666969656944, "step": 1861, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.46310067572631e-05, "epoch": 0.005169379063737167, "grad_norm": 0.08329330384731293, "kl": 0.12102905288338661, "learning_rate": 3e-06, "loss": 0.0297, "step": 1862 }, { "clip_ratio": 0.00017888411093736067, "epoch": 0.005172155314576983, "grad_norm": 0.10118658095598221, "kl": 0.13169753551483154, "learning_rate": 3e-06, "loss": 0.0301, "step": 1863 }, { "clip_ratio": 0.0002753450389718637, "epoch": 0.005174931565416798, "grad_norm": 0.11060193926095963, "kl": 0.13022061809897423, "learning_rate": 3e-06, "loss": 0.0292, "step": 1864 }, { "clip_ratio": 0.00017968667816603556, "epoch": 0.0051777078162566145, "grad_norm": 0.10579267144203186, "kl": 0.12778804078698158, "learning_rate": 3e-06, "loss": 0.0302, "step": 1865 }, { "clip_ratio": 8.46310067572631e-05, "epoch": 0.00518048406709643, "grad_norm": 0.0976695865392685, "kl": 0.11770972609519958, "learning_rate": 3e-06, "loss": 0.0288, "step": 1866 }, { "clip_ratio": 0.0003618215851020068, "epoch": 0.005183260317936246, "grad_norm": 0.09765175729990005, "kl": 0.14232248812913895, "learning_rate": 3e-06, "loss": 0.0292, "step": 1867 }, { "clip_ratio": 0.00016601121751591563, "epoch": 0.0051860365687760625, "grad_norm": 0.0782550796866417, "kl": 0.1213303953409195, "learning_rate": 3e-06, "loss": 0.0291, "step": 1868 }, { "clip_ratio": 0.0002776276451186277, "epoch": 0.005188812819615878, "grad_norm": 0.10285110771656036, "kl": 0.13297457247972488, "learning_rate": 3e-06, "loss": 0.0283, "step": 1869 }, { "clip_ratio": 0.00018646800162969157, "epoch": 0.005191589070455694, "grad_norm": 0.11395300924777985, "kl": 0.1316339075565338, "learning_rate": 3e-06, "loss": 0.0278, "step": 1870 }, { "clip_ratio": 0.0005108660116093233, "epoch": 0.0051943653212955096, "grad_norm": 0.0980798751115799, "kl": 0.12793593108654022, "learning_rate": 3e-06, "loss": 0.0287, "step": 1871 }, { "clip_ratio": 9.097525617107749e-05, "epoch": 0.005197141572135326, "grad_norm": 0.09456781297922134, "kl": 0.12113503366708755, "learning_rate": 3e-06, "loss": 0.0272, "step": 1872 }, { "clip_ratio": 8.85896515683271e-05, "completion_length": 233.06250762939453, "epoch": 0.005199917822975141, "grad_norm": 0.15110257267951965, "kl": 0.11747819557785988, "learning_rate": 3e-06, "loss": 0.0183, "reward": 0.3437500298023224, "reward_std": 0.31206804513931274, "rewards/countdown_reward_func": 0.3437500149011612, "step": 1873, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.897070412989706e-05, "epoch": 0.0052026940738149575, "grad_norm": 0.11430589109659195, "kl": 0.1168769858777523, "learning_rate": 3e-06, "loss": 0.0183, "step": 1874 }, { "clip_ratio": 8.90313385752961e-05, "epoch": 0.005205470324654773, "grad_norm": 0.09721416234970093, "kl": 0.11751173809170723, "learning_rate": 3e-06, "loss": 0.0188, "step": 1875 }, { "clip_ratio": 0.0004218236426822841, "epoch": 0.005208246575494589, "grad_norm": 0.09081301838159561, "kl": 0.11737838387489319, "learning_rate": 3e-06, "loss": 0.0181, "step": 1876 }, { "clip_ratio": 0.0003606221798690967, "epoch": 0.005211022826334405, "grad_norm": 0.08658932149410248, "kl": 0.12061323970556259, "learning_rate": 3e-06, "loss": 0.0184, "step": 1877 }, { "clip_ratio": 0.0004417505406308919, "epoch": 0.005213799077174221, "grad_norm": 0.13034342229366302, "kl": 0.11869553104043007, "learning_rate": 3e-06, "loss": 0.0181, "step": 1878 }, { "clip_ratio": 0.0003327302838442847, "epoch": 0.005216575328014037, "grad_norm": 0.09448829293251038, "kl": 0.12387342751026154, "learning_rate": 3e-06, "loss": 0.0167, "step": 1879 }, { "clip_ratio": 0.0003734165584319271, "epoch": 0.0052193515788538525, "grad_norm": 0.10919336974620819, "kl": 0.12406548857688904, "learning_rate": 3e-06, "loss": 0.0164, "step": 1880 }, { "clip_ratio": 8.85896515683271e-05, "epoch": 0.005222127829693669, "grad_norm": 0.09010238200426102, "kl": 0.1251124069094658, "learning_rate": 3e-06, "loss": 0.0178, "step": 1881 }, { "clip_ratio": 0.0004870776829193346, "epoch": 0.005224904080533484, "grad_norm": 0.08950203657150269, "kl": 0.1254933625459671, "learning_rate": 3e-06, "loss": 0.016, "step": 1882 }, { "clip_ratio": 0.0005361769872251898, "epoch": 0.0052276803313733005, "grad_norm": 0.1491861492395401, "kl": 0.13046596199274063, "learning_rate": 3e-06, "loss": 0.0163, "step": 1883 }, { "clip_ratio": 0.0005558350239880383, "epoch": 0.005230456582213116, "grad_norm": 0.12711560726165771, "kl": 0.12579327076673508, "learning_rate": 3e-06, "loss": 0.0155, "step": 1884 }, { "clip_ratio": 0.0005470910182339139, "completion_length": 228.58333587646484, "epoch": 0.005233232833052932, "grad_norm": 0.18066106736660004, "kl": 0.14193446934223175, "learning_rate": 3e-06, "loss": 0.0218, "reward": 0.34166671335697174, "reward_std": 0.3784969747066498, "rewards/countdown_reward_func": 0.34166669845581055, "step": 1885, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.000280211737845093, "epoch": 0.005236009083892748, "grad_norm": 0.1274838149547577, "kl": 0.13073249906301498, "learning_rate": 3e-06, "loss": 0.0227, "step": 1886 }, { "clip_ratio": 0.00027726277767214924, "epoch": 0.005238785334732564, "grad_norm": 0.11818882077932358, "kl": 0.14046981185674667, "learning_rate": 3e-06, "loss": 0.0234, "step": 1887 }, { "clip_ratio": 0.0, "epoch": 0.005241561585572379, "grad_norm": 0.12307964265346527, "kl": 0.13810279220342636, "learning_rate": 3e-06, "loss": 0.022, "step": 1888 }, { "clip_ratio": 0.0, "epoch": 0.0052443378364121955, "grad_norm": 0.12825541198253632, "kl": 0.13892988115549088, "learning_rate": 3e-06, "loss": 0.0217, "step": 1889 }, { "clip_ratio": 9.170946577796713e-05, "epoch": 0.005247114087252012, "grad_norm": 0.14815275371074677, "kl": 0.15306030213832855, "learning_rate": 3e-06, "loss": 0.0225, "step": 1890 }, { "clip_ratio": 8.890469325706363e-05, "epoch": 0.005249890338091827, "grad_norm": 0.12983007729053497, "kl": 0.1546284407377243, "learning_rate": 3e-06, "loss": 0.0203, "step": 1891 }, { "clip_ratio": 0.00017390426364727318, "epoch": 0.0052526665889316435, "grad_norm": 0.20150499045848846, "kl": 0.13978148996829987, "learning_rate": 3e-06, "loss": 0.0197, "step": 1892 }, { "clip_ratio": 0.0015431393767357804, "epoch": 0.005255442839771459, "grad_norm": 0.12668222188949585, "kl": 0.14928098767995834, "learning_rate": 3e-06, "loss": 0.0219, "step": 1893 }, { "clip_ratio": 0.00017780938651412725, "epoch": 0.005258219090611275, "grad_norm": 0.1326785534620285, "kl": 0.14815984666347504, "learning_rate": 3e-06, "loss": 0.0205, "step": 1894 }, { "clip_ratio": 0.000360042235115543, "epoch": 0.005260995341451091, "grad_norm": 0.11318658292293549, "kl": 0.14500422030687332, "learning_rate": 3e-06, "loss": 0.0193, "step": 1895 }, { "clip_ratio": 0.00046394005767069757, "epoch": 0.005263771592290907, "grad_norm": 0.17353534698486328, "kl": 0.1581486538052559, "learning_rate": 3e-06, "loss": 0.0198, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 227.6666717529297, "epoch": 0.005266547843130722, "grad_norm": 0.09441768378019333, "kl": 0.14793231338262558, "learning_rate": 3e-06, "loss": 0.0268, "reward": 0.19375000149011612, "reward_std": 0.16211743652820587, "rewards/countdown_reward_func": 0.19375000149011612, "step": 1897, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00019291546777822077, "epoch": 0.0052693240939705385, "grad_norm": 0.06794694811105728, "kl": 0.15445201843976974, "learning_rate": 3e-06, "loss": 0.0273, "step": 1898 }, { "clip_ratio": 9.084302291739732e-05, "epoch": 0.005272100344810354, "grad_norm": 0.07045197486877441, "kl": 0.15664274990558624, "learning_rate": 3e-06, "loss": 0.0278, "step": 1899 }, { "clip_ratio": 0.0002881958498619497, "epoch": 0.00527487659565017, "grad_norm": 0.0634378045797348, "kl": 0.15410824120044708, "learning_rate": 3e-06, "loss": 0.0281, "step": 1900 }, { "clip_ratio": 0.0, "epoch": 0.0052776528464899865, "grad_norm": 0.07607164233922958, "kl": 0.14435160160064697, "learning_rate": 3e-06, "loss": 0.027, "step": 1901 }, { "clip_ratio": 0.0004623479617293924, "epoch": 0.005280429097329802, "grad_norm": 0.08263766020536423, "kl": 0.14478224515914917, "learning_rate": 3e-06, "loss": 0.0261, "step": 1902 }, { "clip_ratio": 0.0003508084482746199, "epoch": 0.005283205348169618, "grad_norm": 0.08324983716011047, "kl": 0.15262345969676971, "learning_rate": 3e-06, "loss": 0.0265, "step": 1903 }, { "clip_ratio": 0.0002769144412013702, "epoch": 0.0052859815990094336, "grad_norm": 0.0647958442568779, "kl": 0.16446733474731445, "learning_rate": 3e-06, "loss": 0.0268, "step": 1904 }, { "clip_ratio": 0.0002789492136798799, "epoch": 0.00528875784984925, "grad_norm": 0.06370534002780914, "kl": 0.165434792637825, "learning_rate": 3e-06, "loss": 0.0274, "step": 1905 }, { "clip_ratio": 0.0009251071896869689, "epoch": 0.005291534100689065, "grad_norm": 0.06624593585729599, "kl": 0.1653437316417694, "learning_rate": 3e-06, "loss": 0.0266, "step": 1906 }, { "clip_ratio": 0.0005657712754327804, "epoch": 0.0052943103515288815, "grad_norm": 0.06858285516500473, "kl": 0.15655828267335892, "learning_rate": 3e-06, "loss": 0.0265, "step": 1907 }, { "clip_ratio": 0.0008327158866450191, "epoch": 0.005297086602368697, "grad_norm": 0.08369874209165573, "kl": 0.15721774101257324, "learning_rate": 3e-06, "loss": 0.0254, "step": 1908 }, { "clip_ratio": 8.239947055699304e-05, "completion_length": 220.08334350585938, "epoch": 0.005299862853208513, "grad_norm": 0.07524816691875458, "kl": 0.16231046617031097, "learning_rate": 3e-06, "loss": 0.0145, "reward": 0.1937500163912773, "reward_std": 0.1958785466849804, "rewards/countdown_reward_func": 0.1937500163912773, "step": 1909, "zero_std_ratio": 0.5 }, { "clip_ratio": 9.057971328729764e-05, "epoch": 0.005302639104048329, "grad_norm": 0.06150933727622032, "kl": 0.17525289952754974, "learning_rate": 3e-06, "loss": 0.0154, "step": 1910 }, { "clip_ratio": 0.00010024057701230049, "epoch": 0.005305415354888145, "grad_norm": 0.06644804775714874, "kl": 0.17246182262897491, "learning_rate": 3e-06, "loss": 0.0141, "step": 1911 }, { "clip_ratio": 0.0, "epoch": 0.005308191605727961, "grad_norm": 0.11811656504869461, "kl": 0.18016871809959412, "learning_rate": 3e-06, "loss": 0.0154, "step": 1912 }, { "clip_ratio": 0.0, "epoch": 0.0053109678565677765, "grad_norm": 0.06376418471336365, "kl": 0.16709844022989273, "learning_rate": 3e-06, "loss": 0.0137, "step": 1913 }, { "clip_ratio": 0.00010469011613167822, "epoch": 0.005313744107407593, "grad_norm": 0.055688947439193726, "kl": 0.16722818464040756, "learning_rate": 3e-06, "loss": 0.0141, "step": 1914 }, { "clip_ratio": 0.0, "epoch": 0.005316520358247408, "grad_norm": 0.0814003273844719, "kl": 0.16963288933038712, "learning_rate": 3e-06, "loss": 0.0142, "step": 1915 }, { "clip_ratio": 0.00029551039915531874, "epoch": 0.0053192966090872245, "grad_norm": 0.06685936450958252, "kl": 0.18109215795993805, "learning_rate": 3e-06, "loss": 0.0147, "step": 1916 }, { "clip_ratio": 0.00018481432925909758, "epoch": 0.00532207285992704, "grad_norm": 0.07375549525022507, "kl": 0.17562759667634964, "learning_rate": 3e-06, "loss": 0.0138, "step": 1917 }, { "clip_ratio": 0.0001017087051877752, "epoch": 0.005324849110766856, "grad_norm": 0.12521782517433167, "kl": 0.18078292161226273, "learning_rate": 3e-06, "loss": 0.0154, "step": 1918 }, { "clip_ratio": 0.00018628245015861467, "epoch": 0.005327625361606672, "grad_norm": 0.06385212391614914, "kl": 0.1704089492559433, "learning_rate": 3e-06, "loss": 0.0135, "step": 1919 }, { "clip_ratio": 0.00010469011613167822, "epoch": 0.005330401612446488, "grad_norm": 0.05945051088929176, "kl": 0.16913989931344986, "learning_rate": 3e-06, "loss": 0.014, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 226.56250762939453, "epoch": 0.005333177863286303, "grad_norm": 0.06853771954774857, "kl": 0.1720345988869667, "learning_rate": 3e-06, "loss": 0.0216, "reward": 0.2500000149011612, "reward_std": 0.2080453597009182, "rewards/countdown_reward_func": 0.2500000149011612, "step": 1921, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00026880551740759984, "epoch": 0.0053359541141261195, "grad_norm": 0.08787719905376434, "kl": 0.16743547469377518, "learning_rate": 3e-06, "loss": 0.0207, "step": 1922 }, { "clip_ratio": 8.567512122681364e-05, "epoch": 0.005338730364965936, "grad_norm": 0.1024947464466095, "kl": 0.16995961219072342, "learning_rate": 3e-06, "loss": 0.0217, "step": 1923 }, { "clip_ratio": 0.0003480182640487328, "epoch": 0.005341506615805751, "grad_norm": 0.0839357003569603, "kl": 0.18032675236463547, "learning_rate": 3e-06, "loss": 0.0214, "step": 1924 }, { "clip_ratio": 0.0002771032159216702, "epoch": 0.0053442828666455675, "grad_norm": 0.08104819059371948, "kl": 0.18133548647165298, "learning_rate": 3e-06, "loss": 0.0211, "step": 1925 }, { "clip_ratio": 0.00025895826547639444, "epoch": 0.005347059117485383, "grad_norm": 0.10513396561145782, "kl": 0.16761326789855957, "learning_rate": 3e-06, "loss": 0.0211, "step": 1926 }, { "clip_ratio": 0.0, "epoch": 0.005349835368325199, "grad_norm": 0.0935124084353447, "kl": 0.17295867204666138, "learning_rate": 3e-06, "loss": 0.0209, "step": 1927 }, { "clip_ratio": 0.000300093786790967, "epoch": 0.005352611619165015, "grad_norm": 0.090276338160038, "kl": 0.16908128559589386, "learning_rate": 3e-06, "loss": 0.0201, "step": 1928 }, { "clip_ratio": 8.555784006603062e-05, "epoch": 0.005355387870004831, "grad_norm": 0.09899937361478806, "kl": 0.17253682017326355, "learning_rate": 3e-06, "loss": 0.0213, "step": 1929 }, { "clip_ratio": 0.0, "epoch": 0.005358164120844646, "grad_norm": 0.09567968547344208, "kl": 0.18297704309225082, "learning_rate": 3e-06, "loss": 0.0212, "step": 1930 }, { "clip_ratio": 0.0, "epoch": 0.0053609403716844625, "grad_norm": 0.07854413986206055, "kl": 0.18571214377880096, "learning_rate": 3e-06, "loss": 0.0203, "step": 1931 }, { "clip_ratio": 0.0004300739456084557, "epoch": 0.005363716622524278, "grad_norm": 0.08686288446187973, "kl": 0.17234565317630768, "learning_rate": 3e-06, "loss": 0.0197, "step": 1932 }, { "clip_ratio": 0.00018758241640171036, "completion_length": 219.52083587646484, "epoch": 0.005366492873364094, "grad_norm": 0.11458230018615723, "kl": 0.18364715576171875, "learning_rate": 3e-06, "loss": 0.0036, "reward": 0.21250002086162567, "reward_std": 0.2080453634262085, "rewards/countdown_reward_func": 0.21250002086162567, "step": 1933, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00010620220564305782, "epoch": 0.0053692691242039105, "grad_norm": 0.22106598317623138, "kl": 0.1838528960943222, "learning_rate": 3e-06, "loss": 0.0045, "step": 1934 }, { "clip_ratio": 0.0, "epoch": 0.005372045375043726, "grad_norm": 0.11077378690242767, "kl": 0.18076221644878387, "learning_rate": 3e-06, "loss": 0.0035, "step": 1935 }, { "clip_ratio": 0.0006542036862811074, "epoch": 0.005374821625883542, "grad_norm": 0.10001291334629059, "kl": 0.17611156404018402, "learning_rate": 3e-06, "loss": 0.0038, "step": 1936 }, { "clip_ratio": 0.0, "epoch": 0.0053775978767233576, "grad_norm": 0.09241759777069092, "kl": 0.17259087413549423, "learning_rate": 3e-06, "loss": 0.0032, "step": 1937 }, { "clip_ratio": 0.00039494471275247633, "epoch": 0.005380374127563174, "grad_norm": 0.11034689098596573, "kl": 0.17264589667320251, "learning_rate": 3e-06, "loss": 0.0028, "step": 1938 }, { "clip_ratio": 8.196721319109201e-05, "epoch": 0.005383150378402989, "grad_norm": 0.11598069220781326, "kl": 0.1707867681980133, "learning_rate": 3e-06, "loss": 0.0018, "step": 1939 }, { "clip_ratio": 0.0003757518425118178, "epoch": 0.0053859266292428055, "grad_norm": 0.23157401382923126, "kl": 0.16873134672641754, "learning_rate": 3e-06, "loss": 0.002, "step": 1940 }, { "clip_ratio": 0.0005139116401551291, "epoch": 0.005388702880082621, "grad_norm": 0.11090226471424103, "kl": 0.1620839387178421, "learning_rate": 3e-06, "loss": 0.0012, "step": 1941 }, { "clip_ratio": 0.0010267609904985875, "epoch": 0.005391479130922437, "grad_norm": 0.09474062919616699, "kl": 0.15658225119113922, "learning_rate": 3e-06, "loss": 0.0019, "step": 1942 }, { "clip_ratio": 0.000244140625, "epoch": 0.005394255381762253, "grad_norm": 0.08938335627317429, "kl": 0.1493806093931198, "learning_rate": 3e-06, "loss": 0.0012, "step": 1943 }, { "clip_ratio": 0.0003643684176495299, "epoch": 0.005397031632602069, "grad_norm": 0.14408725500106812, "kl": 0.15142924338579178, "learning_rate": 3e-06, "loss": 0.0002, "step": 1944 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 229.0416717529297, "epoch": 0.005399807883441885, "grad_norm": 0.12846609950065613, "kl": 0.14687072485685349, "learning_rate": 3e-06, "loss": 0.0124, "reward": 0.3020833730697632, "reward_std": 0.2922677993774414, "rewards/countdown_reward_func": 0.3020833432674408, "step": 1945, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0054025841342817005, "grad_norm": 0.10064707696437836, "kl": 0.13952406495809555, "learning_rate": 3e-06, "loss": 0.012, "step": 1946 }, { "clip_ratio": 0.0007594309572596103, "epoch": 0.005405360385121517, "grad_norm": 0.10994094610214233, "kl": 0.13837341219186783, "learning_rate": 3e-06, "loss": 0.0116, "step": 1947 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005408136635961332, "grad_norm": 0.09539451450109482, "kl": 0.1353205442428589, "learning_rate": 3e-06, "loss": 0.0122, "step": 1948 }, { "clip_ratio": 0.00025521605130052194, "epoch": 0.0054109128868011485, "grad_norm": 0.11715108156204224, "kl": 0.13513853400945663, "learning_rate": 3e-06, "loss": 0.0127, "step": 1949 }, { "clip_ratio": 0.0005876675131730735, "epoch": 0.005413689137640964, "grad_norm": 0.11635179072618484, "kl": 0.12926241755485535, "learning_rate": 3e-06, "loss": 0.0114, "step": 1950 }, { "clip_ratio": 0.00033659624023130164, "epoch": 0.00541646538848078, "grad_norm": 0.13250043988227844, "kl": 0.1317070946097374, "learning_rate": 3e-06, "loss": 0.0108, "step": 1951 }, { "clip_ratio": 0.0002992119698319584, "epoch": 0.005419241639320596, "grad_norm": 0.10431980341672897, "kl": 0.12609682232141495, "learning_rate": 3e-06, "loss": 0.0105, "step": 1952 }, { "clip_ratio": 0.0017799893976189196, "epoch": 0.005422017890160412, "grad_norm": 0.1222839429974556, "kl": 0.12637890875339508, "learning_rate": 3e-06, "loss": 0.0103, "step": 1953 }, { "clip_ratio": 0.0005579154822044075, "epoch": 0.005424794141000227, "grad_norm": 0.09429076313972473, "kl": 0.12555726990103722, "learning_rate": 3e-06, "loss": 0.0103, "step": 1954 }, { "clip_ratio": 0.0017903645930346102, "epoch": 0.0054275703918400435, "grad_norm": 0.11087554693222046, "kl": 0.1258564032614231, "learning_rate": 3e-06, "loss": 0.0102, "step": 1955 }, { "clip_ratio": 0.002133891510311514, "epoch": 0.00543034664267986, "grad_norm": 0.13041195273399353, "kl": 0.12211589515209198, "learning_rate": 3e-06, "loss": 0.0104, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 222.1041717529297, "epoch": 0.005433122893519675, "grad_norm": 0.09448617696762085, "kl": 0.11988238245248795, "learning_rate": 3e-06, "loss": 0.0148, "reward": 0.2666666805744171, "reward_std": 0.22970523685216904, "rewards/countdown_reward_func": 0.2666666656732559, "step": 1957, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00017508336168248206, "epoch": 0.0054358991443594915, "grad_norm": 0.09333960711956024, "kl": 0.12271144986152649, "learning_rate": 3e-06, "loss": 0.0156, "step": 1958 }, { "clip_ratio": 0.0, "epoch": 0.005438675395199307, "grad_norm": 0.10355954617261887, "kl": 0.11948385834693909, "learning_rate": 3e-06, "loss": 0.0148, "step": 1959 }, { "clip_ratio": 0.00018209777772426605, "epoch": 0.005441451646039123, "grad_norm": 0.10874044895172119, "kl": 0.12741681933403015, "learning_rate": 3e-06, "loss": 0.0161, "step": 1960 }, { "clip_ratio": 0.0004606101065292023, "epoch": 0.005444227896878939, "grad_norm": 0.11931112408638, "kl": 0.11942564323544502, "learning_rate": 3e-06, "loss": 0.0144, "step": 1961 }, { "clip_ratio": 0.00018340400856686756, "epoch": 0.005447004147718755, "grad_norm": 0.13169850409030914, "kl": 0.11675844341516495, "learning_rate": 3e-06, "loss": 0.0149, "step": 1962 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00544978039855857, "grad_norm": 0.09862768650054932, "kl": 0.12140379846096039, "learning_rate": 3e-06, "loss": 0.0147, "step": 1963 }, { "clip_ratio": 0.0002775423854473047, "epoch": 0.0054525566493983865, "grad_norm": 0.10945676267147064, "kl": 0.12424385920166969, "learning_rate": 3e-06, "loss": 0.0146, "step": 1964 }, { "clip_ratio": 0.0005308387917466462, "epoch": 0.005455332900238202, "grad_norm": 0.09942898899316788, "kl": 0.12193005159497261, "learning_rate": 3e-06, "loss": 0.0137, "step": 1965 }, { "clip_ratio": 0.00037315295776352286, "epoch": 0.005458109151078018, "grad_norm": 0.1435389667749405, "kl": 0.12972797825932503, "learning_rate": 3e-06, "loss": 0.0155, "step": 1966 }, { "clip_ratio": 0.0011096491580246948, "epoch": 0.0054608854019178345, "grad_norm": 0.118125319480896, "kl": 0.12346281483769417, "learning_rate": 3e-06, "loss": 0.0133, "step": 1967 }, { "clip_ratio": 0.00108197086956352, "epoch": 0.00546366165275765, "grad_norm": 0.13901278376579285, "kl": 0.12123304605484009, "learning_rate": 3e-06, "loss": 0.0131, "step": 1968 }, { "clip_ratio": 0.00017531556659378111, "completion_length": 226.37500762939453, "epoch": 0.005466437903597466, "grad_norm": 0.1171063482761383, "kl": 0.11619845405220985, "learning_rate": 3e-06, "loss": 0.0091, "reward": 0.3395833522081375, "reward_std": 0.35036255419254303, "rewards/countdown_reward_func": 0.3395833373069763, "step": 1969, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.645061800256371e-05, "epoch": 0.0054692141544372816, "grad_norm": 0.1672900915145874, "kl": 0.12332340329885483, "learning_rate": 3e-06, "loss": 0.0087, "step": 1970 }, { "clip_ratio": 0.0, "epoch": 0.005471990405277098, "grad_norm": 0.12007147073745728, "kl": 0.12126722559332848, "learning_rate": 3e-06, "loss": 0.0096, "step": 1971 }, { "clip_ratio": 9.645061800256371e-05, "epoch": 0.005474766656116913, "grad_norm": 0.1385733038187027, "kl": 0.11890603601932526, "learning_rate": 3e-06, "loss": 0.0084, "step": 1972 }, { "clip_ratio": 0.0, "epoch": 0.0054775429069567295, "grad_norm": 0.122285395860672, "kl": 0.12061040103435516, "learning_rate": 3e-06, "loss": 0.0079, "step": 1973 }, { "clip_ratio": 8.765778329689056e-05, "epoch": 0.005480319157796545, "grad_norm": 0.11869718879461288, "kl": 0.12436644360423088, "learning_rate": 3e-06, "loss": 0.0088, "step": 1974 }, { "clip_ratio": 8.765778329689056e-05, "epoch": 0.005483095408636361, "grad_norm": 0.11378361284732819, "kl": 0.11800763756036758, "learning_rate": 3e-06, "loss": 0.007, "step": 1975 }, { "clip_ratio": 0.00017234613187611103, "epoch": 0.005485871659476177, "grad_norm": 0.1709638386964798, "kl": 0.1264483742415905, "learning_rate": 3e-06, "loss": 0.0078, "step": 1976 }, { "clip_ratio": 0.0, "epoch": 0.005488647910315993, "grad_norm": 0.118524931371212, "kl": 0.12333150953054428, "learning_rate": 3e-06, "loss": 0.0078, "step": 1977 }, { "clip_ratio": 9.645061800256371e-05, "epoch": 0.005491424161155809, "grad_norm": 0.15198098123073578, "kl": 0.12151569873094559, "learning_rate": 3e-06, "loss": 0.0068, "step": 1978 }, { "clip_ratio": 0.0, "epoch": 0.0054942004119956245, "grad_norm": 0.13697625696659088, "kl": 0.12277953699231148, "learning_rate": 3e-06, "loss": 0.0065, "step": 1979 }, { "clip_ratio": 0.0002635204582475126, "epoch": 0.005496976662835441, "grad_norm": 0.1165957823395729, "kl": 0.12548280879855156, "learning_rate": 3e-06, "loss": 0.0077, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 220.08334350585938, "epoch": 0.005499752913675256, "grad_norm": 0.1365557312965393, "kl": 0.11501205712556839, "learning_rate": 3e-06, "loss": -0.0015, "reward": 0.36250002682209015, "reward_std": 0.33128294348716736, "rewards/countdown_reward_func": 0.36250001192092896, "step": 1981, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00026728439843282104, "epoch": 0.0055025291645150725, "grad_norm": 0.1341644525527954, "kl": 0.11507071927189827, "learning_rate": 3e-06, "loss": -0.0015, "step": 1982 }, { "clip_ratio": 0.00010775862028822303, "epoch": 0.005505305415354888, "grad_norm": 0.1273004114627838, "kl": 0.12233321368694305, "learning_rate": 3e-06, "loss": -0.0006, "step": 1983 }, { "clip_ratio": 0.0005858765362063423, "epoch": 0.005508081666194704, "grad_norm": 0.12228009104728699, "kl": 0.12594753503799438, "learning_rate": 3e-06, "loss": -0.0006, "step": 1984 }, { "clip_ratio": 0.00029870675643905997, "epoch": 0.00551085791703452, "grad_norm": 0.1282241940498352, "kl": 0.12114127725362778, "learning_rate": 3e-06, "loss": -0.002, "step": 1985 }, { "clip_ratio": 0.0, "epoch": 0.005513634167874336, "grad_norm": 0.18000636994838715, "kl": 0.12271250784397125, "learning_rate": 3e-06, "loss": -0.0024, "step": 1986 }, { "clip_ratio": 0.0003066251229029149, "epoch": 0.005516410418714151, "grad_norm": 0.11781095713376999, "kl": 0.11188013106584549, "learning_rate": 3e-06, "loss": -0.0023, "step": 1987 }, { "clip_ratio": 0.0005269177490845323, "epoch": 0.0055191866695539675, "grad_norm": 0.13404454290866852, "kl": 0.1126963309943676, "learning_rate": 3e-06, "loss": -0.0036, "step": 1988 }, { "clip_ratio": 0.0003066251229029149, "epoch": 0.005521962920393784, "grad_norm": 0.11764881014823914, "kl": 0.11812717467546463, "learning_rate": 3e-06, "loss": -0.0026, "step": 1989 }, { "clip_ratio": 0.0011090568150393665, "epoch": 0.005524739171233599, "grad_norm": 0.14475159347057343, "kl": 0.12093603238463402, "learning_rate": 3e-06, "loss": -0.0037, "step": 1990 }, { "clip_ratio": 0.0010156702192034572, "epoch": 0.0055275154220734155, "grad_norm": 0.11484024673700333, "kl": 0.11511994898319244, "learning_rate": 3e-06, "loss": -0.0032, "step": 1991 }, { "clip_ratio": 0.0007695165404584259, "epoch": 0.005530291672913231, "grad_norm": 0.18602021038532257, "kl": 0.11601589620113373, "learning_rate": 3e-06, "loss": -0.0049, "step": 1992 }, { "clip_ratio": 0.0005422846879810095, "completion_length": 225.81250762939453, "epoch": 0.005533067923753047, "grad_norm": 0.10769283771514893, "kl": 0.11326644197106361, "learning_rate": 3e-06, "loss": 0.0156, "reward": 0.2854166850447655, "reward_std": 0.24453017115592957, "rewards/countdown_reward_func": 0.2854166701436043, "step": 1993, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003762734049814753, "epoch": 0.005535844174592863, "grad_norm": 0.12362723052501678, "kl": 0.11308754608035088, "learning_rate": 3e-06, "loss": 0.0153, "step": 1994 }, { "clip_ratio": 0.0005234857235336676, "epoch": 0.005538620425432679, "grad_norm": 0.0922532007098198, "kl": 0.10114821046590805, "learning_rate": 3e-06, "loss": 0.0145, "step": 1995 }, { "clip_ratio": 8.73515018611215e-05, "epoch": 0.005541396676272494, "grad_norm": 0.10349724441766739, "kl": 0.10615367442369461, "learning_rate": 3e-06, "loss": 0.016, "step": 1996 }, { "clip_ratio": 0.00021079258294776082, "epoch": 0.0055441729271123105, "grad_norm": 0.13311462104320526, "kl": 0.10724882781505585, "learning_rate": 3e-06, "loss": 0.0144, "step": 1997 }, { "clip_ratio": 0.0001151012911577709, "epoch": 0.005546949177952126, "grad_norm": 0.09960220754146576, "kl": 0.10265998914837837, "learning_rate": 3e-06, "loss": 0.0153, "step": 1998 }, { "clip_ratio": 0.0007085143588483334, "epoch": 0.005549725428791942, "grad_norm": 0.10948970168828964, "kl": 0.10900342464447021, "learning_rate": 3e-06, "loss": 0.0154, "step": 1999 }, { "epoch": 0.0055525016796317585, "grad_norm": 0.09006191045045853, "learning_rate": 3e-06, "loss": 0.0145, "step": 2000 }, { "clip_ratio": 0.0005361156981962267, "epoch": 0.005555277930471574, "grad_norm": 0.10735763609409332, "kl": 0.1035256776958704, "learning_rate": 3e-06, "loss": 0.0144, "step": 2001 }, { "clip_ratio": 0.0004768256621900946, "epoch": 0.00555805418131139, "grad_norm": 0.09122443944215775, "kl": 0.10310684517025948, "learning_rate": 3e-06, "loss": 0.0147, "step": 2002 }, { "clip_ratio": 0.0020118614193052053, "epoch": 0.0055608304321512056, "grad_norm": 0.21058741211891174, "kl": 0.10488555207848549, "learning_rate": 3e-06, "loss": 0.0131, "step": 2003 }, { "clip_ratio": 0.0006240379370865412, "epoch": 0.005563606682991022, "grad_norm": 0.09514111280441284, "kl": 0.10193825513124466, "learning_rate": 3e-06, "loss": 0.0136, "step": 2004 }, { "clip_ratio": 0.0002617926656967029, "completion_length": 223.08333587646484, "epoch": 0.005566382933830837, "grad_norm": 0.08725622296333313, "kl": 0.11390889436006546, "learning_rate": 3e-06, "loss": 0.0051, "reward": 0.2291666641831398, "reward_std": 0.16722052544355392, "rewards/countdown_reward_func": 0.2291666641831398, "step": 2005, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00020610057981684804, "epoch": 0.0055691591846706535, "grad_norm": 0.08379297703504562, "kl": 0.11956554651260376, "learning_rate": 3e-06, "loss": 0.0049, "step": 2006 }, { "clip_ratio": 0.0002910737384809181, "epoch": 0.005571935435510469, "grad_norm": 0.0656660795211792, "kl": 0.10992233455181122, "learning_rate": 3e-06, "loss": 0.0051, "step": 2007 }, { "clip_ratio": 8.840169903123751e-05, "epoch": 0.005574711686350285, "grad_norm": 0.07707367092370987, "kl": 0.11566202342510223, "learning_rate": 3e-06, "loss": 0.0054, "step": 2008 }, { "clip_ratio": 0.0005598522620857693, "epoch": 0.005577487937190101, "grad_norm": 0.0825631394982338, "kl": 0.11144465208053589, "learning_rate": 3e-06, "loss": 0.0042, "step": 2009 }, { "clip_ratio": 8.333333244081587e-05, "epoch": 0.005580264188029917, "grad_norm": 0.07131849229335785, "kl": 0.11648586019873619, "learning_rate": 3e-06, "loss": 0.0044, "step": 2010 }, { "clip_ratio": 9.204712841892615e-05, "epoch": 0.005583040438869733, "grad_norm": 0.07822870463132858, "kl": 0.11300336197018623, "learning_rate": 3e-06, "loss": 0.0043, "step": 2011 }, { "clip_ratio": 0.00019509741832735017, "epoch": 0.0055858166897095485, "grad_norm": 0.08241821080446243, "kl": 0.11744438856840134, "learning_rate": 3e-06, "loss": 0.0046, "step": 2012 }, { "clip_ratio": 0.0005299464974086732, "epoch": 0.005588592940549365, "grad_norm": 0.07519717514514923, "kl": 0.1080629974603653, "learning_rate": 3e-06, "loss": 0.0044, "step": 2013 }, { "clip_ratio": 9.578544268151745e-05, "epoch": 0.00559136919138918, "grad_norm": 0.07436896860599518, "kl": 0.11372564733028412, "learning_rate": 3e-06, "loss": 0.0053, "step": 2014 }, { "clip_ratio": 0.0011513839708641171, "epoch": 0.0055941454422289965, "grad_norm": 0.07369540631771088, "kl": 0.10970757156610489, "learning_rate": 3e-06, "loss": 0.0035, "step": 2015 }, { "clip_ratio": 0.0004363836196716875, "epoch": 0.005596921693068812, "grad_norm": 0.06370438635349274, "kl": 0.1145344153046608, "learning_rate": 3e-06, "loss": 0.0037, "step": 2016 }, { "clip_ratio": 0.0002770941355265677, "completion_length": 221.2291717529297, "epoch": 0.005599697943908628, "grad_norm": 0.07984435558319092, "kl": 0.10508502274751663, "learning_rate": 3e-06, "loss": 0.0025, "reward": 0.21250000596046448, "reward_std": 0.17428425326943398, "rewards/countdown_reward_func": 0.21250000596046448, "step": 2017, "zero_std_ratio": 0.625 }, { "clip_ratio": 9.21828905120492e-05, "epoch": 0.005602474194748444, "grad_norm": 0.07923725992441177, "kl": 0.11465127393603325, "learning_rate": 3e-06, "loss": 0.0021, "step": 2018 }, { "clip_ratio": 0.00029520990210585296, "epoch": 0.00560525044558826, "grad_norm": 0.08113577216863632, "kl": 0.10587036609649658, "learning_rate": 3e-06, "loss": 0.0026, "step": 2019 }, { "clip_ratio": 0.00039488900802098215, "epoch": 0.005608026696428075, "grad_norm": 0.09565461426973343, "kl": 0.11312683299183846, "learning_rate": 3e-06, "loss": 0.0021, "step": 2020 }, { "clip_ratio": 0.0, "epoch": 0.0056108029472678915, "grad_norm": 0.10134981572628021, "kl": 0.10707443952560425, "learning_rate": 3e-06, "loss": 0.0027, "step": 2021 }, { "clip_ratio": 0.000533617683686316, "epoch": 0.005613579198107708, "grad_norm": 0.0789194330573082, "kl": 0.1022312305867672, "learning_rate": 3e-06, "loss": 0.0019, "step": 2022 }, { "clip_ratio": 0.0005019560339860618, "epoch": 0.005616355448947523, "grad_norm": 0.08137885481119156, "kl": 0.10183565318584442, "learning_rate": 3e-06, "loss": 0.0022, "step": 2023 }, { "clip_ratio": 9.21828905120492e-05, "epoch": 0.0056191316997873395, "grad_norm": 0.0739808902144432, "kl": 0.11150963604450226, "learning_rate": 3e-06, "loss": 0.0012, "step": 2024 }, { "clip_ratio": 0.0001953362807398662, "epoch": 0.005621907950627155, "grad_norm": 0.07564988732337952, "kl": 0.10209690034389496, "learning_rate": 3e-06, "loss": 0.0017, "step": 2025 }, { "clip_ratio": 0.0004712695226771757, "epoch": 0.005624684201466971, "grad_norm": 0.14252831041812897, "kl": 0.10544009134173393, "learning_rate": 3e-06, "loss": 0.0015, "step": 2026 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005627460452306787, "grad_norm": 0.0803026333451271, "kl": 0.10230910778045654, "learning_rate": 3e-06, "loss": 0.0012, "step": 2027 }, { "clip_ratio": 0.0007070228894008324, "epoch": 0.005630236703146603, "grad_norm": 0.08244466781616211, "kl": 0.0964125283062458, "learning_rate": 3e-06, "loss": 0.001, "step": 2028 }, { "clip_ratio": 0.0004911591531708837, "completion_length": 220.95833587646484, "epoch": 0.005633012953986418, "grad_norm": 0.10779600590467453, "kl": 0.11588352173566818, "learning_rate": 3e-06, "loss": 0.0112, "reward": 0.30416668206453323, "reward_std": 0.3633297234773636, "rewards/countdown_reward_func": 0.30416668206453323, "step": 2029, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00043572409049374983, "epoch": 0.0056357892048262345, "grad_norm": 0.11869516223669052, "kl": 0.09700796753168106, "learning_rate": 3e-06, "loss": 0.0108, "step": 2030 }, { "clip_ratio": 0.00020404949464136735, "epoch": 0.00563856545566605, "grad_norm": 0.09410764276981354, "kl": 0.10727613791823387, "learning_rate": 3e-06, "loss": 0.0105, "step": 2031 }, { "clip_ratio": 0.00020356984896352515, "epoch": 0.005641341706505866, "grad_norm": 0.11350057274103165, "kl": 0.10674090310931206, "learning_rate": 3e-06, "loss": 0.0096, "step": 2032 }, { "clip_ratio": 0.0002443792764097452, "epoch": 0.0056441179573456825, "grad_norm": 0.12127961963415146, "kl": 0.10875796526670456, "learning_rate": 3e-06, "loss": 0.0101, "step": 2033 }, { "clip_ratio": 0.0003601440985221416, "epoch": 0.005646894208185498, "grad_norm": 0.1095552071928978, "kl": 0.09716677665710449, "learning_rate": 3e-06, "loss": 0.01, "step": 2034 }, { "clip_ratio": 0.0002456332149449736, "epoch": 0.005649670459025314, "grad_norm": 0.12184404581785202, "kl": 0.11015952378511429, "learning_rate": 3e-06, "loss": 0.01, "step": 2035 }, { "clip_ratio": 8.191350207198411e-05, "epoch": 0.0056524467098651296, "grad_norm": 0.11499220877885818, "kl": 0.09228203445672989, "learning_rate": 3e-06, "loss": 0.0097, "step": 2036 }, { "clip_ratio": 0.0001221896382048726, "epoch": 0.005655222960704946, "grad_norm": 0.12085867673158646, "kl": 0.10267635807394981, "learning_rate": 3e-06, "loss": 0.0094, "step": 2037 }, { "clip_ratio": 0.0004097962155356072, "epoch": 0.005657999211544761, "grad_norm": 0.12743261456489563, "kl": 0.10104519873857498, "learning_rate": 3e-06, "loss": 0.0086, "step": 2038 }, { "clip_ratio": 0.0006971483817324042, "epoch": 0.0056607754623845775, "grad_norm": 0.12285710871219635, "kl": 0.10486527159810066, "learning_rate": 3e-06, "loss": 0.0099, "step": 2039 }, { "clip_ratio": 0.0005238101875875145, "epoch": 0.005663551713224393, "grad_norm": 0.1023198664188385, "kl": 0.09349857643246651, "learning_rate": 3e-06, "loss": 0.0078, "step": 2040 }, { "clip_ratio": 0.0009150805417448282, "completion_length": 226.89583587646484, "epoch": 0.005666327964064209, "grad_norm": 0.09265701472759247, "kl": 0.09547650068998337, "learning_rate": 3e-06, "loss": 0.0079, "reward": 0.26875001937150955, "reward_std": 0.15347465127706528, "rewards/countdown_reward_func": 0.26875000447034836, "step": 2041, "zero_std_ratio": 0.625 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005669104214904025, "grad_norm": 0.050366487354040146, "kl": 0.10091326385736465, "learning_rate": 3e-06, "loss": 0.0086, "step": 2042 }, { "clip_ratio": 0.0, "epoch": 0.005671880465743841, "grad_norm": 0.06937897950410843, "kl": 0.10959598422050476, "learning_rate": 3e-06, "loss": 0.0086, "step": 2043 }, { "clip_ratio": 0.0001739728031679988, "epoch": 0.005674656716583657, "grad_norm": 0.06653722375631332, "kl": 0.09630914404988289, "learning_rate": 3e-06, "loss": 0.0084, "step": 2044 }, { "clip_ratio": 0.00018076645210385323, "epoch": 0.0056774329674234725, "grad_norm": 0.04911844804883003, "kl": 0.08973881602287292, "learning_rate": 3e-06, "loss": 0.0084, "step": 2045 }, { "clip_ratio": 0.0002787739722407423, "epoch": 0.005680209218263289, "grad_norm": 0.08200076222419739, "kl": 0.0952768363058567, "learning_rate": 3e-06, "loss": 0.0077, "step": 2046 }, { "clip_ratio": 0.0003649073769338429, "epoch": 0.005682985469103104, "grad_norm": 0.06790746748447418, "kl": 0.09479077905416489, "learning_rate": 3e-06, "loss": 0.0078, "step": 2047 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.0056857617199429205, "grad_norm": 0.05466790497303009, "kl": 0.10251379758119583, "learning_rate": 3e-06, "loss": 0.0084, "step": 2048 }, { "clip_ratio": 0.0002632715040817857, "epoch": 0.005688537970782736, "grad_norm": 0.06806403398513794, "kl": 0.1096310168504715, "learning_rate": 3e-06, "loss": 0.0081, "step": 2049 }, { "clip_ratio": 0.0001739728031679988, "epoch": 0.005691314221622552, "grad_norm": 0.06310102343559265, "kl": 0.09831372275948524, "learning_rate": 3e-06, "loss": 0.0084, "step": 2050 }, { "clip_ratio": 0.0001840161858126521, "epoch": 0.005694090472462368, "grad_norm": 0.05473968759179115, "kl": 0.09245896711945534, "learning_rate": 3e-06, "loss": 0.0077, "step": 2051 }, { "clip_ratio": 9.36329597607255e-05, "epoch": 0.005696866723302184, "grad_norm": 0.06946438550949097, "kl": 0.0970330685377121, "learning_rate": 3e-06, "loss": 0.0074, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 230.62500762939453, "epoch": 0.005699642974141999, "grad_norm": 0.1020989716053009, "kl": 0.09304828196763992, "learning_rate": 3e-06, "loss": 0.0123, "reward": 0.3395833522081375, "reward_std": 0.2922678142786026, "rewards/countdown_reward_func": 0.3395833522081375, "step": 2053, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0057024192249818155, "grad_norm": 0.08666249364614487, "kl": 0.09421010315418243, "learning_rate": 3e-06, "loss": 0.0129, "step": 2054 }, { "clip_ratio": 0.00016469038382638246, "epoch": 0.005705195475821632, "grad_norm": 0.11592990159988403, "kl": 0.09231939166784286, "learning_rate": 3e-06, "loss": 0.0128, "step": 2055 }, { "clip_ratio": 0.0002482597410562448, "epoch": 0.005707971726661447, "grad_norm": 0.1035403460264206, "kl": 0.09664532542228699, "learning_rate": 3e-06, "loss": 0.012, "step": 2056 }, { "clip_ratio": 0.00016687953029759228, "epoch": 0.0057107479775012635, "grad_norm": 0.1996307075023651, "kl": 0.09553533047437668, "learning_rate": 3e-06, "loss": 0.0118, "step": 2057 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005713524228341079, "grad_norm": 0.09972159564495087, "kl": 0.10145417600870132, "learning_rate": 3e-06, "loss": 0.013, "step": 2058 }, { "clip_ratio": 0.0, "epoch": 0.005716300479180895, "grad_norm": 0.10573241114616394, "kl": 0.09803595021367073, "learning_rate": 3e-06, "loss": 0.0114, "step": 2059 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005719076730020711, "grad_norm": 0.08910799026489258, "kl": 0.09924589842557907, "learning_rate": 3e-06, "loss": 0.012, "step": 2060 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005721852980860527, "grad_norm": 0.11156439036130905, "kl": 0.09876829758286476, "learning_rate": 3e-06, "loss": 0.0116, "step": 2061 }, { "clip_ratio": 0.00035182230203645304, "epoch": 0.005724629231700342, "grad_norm": 0.11116062849760056, "kl": 0.10230807960033417, "learning_rate": 3e-06, "loss": 0.0107, "step": 2062 }, { "clip_ratio": 0.00016687953029759228, "epoch": 0.0057274054825401585, "grad_norm": 0.11640466749668121, "kl": 0.10202505066990852, "learning_rate": 3e-06, "loss": 0.0104, "step": 2063 }, { "clip_ratio": 0.0002663229824975133, "epoch": 0.005730181733379974, "grad_norm": 0.20945513248443604, "kl": 0.10810398682951927, "learning_rate": 3e-06, "loss": 0.0113, "step": 2064 }, { "clip_ratio": 8.223684562835842e-05, "completion_length": 233.02084350585938, "epoch": 0.00573295798421979, "grad_norm": 0.10434425622224808, "kl": 0.11239994317293167, "learning_rate": 3e-06, "loss": -0.0028, "reward": 0.32083335518836975, "reward_std": 0.3441803753376007, "rewards/countdown_reward_func": 0.32083334028720856, "step": 2065, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0057357342350596065, "grad_norm": 0.11738111078739166, "kl": 0.11531775444746017, "learning_rate": 3e-06, "loss": -0.0023, "step": 2066 }, { "clip_ratio": 0.0006976053700782359, "epoch": 0.005738510485899422, "grad_norm": 0.09144604206085205, "kl": 0.11997786909341812, "learning_rate": 3e-06, "loss": -0.002, "step": 2067 }, { "clip_ratio": 0.0002564237511251122, "epoch": 0.005741286736739238, "grad_norm": 0.15023760497570038, "kl": 0.11250534653663635, "learning_rate": 3e-06, "loss": -0.0033, "step": 2068 }, { "clip_ratio": 0.0003439874417381361, "epoch": 0.0057440629875790536, "grad_norm": 0.0950041189789772, "kl": 0.1112917885184288, "learning_rate": 3e-06, "loss": -0.0029, "step": 2069 }, { "clip_ratio": 8.327781688421965e-05, "epoch": 0.00574683923841887, "grad_norm": 0.1445959061384201, "kl": 0.12204555794596672, "learning_rate": 3e-06, "loss": -0.0021, "step": 2070 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.005749615489258685, "grad_norm": 0.10922154039144516, "kl": 0.1177988089621067, "learning_rate": 3e-06, "loss": -0.0032, "step": 2071 }, { "clip_ratio": 0.0002721360942814499, "epoch": 0.0057523917400985015, "grad_norm": 0.10157906264066696, "kl": 0.11939038708806038, "learning_rate": 3e-06, "loss": -0.0028, "step": 2072 }, { "clip_ratio": 0.0011802471126429737, "epoch": 0.005755167990938317, "grad_norm": 0.10021565854549408, "kl": 0.12260624766349792, "learning_rate": 3e-06, "loss": -0.0032, "step": 2073 }, { "clip_ratio": 0.0005078699323348701, "epoch": 0.005757944241778133, "grad_norm": 0.14854243397712708, "kl": 0.11457186192274094, "learning_rate": 3e-06, "loss": -0.0052, "step": 2074 }, { "clip_ratio": 0.0005989633646095172, "epoch": 0.005760720492617949, "grad_norm": 0.10022560507059097, "kl": 0.11323418468236923, "learning_rate": 3e-06, "loss": -0.0044, "step": 2075 }, { "clip_ratio": 0.0011352297442499548, "epoch": 0.005763496743457765, "grad_norm": 0.14965884387493134, "kl": 0.12618472799658775, "learning_rate": 3e-06, "loss": -0.0045, "step": 2076 }, { "clip_ratio": 9.689922444522381e-05, "completion_length": 221.6041717529297, "epoch": 0.005766272994297581, "grad_norm": 0.11252445727586746, "kl": 0.11330411210656166, "learning_rate": 3e-06, "loss": 0.0338, "reward": 0.26875001937150955, "reward_std": 0.2272602580487728, "rewards/countdown_reward_func": 0.26875001937150955, "step": 2077, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0002688652166398242, "epoch": 0.0057690492451373965, "grad_norm": 0.11611206829547882, "kl": 0.11921777203679085, "learning_rate": 3e-06, "loss": 0.0351, "step": 2078 }, { "clip_ratio": 0.00021403797290986404, "epoch": 0.005771825495977213, "grad_norm": 0.09081734716892242, "kl": 0.11672532185912132, "learning_rate": 3e-06, "loss": 0.0339, "step": 2079 }, { "clip_ratio": 0.0006553223647642881, "epoch": 0.005774601746817028, "grad_norm": 0.11462453752756119, "kl": 0.12759307771921158, "learning_rate": 3e-06, "loss": 0.0341, "step": 2080 }, { "clip_ratio": 0.0002815326370182447, "epoch": 0.0057773779976568445, "grad_norm": 0.11553678661584854, "kl": 0.11894625052809715, "learning_rate": 3e-06, "loss": 0.0344, "step": 2081 }, { "clip_ratio": 0.00038389285327866673, "epoch": 0.00578015424849666, "grad_norm": 0.13438205420970917, "kl": 0.13819441944360733, "learning_rate": 3e-06, "loss": 0.0343, "step": 2082 }, { "clip_ratio": 0.00019379844889044762, "epoch": 0.005782930499336476, "grad_norm": 0.09481407701969147, "kl": 0.12022696807980537, "learning_rate": 3e-06, "loss": 0.0335, "step": 2083 }, { "clip_ratio": 0.00046928575466154143, "epoch": 0.005785706750176292, "grad_norm": 0.11112938076257706, "kl": 0.12938351184129715, "learning_rate": 3e-06, "loss": 0.034, "step": 2084 }, { "clip_ratio": 0.0005152427984285168, "epoch": 0.005788483001016108, "grad_norm": 0.08531668782234192, "kl": 0.12835099548101425, "learning_rate": 3e-06, "loss": 0.0329, "step": 2085 }, { "clip_ratio": 0.0012749898305628449, "epoch": 0.005791259251855923, "grad_norm": 0.14292016625404358, "kl": 0.14164124429225922, "learning_rate": 3e-06, "loss": 0.0331, "step": 2086 }, { "clip_ratio": 0.0004016064340248704, "epoch": 0.0057940355026957395, "grad_norm": 0.10637033730745316, "kl": 0.1343921273946762, "learning_rate": 3e-06, "loss": 0.0319, "step": 2087 }, { "clip_ratio": 0.0011857394711114466, "epoch": 0.005796811753535556, "grad_norm": 0.10809402167797089, "kl": 0.15764878690242767, "learning_rate": 3e-06, "loss": 0.0319, "step": 2088 }, { "clip_ratio": 9.689922444522381e-05, "completion_length": 241.83334350585938, "epoch": 0.005799588004375371, "grad_norm": 0.16788090765476227, "kl": 0.1235123947262764, "learning_rate": 3e-06, "loss": 0.016, "reward": 0.23125001043081284, "reward_std": 0.2202121838927269, "rewards/countdown_reward_func": 0.23125001043081284, "step": 2089, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00040937707672128454, "epoch": 0.0058023642552151875, "grad_norm": 0.08114562183618546, "kl": 0.14093566685914993, "learning_rate": 3e-06, "loss": 0.0153, "step": 2090 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005805140506055003, "grad_norm": 0.09554962068796158, "kl": 0.1442393809556961, "learning_rate": 3e-06, "loss": 0.0148, "step": 2091 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.005807916756894819, "grad_norm": 0.090549536049366, "kl": 0.14015483856201172, "learning_rate": 3e-06, "loss": 0.0169, "step": 2092 }, { "clip_ratio": 0.0002713593712542206, "epoch": 0.005810693007734635, "grad_norm": 0.10590094327926636, "kl": 0.1370547115802765, "learning_rate": 3e-06, "loss": 0.0161, "step": 2093 }, { "clip_ratio": 0.00025648381415521726, "epoch": 0.005813469258574451, "grad_norm": 0.1045352965593338, "kl": 0.14967551827430725, "learning_rate": 3e-06, "loss": 0.016, "step": 2094 }, { "clip_ratio": 0.00045060378033667803, "epoch": 0.005816245509414266, "grad_norm": 0.08050543814897537, "kl": 0.13956885784864426, "learning_rate": 3e-06, "loss": 0.0143, "step": 2095 }, { "clip_ratio": 0.0002441406322759576, "epoch": 0.0058190217602540825, "grad_norm": 0.14037755131721497, "kl": 0.15493950247764587, "learning_rate": 3e-06, "loss": 0.015, "step": 2096 }, { "clip_ratio": 0.0, "epoch": 0.005821798011093898, "grad_norm": 0.09420851618051529, "kl": 0.15672826766967773, "learning_rate": 3e-06, "loss": 0.0139, "step": 2097 }, { "clip_ratio": 0.0003684598486870527, "epoch": 0.005824574261933714, "grad_norm": 0.08682415634393692, "kl": 0.15357310324907303, "learning_rate": 3e-06, "loss": 0.016, "step": 2098 }, { "clip_ratio": 0.0007170586031861603, "epoch": 0.0058273505127735305, "grad_norm": 0.09509247541427612, "kl": 0.149859219789505, "learning_rate": 3e-06, "loss": 0.0145, "step": 2099 }, { "clip_ratio": 0.0007866056985221803, "epoch": 0.005830126763613346, "grad_norm": 0.1083296537399292, "kl": 0.16195228695869446, "learning_rate": 3e-06, "loss": 0.0151, "step": 2100 }, { "clip_ratio": 0.00034024709020741284, "completion_length": 231.875, "epoch": 0.005832903014453162, "grad_norm": 0.06713063269853592, "kl": 0.16988279670476913, "learning_rate": 3e-06, "loss": 0.0083, "reward": 0.1937500163912773, "reward_std": 0.16211743280291557, "rewards/countdown_reward_func": 0.1937500163912773, "step": 2101, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0007102232775650918, "epoch": 0.0058356792652929776, "grad_norm": 0.0946163609623909, "kl": 0.17318201810121536, "learning_rate": 3e-06, "loss": 0.0086, "step": 2102 }, { "clip_ratio": 0.000451006053481251, "epoch": 0.005838455516132794, "grad_norm": 0.08715200424194336, "kl": 0.17537237703800201, "learning_rate": 3e-06, "loss": 0.0084, "step": 2103 }, { "clip_ratio": 0.0002783698437269777, "epoch": 0.005841231766972609, "grad_norm": 0.09208168089389801, "kl": 0.18494432419538498, "learning_rate": 3e-06, "loss": 0.009, "step": 2104 }, { "clip_ratio": 0.0006052821408957243, "epoch": 0.0058440080178124255, "grad_norm": 0.08401837199926376, "kl": 0.18430516123771667, "learning_rate": 3e-06, "loss": 0.0085, "step": 2105 }, { "clip_ratio": 0.0005398763241828419, "epoch": 0.005846784268652241, "grad_norm": 0.09048160165548325, "kl": 0.17285944521427155, "learning_rate": 3e-06, "loss": 0.0083, "step": 2106 }, { "clip_ratio": 0.001236416690517217, "epoch": 0.005849560519492057, "grad_norm": 0.06877080351114273, "kl": 0.16914395987987518, "learning_rate": 3e-06, "loss": 0.0078, "step": 2107 }, { "clip_ratio": 0.0004467820399440825, "epoch": 0.005852336770331873, "grad_norm": 0.08058128505945206, "kl": 0.16808201372623444, "learning_rate": 3e-06, "loss": 0.0074, "step": 2108 }, { "clip_ratio": 0.0016149040893651545, "epoch": 0.005855113021171689, "grad_norm": 0.0833841860294342, "kl": 0.1688889116048813, "learning_rate": 3e-06, "loss": 0.0083, "step": 2109 }, { "clip_ratio": 0.0009857022087089717, "epoch": 0.005857889272011505, "grad_norm": 0.07885358482599258, "kl": 0.17396697402000427, "learning_rate": 3e-06, "loss": 0.0075, "step": 2110 }, { "clip_ratio": 0.0015709067229181528, "epoch": 0.0058606655228513205, "grad_norm": 0.07803882658481598, "kl": 0.17317739129066467, "learning_rate": 3e-06, "loss": 0.0079, "step": 2111 }, { "clip_ratio": 0.0007821381441317499, "epoch": 0.005863441773691137, "grad_norm": 0.09485685080289841, "kl": 0.1609647423028946, "learning_rate": 3e-06, "loss": 0.0074, "step": 2112 }, { "clip_ratio": 0.0007151653699111193, "completion_length": 230.875, "epoch": 0.005866218024530952, "grad_norm": 0.11236539483070374, "kl": 0.1636364832520485, "learning_rate": 3e-06, "loss": 0.0213, "reward": 0.40000002086162567, "reward_std": 0.4050685316324234, "rewards/countdown_reward_func": 0.40000002086162567, "step": 2113, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00017586752073839307, "epoch": 0.0058689942753707685, "grad_norm": 0.1399078369140625, "kl": 0.14752335101366043, "learning_rate": 3e-06, "loss": 0.0204, "step": 2114 }, { "clip_ratio": 0.000263152651314158, "epoch": 0.005871770526210584, "grad_norm": 0.13787932693958282, "kl": 0.1409480944275856, "learning_rate": 3e-06, "loss": 0.0213, "step": 2115 }, { "clip_ratio": 0.0, "epoch": 0.0058745467770504, "grad_norm": 0.12428930401802063, "kl": 0.14609377086162567, "learning_rate": 3e-06, "loss": 0.0202, "step": 2116 }, { "clip_ratio": 0.0003564371290849522, "epoch": 0.005877323027890216, "grad_norm": 0.1103990450501442, "kl": 0.1507614701986313, "learning_rate": 3e-06, "loss": 0.0211, "step": 2117 }, { "clip_ratio": 0.00045055238297209144, "epoch": 0.005880099278730032, "grad_norm": 0.13074712455272675, "kl": 0.14322231709957123, "learning_rate": 3e-06, "loss": 0.0215, "step": 2118 }, { "clip_ratio": 0.0006283415132202208, "epoch": 0.005882875529569848, "grad_norm": 0.11959907412528992, "kl": 0.1588028073310852, "learning_rate": 3e-06, "loss": 0.0208, "step": 2119 }, { "clip_ratio": 0.0003582945646485314, "epoch": 0.0058856517804096635, "grad_norm": 0.1354365348815918, "kl": 0.1482446938753128, "learning_rate": 3e-06, "loss": 0.0203, "step": 2120 }, { "clip_ratio": 0.00043741075205616653, "epoch": 0.00588842803124948, "grad_norm": 0.14199601113796234, "kl": 0.14358460903167725, "learning_rate": 3e-06, "loss": 0.02, "step": 2121 }, { "clip_ratio": 0.0, "epoch": 0.005891204282089295, "grad_norm": 0.11802490055561066, "kl": 0.15160782635211945, "learning_rate": 3e-06, "loss": 0.0193, "step": 2122 }, { "clip_ratio": 0.00018325866403756663, "epoch": 0.0058939805329291115, "grad_norm": 0.1145329475402832, "kl": 0.15987689793109894, "learning_rate": 3e-06, "loss": 0.0203, "step": 2123 }, { "clip_ratio": 0.0003541165206115693, "epoch": 0.005896756783768927, "grad_norm": 0.13508586585521698, "kl": 0.15324489772319794, "learning_rate": 3e-06, "loss": 0.0196, "step": 2124 }, { "clip_ratio": 0.0005099133632029407, "completion_length": 218.33333587646484, "epoch": 0.005899533034608743, "grad_norm": 0.11340119689702988, "kl": 0.1783110797405243, "learning_rate": 3e-06, "loss": 0.0107, "reward": 0.30416667461395264, "reward_std": 0.29580747336149216, "rewards/countdown_reward_func": 0.30416665971279144, "step": 2125, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00028971848951186985, "epoch": 0.005902309285448559, "grad_norm": 0.11806675046682358, "kl": 0.19692577421665192, "learning_rate": 3e-06, "loss": 0.013, "step": 2126 }, { "clip_ratio": 0.0, "epoch": 0.005905085536288375, "grad_norm": 0.1320490837097168, "kl": 0.18967118114233017, "learning_rate": 3e-06, "loss": 0.0124, "step": 2127 }, { "clip_ratio": 0.00026122476992895827, "epoch": 0.00590786178712819, "grad_norm": 0.11485249549150467, "kl": 0.1766699254512787, "learning_rate": 3e-06, "loss": 0.0114, "step": 2128 }, { "clip_ratio": 0.0, "epoch": 0.0059106380379680065, "grad_norm": 0.10251068323850632, "kl": 0.175718754529953, "learning_rate": 3e-06, "loss": 0.0119, "step": 2129 }, { "clip_ratio": 0.0002794336760416627, "epoch": 0.005913414288807823, "grad_norm": 0.1114218533039093, "kl": 0.1810135394334793, "learning_rate": 3e-06, "loss": 0.011, "step": 2130 }, { "clip_ratio": 9.314456110587344e-05, "epoch": 0.005916190539647638, "grad_norm": 0.12015023082494736, "kl": 0.1831328272819519, "learning_rate": 3e-06, "loss": 0.0098, "step": 2131 }, { "clip_ratio": 0.0003007580089615658, "epoch": 0.0059189667904874545, "grad_norm": 0.11531781405210495, "kl": 0.1972024142742157, "learning_rate": 3e-06, "loss": 0.0125, "step": 2132 }, { "clip_ratio": 0.0002856182763935067, "epoch": 0.00592174304132727, "grad_norm": 0.11725442856550217, "kl": 0.18396782875061035, "learning_rate": 3e-06, "loss": 0.01, "step": 2133 }, { "clip_ratio": 0.00017714993737172335, "epoch": 0.005924519292167086, "grad_norm": 0.11699375510215759, "kl": 0.17351438850164413, "learning_rate": 3e-06, "loss": 0.0097, "step": 2134 }, { "clip_ratio": 0.0002972428919747472, "epoch": 0.0059272955430069016, "grad_norm": 0.12814870476722717, "kl": 0.16928401589393616, "learning_rate": 3e-06, "loss": 0.0101, "step": 2135 }, { "clip_ratio": 0.00038728527579223737, "epoch": 0.005930071793846718, "grad_norm": 0.10801739990711212, "kl": 0.17271753400564194, "learning_rate": 3e-06, "loss": 0.0089, "step": 2136 }, { "clip_ratio": 0.0008804201061138883, "completion_length": 240.1666717529297, "epoch": 0.005932848044686533, "grad_norm": 0.11421734094619751, "kl": 0.169437974691391, "learning_rate": 3e-06, "loss": 0.0286, "reward": 0.30416667461395264, "reward_std": 0.2904581278562546, "rewards/countdown_reward_func": 0.30416667461395264, "step": 2137, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0059356242955263495, "grad_norm": 0.1658470183610916, "kl": 0.16381431370973587, "learning_rate": 3e-06, "loss": 0.0283, "step": 2138 }, { "clip_ratio": 8.765778329689056e-05, "epoch": 0.005938400546366165, "grad_norm": 0.1286601573228836, "kl": 0.16420385986566544, "learning_rate": 3e-06, "loss": 0.029, "step": 2139 }, { "clip_ratio": 0.000244140625, "epoch": 0.005941176797205981, "grad_norm": 0.09617311507463455, "kl": 0.15908657014369965, "learning_rate": 3e-06, "loss": 0.0279, "step": 2140 }, { "clip_ratio": 0.0004251676582498476, "epoch": 0.0059439530480457975, "grad_norm": 0.10428610444068909, "kl": 0.1709904745221138, "learning_rate": 3e-06, "loss": 0.029, "step": 2141 }, { "clip_ratio": 0.000519495370099321, "epoch": 0.005946729298885613, "grad_norm": 0.08956306427717209, "kl": 0.17576873302459717, "learning_rate": 3e-06, "loss": 0.0287, "step": 2142 }, { "clip_ratio": 0.00035832179128192365, "epoch": 0.005949505549725429, "grad_norm": 0.09938909113407135, "kl": 0.1722540631890297, "learning_rate": 3e-06, "loss": 0.0279, "step": 2143 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0059522818005652445, "grad_norm": 0.0916515365242958, "kl": 0.17020156979560852, "learning_rate": 3e-06, "loss": 0.0275, "step": 2144 }, { "clip_ratio": 0.00017531556659378111, "epoch": 0.005955058051405061, "grad_norm": 0.1481482982635498, "kl": 0.17428260296583176, "learning_rate": 3e-06, "loss": 0.0269, "step": 2145 }, { "clip_ratio": 0.0007600796525366604, "epoch": 0.005957834302244876, "grad_norm": 0.10551691800355911, "kl": 0.17008347064256668, "learning_rate": 3e-06, "loss": 0.0264, "step": 2146 }, { "clip_ratio": 0.00026695779524743557, "epoch": 0.0059606105530846925, "grad_norm": 0.1174376830458641, "kl": 0.18421828001737595, "learning_rate": 3e-06, "loss": 0.0284, "step": 2147 }, { "clip_ratio": 0.000535756757017225, "epoch": 0.005963386803924508, "grad_norm": 0.08853857964277267, "kl": 0.19289961457252502, "learning_rate": 3e-06, "loss": 0.0272, "step": 2148 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 232.875, "epoch": 0.005966163054764324, "grad_norm": 0.04596855491399765, "kl": 0.18686389178037643, "learning_rate": 3e-06, "loss": 0.0206, "reward": 0.17083335667848587, "reward_std": 0.11509480327367783, "rewards/countdown_reward_func": 0.17083334922790527, "step": 2149, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.000768062163842842, "epoch": 0.00596893930560414, "grad_norm": 0.0615745410323143, "kl": 0.1883329302072525, "learning_rate": 3e-06, "loss": 0.0202, "step": 2150 }, { "clip_ratio": 0.0002582559172878973, "epoch": 0.005971715556443956, "grad_norm": 0.060142528265714645, "kl": 0.2043200358748436, "learning_rate": 3e-06, "loss": 0.0209, "step": 2151 }, { "clip_ratio": 0.00044469654676504433, "epoch": 0.005974491807283772, "grad_norm": 0.066607765853405, "kl": 0.22076967358589172, "learning_rate": 3e-06, "loss": 0.0211, "step": 2152 }, { "clip_ratio": 0.0004701242069131695, "epoch": 0.0059772680581235875, "grad_norm": 0.08139292895793915, "kl": 0.20817560702562332, "learning_rate": 3e-06, "loss": 0.0203, "step": 2153 }, { "clip_ratio": 0.0001826150546548888, "epoch": 0.005980044308963404, "grad_norm": 0.07108417898416519, "kl": 0.20678973197937012, "learning_rate": 3e-06, "loss": 0.02, "step": 2154 }, { "clip_ratio": 0.00027385010616853833, "epoch": 0.005982820559803219, "grad_norm": 0.044102661311626434, "kl": 0.2047039493918419, "learning_rate": 3e-06, "loss": 0.0201, "step": 2155 }, { "clip_ratio": 0.00040690103196538985, "epoch": 0.0059855968106430355, "grad_norm": 0.054487258195877075, "kl": 0.20282930880784988, "learning_rate": 3e-06, "loss": 0.0196, "step": 2156 }, { "clip_ratio": 0.00010032102727564052, "epoch": 0.005988373061482851, "grad_norm": 0.05873025581240654, "kl": 0.22172988951206207, "learning_rate": 3e-06, "loss": 0.0205, "step": 2157 }, { "clip_ratio": 0.00020064205455128103, "epoch": 0.005991149312322667, "grad_norm": 0.06830353289842606, "kl": 0.2312830686569214, "learning_rate": 3e-06, "loss": 0.0203, "step": 2158 }, { "clip_ratio": 0.00020064205455128103, "epoch": 0.005993925563162483, "grad_norm": 0.06709025055170059, "kl": 0.21746045351028442, "learning_rate": 3e-06, "loss": 0.0191, "step": 2159 }, { "clip_ratio": 0.0, "epoch": 0.005996701814002299, "grad_norm": 0.07886490970849991, "kl": 0.21650362014770508, "learning_rate": 3e-06, "loss": 0.0196, "step": 2160 }, { "clip_ratio": 9.391435014549643e-05, "completion_length": 214.43750762939453, "epoch": 0.005999478064842114, "grad_norm": 0.11006475239992142, "kl": 0.21128001064062119, "learning_rate": 3e-06, "loss": 0.0006, "reward": 0.28333335369825363, "reward_std": 0.2853127121925354, "rewards/countdown_reward_func": 0.28333333879709244, "step": 2161, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0001065643664333038, "epoch": 0.0060022543156819305, "grad_norm": 0.17413948476314545, "kl": 0.20346222817897797, "learning_rate": 3e-06, "loss": -0.0006, "step": 2162 }, { "clip_ratio": 0.00018050541984848678, "epoch": 0.006005030566521747, "grad_norm": 0.11314403265714645, "kl": 0.19823309034109116, "learning_rate": 3e-06, "loss": 0.0001, "step": 2163 }, { "clip_ratio": 0.0005011356552131474, "epoch": 0.006007806817361562, "grad_norm": 0.1388438194990158, "kl": 0.1954411193728447, "learning_rate": 3e-06, "loss": 0.0001, "step": 2164 }, { "clip_ratio": 0.00022549449931830168, "epoch": 0.0060105830682013785, "grad_norm": 0.1569499969482422, "kl": 0.18387842923402786, "learning_rate": 3e-06, "loss": -0.0012, "step": 2165 }, { "clip_ratio": 0.0001065643664333038, "epoch": 0.006013359319041194, "grad_norm": 0.14936763048171997, "kl": 0.19631676375865936, "learning_rate": 3e-06, "loss": -0.0009, "step": 2166 }, { "clip_ratio": 0.0001065643664333038, "epoch": 0.00601613556988101, "grad_norm": 0.10569910705089569, "kl": 0.19845695048570633, "learning_rate": 3e-06, "loss": -0.0009, "step": 2167 }, { "clip_ratio": 0.00045912877249065787, "epoch": 0.0060189118207208256, "grad_norm": 0.1597263067960739, "kl": 0.18819713592529297, "learning_rate": 3e-06, "loss": -0.0028, "step": 2168 }, { "clip_ratio": 0.000762759504141286, "epoch": 0.006021688071560642, "grad_norm": 0.1164880320429802, "kl": 0.17806652933359146, "learning_rate": 3e-06, "loss": -0.0014, "step": 2169 }, { "clip_ratio": 0.0007057104958221316, "epoch": 0.006024464322400457, "grad_norm": 0.12073780596256256, "kl": 0.17337379604578018, "learning_rate": 3e-06, "loss": -0.0033, "step": 2170 }, { "clip_ratio": 0.00038431792927440256, "epoch": 0.0060272405732402735, "grad_norm": 0.15236780047416687, "kl": 0.1608799397945404, "learning_rate": 3e-06, "loss": -0.0041, "step": 2171 }, { "clip_ratio": 0.0005894555361010134, "epoch": 0.006030016824080089, "grad_norm": 0.1417692005634308, "kl": 0.17029716074466705, "learning_rate": 3e-06, "loss": -0.0044, "step": 2172 }, { "clip_ratio": 0.00017841139197116718, "completion_length": 235.6041717529297, "epoch": 0.006032793074919905, "grad_norm": 0.09551423043012619, "kl": 0.1639445647597313, "learning_rate": 3e-06, "loss": -0.0062, "reward": 0.21250000596046448, "reward_std": 0.11618950217962265, "rewards/countdown_reward_func": 0.21250000596046448, "step": 2173, "zero_std_ratio": 0.75 }, { "clip_ratio": 0.0006238899368327111, "epoch": 0.0060355693257597215, "grad_norm": 0.11371457576751709, "kl": 0.15197165310382843, "learning_rate": 3e-06, "loss": -0.0061, "step": 2174 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.006038345576599537, "grad_norm": 0.09170381724834442, "kl": 0.15608438104391098, "learning_rate": 3e-06, "loss": -0.0057, "step": 2175 }, { "clip_ratio": 0.0005786644760519266, "epoch": 0.006041121827439353, "grad_norm": 0.08310035616159439, "kl": 0.15356775373220444, "learning_rate": 3e-06, "loss": -0.0068, "step": 2176 }, { "clip_ratio": 8.802816591924056e-05, "epoch": 0.0060438980782791685, "grad_norm": 0.10131369531154633, "kl": 0.148365817964077, "learning_rate": 3e-06, "loss": -0.0066, "step": 2177 }, { "clip_ratio": 0.0007334884430747479, "epoch": 0.006046674329118985, "grad_norm": 0.0910923108458519, "kl": 0.14135953783988953, "learning_rate": 3e-06, "loss": -0.0067, "step": 2178 }, { "clip_ratio": 0.0007884440710768104, "epoch": 0.0060494505799588, "grad_norm": 0.09182950854301453, "kl": 0.1402914896607399, "learning_rate": 3e-06, "loss": -0.0073, "step": 2179 }, { "clip_ratio": 0.0012513441615737975, "epoch": 0.0060522268307986165, "grad_norm": 0.11456513404846191, "kl": 0.13089902698993683, "learning_rate": 3e-06, "loss": -0.0089, "step": 2180 }, { "clip_ratio": 0.0005369534774217755, "epoch": 0.006055003081638432, "grad_norm": 0.09350922703742981, "kl": 0.1340053342282772, "learning_rate": 3e-06, "loss": -0.0084, "step": 2181 }, { "clip_ratio": 0.0014489490713458508, "epoch": 0.006057779332478248, "grad_norm": 0.08015953004360199, "kl": 0.13297784700989723, "learning_rate": 3e-06, "loss": -0.0091, "step": 2182 }, { "clip_ratio": 0.0017656179843470454, "epoch": 0.006060555583318064, "grad_norm": 0.08231103420257568, "kl": 0.130483016371727, "learning_rate": 3e-06, "loss": -0.0095, "step": 2183 }, { "clip_ratio": 0.002995648537762463, "epoch": 0.00606333183415788, "grad_norm": 0.0673069879412651, "kl": 0.12187189608812332, "learning_rate": 3e-06, "loss": -0.009, "step": 2184 }, { "clip_ratio": 0.0004267231997800991, "completion_length": 233.3541717529297, "epoch": 0.006066108084997696, "grad_norm": 0.07689964771270752, "kl": 0.1193951666355133, "learning_rate": 3e-06, "loss": 0.0086, "reward": 0.21250000596046448, "reward_std": 0.2418064922094345, "rewards/countdown_reward_func": 0.21250000596046448, "step": 2185, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0002802690723910928, "epoch": 0.0060688843358375115, "grad_norm": 0.07742653042078018, "kl": 0.1159132868051529, "learning_rate": 3e-06, "loss": 0.0085, "step": 2186 }, { "clip_ratio": 0.0004338699218351394, "epoch": 0.006071660586677328, "grad_norm": 0.11565269529819489, "kl": 0.11845757439732552, "learning_rate": 3e-06, "loss": 0.0084, "step": 2187 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.006074436837517143, "grad_norm": 0.07725950330495834, "kl": 0.11740844696760178, "learning_rate": 3e-06, "loss": 0.0082, "step": 2188 }, { "clip_ratio": 0.00018731241289060563, "epoch": 0.0060772130883569595, "grad_norm": 0.0754290521144867, "kl": 0.11156599596142769, "learning_rate": 3e-06, "loss": 0.008, "step": 2189 }, { "clip_ratio": 0.000675759933074005, "epoch": 0.006079989339196775, "grad_norm": 0.10546611994504929, "kl": 0.11662887409329414, "learning_rate": 3e-06, "loss": 0.0078, "step": 2190 }, { "clip_ratio": 0.00046884678886272013, "epoch": 0.006082765590036591, "grad_norm": 0.07807113230228424, "kl": 0.11078613996505737, "learning_rate": 3e-06, "loss": 0.0078, "step": 2191 }, { "clip_ratio": 0.00047955120680853724, "epoch": 0.0060855418408764066, "grad_norm": 0.0798504427075386, "kl": 0.10783285275101662, "learning_rate": 3e-06, "loss": 0.0075, "step": 2192 }, { "clip_ratio": 0.00029404355882434174, "epoch": 0.006088318091716223, "grad_norm": 0.09978339821100235, "kl": 0.11092406511306763, "learning_rate": 3e-06, "loss": 0.0074, "step": 2193 }, { "clip_ratio": 0.000244140625, "epoch": 0.006091094342556038, "grad_norm": 0.08134466409683228, "kl": 0.11090698093175888, "learning_rate": 3e-06, "loss": 0.008, "step": 2194 }, { "clip_ratio": 0.0006887645286042243, "epoch": 0.0060938705933958545, "grad_norm": 0.07155325263738632, "kl": 0.10384676232933998, "learning_rate": 3e-06, "loss": 0.0072, "step": 2195 }, { "clip_ratio": 0.0011016842036042362, "epoch": 0.006096646844235671, "grad_norm": 0.08413667976856232, "kl": 0.11054657027125359, "learning_rate": 3e-06, "loss": 0.007, "step": 2196 }, { "clip_ratio": 0.00044989935122430325, "completion_length": 235.1875, "epoch": 0.006099423095075486, "grad_norm": 0.07087922096252441, "kl": 0.1160878986120224, "learning_rate": 3e-06, "loss": 0.0124, "reward": 0.2083333507180214, "reward_std": 0.2182515673339367, "rewards/countdown_reward_func": 0.2083333358168602, "step": 2197, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0061021993459153025, "grad_norm": 0.07426546514034271, "kl": 0.11279673129320145, "learning_rate": 3e-06, "loss": 0.0122, "step": 2198 }, { "clip_ratio": 8.185985643649474e-05, "epoch": 0.006104975596755118, "grad_norm": 0.07759413123130798, "kl": 0.116276104003191, "learning_rate": 3e-06, "loss": 0.012, "step": 2199 }, { "clip_ratio": 0.0, "epoch": 0.006107751847594934, "grad_norm": 0.0708848312497139, "kl": 0.10642623901367188, "learning_rate": 3e-06, "loss": 0.0116, "step": 2200 }, { "clip_ratio": 0.00017426943668397143, "epoch": 0.0061105280984347496, "grad_norm": 0.06520801782608032, "kl": 0.12019915506243706, "learning_rate": 3e-06, "loss": 0.0127, "step": 2201 }, { "clip_ratio": 0.00034561891516204923, "epoch": 0.006113304349274566, "grad_norm": 0.07822219282388687, "kl": 0.10957641154527664, "learning_rate": 3e-06, "loss": 0.0129, "step": 2202 }, { "clip_ratio": 0.0004435717419255525, "epoch": 0.006116080600114381, "grad_norm": 0.07648086547851562, "kl": 0.11310628056526184, "learning_rate": 3e-06, "loss": 0.0122, "step": 2203 }, { "clip_ratio": 0.0, "epoch": 0.0061188568509541975, "grad_norm": 0.08405080437660217, "kl": 0.11058183759450912, "learning_rate": 3e-06, "loss": 0.0112, "step": 2204 }, { "clip_ratio": 9.469696669839323e-05, "epoch": 0.006121633101794013, "grad_norm": 0.08572155982255936, "kl": 0.11485684290528297, "learning_rate": 3e-06, "loss": 0.0113, "step": 2205 }, { "clip_ratio": 0.0, "epoch": 0.006124409352633829, "grad_norm": 0.07442327588796616, "kl": 0.1044563390314579, "learning_rate": 3e-06, "loss": 0.0112, "step": 2206 }, { "clip_ratio": 0.0004602315020747483, "epoch": 0.0061271856034736455, "grad_norm": 0.18411950767040253, "kl": 0.11828203499317169, "learning_rate": 3e-06, "loss": 0.0119, "step": 2207 }, { "clip_ratio": 0.0005978553963359445, "epoch": 0.006129961854313461, "grad_norm": 0.0784916952252388, "kl": 0.10879363864660263, "learning_rate": 3e-06, "loss": 0.0119, "step": 2208 }, { "clip_ratio": 0.0002201182724093087, "completion_length": 233.75, "epoch": 0.006132738105153277, "grad_norm": 0.11505284905433655, "kl": 0.10442957654595375, "learning_rate": 3e-06, "loss": 0.024, "reward": 0.3812500089406967, "reward_std": 0.3858536630868912, "rewards/countdown_reward_func": 0.3812500089406967, "step": 2209, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0003907842037733644, "epoch": 0.0061355143559930925, "grad_norm": 0.10098423808813095, "kl": 0.10408291965723038, "learning_rate": 3e-06, "loss": 0.0239, "step": 2210 }, { "clip_ratio": 0.0001870947889983654, "epoch": 0.006138290606832909, "grad_norm": 0.10101636499166489, "kl": 0.10832390189170837, "learning_rate": 3e-06, "loss": 0.0225, "step": 2211 }, { "clip_ratio": 0.0004268408374628052, "epoch": 0.006141066857672724, "grad_norm": 0.11006015539169312, "kl": 0.10417984053492546, "learning_rate": 3e-06, "loss": 0.0225, "step": 2212 }, { "clip_ratio": 0.0, "epoch": 0.0061438431085125405, "grad_norm": 0.1787508875131607, "kl": 0.11693385615944862, "learning_rate": 3e-06, "loss": 0.0225, "step": 2213 }, { "clip_ratio": 0.0005292101996019483, "epoch": 0.006146619359352356, "grad_norm": 0.12185002118349075, "kl": 0.11256387084722519, "learning_rate": 3e-06, "loss": 0.0223, "step": 2214 }, { "clip_ratio": 0.00012230919674038887, "epoch": 0.006149395610192172, "grad_norm": 0.12709738314151764, "kl": 0.10908814519643784, "learning_rate": 3e-06, "loss": 0.023, "step": 2215 }, { "clip_ratio": 0.00012230919674038887, "epoch": 0.006152171861031988, "grad_norm": 0.10294611752033234, "kl": 0.11007463932037354, "learning_rate": 3e-06, "loss": 0.0217, "step": 2216 }, { "clip_ratio": 0.0002929751281044446, "epoch": 0.006154948111871804, "grad_norm": 0.09818847477436066, "kl": 0.11554765328764915, "learning_rate": 3e-06, "loss": 0.0212, "step": 2217 }, { "clip_ratio": 0.000292975120828487, "epoch": 0.00615772436271162, "grad_norm": 0.11660251021385193, "kl": 0.11222562938928604, "learning_rate": 3e-06, "loss": 0.0201, "step": 2218 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0061605006135514355, "grad_norm": 0.09623023122549057, "kl": 0.12738066911697388, "learning_rate": 3e-06, "loss": 0.0199, "step": 2219 }, { "clip_ratio": 0.0007260283455252647, "epoch": 0.006163276864391252, "grad_norm": 0.09498999267816544, "kl": 0.12235638499259949, "learning_rate": 3e-06, "loss": 0.0207, "step": 2220 }, { "clip_ratio": 0.00026175539096584544, "completion_length": 239.08333587646484, "epoch": 0.006166053115231067, "grad_norm": 0.08734557777643204, "kl": 0.10614108666777611, "learning_rate": 3e-06, "loss": 0.0156, "reward": 0.23124999552965164, "reward_std": 0.2115694098174572, "rewards/countdown_reward_func": 0.23124999552965164, "step": 2221, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00016405216592829674, "epoch": 0.0061688293660708835, "grad_norm": 0.11606533825397491, "kl": 0.10931029170751572, "learning_rate": 3e-06, "loss": 0.0164, "step": 2222 }, { "clip_ratio": 0.0002663365739863366, "epoch": 0.006171605616910699, "grad_norm": 0.07539436966180801, "kl": 0.10711556673049927, "learning_rate": 3e-06, "loss": 0.0157, "step": 2223 }, { "clip_ratio": 0.0002645519489306025, "epoch": 0.006174381867750515, "grad_norm": 0.06568428874015808, "kl": 0.12526912242174149, "learning_rate": 3e-06, "loss": 0.0164, "step": 2224 }, { "clip_ratio": 0.00026455195620656013, "epoch": 0.0061771581185903306, "grad_norm": 0.07675132900476456, "kl": 0.11198921501636505, "learning_rate": 3e-06, "loss": 0.0158, "step": 2225 }, { "clip_ratio": 0.0007440476329065859, "epoch": 0.006179934369430147, "grad_norm": 0.07890935987234116, "kl": 0.11408694460988045, "learning_rate": 3e-06, "loss": 0.0169, "step": 2226 }, { "clip_ratio": 0.0004973536997567862, "epoch": 0.006182710620269962, "grad_norm": 0.08032620698213577, "kl": 0.11937838792800903, "learning_rate": 3e-06, "loss": 0.0155, "step": 2227 }, { "clip_ratio": 0.00019656029326142743, "epoch": 0.0061854868711097785, "grad_norm": 0.09635122865438461, "kl": 0.11930480599403381, "learning_rate": 3e-06, "loss": 0.0145, "step": 2228 }, { "clip_ratio": 9.476876584812999e-05, "epoch": 0.006188263121949595, "grad_norm": 0.21204665303230286, "kl": 0.11878373473882675, "learning_rate": 3e-06, "loss": 0.0142, "step": 2229 }, { "clip_ratio": 0.0009819945989875123, "epoch": 0.00619103937278941, "grad_norm": 0.06745634227991104, "kl": 0.13793090730905533, "learning_rate": 3e-06, "loss": 0.0158, "step": 2230 }, { "clip_ratio": 0.0004885463276877999, "epoch": 0.0061938156236292265, "grad_norm": 0.07236343622207642, "kl": 0.12467692419886589, "learning_rate": 3e-06, "loss": 0.0147, "step": 2231 }, { "clip_ratio": 0.0003369117039255798, "epoch": 0.006196591874469042, "grad_norm": 0.07333452254533768, "kl": 0.12604239955544472, "learning_rate": 3e-06, "loss": 0.0151, "step": 2232 }, { "clip_ratio": 0.00044176532537676394, "completion_length": 227.5, "epoch": 0.006199368125308858, "grad_norm": 0.11169209331274033, "kl": 0.14527644217014313, "learning_rate": 3e-06, "loss": 0.0172, "reward": 0.4520833492279053, "reward_std": 0.3492494821548462, "rewards/countdown_reward_func": 0.4520833194255829, "step": 2233, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.722958591533825e-05, "epoch": 0.0062021443761486736, "grad_norm": 0.12365715205669403, "kl": 0.15404719859361649, "learning_rate": 3e-06, "loss": 0.0176, "step": 2234 }, { "clip_ratio": 0.0003403676091693342, "epoch": 0.00620492062698849, "grad_norm": 0.10898413509130478, "kl": 0.15741366147994995, "learning_rate": 3e-06, "loss": 0.0166, "step": 2235 }, { "clip_ratio": 8.722958591533825e-05, "epoch": 0.006207696877828305, "grad_norm": 0.11201989650726318, "kl": 0.15081220865249634, "learning_rate": 3e-06, "loss": 0.0165, "step": 2236 }, { "clip_ratio": 0.00020764119108207524, "epoch": 0.0062104731286681215, "grad_norm": 0.2154986560344696, "kl": 0.15322324633598328, "learning_rate": 3e-06, "loss": 0.0158, "step": 2237 }, { "clip_ratio": 0.0006642429507337511, "epoch": 0.006213249379507937, "grad_norm": 0.0987052395939827, "kl": 0.15503258258104324, "learning_rate": 3e-06, "loss": 0.0166, "step": 2238 }, { "clip_ratio": 8.722958591533825e-05, "epoch": 0.006216025630347753, "grad_norm": 0.10490848124027252, "kl": 0.154758021235466, "learning_rate": 3e-06, "loss": 0.0157, "step": 2239 }, { "clip_ratio": 0.0, "epoch": 0.0062188018811875695, "grad_norm": 0.1208440288901329, "kl": 0.16184867918491364, "learning_rate": 3e-06, "loss": 0.016, "step": 2240 }, { "clip_ratio": 0.0002552756923250854, "epoch": 0.006221578132027385, "grad_norm": 0.09863177686929703, "kl": 0.16634425520896912, "learning_rate": 3e-06, "loss": 0.0159, "step": 2241 }, { "clip_ratio": 0.00017934454808710143, "epoch": 0.006224354382867201, "grad_norm": 0.11939014494419098, "kl": 0.16019989550113678, "learning_rate": 3e-06, "loss": 0.0145, "step": 2242 }, { "clip_ratio": 0.0, "epoch": 0.0062271306337070165, "grad_norm": 0.16073253750801086, "kl": 0.16025684773921967, "learning_rate": 3e-06, "loss": 0.0131, "step": 2243 }, { "clip_ratio": 0.0008758519834373146, "epoch": 0.006229906884546833, "grad_norm": 0.11048475652933121, "kl": 0.1635778620839119, "learning_rate": 3e-06, "loss": 0.0148, "step": 2244 }, { "clip_ratio": 8.90313385752961e-05, "completion_length": 217.1041717529297, "epoch": 0.006232683135386648, "grad_norm": 0.1347557008266449, "kl": 0.16955012828111649, "learning_rate": 3e-06, "loss": 0.0281, "reward": 0.35625001788139343, "reward_std": 0.3819515109062195, "rewards/countdown_reward_func": 0.35625001043081284, "step": 2245, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.204712841892615e-05, "epoch": 0.0062354593862264645, "grad_norm": 0.1472625583410263, "kl": 0.16981971263885498, "learning_rate": 3e-06, "loss": 0.0262, "step": 2246 }, { "clip_ratio": 0.0001900863426271826, "epoch": 0.00623823563706628, "grad_norm": 0.11650460213422775, "kl": 0.16689752787351608, "learning_rate": 3e-06, "loss": 0.0278, "step": 2247 }, { "clip_ratio": 0.0001804056082619354, "epoch": 0.006241011887906096, "grad_norm": 0.13127779960632324, "kl": 0.1765117198228836, "learning_rate": 3e-06, "loss": 0.028, "step": 2248 }, { "clip_ratio": 0.0005522827850654721, "epoch": 0.006243788138745912, "grad_norm": 0.12236137688159943, "kl": 0.16286692768335342, "learning_rate": 3e-06, "loss": 0.0261, "step": 2249 }, { "clip_ratio": 0.00018342139810556546, "epoch": 0.006246564389585728, "grad_norm": 0.13048216700553894, "kl": 0.16761357337236404, "learning_rate": 3e-06, "loss": 0.0269, "step": 2250 }, { "clip_ratio": 0.000550264201592654, "epoch": 0.006249340640425544, "grad_norm": 0.13221754133701324, "kl": 0.18436182290315628, "learning_rate": 3e-06, "loss": 0.0251, "step": 2251 }, { "clip_ratio": 0.0, "epoch": 0.0062521168912653595, "grad_norm": 0.14144454896450043, "kl": 0.18143575638532639, "learning_rate": 3e-06, "loss": 0.0239, "step": 2252 }, { "clip_ratio": 0.00033115307451225817, "epoch": 0.006254893142105176, "grad_norm": 0.13937947154045105, "kl": 0.18208129703998566, "learning_rate": 3e-06, "loss": 0.0254, "step": 2253 }, { "clip_ratio": 0.00037381629226729274, "epoch": 0.006257669392944991, "grad_norm": 0.11771649122238159, "kl": 0.1911814734339714, "learning_rate": 3e-06, "loss": 0.0255, "step": 2254 }, { "clip_ratio": 0.000840340624563396, "epoch": 0.0062604456437848075, "grad_norm": 0.11015734076499939, "kl": 0.17935297638177872, "learning_rate": 3e-06, "loss": 0.0235, "step": 2255 }, { "clip_ratio": 0.0008332670404342934, "epoch": 0.006263221894624623, "grad_norm": 0.12103752791881561, "kl": 0.180360309779644, "learning_rate": 3e-06, "loss": 0.0231, "step": 2256 }, { "clip_ratio": 0.0001020408162730746, "completion_length": 203.4166717529297, "epoch": 0.006265998145464439, "grad_norm": 0.1085449829697609, "kl": 0.178614042699337, "learning_rate": 3e-06, "loss": 0.0057, "reward": 0.39791668951511383, "reward_std": 0.32604455202817917, "rewards/countdown_reward_func": 0.39791667461395264, "step": 2257, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.000700604279700201, "epoch": 0.0062687743963042546, "grad_norm": 0.12487374991178513, "kl": 0.18157772719860077, "learning_rate": 3e-06, "loss": 0.0065, "step": 2258 }, { "clip_ratio": 0.0005079238399048336, "epoch": 0.006271550647144071, "grad_norm": 0.16143116354942322, "kl": 0.19268468022346497, "learning_rate": 3e-06, "loss": 0.0063, "step": 2259 }, { "clip_ratio": 9.97605748125352e-05, "epoch": 0.006274326897983886, "grad_norm": 0.12245763093233109, "kl": 0.1898026168346405, "learning_rate": 3e-06, "loss": 0.0064, "step": 2260 }, { "clip_ratio": 0.0, "epoch": 0.0062771031488237025, "grad_norm": 0.11813352257013321, "kl": 0.18366330116987228, "learning_rate": 3e-06, "loss": 0.0053, "step": 2261 }, { "clip_ratio": 0.0006266141863306984, "epoch": 0.006279879399663519, "grad_norm": 0.2579954266548157, "kl": 0.18229518085718155, "learning_rate": 3e-06, "loss": 0.0059, "step": 2262 }, { "clip_ratio": 0.0, "epoch": 0.006282655650503334, "grad_norm": 0.11579995602369308, "kl": 0.1908181607723236, "learning_rate": 3e-06, "loss": 0.0042, "step": 2263 }, { "clip_ratio": 0.0006184200756251812, "epoch": 0.0062854319013431505, "grad_norm": 0.1960228681564331, "kl": 0.1934300884604454, "learning_rate": 3e-06, "loss": 0.0054, "step": 2264 }, { "clip_ratio": 0.0012736761127598584, "epoch": 0.006288208152182966, "grad_norm": 0.12140205502510071, "kl": 0.20321929454803467, "learning_rate": 3e-06, "loss": 0.0057, "step": 2265 }, { "clip_ratio": 0.000426823906309437, "epoch": 0.006290984403022782, "grad_norm": 0.1191827654838562, "kl": 0.1980365291237831, "learning_rate": 3e-06, "loss": 0.0048, "step": 2266 }, { "clip_ratio": 0.0, "epoch": 0.0062937606538625976, "grad_norm": 0.1335882693529129, "kl": 0.1887526884675026, "learning_rate": 3e-06, "loss": 0.004, "step": 2267 }, { "clip_ratio": 0.0004993882175767794, "epoch": 0.006296536904702414, "grad_norm": 0.11730194836854935, "kl": 0.18969019502401352, "learning_rate": 3e-06, "loss": 0.0042, "step": 2268 }, { "clip_ratio": 0.00018388705211691558, "completion_length": 227.70833587646484, "epoch": 0.006299313155542229, "grad_norm": 0.12374776601791382, "kl": 0.22037464380264282, "learning_rate": 3e-06, "loss": -0.0004, "reward": 0.2875000163912773, "reward_std": 0.2661401182413101, "rewards/countdown_reward_func": 0.2875000014901161, "step": 2269, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00017997733084484935, "epoch": 0.0063020894063820455, "grad_norm": 0.13824217021465302, "kl": 0.2126154974102974, "learning_rate": 3e-06, "loss": -0.001, "step": 2270 }, { "clip_ratio": 8.802816591924056e-05, "epoch": 0.006304865657221861, "grad_norm": 0.18445436656475067, "kl": 0.2134125828742981, "learning_rate": 3e-06, "loss": -0.0016, "step": 2271 }, { "clip_ratio": 0.0001707650226308033, "epoch": 0.006307641908061677, "grad_norm": 0.13947488367557526, "kl": 0.2060968428850174, "learning_rate": 3e-06, "loss": -0.0013, "step": 2272 }, { "clip_ratio": 0.00017788648256100714, "epoch": 0.0063104181589014935, "grad_norm": 0.13269934058189392, "kl": 0.20021595805883408, "learning_rate": 3e-06, "loss": -0.0005, "step": 2273 }, { "clip_ratio": 0.0003868369967676699, "epoch": 0.006313194409741309, "grad_norm": 0.13080525398254395, "kl": 0.20501340180635452, "learning_rate": 3e-06, "loss": -0.0021, "step": 2274 }, { "clip_ratio": 0.0005433336482383311, "epoch": 0.006315970660581125, "grad_norm": 0.11887697875499725, "kl": 0.19895461946725845, "learning_rate": 3e-06, "loss": -0.0025, "step": 2275 }, { "clip_ratio": 0.0019091092544840649, "epoch": 0.0063187469114209405, "grad_norm": 0.13246318697929382, "kl": 0.18607723712921143, "learning_rate": 3e-06, "loss": -0.003, "step": 2276 }, { "clip_ratio": 0.0005245065985945985, "epoch": 0.006321523162260757, "grad_norm": 0.13839560747146606, "kl": 0.18259260058403015, "learning_rate": 3e-06, "loss": -0.0052, "step": 2277 }, { "clip_ratio": 0.001069007470505312, "epoch": 0.006324299413100572, "grad_norm": 0.126378133893013, "kl": 0.17721717059612274, "learning_rate": 3e-06, "loss": -0.0051, "step": 2278 }, { "clip_ratio": 0.0020173885859549046, "epoch": 0.0063270756639403885, "grad_norm": 0.13504590094089508, "kl": 0.16881190985441208, "learning_rate": 3e-06, "loss": -0.0043, "step": 2279 }, { "clip_ratio": 0.000982351542916149, "epoch": 0.006329851914780204, "grad_norm": 0.13827833533287048, "kl": 0.16988541185855865, "learning_rate": 3e-06, "loss": -0.0052, "step": 2280 }, { "clip_ratio": 9.057971328729764e-05, "completion_length": 228.64583587646484, "epoch": 0.00633262816562002, "grad_norm": 0.12721112370491028, "kl": 0.1614338904619217, "learning_rate": 3e-06, "loss": 0.053, "reward": 0.36250002682209015, "reward_std": 0.33128294348716736, "rewards/countdown_reward_func": 0.36250001192092896, "step": 2281, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0003709889715537429, "epoch": 0.006335404416459836, "grad_norm": 0.20156148076057434, "kl": 0.16206463426351547, "learning_rate": 3e-06, "loss": 0.0541, "step": 2282 }, { "clip_ratio": 0.00019021370098926127, "epoch": 0.006338180667299652, "grad_norm": 0.16486519575119019, "kl": 0.15344135463237762, "learning_rate": 3e-06, "loss": 0.0543, "step": 2283 }, { "clip_ratio": 0.0008641848689876497, "epoch": 0.006340956918139468, "grad_norm": 0.16734211146831512, "kl": 0.15662110596895218, "learning_rate": 3e-06, "loss": 0.0541, "step": 2284 }, { "clip_ratio": 0.00026667342899600044, "epoch": 0.0063437331689792835, "grad_norm": 0.14003409445285797, "kl": 0.1625654399394989, "learning_rate": 3e-06, "loss": 0.0546, "step": 2285 }, { "clip_ratio": 8.239947055699304e-05, "epoch": 0.0063465094198191, "grad_norm": 0.14652083814144135, "kl": 0.15703174471855164, "learning_rate": 3e-06, "loss": 0.0539, "step": 2286 }, { "clip_ratio": 8.239947055699304e-05, "epoch": 0.006349285670658915, "grad_norm": 0.14092198014259338, "kl": 0.156203031539917, "learning_rate": 3e-06, "loss": 0.0527, "step": 2287 }, { "clip_ratio": 0.00041524558037053794, "epoch": 0.0063520619214987315, "grad_norm": 0.19670848548412323, "kl": 0.16062773764133453, "learning_rate": 3e-06, "loss": 0.0533, "step": 2288 }, { "clip_ratio": 0.0005093724466860294, "epoch": 0.006354838172338547, "grad_norm": 0.15894927084445953, "kl": 0.15653447806835175, "learning_rate": 3e-06, "loss": 0.0521, "step": 2289 }, { "clip_ratio": 0.0009049937652889639, "epoch": 0.006357614423178363, "grad_norm": 0.16536635160446167, "kl": 0.1637991964817047, "learning_rate": 3e-06, "loss": 0.051, "step": 2290 }, { "clip_ratio": 0.00036807394644711167, "epoch": 0.0063603906740181786, "grad_norm": 0.13223929703235626, "kl": 0.17576530575752258, "learning_rate": 3e-06, "loss": 0.052, "step": 2291 }, { "clip_ratio": 0.00010237510286970064, "epoch": 0.006363166924857995, "grad_norm": 0.13640287518501282, "kl": 0.16830559074878693, "learning_rate": 3e-06, "loss": 0.05, "step": 2292 }, { "clip_ratio": 0.0002590137009974569, "completion_length": 232.68750762939453, "epoch": 0.00636594317569781, "grad_norm": 0.0946778878569603, "kl": 0.18598677217960358, "learning_rate": 3e-06, "loss": 0.0195, "reward": 0.24583332985639572, "reward_std": 0.2609790861606598, "rewards/countdown_reward_func": 0.24583332985639572, "step": 2293, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0063687194265376265, "grad_norm": 0.0930834412574768, "kl": 0.18177608400583267, "learning_rate": 3e-06, "loss": 0.0184, "step": 2294 }, { "clip_ratio": 8.417508070124313e-05, "epoch": 0.006371495677377443, "grad_norm": 0.26509028673171997, "kl": 0.18570785969495773, "learning_rate": 3e-06, "loss": 0.0183, "step": 2295 }, { "clip_ratio": 0.0, "epoch": 0.006374271928217258, "grad_norm": 0.12472037225961685, "kl": 0.19241143763065338, "learning_rate": 3e-06, "loss": 0.0195, "step": 2296 }, { "clip_ratio": 0.0006900840235175565, "epoch": 0.0063770481790570745, "grad_norm": 0.1127580925822258, "kl": 0.19401460886001587, "learning_rate": 3e-06, "loss": 0.0185, "step": 2297 }, { "clip_ratio": 0.0001705010508885607, "epoch": 0.00637982442989689, "grad_norm": 0.1149328202009201, "kl": 0.20228491723537445, "learning_rate": 3e-06, "loss": 0.0181, "step": 2298 }, { "clip_ratio": 0.00026395946042612195, "epoch": 0.006382600680736706, "grad_norm": 0.09492156654596329, "kl": 0.21177734434604645, "learning_rate": 3e-06, "loss": 0.0185, "step": 2299 }, { "clip_ratio": 0.00018846639432013035, "epoch": 0.0063853769315765216, "grad_norm": 0.09327538311481476, "kl": 0.20418494194746017, "learning_rate": 3e-06, "loss": 0.0178, "step": 2300 }, { "clip_ratio": 0.00017548260802868754, "epoch": 0.006388153182416338, "grad_norm": 0.11833266168832779, "kl": 0.2049153670668602, "learning_rate": 3e-06, "loss": 0.0164, "step": 2301 }, { "clip_ratio": 0.00044657984835794196, "epoch": 0.006390929433256153, "grad_norm": 0.1208672747015953, "kl": 0.20857040584087372, "learning_rate": 3e-06, "loss": 0.018, "step": 2302 }, { "clip_ratio": 0.001321177463978529, "epoch": 0.0063937056840959695, "grad_norm": 0.09639300405979156, "kl": 0.20693976432085037, "learning_rate": 3e-06, "loss": 0.0173, "step": 2303 }, { "clip_ratio": 0.0, "epoch": 0.006396481934935785, "grad_norm": 0.1254264861345291, "kl": 0.21083715558052063, "learning_rate": 3e-06, "loss": 0.0165, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 219.6666717529297, "epoch": 0.006399258185775601, "grad_norm": 0.20140250027179718, "kl": 0.22675880044698715, "learning_rate": 3e-06, "loss": 0.0569, "reward": 0.4375000298023224, "reward_std": 0.4317815750837326, "rewards/countdown_reward_func": 0.4375000149011612, "step": 2305, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.0, "epoch": 0.0064020344366154175, "grad_norm": 0.164041668176651, "kl": 0.22609523683786392, "learning_rate": 3e-06, "loss": 0.057, "step": 2306 }, { "clip_ratio": 0.0010027538519352674, "epoch": 0.006404810687455233, "grad_norm": 0.2000647336244583, "kl": 0.20753808319568634, "learning_rate": 3e-06, "loss": 0.0568, "step": 2307 }, { "clip_ratio": 0.00016891522682271898, "epoch": 0.006407586938295049, "grad_norm": 0.16357851028442383, "kl": 0.23235772550106049, "learning_rate": 3e-06, "loss": 0.0562, "step": 2308 }, { "clip_ratio": 8.941345004132017e-05, "epoch": 0.0064103631891348645, "grad_norm": 0.18721310794353485, "kl": 0.23869449645280838, "learning_rate": 3e-06, "loss": 0.055, "step": 2309 }, { "clip_ratio": 0.0, "epoch": 0.006413139439974681, "grad_norm": 0.1559915840625763, "kl": 0.26671281456947327, "learning_rate": 3e-06, "loss": 0.0561, "step": 2310 }, { "clip_ratio": 0.00017694846610538661, "epoch": 0.006415915690814496, "grad_norm": 0.15243524312973022, "kl": 0.25702714920043945, "learning_rate": 3e-06, "loss": 0.0538, "step": 2311 }, { "clip_ratio": 8.999279816634953e-05, "epoch": 0.0064186919416543125, "grad_norm": 0.1509992778301239, "kl": 0.26283329725265503, "learning_rate": 3e-06, "loss": 0.0525, "step": 2312 }, { "clip_ratio": 9.968101949198171e-05, "epoch": 0.006421468192494128, "grad_norm": 0.15100322663784027, "kl": 0.24412598460912704, "learning_rate": 3e-06, "loss": 0.0516, "step": 2313 }, { "clip_ratio": 0.000363269035005942, "epoch": 0.006424244443333944, "grad_norm": 0.1439571976661682, "kl": 0.2789086848497391, "learning_rate": 3e-06, "loss": 0.0529, "step": 2314 }, { "clip_ratio": 0.00019936203898396343, "epoch": 0.00642702069417376, "grad_norm": 0.15630877017974854, "kl": 0.29722483456134796, "learning_rate": 3e-06, "loss": 0.0506, "step": 2315 }, { "clip_ratio": 0.00027720882644644007, "epoch": 0.006429796945013576, "grad_norm": 0.15677094459533691, "kl": 0.32308197021484375, "learning_rate": 3e-06, "loss": 0.0526, "step": 2316 }, { "clip_ratio": 0.0004198152746539563, "completion_length": 203.25, "epoch": 0.006432573195853392, "grad_norm": 0.12152662873268127, "kl": 0.27852681279182434, "learning_rate": 3e-06, "loss": 0.0191, "reward": 0.3812500238418579, "reward_std": 0.37016279995441437, "rewards/countdown_reward_func": 0.3812499940395355, "step": 2317, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00020843063975917175, "epoch": 0.0064353494466932075, "grad_norm": 0.139579176902771, "kl": 0.3006761074066162, "learning_rate": 3e-06, "loss": 0.019, "step": 2318 }, { "clip_ratio": 0.0, "epoch": 0.006438125697533024, "grad_norm": 0.14806227385997772, "kl": 0.3408869802951813, "learning_rate": 3e-06, "loss": 0.0213, "step": 2319 }, { "clip_ratio": 0.00039015276706777513, "epoch": 0.006440901948372839, "grad_norm": 0.17348290979862213, "kl": 0.34244829416275024, "learning_rate": 3e-06, "loss": 0.0215, "step": 2320 }, { "clip_ratio": 0.000316114688757807, "epoch": 0.0064436781992126555, "grad_norm": 0.15013259649276733, "kl": 0.34295883774757385, "learning_rate": 3e-06, "loss": 0.022, "step": 2321 }, { "clip_ratio": 0.0004764232726301998, "epoch": 0.006446454450052471, "grad_norm": 0.2142786681652069, "kl": 0.36009830236434937, "learning_rate": 3e-06, "loss": 0.023, "step": 2322 }, { "clip_ratio": 0.00029448137502186, "epoch": 0.006449230700892287, "grad_norm": 0.12087388336658478, "kl": 0.30946898460388184, "learning_rate": 3e-06, "loss": 0.0178, "step": 2323 }, { "clip_ratio": 0.00010347682109568268, "epoch": 0.0064520069517321026, "grad_norm": 0.15301726758480072, "kl": 0.3245188295841217, "learning_rate": 3e-06, "loss": 0.0184, "step": 2324 }, { "clip_ratio": 0.00010495381866348907, "epoch": 0.006454783202571919, "grad_norm": 0.15143437683582306, "kl": 0.3580729514360428, "learning_rate": 3e-06, "loss": 0.0202, "step": 2325 }, { "clip_ratio": 0.000572390272282064, "epoch": 0.006457559453411734, "grad_norm": 0.17135295271873474, "kl": 0.35064497590065, "learning_rate": 3e-06, "loss": 0.0181, "step": 2326 }, { "clip_ratio": 0.00043031001405324787, "epoch": 0.0064603357042515505, "grad_norm": 0.14325761795043945, "kl": 0.33868308365345, "learning_rate": 3e-06, "loss": 0.0201, "step": 2327 }, { "clip_ratio": 0.0015213553560897708, "epoch": 0.006463111955091367, "grad_norm": 0.158623605966568, "kl": 0.3448447734117508, "learning_rate": 3e-06, "loss": 0.022, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 203.6666717529297, "epoch": 0.006465888205931182, "grad_norm": 0.17202085256576538, "kl": 0.2865896373987198, "learning_rate": 3e-06, "loss": 0.0367, "reward": 0.4541666954755783, "reward_std": 0.3947114050388336, "rewards/countdown_reward_func": 0.4541666954755783, "step": 2329, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.904913167702034e-05, "epoch": 0.0064686644567709985, "grad_norm": 0.14967620372772217, "kl": 0.2809770703315735, "learning_rate": 3e-06, "loss": 0.0355, "step": 2330 }, { "clip_ratio": 0.0, "epoch": 0.006471440707610814, "grad_norm": 0.1649092584848404, "kl": 0.2993633449077606, "learning_rate": 3e-06, "loss": 0.0373, "step": 2331 }, { "clip_ratio": 0.00019977435295004398, "epoch": 0.00647421695845063, "grad_norm": 0.1611592024564743, "kl": 0.2826383709907532, "learning_rate": 3e-06, "loss": 0.0346, "step": 2332 }, { "clip_ratio": 0.0, "epoch": 0.0064769932092904456, "grad_norm": 0.13679081201553345, "kl": 0.28105440735816956, "learning_rate": 3e-06, "loss": 0.0359, "step": 2333 }, { "clip_ratio": 0.0002023549168370664, "epoch": 0.006479769460130262, "grad_norm": 0.15343226492404938, "kl": 0.2757682204246521, "learning_rate": 3e-06, "loss": 0.0357, "step": 2334 }, { "clip_ratio": 0.0, "epoch": 0.006482545710970077, "grad_norm": 0.17491856217384338, "kl": 0.2747037261724472, "learning_rate": 3e-06, "loss": 0.0344, "step": 2335 }, { "clip_ratio": 0.00014551804633811116, "epoch": 0.0064853219618098935, "grad_norm": 0.14823581278324127, "kl": 0.2763662040233612, "learning_rate": 3e-06, "loss": 0.0339, "step": 2336 }, { "clip_ratio": 9.391435014549643e-05, "epoch": 0.006488098212649709, "grad_norm": 0.15623505413532257, "kl": 0.30250681936740875, "learning_rate": 3e-06, "loss": 0.0355, "step": 2337 }, { "clip_ratio": 0.00010330578516004607, "epoch": 0.006490874463489525, "grad_norm": 0.15354587137699127, "kl": 0.2839023768901825, "learning_rate": 3e-06, "loss": 0.0342, "step": 2338 }, { "clip_ratio": 0.00020746888185385615, "epoch": 0.0064936507143293415, "grad_norm": 0.14572341740131378, "kl": 0.2831754833459854, "learning_rate": 3e-06, "loss": 0.0341, "step": 2339 }, { "clip_ratio": 9.391435014549643e-05, "epoch": 0.006496426965169157, "grad_norm": 0.1376647800207138, "kl": 0.28501126170158386, "learning_rate": 3e-06, "loss": 0.0341, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 215.5416717529297, "epoch": 0.006499203216008973, "grad_norm": 0.2073156237602234, "kl": 0.3356934189796448, "learning_rate": 3e-06, "loss": 0.0187, "reward": 0.40000003576278687, "reward_std": 0.2904737517237663, "rewards/countdown_reward_func": 0.4000000059604645, "step": 2341, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0065019794668487885, "grad_norm": 0.14323925971984863, "kl": 0.322573721408844, "learning_rate": 3e-06, "loss": 0.0175, "step": 2342 }, { "clip_ratio": 0.00025773196830414236, "epoch": 0.006504755717688605, "grad_norm": 0.3021996021270752, "kl": 0.3119508624076843, "learning_rate": 3e-06, "loss": 0.0173, "step": 2343 }, { "clip_ratio": 0.0001781615719664842, "epoch": 0.00650753196852842, "grad_norm": 0.15104366838932037, "kl": 0.31141962110996246, "learning_rate": 3e-06, "loss": 0.0174, "step": 2344 }, { "clip_ratio": 0.0002609418734209612, "epoch": 0.0065103082193682365, "grad_norm": 0.1799515038728714, "kl": 0.3139701336622238, "learning_rate": 3e-06, "loss": 0.0174, "step": 2345 }, { "clip_ratio": 0.0003696906060213223, "epoch": 0.006513084470208052, "grad_norm": 0.15009325742721558, "kl": 0.29629264771938324, "learning_rate": 3e-06, "loss": 0.0184, "step": 2346 }, { "clip_ratio": 0.00026359809999121353, "epoch": 0.006515860721047868, "grad_norm": 0.20159399509429932, "kl": 0.30615514516830444, "learning_rate": 3e-06, "loss": 0.016, "step": 2347 }, { "clip_ratio": 8.591065125074238e-05, "epoch": 0.006518636971887684, "grad_norm": 0.15064962208271027, "kl": 0.28726859390735626, "learning_rate": 3e-06, "loss": 0.0158, "step": 2348 }, { "clip_ratio": 0.00031818235584069043, "epoch": 0.0065214132227275, "grad_norm": 0.15885993838310242, "kl": 0.2687215358018875, "learning_rate": 3e-06, "loss": 0.0144, "step": 2349 }, { "clip_ratio": 0.0012777996016666293, "epoch": 0.006524189473567316, "grad_norm": 0.15064121782779694, "kl": 0.26052138954401016, "learning_rate": 3e-06, "loss": 0.0151, "step": 2350 }, { "clip_ratio": 0.000863487075548619, "epoch": 0.0065269657244071315, "grad_norm": 0.17315149307250977, "kl": 0.2593861371278763, "learning_rate": 3e-06, "loss": 0.0123, "step": 2351 }, { "clip_ratio": 0.001552396803162992, "epoch": 0.006529741975246948, "grad_norm": 0.13436666131019592, "kl": 0.24121838808059692, "learning_rate": 3e-06, "loss": 0.0142, "step": 2352 }, { "clip_ratio": 0.00013102724915370345, "completion_length": 208.56250762939453, "epoch": 0.006532518226086763, "grad_norm": 0.16845515370368958, "kl": 0.2573952376842499, "learning_rate": 3e-06, "loss": 0.0076, "reward": 0.2854166701436043, "reward_std": 0.24690960347652435, "rewards/countdown_reward_func": 0.2854166701436043, "step": 2353, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00021652656141668558, "epoch": 0.0065352944769265795, "grad_norm": 0.37916499376296997, "kl": 0.2523314207792282, "learning_rate": 3e-06, "loss": 0.0068, "step": 2354 }, { "clip_ratio": 9.293680341215804e-05, "epoch": 0.006538070727766395, "grad_norm": 0.16434593498706818, "kl": 0.24893923103809357, "learning_rate": 3e-06, "loss": 0.006, "step": 2355 }, { "clip_ratio": 0.0, "epoch": 0.006540846978606211, "grad_norm": 0.125566303730011, "kl": 0.23655416816473007, "learning_rate": 3e-06, "loss": 0.0069, "step": 2356 }, { "clip_ratio": 0.00024826216395013034, "epoch": 0.0065436232294460266, "grad_norm": 0.130938321352005, "kl": 0.220244362950325, "learning_rate": 3e-06, "loss": 0.0047, "step": 2357 }, { "clip_ratio": 0.00016687953029759228, "epoch": 0.006546399480285843, "grad_norm": 0.18719619512557983, "kl": 0.2183583378791809, "learning_rate": 3e-06, "loss": 0.0042, "step": 2358 }, { "clip_ratio": 0.0015653396840207279, "epoch": 0.006549175731125658, "grad_norm": 0.16384494304656982, "kl": 0.20466335862874985, "learning_rate": 3e-06, "loss": 0.0031, "step": 2359 }, { "clip_ratio": 0.0020808427361771464, "epoch": 0.0065519519819654745, "grad_norm": 0.1263015866279602, "kl": 0.1988421380519867, "learning_rate": 3e-06, "loss": 0.0029, "step": 2360 }, { "clip_ratio": 0.0030670628184452653, "epoch": 0.006554728232805291, "grad_norm": 0.13618893921375275, "kl": 0.20048832148313522, "learning_rate": 3e-06, "loss": 0.0016, "step": 2361 }, { "clip_ratio": 0.004884407157078385, "epoch": 0.006557504483645106, "grad_norm": 0.12469899654388428, "kl": 0.18725426495075226, "learning_rate": 3e-06, "loss": 0.0037, "step": 2362 }, { "clip_ratio": 0.006509467493742704, "epoch": 0.0065602807344849225, "grad_norm": 0.11395581811666489, "kl": 0.17675061523914337, "learning_rate": 3e-06, "loss": 0.0023, "step": 2363 }, { "clip_ratio": 0.010824932716786861, "epoch": 0.006563056985324738, "grad_norm": 0.14886018633842468, "kl": 0.17395079135894775, "learning_rate": 3e-06, "loss": -0.0004, "step": 2364 }, { "clip_ratio": 0.00041061273077502847, "completion_length": 225.1041717529297, "epoch": 0.006565833236164554, "grad_norm": 0.10056737810373306, "kl": 0.14540190249681473, "learning_rate": 3e-06, "loss": 0.0074, "reward": 0.3770833760499954, "reward_std": 0.3362167477607727, "rewards/countdown_reward_func": 0.37708336114883423, "step": 2365, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0065686094870043696, "grad_norm": 0.11986743658781052, "kl": 0.14874933660030365, "learning_rate": 3e-06, "loss": 0.0078, "step": 2366 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.006571385737844186, "grad_norm": 0.13911956548690796, "kl": 0.1324518918991089, "learning_rate": 3e-06, "loss": 0.0065, "step": 2367 }, { "clip_ratio": 0.00030378723749890924, "epoch": 0.006574161988684001, "grad_norm": 0.12894724309444427, "kl": 0.1367444545030594, "learning_rate": 3e-06, "loss": 0.0075, "step": 2368 }, { "clip_ratio": 0.0003762036649277434, "epoch": 0.0065769382395238175, "grad_norm": 0.10234810411930084, "kl": 0.1384653076529503, "learning_rate": 3e-06, "loss": 0.0064, "step": 2369 }, { "clip_ratio": 0.0003675257903523743, "epoch": 0.006579714490363633, "grad_norm": 0.11732344329357147, "kl": 0.12812578678131104, "learning_rate": 3e-06, "loss": 0.0075, "step": 2370 }, { "clip_ratio": 0.0004069010537932627, "epoch": 0.006582490741203449, "grad_norm": 0.09635314345359802, "kl": 0.1304508000612259, "learning_rate": 3e-06, "loss": 0.0069, "step": 2371 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0065852669920432655, "grad_norm": 0.11662594228982925, "kl": 0.13656722754240036, "learning_rate": 3e-06, "loss": 0.0065, "step": 2372 }, { "clip_ratio": 0.0008264775387942791, "epoch": 0.006588043242883081, "grad_norm": 0.11650729924440384, "kl": 0.12324058637022972, "learning_rate": 3e-06, "loss": 0.0059, "step": 2373 }, { "clip_ratio": 0.0005027086590416729, "epoch": 0.006590819493722897, "grad_norm": 0.12746286392211914, "kl": 0.12780633196234703, "learning_rate": 3e-06, "loss": 0.0055, "step": 2374 }, { "clip_ratio": 0.00035271809611003846, "epoch": 0.0065935957445627125, "grad_norm": 0.10422283411026001, "kl": 0.13124725595116615, "learning_rate": 3e-06, "loss": 0.0072, "step": 2375 }, { "clip_ratio": 0.0005158810163266025, "epoch": 0.006596371995402529, "grad_norm": 0.11545603722333908, "kl": 0.12114672735333443, "learning_rate": 3e-06, "loss": 0.0072, "step": 2376 }, { "clip_ratio": 0.00017926588043337688, "completion_length": 227.45834350585938, "epoch": 0.006599148246242344, "grad_norm": 0.09546420723199844, "kl": 0.1200266070663929, "learning_rate": 3e-06, "loss": 0.0108, "reward": 0.37708333134651184, "reward_std": 0.30922502279281616, "rewards/countdown_reward_func": 0.37708331644535065, "step": 2377, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0066019244970821605, "grad_norm": 0.11293301731348038, "kl": 0.11872411891818047, "learning_rate": 3e-06, "loss": 0.0105, "step": 2378 }, { "clip_ratio": 0.00032814600854180753, "epoch": 0.006604700747921976, "grad_norm": 0.10137040168046951, "kl": 0.1121567115187645, "learning_rate": 3e-06, "loss": 0.011, "step": 2379 }, { "clip_ratio": 0.0, "epoch": 0.006607476998761792, "grad_norm": 0.12760894000530243, "kl": 0.11916434392333031, "learning_rate": 3e-06, "loss": 0.0107, "step": 2380 }, { "clip_ratio": 0.0, "epoch": 0.006610253249601608, "grad_norm": 0.13065321743488312, "kl": 0.11660314351320267, "learning_rate": 3e-06, "loss": 0.0105, "step": 2381 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.006613029500441424, "grad_norm": 0.13641607761383057, "kl": 0.11466880142688751, "learning_rate": 3e-06, "loss": 0.0102, "step": 2382 }, { "clip_ratio": 0.0, "epoch": 0.00661580575128124, "grad_norm": 0.09623085707426071, "kl": 0.11718219518661499, "learning_rate": 3e-06, "loss": 0.0105, "step": 2383 }, { "clip_ratio": 0.0006848488337709568, "epoch": 0.0066185820021210555, "grad_norm": 0.10395172983407974, "kl": 0.11643769219517708, "learning_rate": 3e-06, "loss": 0.0096, "step": 2384 }, { "clip_ratio": 0.0002559744389145635, "epoch": 0.006621358252960872, "grad_norm": 0.08755200356245041, "kl": 0.10955745726823807, "learning_rate": 3e-06, "loss": 0.0094, "step": 2385 }, { "clip_ratio": 0.0, "epoch": 0.006624134503800687, "grad_norm": 0.09580729156732559, "kl": 0.11723517999053001, "learning_rate": 3e-06, "loss": 0.0107, "step": 2386 }, { "clip_ratio": 0.0, "epoch": 0.0066269107546405035, "grad_norm": 0.11413105577230453, "kl": 0.11416671797633171, "learning_rate": 3e-06, "loss": 0.0096, "step": 2387 }, { "clip_ratio": 0.00010495381866348907, "epoch": 0.006629687005480319, "grad_norm": 0.14028304815292358, "kl": 0.11132171005010605, "learning_rate": 3e-06, "loss": 0.0099, "step": 2388 }, { "clip_ratio": 0.0007585418788949028, "completion_length": 243.0, "epoch": 0.006632463256320135, "grad_norm": 0.12180110812187195, "kl": 0.11260923743247986, "learning_rate": 3e-06, "loss": 0.0024, "reward": 0.3229166716337204, "reward_std": 0.3841392993927002, "rewards/countdown_reward_func": 0.3229166716337204, "step": 2389, "zero_std_ratio": 0.125 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0066352395071599506, "grad_norm": 0.10579289495944977, "kl": 0.12057583779096603, "learning_rate": 3e-06, "loss": 0.0038, "step": 2390 }, { "clip_ratio": 0.00044376778532750905, "epoch": 0.006638015757999767, "grad_norm": 0.1263391077518463, "kl": 0.11325542628765106, "learning_rate": 3e-06, "loss": 0.0029, "step": 2391 }, { "clip_ratio": 0.00017295530415140092, "epoch": 0.006640792008839583, "grad_norm": 0.11693087965250015, "kl": 0.1094004213809967, "learning_rate": 3e-06, "loss": 0.0033, "step": 2392 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0066435682596793985, "grad_norm": 0.09953317046165466, "kl": 0.11280214041471481, "learning_rate": 3e-06, "loss": 0.0032, "step": 2393 }, { "clip_ratio": 0.00016556291666347533, "epoch": 0.006646344510519215, "grad_norm": 0.09708958119153976, "kl": 0.10897836834192276, "learning_rate": 3e-06, "loss": 0.0026, "step": 2394 }, { "clip_ratio": 0.00018115942657459527, "epoch": 0.00664912076135903, "grad_norm": 0.12472333759069443, "kl": 0.10994191840291023, "learning_rate": 3e-06, "loss": 0.0021, "step": 2395 }, { "clip_ratio": 0.0002494906075298786, "epoch": 0.0066518970121988465, "grad_norm": 0.10897238552570343, "kl": 0.1151459850370884, "learning_rate": 3e-06, "loss": 0.0034, "step": 2396 }, { "clip_ratio": 0.0005148796408320777, "epoch": 0.006654673263038662, "grad_norm": 0.12806881964206696, "kl": 0.10942655429244041, "learning_rate": 3e-06, "loss": 0.0017, "step": 2397 }, { "clip_ratio": 0.000273729907348752, "epoch": 0.006657449513878478, "grad_norm": 0.09647449851036072, "kl": 0.1066247895359993, "learning_rate": 3e-06, "loss": 0.003, "step": 2398 }, { "clip_ratio": 8.394895849050954e-05, "epoch": 0.0066602257647182935, "grad_norm": 0.11547134071588516, "kl": 0.10796680673956871, "learning_rate": 3e-06, "loss": 0.002, "step": 2399 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00666300201555811, "grad_norm": 0.10167829692363739, "kl": 0.10324173793196678, "learning_rate": 3e-06, "loss": 0.0014, "step": 2400 }, { "clip_ratio": 0.001098598149837926, "completion_length": 229.6041717529297, "epoch": 0.006665778266397925, "grad_norm": 0.12295004725456238, "kl": 0.10780546069145203, "learning_rate": 3e-06, "loss": 0.017, "reward": 0.28333336114883423, "reward_std": 0.2339424267411232, "rewards/countdown_reward_func": 0.28333333134651184, "step": 2401, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00017353141447529197, "epoch": 0.0066685545172377415, "grad_norm": 0.08983492106199265, "kl": 0.10426361858844757, "learning_rate": 3e-06, "loss": 0.0157, "step": 2402 }, { "clip_ratio": 8.196721319109201e-05, "epoch": 0.006671330768077558, "grad_norm": 0.06798221170902252, "kl": 0.09816591441631317, "learning_rate": 3e-06, "loss": 0.0156, "step": 2403 }, { "clip_ratio": 0.00025100400671362877, "epoch": 0.006674107018917373, "grad_norm": 0.08332613110542297, "kl": 0.10509132966399193, "learning_rate": 3e-06, "loss": 0.0153, "step": 2404 }, { "clip_ratio": 0.00019771909865085036, "epoch": 0.0066768832697571895, "grad_norm": 0.08753097057342529, "kl": 0.1027427539229393, "learning_rate": 3e-06, "loss": 0.016, "step": 2405 }, { "clip_ratio": 0.0006030112272128463, "epoch": 0.006679659520597005, "grad_norm": 0.06988681852817535, "kl": 0.10366249829530716, "learning_rate": 3e-06, "loss": 0.0161, "step": 2406 }, { "clip_ratio": 0.0005557472104555927, "epoch": 0.006682435771436821, "grad_norm": 0.09204717725515366, "kl": 0.10658174008131027, "learning_rate": 3e-06, "loss": 0.0154, "step": 2407 }, { "clip_ratio": 8.366800466319546e-05, "epoch": 0.0066852120222766365, "grad_norm": 0.08877309411764145, "kl": 0.10358146205544472, "learning_rate": 3e-06, "loss": 0.0147, "step": 2408 }, { "clip_ratio": 8.366800466319546e-05, "epoch": 0.006687988273116453, "grad_norm": 0.06935451179742813, "kl": 0.0986652821302414, "learning_rate": 3e-06, "loss": 0.0153, "step": 2409 }, { "clip_ratio": 0.0003415837127249688, "epoch": 0.006690764523956268, "grad_norm": 0.0854223370552063, "kl": 0.10557211935520172, "learning_rate": 3e-06, "loss": 0.0144, "step": 2410 }, { "clip_ratio": 0.0001140510939876549, "epoch": 0.0066935407747960845, "grad_norm": 0.08558721095323563, "kl": 0.10372918471693993, "learning_rate": 3e-06, "loss": 0.0151, "step": 2411 }, { "clip_ratio": 0.0006259864894673228, "epoch": 0.0066963170256359, "grad_norm": 0.06793802231550217, "kl": 0.10636503249406815, "learning_rate": 3e-06, "loss": 0.0152, "step": 2412 }, { "clip_ratio": 8.256275032181293e-05, "completion_length": 230.20834350585938, "epoch": 0.006699093276475716, "grad_norm": 0.07189203053712845, "kl": 0.11288557946681976, "learning_rate": 3e-06, "loss": 0.0073, "reward": 0.1875000074505806, "reward_std": 0.20407745242118835, "rewards/countdown_reward_func": 0.1875, "step": 2413, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00019181969400960952, "epoch": 0.0067018695273155324, "grad_norm": 0.07376082241535187, "kl": 0.10653312504291534, "learning_rate": 3e-06, "loss": 0.0075, "step": 2414 }, { "clip_ratio": 0.0, "epoch": 0.006704645778155348, "grad_norm": 0.08378919959068298, "kl": 0.11886245012283325, "learning_rate": 3e-06, "loss": 0.0077, "step": 2415 }, { "clip_ratio": 9.245562250725925e-05, "epoch": 0.006707422028995164, "grad_norm": 0.06871742010116577, "kl": 0.11657249182462692, "learning_rate": 3e-06, "loss": 0.0075, "step": 2416 }, { "clip_ratio": 0.0008867568030836992, "epoch": 0.0067101982798349795, "grad_norm": 0.06403800845146179, "kl": 0.10760099440813065, "learning_rate": 3e-06, "loss": 0.0086, "step": 2417 }, { "clip_ratio": 9.225092071574181e-05, "epoch": 0.006712974530674796, "grad_norm": 0.06900127232074738, "kl": 0.10545783117413521, "learning_rate": 3e-06, "loss": 0.0067, "step": 2418 }, { "clip_ratio": 0.0, "epoch": 0.006715750781514611, "grad_norm": 0.07691039890050888, "kl": 0.11479473859071732, "learning_rate": 3e-06, "loss": 0.0072, "step": 2419 }, { "clip_ratio": 0.00036602198088075966, "epoch": 0.0067185270323544275, "grad_norm": 0.07924985140562057, "kl": 0.1063217967748642, "learning_rate": 3e-06, "loss": 0.0075, "step": 2420 }, { "clip_ratio": 0.0003559212273103185, "epoch": 0.006721303283194243, "grad_norm": 0.08351549506187439, "kl": 0.11921686306595802, "learning_rate": 3e-06, "loss": 0.0078, "step": 2421 }, { "clip_ratio": 0.00027179565222468227, "epoch": 0.006724079534034059, "grad_norm": 0.06077051907777786, "kl": 0.1158263087272644, "learning_rate": 3e-06, "loss": 0.0068, "step": 2422 }, { "clip_ratio": 0.0011344332597218454, "epoch": 0.0067268557848738746, "grad_norm": 0.06291055679321289, "kl": 0.10745818167924881, "learning_rate": 3e-06, "loss": 0.0076, "step": 2423 }, { "clip_ratio": 0.0003693613543873653, "epoch": 0.006729632035713691, "grad_norm": 0.0665401816368103, "kl": 0.10476551577448845, "learning_rate": 3e-06, "loss": 0.0065, "step": 2424 }, { "clip_ratio": 8.928571332944557e-05, "completion_length": 215.6875, "epoch": 0.006732408286553507, "grad_norm": 0.08517291396856308, "kl": 0.1074470691382885, "learning_rate": 3e-06, "loss": 0.0123, "reward": 0.3604166805744171, "reward_std": 0.28518571704626083, "rewards/countdown_reward_func": 0.3604166656732559, "step": 2425, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0005277531163301319, "epoch": 0.0067351845373933225, "grad_norm": 0.12994913756847382, "kl": 0.11232437938451767, "learning_rate": 3e-06, "loss": 0.0127, "step": 2426 }, { "clip_ratio": 0.00017857142665889114, "epoch": 0.006737960788233139, "grad_norm": 0.20326417684555054, "kl": 0.11356884241104126, "learning_rate": 3e-06, "loss": 0.0123, "step": 2427 }, { "clip_ratio": 0.0001188212918350473, "epoch": 0.006740737039072954, "grad_norm": 0.078920379281044, "kl": 0.11158164963126183, "learning_rate": 3e-06, "loss": 0.0129, "step": 2428 }, { "clip_ratio": 0.0001050420178216882, "epoch": 0.0067435132899127705, "grad_norm": 0.07914809137582779, "kl": 0.11002899706363678, "learning_rate": 3e-06, "loss": 0.0122, "step": 2429 }, { "clip_ratio": 0.00022386331693269312, "epoch": 0.006746289540752586, "grad_norm": 0.10386265814304352, "kl": 0.1144975870847702, "learning_rate": 3e-06, "loss": 0.0123, "step": 2430 }, { "clip_ratio": 0.00030388979939743876, "epoch": 0.006749065791592402, "grad_norm": 0.09454360604286194, "kl": 0.10659191012382507, "learning_rate": 3e-06, "loss": 0.0127, "step": 2431 }, { "clip_ratio": 0.00039420771645382047, "epoch": 0.0067518420424322175, "grad_norm": 0.10722549259662628, "kl": 0.11116393283009529, "learning_rate": 3e-06, "loss": 0.0121, "step": 2432 }, { "clip_ratio": 0.0, "epoch": 0.006754618293272034, "grad_norm": 0.1120673343539238, "kl": 0.11296507716178894, "learning_rate": 3e-06, "loss": 0.0123, "step": 2433 }, { "clip_ratio": 0.0006201282667461783, "epoch": 0.006757394544111849, "grad_norm": 0.08440835773944855, "kl": 0.11203001067042351, "learning_rate": 3e-06, "loss": 0.012, "step": 2434 }, { "clip_ratio": 0.0, "epoch": 0.0067601707949516655, "grad_norm": 0.07518981397151947, "kl": 0.10907341167330742, "learning_rate": 3e-06, "loss": 0.012, "step": 2435 }, { "clip_ratio": 0.0005205198394833133, "epoch": 0.006762947045791482, "grad_norm": 0.09459592401981354, "kl": 0.11349227651953697, "learning_rate": 3e-06, "loss": 0.0114, "step": 2436 }, { "clip_ratio": 0.00026166778843617067, "completion_length": 230.75000762939453, "epoch": 0.006765723296631297, "grad_norm": 0.07740966230630875, "kl": 0.10574423149228096, "learning_rate": 3e-06, "loss": 0.0033, "reward": 0.2666666731238365, "reward_std": 0.2285463623702526, "rewards/countdown_reward_func": 0.2666666731238365, "step": 2437, "zero_std_ratio": 0.5 }, { "clip_ratio": 8.802816591924056e-05, "epoch": 0.0067684995474711135, "grad_norm": 0.10261306911706924, "kl": 0.10881078243255615, "learning_rate": 3e-06, "loss": 0.0033, "step": 2438 }, { "clip_ratio": 0.00017181201837956905, "epoch": 0.006771275798310929, "grad_norm": 0.09754025936126709, "kl": 0.11322496458888054, "learning_rate": 3e-06, "loss": 0.0025, "step": 2439 }, { "clip_ratio": 0.0, "epoch": 0.006774052049150745, "grad_norm": 0.07824526727199554, "kl": 0.10903006792068481, "learning_rate": 3e-06, "loss": 0.0033, "step": 2440 }, { "clip_ratio": 0.0003595175876398571, "epoch": 0.0067768282999905605, "grad_norm": 0.18730951845645905, "kl": 0.11356868967413902, "learning_rate": 3e-06, "loss": 0.0022, "step": 2441 }, { "clip_ratio": 8.85896515683271e-05, "epoch": 0.006779604550830377, "grad_norm": 0.09276639670133591, "kl": 0.10244229808449745, "learning_rate": 3e-06, "loss": 0.0028, "step": 2442 }, { "clip_ratio": 0.00026698306464822963, "epoch": 0.006782380801670192, "grad_norm": 0.07316610962152481, "kl": 0.10353904590010643, "learning_rate": 3e-06, "loss": 0.0031, "step": 2443 }, { "clip_ratio": 8.802816591924056e-05, "epoch": 0.0067851570525100085, "grad_norm": 0.09989767521619797, "kl": 0.10630606859922409, "learning_rate": 3e-06, "loss": 0.0023, "step": 2444 }, { "clip_ratio": 0.0004429482505656779, "epoch": 0.006787933303349824, "grad_norm": 0.08019357174634933, "kl": 0.11003177613019943, "learning_rate": 3e-06, "loss": 0.0026, "step": 2445 }, { "clip_ratio": 0.0, "epoch": 0.00679070955418964, "grad_norm": 0.07700924575328827, "kl": 0.10536712780594826, "learning_rate": 3e-06, "loss": 0.0024, "step": 2446 }, { "clip_ratio": 0.0005259623358142562, "epoch": 0.0067934858050294564, "grad_norm": 0.18418793380260468, "kl": 0.11070561036467552, "learning_rate": 3e-06, "loss": -0.0002, "step": 2447 }, { "clip_ratio": 0.0001726998161757365, "epoch": 0.006796262055869272, "grad_norm": 0.10063984245061874, "kl": 0.1003638207912445, "learning_rate": 3e-06, "loss": 0.0023, "step": 2448 }, { "clip_ratio": 0.00025201612152159214, "completion_length": 229.39583587646484, "epoch": 0.006799038306709088, "grad_norm": 0.11044564843177795, "kl": 0.10882113873958588, "learning_rate": 3e-06, "loss": 0.022, "reward": 0.4333333671092987, "reward_std": 0.4348773956298828, "rewards/countdown_reward_func": 0.4333333671092987, "step": 2449, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.0, "epoch": 0.0068018145575489035, "grad_norm": 0.10996294021606445, "kl": 0.10753875970840454, "learning_rate": 3e-06, "loss": 0.0224, "step": 2450 }, { "clip_ratio": 0.00017908310110215098, "epoch": 0.00680459080838872, "grad_norm": 0.13970771431922913, "kl": 0.10632171854376793, "learning_rate": 3e-06, "loss": 0.0232, "step": 2451 }, { "clip_ratio": 0.0005040322430431843, "epoch": 0.006807367059228535, "grad_norm": 0.1698501855134964, "kl": 0.10705317929387093, "learning_rate": 3e-06, "loss": 0.0207, "step": 2452 }, { "clip_ratio": 8.585165051044896e-05, "epoch": 0.0068101433100683515, "grad_norm": 0.12471974641084671, "kl": 0.09810657054185867, "learning_rate": 3e-06, "loss": 0.0227, "step": 2453 }, { "clip_ratio": 9.571210102876648e-05, "epoch": 0.006812919560908167, "grad_norm": 0.12794393301010132, "kl": 0.097657959908247, "learning_rate": 3e-06, "loss": 0.0206, "step": 2454 }, { "clip_ratio": 0.0003692590180435218, "epoch": 0.006815695811747983, "grad_norm": 0.11906078457832336, "kl": 0.10667447000741959, "learning_rate": 3e-06, "loss": 0.0221, "step": 2455 }, { "clip_ratio": 9.999999747378752e-05, "epoch": 0.0068184720625877986, "grad_norm": 0.24868349730968475, "kl": 0.10781405493617058, "learning_rate": 3e-06, "loss": 0.0215, "step": 2456 }, { "clip_ratio": 0.00026493475888855755, "epoch": 0.006821248313427615, "grad_norm": 0.10990463197231293, "kl": 0.10687519982457161, "learning_rate": 3e-06, "loss": 0.0217, "step": 2457 }, { "clip_ratio": 0.0, "epoch": 0.006824024564267431, "grad_norm": 0.16284221410751343, "kl": 0.10830774903297424, "learning_rate": 3e-06, "loss": 0.0189, "step": 2458 }, { "clip_ratio": 0.00017539320106152445, "epoch": 0.0068268008151072465, "grad_norm": 0.11818123608827591, "kl": 0.09956466034054756, "learning_rate": 3e-06, "loss": 0.0216, "step": 2459 }, { "clip_ratio": 0.0002764785604085773, "epoch": 0.006829577065947063, "grad_norm": 0.1193012222647667, "kl": 0.09965555369853973, "learning_rate": 3e-06, "loss": 0.0197, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 222.64583587646484, "epoch": 0.006832353316786878, "grad_norm": 0.09964413940906525, "kl": 0.11183221638202667, "learning_rate": 3e-06, "loss": 0.0273, "reward": 0.30625002086162567, "reward_std": 0.28535499423742294, "rewards/countdown_reward_func": 0.30625002086162567, "step": 2461, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.630200656829402e-05, "epoch": 0.0068351295676266945, "grad_norm": 0.09929607808589935, "kl": 0.112647145986557, "learning_rate": 3e-06, "loss": 0.0274, "step": 2462 }, { "clip_ratio": 0.0007768171490170062, "epoch": 0.00683790581846651, "grad_norm": 0.10724590718746185, "kl": 0.11417991295456886, "learning_rate": 3e-06, "loss": 0.0279, "step": 2463 }, { "clip_ratio": 0.000346161745255813, "epoch": 0.006840682069306326, "grad_norm": 0.0905621126294136, "kl": 0.11805011332035065, "learning_rate": 3e-06, "loss": 0.0279, "step": 2464 }, { "clip_ratio": 0.000299699509923812, "epoch": 0.0068434583201461415, "grad_norm": 0.09721238166093826, "kl": 0.11963087320327759, "learning_rate": 3e-06, "loss": 0.028, "step": 2465 }, { "clip_ratio": 8.614748367108405e-05, "epoch": 0.006846234570985958, "grad_norm": 0.09523501247167587, "kl": 0.12303963676095009, "learning_rate": 3e-06, "loss": 0.0261, "step": 2466 }, { "clip_ratio": 0.0002624175394885242, "epoch": 0.006849010821825773, "grad_norm": 0.09940183162689209, "kl": 0.12013134360313416, "learning_rate": 3e-06, "loss": 0.0264, "step": 2467 }, { "clip_ratio": 0.0002837783540599048, "epoch": 0.0068517870726655895, "grad_norm": 0.09632185846567154, "kl": 0.12204306945204735, "learning_rate": 3e-06, "loss": 0.0259, "step": 2468 }, { "clip_ratio": 0.0003724969647009857, "epoch": 0.006854563323505406, "grad_norm": 0.10749493539333344, "kl": 0.12405981495976448, "learning_rate": 3e-06, "loss": 0.0258, "step": 2469 }, { "clip_ratio": 8.771930151851848e-05, "epoch": 0.006857339574345221, "grad_norm": 0.084246926009655, "kl": 0.12915344536304474, "learning_rate": 3e-06, "loss": 0.0264, "step": 2470 }, { "clip_ratio": 0.0, "epoch": 0.0068601158251850375, "grad_norm": 0.0897546112537384, "kl": 0.131361685693264, "learning_rate": 3e-06, "loss": 0.0264, "step": 2471 }, { "clip_ratio": 0.0, "epoch": 0.006862892076024853, "grad_norm": 0.07698703557252884, "kl": 0.1356644704937935, "learning_rate": 3e-06, "loss": 0.0253, "step": 2472 }, { "clip_ratio": 0.0003964965872000903, "completion_length": 210.31250762939453, "epoch": 0.006865668326864669, "grad_norm": 0.089408740401268, "kl": 0.12919557839632034, "learning_rate": 3e-06, "loss": 0.0279, "reward": 0.3604166805744171, "reward_std": 0.3206951916217804, "rewards/countdown_reward_func": 0.3604166805744171, "step": 2473, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0005657835863530636, "epoch": 0.0068684445777044845, "grad_norm": 0.10280496627092361, "kl": 0.13355571776628494, "learning_rate": 3e-06, "loss": 0.0279, "step": 2474 }, { "clip_ratio": 0.00030004348809598014, "epoch": 0.006871220828544301, "grad_norm": 0.09122777730226517, "kl": 0.13717789202928543, "learning_rate": 3e-06, "loss": 0.0281, "step": 2475 }, { "clip_ratio": 9.578544268151745e-05, "epoch": 0.006873997079384116, "grad_norm": 0.08694249391555786, "kl": 0.1368713080883026, "learning_rate": 3e-06, "loss": 0.0283, "step": 2476 }, { "clip_ratio": 9.912767563946545e-05, "epoch": 0.0068767733302239325, "grad_norm": 0.10039347410202026, "kl": 0.1407921016216278, "learning_rate": 3e-06, "loss": 0.0278, "step": 2477 }, { "clip_ratio": 8.45165632199496e-05, "epoch": 0.006879549581063748, "grad_norm": 0.12389730662107468, "kl": 0.15291880071163177, "learning_rate": 3e-06, "loss": 0.028, "step": 2478 }, { "clip_ratio": 0.0001690331264398992, "epoch": 0.006882325831903564, "grad_norm": 0.09755761176347733, "kl": 0.14044442027807236, "learning_rate": 3e-06, "loss": 0.0272, "step": 2479 }, { "clip_ratio": 0.00030831367621431127, "epoch": 0.0068851020827433804, "grad_norm": 0.11109394580125809, "kl": 0.1446225941181183, "learning_rate": 3e-06, "loss": 0.0266, "step": 2480 }, { "clip_ratio": 0.0003083823903580196, "epoch": 0.006887878333583196, "grad_norm": 0.10463670641183853, "kl": 0.14909590035676956, "learning_rate": 3e-06, "loss": 0.0263, "step": 2481 }, { "clip_ratio": 0.0, "epoch": 0.006890654584423012, "grad_norm": 0.09704513847827911, "kl": 0.14730746299028397, "learning_rate": 3e-06, "loss": 0.0267, "step": 2482 }, { "clip_ratio": 0.0, "epoch": 0.0068934308352628275, "grad_norm": 0.09785734117031097, "kl": 0.15231196582317352, "learning_rate": 3e-06, "loss": 0.0252, "step": 2483 }, { "clip_ratio": 8.45165632199496e-05, "epoch": 0.006896207086102644, "grad_norm": 0.08950638771057129, "kl": 0.1649750992655754, "learning_rate": 3e-06, "loss": 0.0262, "step": 2484 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 230.9166717529297, "epoch": 0.006898983336942459, "grad_norm": 0.08120502531528473, "kl": 0.12933644652366638, "learning_rate": 3e-06, "loss": 0.0153, "reward": 0.395833358168602, "reward_std": 0.3388310372829437, "rewards/countdown_reward_func": 0.3958333507180214, "step": 2485, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0069017595877822755, "grad_norm": 0.12228546291589737, "kl": 0.13283831626176834, "learning_rate": 3e-06, "loss": 0.0166, "step": 2486 }, { "clip_ratio": 0.00025467218802077696, "epoch": 0.006904535838622091, "grad_norm": 0.11056458950042725, "kl": 0.1325739100575447, "learning_rate": 3e-06, "loss": 0.0158, "step": 2487 }, { "clip_ratio": 0.00018615041335579008, "epoch": 0.006907312089461907, "grad_norm": 0.09804553538560867, "kl": 0.13264501839876175, "learning_rate": 3e-06, "loss": 0.0154, "step": 2488 }, { "clip_ratio": 0.0, "epoch": 0.0069100883403017226, "grad_norm": 0.10097736865282059, "kl": 0.1376706250011921, "learning_rate": 3e-06, "loss": 0.0157, "step": 2489 }, { "clip_ratio": 0.0, "epoch": 0.006912864591141539, "grad_norm": 0.09442541003227234, "kl": 0.13518304377794266, "learning_rate": 3e-06, "loss": 0.0159, "step": 2490 }, { "clip_ratio": 0.0, "epoch": 0.006915640841981355, "grad_norm": 0.08362875878810883, "kl": 0.13955392688512802, "learning_rate": 3e-06, "loss": 0.0152, "step": 2491 }, { "clip_ratio": 0.0002792256127577275, "epoch": 0.0069184170928211705, "grad_norm": 0.09364964812994003, "kl": 0.14495066553354263, "learning_rate": 3e-06, "loss": 0.0149, "step": 2492 }, { "clip_ratio": 0.00016947041149251163, "epoch": 0.006921193343660987, "grad_norm": 0.0926862508058548, "kl": 0.144854336977005, "learning_rate": 3e-06, "loss": 0.0147, "step": 2493 }, { "clip_ratio": 0.0006245378099265508, "epoch": 0.006923969594500802, "grad_norm": 0.09652328491210938, "kl": 0.1455134153366089, "learning_rate": 3e-06, "loss": 0.0148, "step": 2494 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0069267458453406185, "grad_norm": 0.09675440937280655, "kl": 0.14825546741485596, "learning_rate": 3e-06, "loss": 0.0149, "step": 2495 }, { "clip_ratio": 0.000413611029216554, "epoch": 0.006929522096180434, "grad_norm": 0.11376820504665375, "kl": 0.14565087854862213, "learning_rate": 3e-06, "loss": 0.0144, "step": 2496 }, { "clip_ratio": 0.00010024057701230049, "completion_length": 216.45833587646484, "epoch": 0.00693229834702025, "grad_norm": 0.11657664179801941, "kl": 0.16808811575174332, "learning_rate": 3e-06, "loss": 0.0155, "reward": 0.28541669249534607, "reward_std": 0.21314848214387894, "rewards/countdown_reward_func": 0.2854166775941849, "step": 2497, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.321401739725843e-05, "epoch": 0.0069350745978600655, "grad_norm": 0.0914500430226326, "kl": 0.18082208931446075, "learning_rate": 3e-06, "loss": 0.0155, "step": 2498 }, { "clip_ratio": 0.00028280332480790094, "epoch": 0.006937850848699882, "grad_norm": 0.10090383887290955, "kl": 0.17869839072227478, "learning_rate": 3e-06, "loss": 0.0146, "step": 2499 }, { "clip_ratio": 0.00018671700672712177, "epoch": 0.006940627099539697, "grad_norm": 0.07849971204996109, "kl": 0.1728534698486328, "learning_rate": 3e-06, "loss": 0.0154, "step": 2500 }, { "clip_ratio": 9.321401739725843e-05, "epoch": 0.0069434033503795135, "grad_norm": 0.08036849647760391, "kl": 0.183627650141716, "learning_rate": 3e-06, "loss": 0.0157, "step": 2501 }, { "clip_ratio": 0.00030516918195644394, "epoch": 0.00694617960121933, "grad_norm": 0.07406234741210938, "kl": 0.17438813298940659, "learning_rate": 3e-06, "loss": 0.0151, "step": 2502 }, { "clip_ratio": 0.0009932260145433247, "epoch": 0.006948955852059145, "grad_norm": 0.0847598984837532, "kl": 0.17175012826919556, "learning_rate": 3e-06, "loss": 0.0153, "step": 2503 }, { "clip_ratio": 8.383634849451482e-05, "epoch": 0.0069517321028989615, "grad_norm": 0.09786758571863174, "kl": 0.1857234165072441, "learning_rate": 3e-06, "loss": 0.0148, "step": 2504 }, { "clip_ratio": 0.00016767269698902965, "epoch": 0.006954508353738777, "grad_norm": 0.10045964270830154, "kl": 0.18046917021274567, "learning_rate": 3e-06, "loss": 0.0145, "step": 2505 }, { "clip_ratio": 0.0003800771082751453, "epoch": 0.006957284604578593, "grad_norm": 0.07848291844129562, "kl": 0.1737968549132347, "learning_rate": 3e-06, "loss": 0.0147, "step": 2506 }, { "clip_ratio": 0.0006345177534967661, "epoch": 0.0069600608554184085, "grad_norm": 0.08301141113042831, "kl": 0.18425538390874863, "learning_rate": 3e-06, "loss": 0.015, "step": 2507 }, { "clip_ratio": 0.0005948093312326819, "epoch": 0.006962837106258225, "grad_norm": 0.07410308718681335, "kl": 0.17250776290893555, "learning_rate": 3e-06, "loss": 0.0143, "step": 2508 }, { "clip_ratio": 9.811617201194167e-05, "completion_length": 223.77083587646484, "epoch": 0.00696561335709804, "grad_norm": 0.10208810120820999, "kl": 0.1743207797408104, "learning_rate": 3e-06, "loss": 0.0328, "reward": 0.3812500238418579, "reward_std": 0.32775889337062836, "rewards/countdown_reward_func": 0.3812500238418579, "step": 2509, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00032837127218954265, "epoch": 0.0069683896079378565, "grad_norm": 0.12014499306678772, "kl": 0.178618386387825, "learning_rate": 3e-06, "loss": 0.0341, "step": 2510 }, { "clip_ratio": 0.00047734283725731075, "epoch": 0.006971165858777672, "grad_norm": 0.10467583686113358, "kl": 0.1751692220568657, "learning_rate": 3e-06, "loss": 0.0322, "step": 2511 }, { "clip_ratio": 9.307520667789504e-05, "epoch": 0.006973942109617488, "grad_norm": 0.10790978372097015, "kl": 0.173781156539917, "learning_rate": 3e-06, "loss": 0.0327, "step": 2512 }, { "clip_ratio": 9.307520667789504e-05, "epoch": 0.0069767183604573044, "grad_norm": 0.10750441998243332, "kl": 0.17114970088005066, "learning_rate": 3e-06, "loss": 0.0323, "step": 2513 }, { "clip_ratio": 0.00019036624871660024, "epoch": 0.00697949461129712, "grad_norm": 0.09914499521255493, "kl": 0.18860305845737457, "learning_rate": 3e-06, "loss": 0.0319, "step": 2514 }, { "clip_ratio": 8.532423089491203e-05, "epoch": 0.006982270862136936, "grad_norm": 0.10709180682897568, "kl": 0.18218130618333817, "learning_rate": 3e-06, "loss": 0.0313, "step": 2515 }, { "clip_ratio": 0.0001050420178216882, "epoch": 0.0069850471129767515, "grad_norm": 0.10699032247066498, "kl": 0.1875542849302292, "learning_rate": 3e-06, "loss": 0.0324, "step": 2516 }, { "clip_ratio": 0.00040064952918328345, "epoch": 0.006987823363816568, "grad_norm": 0.09571097046136856, "kl": 0.18864677846431732, "learning_rate": 3e-06, "loss": 0.0309, "step": 2517 }, { "clip_ratio": 0.00017064846178982407, "epoch": 0.006990599614656383, "grad_norm": 0.098480224609375, "kl": 0.1907021626830101, "learning_rate": 3e-06, "loss": 0.031, "step": 2518 }, { "clip_ratio": 0.0002962333965115249, "epoch": 0.0069933758654961995, "grad_norm": 0.10025587677955627, "kl": 0.18980274349451065, "learning_rate": 3e-06, "loss": 0.0309, "step": 2519 }, { "clip_ratio": 0.0006769659230485559, "epoch": 0.006996152116336015, "grad_norm": 0.09761010855436325, "kl": 0.20822478830814362, "learning_rate": 3e-06, "loss": 0.031, "step": 2520 }, { "clip_ratio": 0.00011803588131442666, "completion_length": 204.75000762939453, "epoch": 0.006998928367175831, "grad_norm": 0.10998831689357758, "kl": 0.19193871319293976, "learning_rate": 3e-06, "loss": 0.0238, "reward": 0.4479166865348816, "reward_std": 0.4021080732345581, "rewards/countdown_reward_func": 0.4479166567325592, "step": 2521, "zero_std_ratio": 0.0 }, { "clip_ratio": 8.532423089491203e-05, "epoch": 0.0070017046180156466, "grad_norm": 0.09823489934206009, "kl": 0.20638025552034378, "learning_rate": 3e-06, "loss": 0.0241, "step": 2522 }, { "clip_ratio": 0.0, "epoch": 0.007004480868855463, "grad_norm": 0.09485521167516708, "kl": 0.20615187287330627, "learning_rate": 3e-06, "loss": 0.0237, "step": 2523 }, { "clip_ratio": 0.0, "epoch": 0.007007257119695279, "grad_norm": 0.10079658776521683, "kl": 0.2072722464799881, "learning_rate": 3e-06, "loss": 0.0238, "step": 2524 }, { "clip_ratio": 0.0007758917636238039, "epoch": 0.0070100333705350945, "grad_norm": 0.09545547515153885, "kl": 0.22079689800739288, "learning_rate": 3e-06, "loss": 0.0247, "step": 2525 }, { "clip_ratio": 0.00010434056457597762, "epoch": 0.007012809621374911, "grad_norm": 0.10776659101247787, "kl": 0.21613473445177078, "learning_rate": 3e-06, "loss": 0.0249, "step": 2526 }, { "clip_ratio": 0.00047114884364418685, "epoch": 0.007015585872214726, "grad_norm": 0.11012641340494156, "kl": 0.20886804908514023, "learning_rate": 3e-06, "loss": 0.0233, "step": 2527 }, { "clip_ratio": 0.0003194066302967258, "epoch": 0.0070183621230545425, "grad_norm": 0.09369030594825745, "kl": 0.22248712182044983, "learning_rate": 3e-06, "loss": 0.0229, "step": 2528 }, { "clip_ratio": 0.00039203022606670856, "epoch": 0.007021138373894358, "grad_norm": 0.08990558981895447, "kl": 0.21968073397874832, "learning_rate": 3e-06, "loss": 0.0235, "step": 2529 }, { "clip_ratio": 0.0, "epoch": 0.007023914624734174, "grad_norm": 0.1038319319486618, "kl": 0.22010161727666855, "learning_rate": 3e-06, "loss": 0.0232, "step": 2530 }, { "clip_ratio": 0.0007747428026050329, "epoch": 0.0070266908755739895, "grad_norm": 0.10059669613838196, "kl": 0.23308537900447845, "learning_rate": 3e-06, "loss": 0.0234, "step": 2531 }, { "clip_ratio": 0.0002213817642768845, "epoch": 0.007029467126413806, "grad_norm": 0.10100699216127396, "kl": 0.22424881905317307, "learning_rate": 3e-06, "loss": 0.0229, "step": 2532 }, { "clip_ratio": 0.00026742640329757705, "completion_length": 217.31250762939453, "epoch": 0.007032243377253621, "grad_norm": 0.1034797728061676, "kl": 0.23187711834907532, "learning_rate": 3e-06, "loss": 0.0399, "reward": 0.32500001788139343, "reward_std": 0.28887904435396194, "rewards/countdown_reward_func": 0.32500001788139343, "step": 2533, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00027684104861691594, "epoch": 0.0070350196280934375, "grad_norm": 0.09598292410373688, "kl": 0.22680123895406723, "learning_rate": 3e-06, "loss": 0.0392, "step": 2534 }, { "clip_ratio": 0.0003663590468931943, "epoch": 0.007037795878933254, "grad_norm": 0.11063233017921448, "kl": 0.25065308809280396, "learning_rate": 3e-06, "loss": 0.0406, "step": 2535 }, { "clip_ratio": 0.0003028440332855098, "epoch": 0.007040572129773069, "grad_norm": 0.09503897279500961, "kl": 0.23381569236516953, "learning_rate": 3e-06, "loss": 0.0401, "step": 2536 }, { "clip_ratio": 0.0, "epoch": 0.0070433483806128854, "grad_norm": 0.09691870212554932, "kl": 0.238780677318573, "learning_rate": 3e-06, "loss": 0.0395, "step": 2537 }, { "clip_ratio": 0.0, "epoch": 0.007046124631452701, "grad_norm": 0.09744689613580704, "kl": 0.2430015429854393, "learning_rate": 3e-06, "loss": 0.0392, "step": 2538 }, { "clip_ratio": 0.000691764202201739, "epoch": 0.007048900882292517, "grad_norm": 0.09978599101305008, "kl": 0.2478952258825302, "learning_rate": 3e-06, "loss": 0.0389, "step": 2539 }, { "clip_ratio": 0.0004900440326309763, "epoch": 0.0070516771331323325, "grad_norm": 0.09151586890220642, "kl": 0.24548708647489548, "learning_rate": 3e-06, "loss": 0.0389, "step": 2540 }, { "clip_ratio": 0.0005319148767739534, "epoch": 0.007054453383972149, "grad_norm": 0.1024128794670105, "kl": 0.2721577286720276, "learning_rate": 3e-06, "loss": 0.0393, "step": 2541 }, { "clip_ratio": 0.0001915094835567288, "epoch": 0.007057229634811964, "grad_norm": 0.09976603090763092, "kl": 0.2565399706363678, "learning_rate": 3e-06, "loss": 0.0386, "step": 2542 }, { "clip_ratio": 0.0004081632650922984, "epoch": 0.0070600058856517805, "grad_norm": 0.09156995266675949, "kl": 0.26452548801898956, "learning_rate": 3e-06, "loss": 0.0378, "step": 2543 }, { "clip_ratio": 0.0, "epoch": 0.007062782136491596, "grad_norm": 0.10364950448274612, "kl": 0.2660391926765442, "learning_rate": 3e-06, "loss": 0.0379, "step": 2544 }, { "clip_ratio": 0.00011415524932090193, "completion_length": 210.2291717529297, "epoch": 0.007065558387331412, "grad_norm": 0.10578175634145737, "kl": 0.2782774120569229, "learning_rate": 3e-06, "loss": 0.0303, "reward": 0.2812500298023224, "reward_std": 0.3244262933731079, "rewards/countdown_reward_func": 0.2812500149011612, "step": 2545, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0070683346381712284, "grad_norm": 0.11301010102033615, "kl": 0.2679397016763687, "learning_rate": 3e-06, "loss": 0.0297, "step": 2546 }, { "clip_ratio": 0.00019877930753864348, "epoch": 0.007071110889011044, "grad_norm": 0.0882059633731842, "kl": 0.27911926805973053, "learning_rate": 3e-06, "loss": 0.0294, "step": 2547 }, { "clip_ratio": 0.0003017173148691654, "epoch": 0.00707388713985086, "grad_norm": 0.10633210092782974, "kl": 0.2784113436937332, "learning_rate": 3e-06, "loss": 0.0303, "step": 2548 }, { "clip_ratio": 0.00011415524932090193, "epoch": 0.0070766633906906755, "grad_norm": 0.12243712693452835, "kl": 0.3002118617296219, "learning_rate": 3e-06, "loss": 0.0287, "step": 2549 }, { "clip_ratio": 0.00011415524932090193, "epoch": 0.007079439641530492, "grad_norm": 0.11381380259990692, "kl": 0.30442118644714355, "learning_rate": 3e-06, "loss": 0.0308, "step": 2550 }, { "clip_ratio": 0.0, "epoch": 0.007082215892370307, "grad_norm": 0.11545722931623459, "kl": 0.2972116321325302, "learning_rate": 3e-06, "loss": 0.0291, "step": 2551 }, { "clip_ratio": 0.0, "epoch": 0.0070849921432101235, "grad_norm": 0.1022641658782959, "kl": 0.28697600960731506, "learning_rate": 3e-06, "loss": 0.0282, "step": 2552 }, { "clip_ratio": 0.00038981897523626685, "epoch": 0.007087768394049939, "grad_norm": 0.07893171161413193, "kl": 0.2962718904018402, "learning_rate": 3e-06, "loss": 0.0289, "step": 2553 }, { "clip_ratio": 0.0004877245664829388, "epoch": 0.007090544644889755, "grad_norm": 0.11324810981750488, "kl": 0.2932514548301697, "learning_rate": 3e-06, "loss": 0.0284, "step": 2554 }, { "clip_ratio": 0.00011415524932090193, "epoch": 0.0070933208957295706, "grad_norm": 0.14238354563713074, "kl": 0.310974583029747, "learning_rate": 3e-06, "loss": 0.0272, "step": 2555 }, { "clip_ratio": 0.0005770557618234307, "epoch": 0.007096097146569387, "grad_norm": 0.15894882380962372, "kl": 0.315914586186409, "learning_rate": 3e-06, "loss": 0.0295, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 207.1666717529297, "epoch": 0.007098873397409203, "grad_norm": 0.10073784738779068, "kl": 0.3152993768453598, "learning_rate": 3e-06, "loss": 0.0123, "reward": 0.17291668057441711, "reward_std": 0.1212926022708416, "rewards/countdown_reward_func": 0.17291668057441711, "step": 2557, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0, "epoch": 0.0071016496482490185, "grad_norm": 0.1141972467303276, "kl": 0.31645752489566803, "learning_rate": 3e-06, "loss": 0.0133, "step": 2558 }, { "clip_ratio": 0.0015715022818767466, "epoch": 0.007104425899088835, "grad_norm": 0.10188689827919006, "kl": 0.3191002756357193, "learning_rate": 3e-06, "loss": 0.0135, "step": 2559 }, { "clip_ratio": 0.000509025325300172, "epoch": 0.00710720214992865, "grad_norm": 0.09876614809036255, "kl": 0.2960846275091171, "learning_rate": 3e-06, "loss": 0.0118, "step": 2560 }, { "clip_ratio": 0.0003962415212299675, "epoch": 0.0071099784007684665, "grad_norm": 0.11342237889766693, "kl": 0.30057406425476074, "learning_rate": 3e-06, "loss": 0.0122, "step": 2561 }, { "clip_ratio": 9.682416566647589e-05, "epoch": 0.007112754651608282, "grad_norm": 0.10263796895742416, "kl": 0.2931150197982788, "learning_rate": 3e-06, "loss": 0.012, "step": 2562 }, { "clip_ratio": 0.0008436891948804259, "epoch": 0.007115530902448098, "grad_norm": 0.09433847665786743, "kl": 0.28412140905857086, "learning_rate": 3e-06, "loss": 0.0109, "step": 2563 }, { "clip_ratio": 0.0004102203529328108, "epoch": 0.0071183071532879135, "grad_norm": 0.10943610221147537, "kl": 0.2760322690010071, "learning_rate": 3e-06, "loss": 0.0107, "step": 2564 }, { "clip_ratio": 0.0016632629558444023, "epoch": 0.00712108340412773, "grad_norm": 0.08785755187273026, "kl": 0.27000415325164795, "learning_rate": 3e-06, "loss": 0.0114, "step": 2565 }, { "clip_ratio": 0.00046117225429043174, "epoch": 0.007123859654967545, "grad_norm": 0.09270691871643066, "kl": 0.2478715479373932, "learning_rate": 3e-06, "loss": 0.0096, "step": 2566 }, { "clip_ratio": 0.0024565017665736377, "epoch": 0.0071266359058073615, "grad_norm": 0.0978165790438652, "kl": 0.24879343807697296, "learning_rate": 3e-06, "loss": 0.0101, "step": 2567 }, { "clip_ratio": 0.0014994074881542474, "epoch": 0.007129412156647178, "grad_norm": 0.08607335388660431, "kl": 0.2371087223291397, "learning_rate": 3e-06, "loss": 0.0094, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 205.3541717529297, "epoch": 0.007132188407486993, "grad_norm": 0.12269733846187592, "kl": 0.23388530313968658, "learning_rate": 3e-06, "loss": 0.0517, "reward": 0.2291666716337204, "reward_std": 0.2552594095468521, "rewards/countdown_reward_func": 0.2291666716337204, "step": 2569, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00019997564959339797, "epoch": 0.0071349646583268094, "grad_norm": 0.12135521322488785, "kl": 0.23965732008218765, "learning_rate": 3e-06, "loss": 0.0522, "step": 2570 }, { "clip_ratio": 0.0002170178049709648, "epoch": 0.007137740909166625, "grad_norm": 0.13130511343479156, "kl": 0.23329252004623413, "learning_rate": 3e-06, "loss": 0.052, "step": 2571 }, { "clip_ratio": 0.0003988340322393924, "epoch": 0.007140517160006441, "grad_norm": 0.10693640261888504, "kl": 0.22538745403289795, "learning_rate": 3e-06, "loss": 0.052, "step": 2572 }, { "clip_ratio": 0.0, "epoch": 0.0071432934108462565, "grad_norm": 0.12298305332660675, "kl": 0.22345633059740067, "learning_rate": 3e-06, "loss": 0.052, "step": 2573 }, { "clip_ratio": 0.0, "epoch": 0.007146069661686073, "grad_norm": 0.1192060336470604, "kl": 0.21971221268177032, "learning_rate": 3e-06, "loss": 0.0522, "step": 2574 }, { "clip_ratio": 0.00019737653201445937, "epoch": 0.007148845912525888, "grad_norm": 0.12879115343093872, "kl": 0.2192608341574669, "learning_rate": 3e-06, "loss": 0.0522, "step": 2575 }, { "clip_ratio": 0.00010620220564305782, "epoch": 0.0071516221633657045, "grad_norm": 0.12754960358142853, "kl": 0.23555022478103638, "learning_rate": 3e-06, "loss": 0.0511, "step": 2576 }, { "clip_ratio": 0.0, "epoch": 0.00715439841420552, "grad_norm": 0.12313434481620789, "kl": 0.23972846567630768, "learning_rate": 3e-06, "loss": 0.051, "step": 2577 }, { "clip_ratio": 0.0, "epoch": 0.007157174665045336, "grad_norm": 0.10354815423488617, "kl": 0.2370493784546852, "learning_rate": 3e-06, "loss": 0.0509, "step": 2578 }, { "clip_ratio": 0.00011150757927680388, "epoch": 0.0071599509158851524, "grad_norm": 0.13989798724651337, "kl": 0.2418346405029297, "learning_rate": 3e-06, "loss": 0.0497, "step": 2579 }, { "clip_ratio": 0.00020620101713575423, "epoch": 0.007162727166724968, "grad_norm": 0.1139150857925415, "kl": 0.24306735396385193, "learning_rate": 3e-06, "loss": 0.0502, "step": 2580 }, { "clip_ratio": 9.124087227974087e-05, "completion_length": 191.02084350585938, "epoch": 0.007165503417564784, "grad_norm": 0.1612713485956192, "kl": 0.2639460563659668, "learning_rate": 3e-06, "loss": 0.0002, "reward": 0.3583333492279053, "reward_std": 0.385948970913887, "rewards/countdown_reward_func": 0.3583333194255829, "step": 2581, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0071682796684045995, "grad_norm": 0.16606862843036652, "kl": 0.26460373401641846, "learning_rate": 3e-06, "loss": -0.0003, "step": 2582 }, { "clip_ratio": 0.00020177561964374036, "epoch": 0.007171055919244416, "grad_norm": 0.18275311589241028, "kl": 0.2770133316516876, "learning_rate": 3e-06, "loss": 0.0014, "step": 2583 }, { "clip_ratio": 0.00020531554764602333, "epoch": 0.007173832170084231, "grad_norm": 0.14581622183322906, "kl": 0.2609102353453636, "learning_rate": 3e-06, "loss": 0.0001, "step": 2584 }, { "clip_ratio": 0.0003089355304837227, "epoch": 0.0071766084209240475, "grad_norm": 0.13155725598335266, "kl": 0.2674725353717804, "learning_rate": 3e-06, "loss": 0.0007, "step": 2585 }, { "clip_ratio": 0.00011693171109072864, "epoch": 0.007179384671763863, "grad_norm": 0.18358992040157318, "kl": 0.2571055293083191, "learning_rate": 3e-06, "loss": -0.0005, "step": 2586 }, { "clip_ratio": 0.00010442773782415316, "epoch": 0.007182160922603679, "grad_norm": 0.16681167483329773, "kl": 0.2628980576992035, "learning_rate": 3e-06, "loss": -0.0017, "step": 2587 }, { "clip_ratio": 0.00011693171109072864, "epoch": 0.0071849371734434946, "grad_norm": 0.16647757589817047, "kl": 0.2551124542951584, "learning_rate": 3e-06, "loss": -0.0029, "step": 2588 }, { "clip_ratio": 0.00021701178047806025, "epoch": 0.007187713424283311, "grad_norm": 0.17945325374603271, "kl": 0.26160192489624023, "learning_rate": 3e-06, "loss": -0.0024, "step": 2589 }, { "clip_ratio": 0.00020450779993552715, "epoch": 0.007190489675123127, "grad_norm": 0.1473187804222107, "kl": 0.23798230290412903, "learning_rate": 3e-06, "loss": -0.0041, "step": 2590 }, { "clip_ratio": 0.0011224752670386806, "epoch": 0.0071932659259629425, "grad_norm": 0.1294354945421219, "kl": 0.24091629683971405, "learning_rate": 3e-06, "loss": -0.003, "step": 2591 }, { "clip_ratio": 0.0009036125265993178, "epoch": 0.007196042176802759, "grad_norm": 0.1550150364637375, "kl": 0.22850533574819565, "learning_rate": 3e-06, "loss": -0.0047, "step": 2592 }, { "clip_ratio": 0.00032973509951261804, "completion_length": 183.1041717529297, "epoch": 0.007198818427642574, "grad_norm": 0.09365466237068176, "kl": 0.20892590284347534, "learning_rate": 3e-06, "loss": 0.0092, "reward": 0.2666667029261589, "reward_std": 0.212855514138937, "rewards/countdown_reward_func": 0.2666666880249977, "step": 2593, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0014551513231708668, "epoch": 0.0072015946784823905, "grad_norm": 0.09161505848169327, "kl": 0.2047012448310852, "learning_rate": 3e-06, "loss": 0.0089, "step": 2594 }, { "clip_ratio": 0.0, "epoch": 0.007204370929322206, "grad_norm": 0.10514482855796814, "kl": 0.2093024104833603, "learning_rate": 3e-06, "loss": 0.0091, "step": 2595 }, { "clip_ratio": 0.00024611613480374217, "epoch": 0.007207147180162022, "grad_norm": 0.0946226418018341, "kl": 0.19274628162384033, "learning_rate": 3e-06, "loss": 0.0092, "step": 2596 }, { "clip_ratio": 0.0006722241232637316, "epoch": 0.0072099234310018375, "grad_norm": 0.09460046887397766, "kl": 0.1927240490913391, "learning_rate": 3e-06, "loss": 0.0086, "step": 2597 }, { "clip_ratio": 0.0007297748234122992, "epoch": 0.007212699681841654, "grad_norm": 0.10224200040102005, "kl": 0.19376622140407562, "learning_rate": 3e-06, "loss": 0.01, "step": 2598 }, { "clip_ratio": 0.00011101243580924347, "epoch": 0.007215475932681469, "grad_norm": 0.09554404020309448, "kl": 0.18605971336364746, "learning_rate": 3e-06, "loss": 0.0076, "step": 2599 }, { "clip_ratio": 0.0007081393487169407, "epoch": 0.0072182521835212855, "grad_norm": 0.09325699508190155, "kl": 0.1811002716422081, "learning_rate": 3e-06, "loss": 0.0074, "step": 2600 }, { "clip_ratio": 0.0, "epoch": 0.007221028434361102, "grad_norm": 0.10484436899423599, "kl": 0.18742184340953827, "learning_rate": 3e-06, "loss": 0.0072, "step": 2601 }, { "clip_ratio": 0.00046079371531959623, "epoch": 0.007223804685200917, "grad_norm": 0.0922909826040268, "kl": 0.17233332991600037, "learning_rate": 3e-06, "loss": 0.0078, "step": 2602 }, { "clip_ratio": 0.0018095963750965893, "epoch": 0.0072265809360407334, "grad_norm": 0.09665399044752121, "kl": 0.17307832092046738, "learning_rate": 3e-06, "loss": 0.007, "step": 2603 }, { "clip_ratio": 0.000654152907372918, "epoch": 0.007229357186880549, "grad_norm": 0.11435188353061676, "kl": 0.17587772011756897, "learning_rate": 3e-06, "loss": 0.009, "step": 2604 }, { "clip_ratio": 0.00017694043344818056, "completion_length": 227.33333587646484, "epoch": 0.007232133437720365, "grad_norm": 0.06461416184902191, "kl": 0.1614966094493866, "learning_rate": 3e-06, "loss": 0.0168, "reward": 0.34166666120290756, "reward_std": 0.18291139230132103, "rewards/countdown_reward_func": 0.34166666120290756, "step": 2605, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0072349096885601805, "grad_norm": 0.07481678575277328, "kl": 0.15756113082170486, "learning_rate": 3e-06, "loss": 0.0162, "step": 2606 }, { "clip_ratio": 0.0009198969055432826, "epoch": 0.007237685939399997, "grad_norm": 0.08776765316724777, "kl": 0.15366152673959732, "learning_rate": 3e-06, "loss": 0.0171, "step": 2607 }, { "clip_ratio": 0.0, "epoch": 0.007240462190239812, "grad_norm": 0.08613624423742294, "kl": 0.15748640894889832, "learning_rate": 3e-06, "loss": 0.0164, "step": 2608 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0072432384410796285, "grad_norm": 0.08208346366882324, "kl": 0.14786392450332642, "learning_rate": 3e-06, "loss": 0.0159, "step": 2609 }, { "clip_ratio": 0.0008111550123430789, "epoch": 0.007246014691919444, "grad_norm": 0.11629568040370941, "kl": 0.1543085277080536, "learning_rate": 3e-06, "loss": 0.016, "step": 2610 }, { "clip_ratio": 8.934953802963719e-05, "epoch": 0.00724879094275926, "grad_norm": 0.06528989225625992, "kl": 0.1493528038263321, "learning_rate": 3e-06, "loss": 0.0168, "step": 2611 }, { "clip_ratio": 8.934953802963719e-05, "epoch": 0.0072515671935990764, "grad_norm": 0.07893183827400208, "kl": 0.1448308825492859, "learning_rate": 3e-06, "loss": 0.015, "step": 2612 }, { "clip_ratio": 0.0007538200006820261, "epoch": 0.007254343444438892, "grad_norm": 0.10118536651134491, "kl": 0.14246216416358948, "learning_rate": 3e-06, "loss": 0.0164, "step": 2613 }, { "clip_ratio": 0.00016356298874597996, "epoch": 0.007257119695278708, "grad_norm": 0.07493187487125397, "kl": 0.14743702113628387, "learning_rate": 3e-06, "loss": 0.0158, "step": 2614 }, { "clip_ratio": 0.00035080187080893666, "epoch": 0.0072598959461185235, "grad_norm": 0.11228254437446594, "kl": 0.1367715448141098, "learning_rate": 3e-06, "loss": 0.0154, "step": 2615 }, { "clip_ratio": 8.549931953893974e-05, "epoch": 0.00726267219695834, "grad_norm": 0.10421894490718842, "kl": 0.14218785613775253, "learning_rate": 3e-06, "loss": 0.0143, "step": 2616 }, { "clip_ratio": 0.0001706022012513131, "completion_length": 224.95834350585938, "epoch": 0.007265448447798155, "grad_norm": 0.11332646012306213, "kl": 0.14102116227149963, "learning_rate": 3e-06, "loss": 0.0075, "reward": 0.32500000298023224, "reward_std": 0.32423485815525055, "rewards/countdown_reward_func": 0.32499998807907104, "step": 2617, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0001959252404049039, "epoch": 0.0072682246986379715, "grad_norm": 0.11970315128564835, "kl": 0.13919668644666672, "learning_rate": 3e-06, "loss": 0.0087, "step": 2618 }, { "clip_ratio": 0.0002867654329747893, "epoch": 0.007271000949477787, "grad_norm": 0.10386362671852112, "kl": 0.14643407613039017, "learning_rate": 3e-06, "loss": 0.0081, "step": 2619 }, { "clip_ratio": 8.922198321670294e-05, "epoch": 0.007273777200317603, "grad_norm": 0.08748464286327362, "kl": 0.14277955889701843, "learning_rate": 3e-06, "loss": 0.008, "step": 2620 }, { "clip_ratio": 0.00018840076518245041, "epoch": 0.0072765534511574186, "grad_norm": 0.0853482037782669, "kl": 0.1375402882695198, "learning_rate": 3e-06, "loss": 0.0089, "step": 2621 }, { "clip_ratio": 0.0002733351939241402, "epoch": 0.007279329701997235, "grad_norm": 0.08296167850494385, "kl": 0.13316894322633743, "learning_rate": 3e-06, "loss": 0.0084, "step": 2622 }, { "clip_ratio": 0.0002673486596904695, "epoch": 0.007282105952837051, "grad_norm": 0.11611305177211761, "kl": 0.1321260631084442, "learning_rate": 3e-06, "loss": 0.0071, "step": 2623 }, { "clip_ratio": 0.0002667140797711909, "epoch": 0.0072848822036768665, "grad_norm": 0.13228337466716766, "kl": 0.13071859627962112, "learning_rate": 3e-06, "loss": 0.0075, "step": 2624 }, { "clip_ratio": 0.00019620567763922736, "epoch": 0.007287658454516683, "grad_norm": 0.09613224118947983, "kl": 0.1390385627746582, "learning_rate": 3e-06, "loss": 0.0063, "step": 2625 }, { "clip_ratio": 0.00019227227312512696, "epoch": 0.007290434705356498, "grad_norm": 0.09107892215251923, "kl": 0.13541456311941147, "learning_rate": 3e-06, "loss": 0.0072, "step": 2626 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0072932109561963145, "grad_norm": 0.08553344756364822, "kl": 0.13120842725038528, "learning_rate": 3e-06, "loss": 0.0078, "step": 2627 }, { "clip_ratio": 0.00018443050066707656, "epoch": 0.00729598720703613, "grad_norm": 0.08421790599822998, "kl": 0.12758233398199081, "learning_rate": 3e-06, "loss": 0.0071, "step": 2628 }, { "clip_ratio": 0.0006468229839811102, "completion_length": 211.89584350585938, "epoch": 0.007298763457875946, "grad_norm": 0.06689060479402542, "kl": 0.13051141053438187, "learning_rate": 3e-06, "loss": 0.0195, "reward": 0.3020833283662796, "reward_std": 0.21466520056128502, "rewards/countdown_reward_func": 0.302083320915699, "step": 2629, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00010154346091439947, "epoch": 0.0073015397087157615, "grad_norm": 0.09763406217098236, "kl": 0.13490428030490875, "learning_rate": 3e-06, "loss": 0.019, "step": 2630 }, { "clip_ratio": 0.00018103061302099377, "epoch": 0.007304315959555578, "grad_norm": 0.08729059994220734, "kl": 0.1274981126189232, "learning_rate": 3e-06, "loss": 0.0187, "step": 2631 }, { "clip_ratio": 0.00020658547873608768, "epoch": 0.007307092210395393, "grad_norm": 0.08206357806921005, "kl": 0.1304761990904808, "learning_rate": 3e-06, "loss": 0.0195, "step": 2632 }, { "clip_ratio": 0.00025294802617281675, "epoch": 0.0073098684612352095, "grad_norm": 0.10291749238967896, "kl": 0.1305009052157402, "learning_rate": 3e-06, "loss": 0.0191, "step": 2633 }, { "clip_ratio": 0.0003607503604143858, "epoch": 0.007312644712075026, "grad_norm": 0.08413831144571304, "kl": 0.1341697722673416, "learning_rate": 3e-06, "loss": 0.0194, "step": 2634 }, { "clip_ratio": 0.0002766098186839372, "epoch": 0.007315420962914841, "grad_norm": 0.06767857819795609, "kl": 0.13260185718536377, "learning_rate": 3e-06, "loss": 0.0189, "step": 2635 }, { "clip_ratio": 0.0, "epoch": 0.0073181972137546574, "grad_norm": 0.10423475503921509, "kl": 0.13295477628707886, "learning_rate": 3e-06, "loss": 0.0182, "step": 2636 }, { "clip_ratio": 9.018759010359645e-05, "epoch": 0.007320973464594473, "grad_norm": 0.08689744025468826, "kl": 0.12721313536167145, "learning_rate": 3e-06, "loss": 0.0183, "step": 2637 }, { "clip_ratio": 0.0, "epoch": 0.007323749715434289, "grad_norm": 0.09273549169301987, "kl": 0.1307196170091629, "learning_rate": 3e-06, "loss": 0.0188, "step": 2638 }, { "clip_ratio": 0.00030163175688358024, "epoch": 0.0073265259662741045, "grad_norm": 0.09733311831951141, "kl": 0.13079597055912018, "learning_rate": 3e-06, "loss": 0.0178, "step": 2639 }, { "clip_ratio": 9.084302291739732e-05, "epoch": 0.007329302217113921, "grad_norm": 0.0765179768204689, "kl": 0.13601050525903702, "learning_rate": 3e-06, "loss": 0.0186, "step": 2640 }, { "clip_ratio": 8.406186680076644e-05, "completion_length": 231.6666717529297, "epoch": 0.007332078467953736, "grad_norm": 0.10443374514579773, "kl": 0.11643990129232407, "learning_rate": 3e-06, "loss": 0.0078, "reward": 0.4166666716337204, "reward_std": 0.37129178643226624, "rewards/countdown_reward_func": 0.4166666716337204, "step": 2641, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00019968050764873624, "epoch": 0.0073348547187935525, "grad_norm": 0.11376018822193146, "kl": 0.1193588376045227, "learning_rate": 3e-06, "loss": 0.0087, "step": 2642 }, { "clip_ratio": 0.0005163489258848131, "epoch": 0.007337630969633368, "grad_norm": 0.11981891840696335, "kl": 0.12199830263853073, "learning_rate": 3e-06, "loss": 0.0081, "step": 2643 }, { "clip_ratio": 0.0, "epoch": 0.007340407220473184, "grad_norm": 0.10817337036132812, "kl": 0.123886588960886, "learning_rate": 3e-06, "loss": 0.0082, "step": 2644 }, { "clip_ratio": 0.0002677346928976476, "epoch": 0.0073431834713130004, "grad_norm": 0.11692439764738083, "kl": 0.12618417292833328, "learning_rate": 3e-06, "loss": 0.0075, "step": 2645 }, { "clip_ratio": 0.00033624746720306575, "epoch": 0.007345959722152816, "grad_norm": 0.12890896201133728, "kl": 0.12341205030679703, "learning_rate": 3e-06, "loss": 0.0085, "step": 2646 }, { "clip_ratio": 8.406186680076644e-05, "epoch": 0.007348735972992632, "grad_norm": 0.11652335524559021, "kl": 0.11960984021425247, "learning_rate": 3e-06, "loss": 0.0068, "step": 2647 }, { "clip_ratio": 0.0, "epoch": 0.0073515122238324475, "grad_norm": 0.13827817142009735, "kl": 0.12161833047866821, "learning_rate": 3e-06, "loss": 0.0074, "step": 2648 }, { "clip_ratio": 8.922198321670294e-05, "epoch": 0.007354288474672264, "grad_norm": 0.1181538924574852, "kl": 0.12784672155976295, "learning_rate": 3e-06, "loss": 0.0065, "step": 2649 }, { "clip_ratio": 0.0001782710460247472, "epoch": 0.007357064725512079, "grad_norm": 0.10128634423017502, "kl": 0.12955759465694427, "learning_rate": 3e-06, "loss": 0.0065, "step": 2650 }, { "clip_ratio": 0.00018164653738494962, "epoch": 0.0073598409763518955, "grad_norm": 0.11900784820318222, "kl": 0.13159554451704025, "learning_rate": 3e-06, "loss": 0.0071, "step": 2651 }, { "clip_ratio": 0.0005095313244964927, "epoch": 0.007362617227191711, "grad_norm": 0.1258239597082138, "kl": 0.13032276183366776, "learning_rate": 3e-06, "loss": 0.0062, "step": 2652 }, { "clip_ratio": 0.0001973169855773449, "completion_length": 213.5, "epoch": 0.007365393478031527, "grad_norm": 0.11249188333749771, "kl": 0.13574101775884628, "learning_rate": 3e-06, "loss": 0.0199, "reward": 0.30416668951511383, "reward_std": 0.324219211935997, "rewards/countdown_reward_func": 0.30416667461395264, "step": 2653, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0007071296859066933, "epoch": 0.0073681697288713426, "grad_norm": 0.1471174657344818, "kl": 0.13413872569799423, "learning_rate": 3e-06, "loss": 0.0199, "step": 2654 }, { "clip_ratio": 8.526603050995618e-05, "epoch": 0.007370945979711159, "grad_norm": 0.12275787442922592, "kl": 0.13440951704978943, "learning_rate": 3e-06, "loss": 0.0192, "step": 2655 }, { "clip_ratio": 0.00010729613859439269, "epoch": 0.007373722230550975, "grad_norm": 0.09506183862686157, "kl": 0.13459663838148117, "learning_rate": 3e-06, "loss": 0.0195, "step": 2656 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0073764984813907905, "grad_norm": 0.1005769893527031, "kl": 0.13220536708831787, "learning_rate": 3e-06, "loss": 0.0198, "step": 2657 }, { "clip_ratio": 0.0003345241057104431, "epoch": 0.007379274732230607, "grad_norm": 0.0869453102350235, "kl": 0.14396630227565765, "learning_rate": 3e-06, "loss": 0.0201, "step": 2658 }, { "clip_ratio": 0.0005324306257534772, "epoch": 0.007382050983070422, "grad_norm": 0.102933868765831, "kl": 0.14012698829174042, "learning_rate": 3e-06, "loss": 0.0198, "step": 2659 }, { "clip_ratio": 0.00019561815133783966, "epoch": 0.0073848272339102385, "grad_norm": 0.0869792029261589, "kl": 0.1389094591140747, "learning_rate": 3e-06, "loss": 0.0194, "step": 2660 }, { "clip_ratio": 8.526603050995618e-05, "epoch": 0.007387603484750054, "grad_norm": 0.09716138243675232, "kl": 0.13817085325717926, "learning_rate": 3e-06, "loss": 0.019, "step": 2661 }, { "clip_ratio": 0.00029389126575551927, "epoch": 0.00739037973558987, "grad_norm": 0.13167338073253632, "kl": 0.1382356584072113, "learning_rate": 3e-06, "loss": 0.0195, "step": 2662 }, { "clip_ratio": 0.00036784904659725726, "epoch": 0.0073931559864296855, "grad_norm": 0.09182232618331909, "kl": 0.13654747605323792, "learning_rate": 3e-06, "loss": 0.0191, "step": 2663 }, { "clip_ratio": 0.00018659512716112658, "epoch": 0.007395932237269502, "grad_norm": 0.0909401997923851, "kl": 0.14828573167324066, "learning_rate": 3e-06, "loss": 0.0196, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 229.0625, "epoch": 0.007398708488109317, "grad_norm": 0.10837903618812561, "kl": 0.1430712789297104, "learning_rate": 3e-06, "loss": 0.0003, "reward": 0.40000003576278687, "reward_std": 0.3556165546178818, "rewards/countdown_reward_func": 0.3999999910593033, "step": 2665, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.0074014847389491335, "grad_norm": 0.1095694825053215, "kl": 0.14318571984767914, "learning_rate": 3e-06, "loss": 0.0008, "step": 2666 }, { "clip_ratio": 0.0, "epoch": 0.00740426098978895, "grad_norm": 0.16832958161830902, "kl": 0.13511797040700912, "learning_rate": 3e-06, "loss": -0.0003, "step": 2667 }, { "clip_ratio": 0.0, "epoch": 0.007407037240628765, "grad_norm": 0.11594914644956589, "kl": 0.1365041732788086, "learning_rate": 3e-06, "loss": 0.0011, "step": 2668 }, { "clip_ratio": 0.0004090314032509923, "epoch": 0.0074098134914685814, "grad_norm": 0.12862901389598846, "kl": 0.14102691411972046, "learning_rate": 3e-06, "loss": -0.0006, "step": 2669 }, { "clip_ratio": 0.000163612567121163, "epoch": 0.007412589742308397, "grad_norm": 0.10756373405456543, "kl": 0.13622543588280678, "learning_rate": 3e-06, "loss": 0.0006, "step": 2670 }, { "clip_ratio": 0.00011606313637457788, "epoch": 0.007415365993148213, "grad_norm": 0.12956829369068146, "kl": 0.1438135728240013, "learning_rate": 3e-06, "loss": -0.0004, "step": 2671 }, { "clip_ratio": 0.0, "epoch": 0.0074181422439880285, "grad_norm": 0.11213965713977814, "kl": 0.14489904791116714, "learning_rate": 3e-06, "loss": -0.0002, "step": 2672 }, { "clip_ratio": 8.704735228093341e-05, "epoch": 0.007420918494827845, "grad_norm": 0.16151954233646393, "kl": 0.136183962225914, "learning_rate": 3e-06, "loss": -0.0009, "step": 2673 }, { "clip_ratio": 0.00016842756303958595, "epoch": 0.00742369474566766, "grad_norm": 0.12114012241363525, "kl": 0.13692208379507065, "learning_rate": 3e-06, "loss": 0.0001, "step": 2674 }, { "clip_ratio": 8.704735228093341e-05, "epoch": 0.0074264709965074765, "grad_norm": 0.10968311131000519, "kl": 0.14131344109773636, "learning_rate": 3e-06, "loss": -0.0013, "step": 2675 }, { "clip_ratio": 0.00018007117614615709, "epoch": 0.007429247247347293, "grad_norm": 0.10273008048534393, "kl": 0.1344214528799057, "learning_rate": 3e-06, "loss": -0.0008, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 209.625, "epoch": 0.007432023498187108, "grad_norm": 0.11194518208503723, "kl": 0.1327531933784485, "learning_rate": 3e-06, "loss": 0.0254, "reward": 0.39791668951511383, "reward_std": 0.22883932292461395, "rewards/countdown_reward_func": 0.39791668206453323, "step": 2677, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00017662087338976562, "epoch": 0.0074347997490269244, "grad_norm": 0.09298588335514069, "kl": 0.13464464992284775, "learning_rate": 3e-06, "loss": 0.0262, "step": 2678 }, { "clip_ratio": 0.0002646436041686684, "epoch": 0.00743757599986674, "grad_norm": 0.0982695147395134, "kl": 0.1280595250427723, "learning_rate": 3e-06, "loss": 0.0262, "step": 2679 }, { "clip_ratio": 0.0006846775067970157, "epoch": 0.007440352250706556, "grad_norm": 0.08643513172864914, "kl": 0.1383785605430603, "learning_rate": 3e-06, "loss": 0.0278, "step": 2680 }, { "clip_ratio": 0.0003382949798833579, "epoch": 0.0074431285015463715, "grad_norm": 0.09550473839044571, "kl": 0.1345135197043419, "learning_rate": 3e-06, "loss": 0.0266, "step": 2681 }, { "clip_ratio": 0.00018026166799245402, "epoch": 0.007445904752386188, "grad_norm": 0.08801712840795517, "kl": 0.13855824619531631, "learning_rate": 3e-06, "loss": 0.0268, "step": 2682 }, { "clip_ratio": 8.821453957352787e-05, "epoch": 0.007448681003226003, "grad_norm": 0.09090833365917206, "kl": 0.13326393067836761, "learning_rate": 3e-06, "loss": 0.0256, "step": 2683 }, { "clip_ratio": 0.00025372125674039125, "epoch": 0.0074514572540658195, "grad_norm": 0.08877915143966675, "kl": 0.1354650855064392, "learning_rate": 3e-06, "loss": 0.0246, "step": 2684 }, { "clip_ratio": 9.204712841892615e-05, "epoch": 0.007454233504905635, "grad_norm": 0.09235383570194244, "kl": 0.1299561709165573, "learning_rate": 3e-06, "loss": 0.0255, "step": 2685 }, { "clip_ratio": 0.00036580205778591335, "epoch": 0.007457009755745451, "grad_norm": 0.0795152336359024, "kl": 0.14115244150161743, "learning_rate": 3e-06, "loss": 0.0265, "step": 2686 }, { "clip_ratio": 0.0, "epoch": 0.007459786006585267, "grad_norm": 0.09334254264831543, "kl": 0.13625407218933105, "learning_rate": 3e-06, "loss": 0.0257, "step": 2687 }, { "clip_ratio": 8.821453957352787e-05, "epoch": 0.007462562257425083, "grad_norm": 0.09275975078344345, "kl": 0.14334241300821304, "learning_rate": 3e-06, "loss": 0.0256, "step": 2688 }, { "clip_ratio": 0.000366316111467313, "completion_length": 216.4166717529297, "epoch": 0.007465338508264899, "grad_norm": 0.1422148197889328, "kl": 0.15680401027202606, "learning_rate": 3e-06, "loss": 0.0488, "reward": 0.36250001192092896, "reward_std": 0.38232962787151337, "rewards/countdown_reward_func": 0.36250001192092896, "step": 2689, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0005639097653329372, "epoch": 0.0074681147591047145, "grad_norm": 0.14207009971141815, "kl": 0.14677118510007858, "learning_rate": 3e-06, "loss": 0.0485, "step": 2690 }, { "clip_ratio": 0.0003494874108582735, "epoch": 0.007470891009944531, "grad_norm": 0.13019144535064697, "kl": 0.15386850386857986, "learning_rate": 3e-06, "loss": 0.0472, "step": 2691 }, { "clip_ratio": 0.0, "epoch": 0.007473667260784346, "grad_norm": 0.13476598262786865, "kl": 0.1664591133594513, "learning_rate": 3e-06, "loss": 0.0484, "step": 2692 }, { "clip_ratio": 0.0, "epoch": 0.0074764435116241625, "grad_norm": 0.1329074651002884, "kl": 0.16190333664417267, "learning_rate": 3e-06, "loss": 0.0471, "step": 2693 }, { "clip_ratio": 0.00020629465871024877, "epoch": 0.007479219762463978, "grad_norm": 0.13387206196784973, "kl": 0.17932642251253128, "learning_rate": 3e-06, "loss": 0.0476, "step": 2694 }, { "clip_ratio": 9.077705180970952e-05, "epoch": 0.007481996013303794, "grad_norm": 0.13275492191314697, "kl": 0.17982840538024902, "learning_rate": 3e-06, "loss": 0.0463, "step": 2695 }, { "clip_ratio": 0.0011561524588614702, "epoch": 0.0074847722641436095, "grad_norm": 0.14639776945114136, "kl": 0.16803783923387527, "learning_rate": 3e-06, "loss": 0.0459, "step": 2696 }, { "clip_ratio": 0.00046598323388025165, "epoch": 0.007487548514983426, "grad_norm": 0.12556995451450348, "kl": 0.1819344237446785, "learning_rate": 3e-06, "loss": 0.0438, "step": 2697 }, { "clip_ratio": 9.07111752894707e-05, "epoch": 0.007490324765823242, "grad_norm": 0.11433063447475433, "kl": 0.199430912733078, "learning_rate": 3e-06, "loss": 0.0441, "step": 2698 }, { "clip_ratio": 0.0004036689642816782, "epoch": 0.0074931010166630575, "grad_norm": 0.12387590855360031, "kl": 0.19225472211837769, "learning_rate": 3e-06, "loss": 0.0445, "step": 2699 }, { "clip_ratio": 0.0013130483275745064, "epoch": 0.007495877267502874, "grad_norm": 0.12008412182331085, "kl": 0.21636594831943512, "learning_rate": 3e-06, "loss": 0.044, "step": 2700 }, { "clip_ratio": 0.00043265242129564285, "completion_length": 207.20833587646484, "epoch": 0.007498653518342689, "grad_norm": 0.08580924570560455, "kl": 0.23036500066518784, "learning_rate": 3e-06, "loss": 0.0178, "reward": 0.2645833492279053, "reward_std": 0.26475247740745544, "rewards/countdown_reward_func": 0.2645833343267441, "step": 2701, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.0075014297691825054, "grad_norm": 0.09006589651107788, "kl": 0.2420239821076393, "learning_rate": 3e-06, "loss": 0.018, "step": 2702 }, { "clip_ratio": 0.00018698109488468617, "epoch": 0.007504206020022321, "grad_norm": 0.09262547641992569, "kl": 0.23500695824623108, "learning_rate": 3e-06, "loss": 0.0174, "step": 2703 }, { "clip_ratio": 0.00010469011613167822, "epoch": 0.007506982270862137, "grad_norm": 0.093314528465271, "kl": 0.23572850972414017, "learning_rate": 3e-06, "loss": 0.0177, "step": 2704 }, { "clip_ratio": 0.00010955302423099056, "epoch": 0.0075097585217019525, "grad_norm": 0.09187373518943787, "kl": 0.24988384544849396, "learning_rate": 3e-06, "loss": 0.017, "step": 2705 }, { "clip_ratio": 0.00010469011613167822, "epoch": 0.007512534772541769, "grad_norm": 0.0862668827176094, "kl": 0.2375548779964447, "learning_rate": 3e-06, "loss": 0.0177, "step": 2706 }, { "clip_ratio": 0.0, "epoch": 0.007515311023381584, "grad_norm": 0.08571210503578186, "kl": 0.2631724327802658, "learning_rate": 3e-06, "loss": 0.0167, "step": 2707 }, { "clip_ratio": 0.0004019632178824395, "epoch": 0.0075180872742214005, "grad_norm": 0.09579290449619293, "kl": 0.2698695957660675, "learning_rate": 3e-06, "loss": 0.0173, "step": 2708 }, { "clip_ratio": 0.0004979280784027651, "epoch": 0.007520863525061217, "grad_norm": 0.08481708914041519, "kl": 0.2539065480232239, "learning_rate": 3e-06, "loss": 0.0169, "step": 2709 }, { "clip_ratio": 0.0003165066664223559, "epoch": 0.007523639775901032, "grad_norm": 0.0984940454363823, "kl": 0.25314465165138245, "learning_rate": 3e-06, "loss": 0.0172, "step": 2710 }, { "clip_ratio": 0.0, "epoch": 0.007526416026740848, "grad_norm": 0.09613295644521713, "kl": 0.2644559442996979, "learning_rate": 3e-06, "loss": 0.0167, "step": 2711 }, { "clip_ratio": 0.00030389922903850675, "epoch": 0.007529192277580664, "grad_norm": 0.08467640727758408, "kl": 0.25008492171764374, "learning_rate": 3e-06, "loss": 0.0174, "step": 2712 }, { "clip_ratio": 0.0002598752616904676, "completion_length": 196.5416717529297, "epoch": 0.00753196852842048, "grad_norm": 0.12258782982826233, "kl": 0.24638450890779495, "learning_rate": 3e-06, "loss": 0.041, "reward": 0.45625004172325134, "reward_std": 0.33640168607234955, "rewards/countdown_reward_func": 0.45624999701976776, "step": 2713, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0005197505233809352, "epoch": 0.0075347447792602955, "grad_norm": 0.1461792290210724, "kl": 0.2639908641576767, "learning_rate": 3e-06, "loss": 0.0433, "step": 2714 }, { "clip_ratio": 0.0, "epoch": 0.007537521030100112, "grad_norm": 0.11039087176322937, "kl": 0.253682978451252, "learning_rate": 3e-06, "loss": 0.041, "step": 2715 }, { "clip_ratio": 0.0004924401509924792, "epoch": 0.007540297280939927, "grad_norm": 0.10208743065595627, "kl": 0.28054000437259674, "learning_rate": 3e-06, "loss": 0.0432, "step": 2716 }, { "clip_ratio": 0.00010262725845677778, "epoch": 0.0075430735317797435, "grad_norm": 0.09507179260253906, "kl": 0.2802576571702957, "learning_rate": 3e-06, "loss": 0.0414, "step": 2717 }, { "clip_ratio": 0.0002557980769779533, "epoch": 0.007545849782619559, "grad_norm": 0.13740640878677368, "kl": 0.26479241251945496, "learning_rate": 3e-06, "loss": 0.041, "step": 2718 }, { "clip_ratio": 0.0002598752616904676, "epoch": 0.007548626033459375, "grad_norm": 0.10834936797618866, "kl": 0.2636203467845917, "learning_rate": 3e-06, "loss": 0.0394, "step": 2719 }, { "clip_ratio": 0.00037032226100564003, "epoch": 0.007551402284299191, "grad_norm": 0.14878171682357788, "kl": 0.2852846682071686, "learning_rate": 3e-06, "loss": 0.0407, "step": 2720 }, { "clip_ratio": 0.0, "epoch": 0.007554178535139007, "grad_norm": 0.09853127598762512, "kl": 0.2804602384567261, "learning_rate": 3e-06, "loss": 0.0394, "step": 2721 }, { "clip_ratio": 0.0003178671468049288, "epoch": 0.007556954785978823, "grad_norm": 0.10325701534748077, "kl": 0.312502384185791, "learning_rate": 3e-06, "loss": 0.0414, "step": 2722 }, { "clip_ratio": 0.0002598752616904676, "epoch": 0.0075597310368186385, "grad_norm": 0.0892425999045372, "kl": 0.3129449635744095, "learning_rate": 3e-06, "loss": 0.0399, "step": 2723 }, { "clip_ratio": 0.0006137051532277837, "epoch": 0.007562507287658455, "grad_norm": 0.1205538660287857, "kl": 0.2968228608369827, "learning_rate": 3e-06, "loss": 0.039, "step": 2724 }, { "clip_ratio": 9.607993706595153e-05, "completion_length": 200.14584350585938, "epoch": 0.00756528353849827, "grad_norm": 0.12288663536310196, "kl": 0.3106507509946823, "learning_rate": 3e-06, "loss": 0.0439, "reward": 0.34166669845581055, "reward_std": 0.32904504239559174, "rewards/countdown_reward_func": 0.34166669845581055, "step": 2725, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0002655807475093752, "epoch": 0.0075680597893380865, "grad_norm": 0.18839959800243378, "kl": 0.3162599802017212, "learning_rate": 3e-06, "loss": 0.0435, "step": 2726 }, { "clip_ratio": 8.852691098582e-05, "epoch": 0.007570836040177902, "grad_norm": 0.12424420565366745, "kl": 0.32132910192012787, "learning_rate": 3e-06, "loss": 0.0439, "step": 2727 }, { "clip_ratio": 0.00010513036249903962, "epoch": 0.007573612291017718, "grad_norm": 0.11105859279632568, "kl": 0.3221927136182785, "learning_rate": 3e-06, "loss": 0.0436, "step": 2728 }, { "clip_ratio": 0.0, "epoch": 0.0075763885418575335, "grad_norm": 0.11852031201124191, "kl": 0.3326174020767212, "learning_rate": 3e-06, "loss": 0.0419, "step": 2729 }, { "clip_ratio": 0.0, "epoch": 0.00757916479269735, "grad_norm": 0.09567821025848389, "kl": 0.32843931019306183, "learning_rate": 3e-06, "loss": 0.0422, "step": 2730 }, { "clip_ratio": 0.0, "epoch": 0.007581941043537166, "grad_norm": 0.10402382165193558, "kl": 0.3619465082883835, "learning_rate": 3e-06, "loss": 0.0427, "step": 2731 }, { "clip_ratio": 8.852691098582e-05, "epoch": 0.0075847172943769815, "grad_norm": 0.2112588882446289, "kl": 0.3617081940174103, "learning_rate": 3e-06, "loss": 0.0426, "step": 2732 }, { "clip_ratio": 0.0004136313946219161, "epoch": 0.007587493545216798, "grad_norm": 0.11296899616718292, "kl": 0.3730858713388443, "learning_rate": 3e-06, "loss": 0.0422, "step": 2733 }, { "clip_ratio": 0.0003087134100496769, "epoch": 0.007590269796056613, "grad_norm": 0.0867656022310257, "kl": 0.37249356508255005, "learning_rate": 3e-06, "loss": 0.0422, "step": 2734 }, { "clip_ratio": 0.00020121029956499115, "epoch": 0.0075930460468964294, "grad_norm": 0.1334255188703537, "kl": 0.3828137516975403, "learning_rate": 3e-06, "loss": 0.0408, "step": 2735 }, { "clip_ratio": 0.000984879763564095, "epoch": 0.007595822297736245, "grad_norm": 0.09838055819272995, "kl": 0.3796308636665344, "learning_rate": 3e-06, "loss": 0.0409, "step": 2736 }, { "clip_ratio": 0.00027322774985805154, "completion_length": 206.64583587646484, "epoch": 0.007598598548576061, "grad_norm": 0.13828794658184052, "kl": 0.39359983801841736, "learning_rate": 3e-06, "loss": 0.0338, "reward": 0.3812500238418579, "reward_std": 0.37016279995441437, "rewards/countdown_reward_func": 0.3812499940395355, "step": 2737, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0076013747994158765, "grad_norm": 0.11819808930158615, "kl": 0.3848104625940323, "learning_rate": 3e-06, "loss": 0.0333, "step": 2738 }, { "clip_ratio": 8.85896515683271e-05, "epoch": 0.007604151050255693, "grad_norm": 0.18826451897621155, "kl": 0.3598520904779434, "learning_rate": 3e-06, "loss": 0.0323, "step": 2739 }, { "clip_ratio": 0.0003242621314711869, "epoch": 0.007606927301095508, "grad_norm": 0.12896966934204102, "kl": 0.3710465133190155, "learning_rate": 3e-06, "loss": 0.0316, "step": 2740 }, { "clip_ratio": 0.0001923076924867928, "epoch": 0.0076097035519353245, "grad_norm": 0.1429174840450287, "kl": 0.40041057765483856, "learning_rate": 3e-06, "loss": 0.0329, "step": 2741 }, { "clip_ratio": 0.0001270325155928731, "epoch": 0.007612479802775141, "grad_norm": 0.14603330194950104, "kl": 0.3678257316350937, "learning_rate": 3e-06, "loss": 0.0317, "step": 2742 }, { "clip_ratio": 0.0003692762111313641, "epoch": 0.007615256053614956, "grad_norm": 0.13155430555343628, "kl": 0.4124736040830612, "learning_rate": 3e-06, "loss": 0.0308, "step": 2743 }, { "clip_ratio": 0.0, "epoch": 0.007618032304454772, "grad_norm": 0.11471111327409744, "kl": 0.3989041596651077, "learning_rate": 3e-06, "loss": 0.0317, "step": 2744 }, { "clip_ratio": 8.85896515683271e-05, "epoch": 0.007620808555294588, "grad_norm": 0.119503915309906, "kl": 0.3668845444917679, "learning_rate": 3e-06, "loss": 0.0298, "step": 2745 }, { "clip_ratio": 0.00010611205652821809, "epoch": 0.007623584806134404, "grad_norm": 0.12154264003038406, "kl": 0.3718787133693695, "learning_rate": 3e-06, "loss": 0.0303, "step": 2746 }, { "clip_ratio": 9.231905278284103e-05, "epoch": 0.0076263610569742195, "grad_norm": 0.12204006314277649, "kl": 0.396651953458786, "learning_rate": 3e-06, "loss": 0.0306, "step": 2747 }, { "clip_ratio": 0.0, "epoch": 0.007629137307814036, "grad_norm": 0.12996286153793335, "kl": 0.3581894338130951, "learning_rate": 3e-06, "loss": 0.03, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 190.43750762939453, "epoch": 0.007631913558653851, "grad_norm": 0.12005726993083954, "kl": 0.41084906458854675, "learning_rate": 3e-06, "loss": 0.0231, "reward": 0.18958335369825363, "reward_std": 0.17232362926006317, "rewards/countdown_reward_func": 0.18958334624767303, "step": 2749, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00020120220142416656, "epoch": 0.0076346898094936675, "grad_norm": 0.1055522933602333, "kl": 0.390746608376503, "learning_rate": 3e-06, "loss": 0.0227, "step": 2750 }, { "clip_ratio": 0.00010729613859439269, "epoch": 0.007637466060333483, "grad_norm": 0.10504616051912308, "kl": 0.38577188551425934, "learning_rate": 3e-06, "loss": 0.0219, "step": 2751 }, { "clip_ratio": 0.0002194907865487039, "epoch": 0.007640242311173299, "grad_norm": 0.09394167363643646, "kl": 0.3656618595123291, "learning_rate": 3e-06, "loss": 0.0208, "step": 2752 }, { "clip_ratio": 0.000326786917867139, "epoch": 0.007643018562013115, "grad_norm": 0.12337076663970947, "kl": 0.3939744532108307, "learning_rate": 3e-06, "loss": 0.0229, "step": 2753 }, { "clip_ratio": 0.0, "epoch": 0.007645794812852931, "grad_norm": 0.0932365134358406, "kl": 0.37053388357162476, "learning_rate": 3e-06, "loss": 0.0225, "step": 2754 }, { "clip_ratio": 0.00035310734529048204, "epoch": 0.007648571063692747, "grad_norm": 0.17183081805706024, "kl": 0.3615852892398834, "learning_rate": 3e-06, "loss": 0.0206, "step": 2755 }, { "clip_ratio": 0.00030849833274260163, "epoch": 0.0076513473145325625, "grad_norm": 0.09622940421104431, "kl": 0.3416426181793213, "learning_rate": 3e-06, "loss": 0.0207, "step": 2756 }, { "clip_ratio": 0.00010729613859439269, "epoch": 0.007654123565372379, "grad_norm": 0.09610886871814728, "kl": 0.3311006873846054, "learning_rate": 3e-06, "loss": 0.0198, "step": 2757 }, { "clip_ratio": 0.00031364121241495013, "epoch": 0.007656899816212194, "grad_norm": 0.08967255055904388, "kl": 0.31152254343032837, "learning_rate": 3e-06, "loss": 0.0197, "step": 2758 }, { "clip_ratio": 0.00034151694853790104, "epoch": 0.0076596760670520105, "grad_norm": 0.11018448323011398, "kl": 0.33154296875, "learning_rate": 3e-06, "loss": 0.0209, "step": 2759 }, { "clip_ratio": 0.0005427191645139828, "epoch": 0.007662452317891826, "grad_norm": 0.07953980565071106, "kl": 0.3077828139066696, "learning_rate": 3e-06, "loss": 0.0203, "step": 2760 }, { "clip_ratio": 0.0005332651344360784, "completion_length": 212.4166717529297, "epoch": 0.007665228568731642, "grad_norm": 0.11768138408660889, "kl": 0.28622299432754517, "learning_rate": 3e-06, "loss": 0.0403, "reward": 0.260416679084301, "reward_std": 0.28296615928411484, "rewards/countdown_reward_func": 0.2604166641831398, "step": 2761, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00063204098114511, "epoch": 0.0076680048195714575, "grad_norm": 0.10940464586019516, "kl": 0.284646600484848, "learning_rate": 3e-06, "loss": 0.0407, "step": 2762 }, { "clip_ratio": 0.0004931568837491795, "epoch": 0.007670781070411274, "grad_norm": 0.11068257689476013, "kl": 0.2776888310909271, "learning_rate": 3e-06, "loss": 0.0408, "step": 2763 }, { "clip_ratio": 0.00038821947964606807, "epoch": 0.00767355732125109, "grad_norm": 0.1107616275548935, "kl": 0.2598488852381706, "learning_rate": 3e-06, "loss": 0.0395, "step": 2764 }, { "clip_ratio": 0.0, "epoch": 0.0076763335720909055, "grad_norm": 0.11520666629076004, "kl": 0.26463089883327484, "learning_rate": 3e-06, "loss": 0.0408, "step": 2765 }, { "clip_ratio": 0.0, "epoch": 0.007679109822930722, "grad_norm": 0.12201401591300964, "kl": 0.2705033868551254, "learning_rate": 3e-06, "loss": 0.0399, "step": 2766 }, { "clip_ratio": 0.00010584250412648544, "epoch": 0.007681886073770537, "grad_norm": 0.12182534486055374, "kl": 0.26080577075481415, "learning_rate": 3e-06, "loss": 0.0402, "step": 2767 }, { "clip_ratio": 0.0004992638278054073, "epoch": 0.0076846623246103534, "grad_norm": 0.11066868156194687, "kl": 0.26648372411727905, "learning_rate": 3e-06, "loss": 0.0405, "step": 2768 }, { "clip_ratio": 0.0007948013953864574, "epoch": 0.007687438575450169, "grad_norm": 0.10705214738845825, "kl": 0.2649771720170975, "learning_rate": 3e-06, "loss": 0.0412, "step": 2769 }, { "clip_ratio": 9.177679748972878e-05, "epoch": 0.007690214826289985, "grad_norm": 0.10800807178020477, "kl": 0.25178319960832596, "learning_rate": 3e-06, "loss": 0.038, "step": 2770 }, { "clip_ratio": 0.0005495353834703565, "epoch": 0.0076929910771298005, "grad_norm": 0.11956821382045746, "kl": 0.2587262988090515, "learning_rate": 3e-06, "loss": 0.0392, "step": 2771 }, { "clip_ratio": 0.0002615746489027515, "epoch": 0.007695767327969617, "grad_norm": 0.12674206495285034, "kl": 0.2708806246519089, "learning_rate": 3e-06, "loss": 0.0383, "step": 2772 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 218.7916717529297, "epoch": 0.007698543578809432, "grad_norm": 0.11207450181245804, "kl": 0.24436921626329422, "learning_rate": 3e-06, "loss": 0.0246, "reward": 0.32083334028720856, "reward_std": 0.33371348679065704, "rewards/countdown_reward_func": 0.32083334028720856, "step": 2773, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0005635402631014585, "epoch": 0.0077013198296492485, "grad_norm": 0.1130228191614151, "kl": 0.2564231976866722, "learning_rate": 3e-06, "loss": 0.0241, "step": 2774 }, { "clip_ratio": 9.630200656829402e-05, "epoch": 0.007704096080489065, "grad_norm": 0.12682946026325226, "kl": 0.2607926279306412, "learning_rate": 3e-06, "loss": 0.024, "step": 2775 }, { "clip_ratio": 0.000179726819624193, "epoch": 0.00770687233132888, "grad_norm": 0.0957360714673996, "kl": 0.24642202258110046, "learning_rate": 3e-06, "loss": 0.0223, "step": 2776 }, { "clip_ratio": 9.630200656829402e-05, "epoch": 0.007709648582168696, "grad_norm": 0.15559712052345276, "kl": 0.26118090003728867, "learning_rate": 3e-06, "loss": 0.0233, "step": 2777 }, { "clip_ratio": 0.0, "epoch": 0.007712424833008512, "grad_norm": 0.11560562252998352, "kl": 0.25564949214458466, "learning_rate": 3e-06, "loss": 0.0226, "step": 2778 }, { "clip_ratio": 0.0, "epoch": 0.007715201083848328, "grad_norm": 0.08487410098314285, "kl": 0.25750092417001724, "learning_rate": 3e-06, "loss": 0.0226, "step": 2779 }, { "clip_ratio": 0.0006449205102398992, "epoch": 0.0077179773346881435, "grad_norm": 0.10305937379598618, "kl": 0.2673904597759247, "learning_rate": 3e-06, "loss": 0.0224, "step": 2780 }, { "clip_ratio": 9.630200656829402e-05, "epoch": 0.00772075358552796, "grad_norm": 0.11479093134403229, "kl": 0.2699562609195709, "learning_rate": 3e-06, "loss": 0.0229, "step": 2781 }, { "clip_ratio": 0.00026959023671224713, "epoch": 0.007723529836367775, "grad_norm": 0.09482819586992264, "kl": 0.25509975850582123, "learning_rate": 3e-06, "loss": 0.021, "step": 2782 }, { "clip_ratio": 0.0005249693640507758, "epoch": 0.0077263060872075915, "grad_norm": 0.10830868035554886, "kl": 0.2682991921901703, "learning_rate": 3e-06, "loss": 0.0225, "step": 2783 }, { "clip_ratio": 9.697439963929355e-05, "epoch": 0.007729082338047407, "grad_norm": 0.11760209500789642, "kl": 0.26228195428848267, "learning_rate": 3e-06, "loss": 0.0216, "step": 2784 }, { "clip_ratio": 0.00010451504931552336, "completion_length": 204.68750762939453, "epoch": 0.007731858588887223, "grad_norm": 0.09573986381292343, "kl": 0.27570192515850067, "learning_rate": 3e-06, "loss": 0.0201, "reward": 0.21041666716337204, "reward_std": 0.17074457183480263, "rewards/countdown_reward_func": 0.21041666716337204, "step": 2785, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00010442773782415316, "epoch": 0.007734634839727039, "grad_norm": 0.08090373873710632, "kl": 0.28676992654800415, "learning_rate": 3e-06, "loss": 0.0209, "step": 2786 }, { "clip_ratio": 0.0007634111607330851, "epoch": 0.007737411090566855, "grad_norm": 0.08370450139045715, "kl": 0.2715526968240738, "learning_rate": 3e-06, "loss": 0.0203, "step": 2787 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.007740187341406671, "grad_norm": 0.0766986683011055, "kl": 0.2739642560482025, "learning_rate": 3e-06, "loss": 0.0209, "step": 2788 }, { "clip_ratio": 0.0, "epoch": 0.0077429635922464865, "grad_norm": 0.10582198947668076, "kl": 0.2793308198451996, "learning_rate": 3e-06, "loss": 0.0205, "step": 2789 }, { "clip_ratio": 0.0001977809879463166, "epoch": 0.007745739843086303, "grad_norm": 0.08606047183275223, "kl": 0.2722570300102234, "learning_rate": 3e-06, "loss": 0.0205, "step": 2790 }, { "clip_ratio": 0.0, "epoch": 0.007748516093926118, "grad_norm": 0.09805324673652649, "kl": 0.2643267661333084, "learning_rate": 3e-06, "loss": 0.0195, "step": 2791 }, { "clip_ratio": 0.0002088554756483063, "epoch": 0.0077512923447659345, "grad_norm": 0.08259531855583191, "kl": 0.269534707069397, "learning_rate": 3e-06, "loss": 0.0205, "step": 2792 }, { "clip_ratio": 0.0008513410502928309, "epoch": 0.00775406859560575, "grad_norm": 0.08229310065507889, "kl": 0.2518397718667984, "learning_rate": 3e-06, "loss": 0.0197, "step": 2793 }, { "clip_ratio": 0.0004233962681610137, "epoch": 0.007756844846445566, "grad_norm": 0.08889353275299072, "kl": 0.2523081451654434, "learning_rate": 3e-06, "loss": 0.0198, "step": 2794 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.0077596210972853815, "grad_norm": 0.10341796278953552, "kl": 0.2532654255628586, "learning_rate": 3e-06, "loss": 0.0198, "step": 2795 }, { "clip_ratio": 0.0003930517486878671, "epoch": 0.007762397348125198, "grad_norm": 0.08445213735103607, "kl": 0.24360372871160507, "learning_rate": 3e-06, "loss": 0.0188, "step": 2796 }, { "clip_ratio": 0.000313379627186805, "completion_length": 210.4166717529297, "epoch": 0.007765173598965014, "grad_norm": 0.14519153535366058, "kl": 0.23711299896240234, "learning_rate": 3e-06, "loss": 0.0103, "reward": 0.41458335518836975, "reward_std": 0.3498389720916748, "rewards/countdown_reward_func": 0.41458334028720856, "step": 2797, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.0077679498498048295, "grad_norm": 0.13637079298496246, "kl": 0.24053488671779633, "learning_rate": 3e-06, "loss": 0.0081, "step": 2798 }, { "clip_ratio": 0.0, "epoch": 0.007770726100644646, "grad_norm": 0.12805470824241638, "kl": 0.23336678743362427, "learning_rate": 3e-06, "loss": 0.009, "step": 2799 }, { "clip_ratio": 0.0, "epoch": 0.007773502351484461, "grad_norm": 0.1558586210012436, "kl": 0.21293286234140396, "learning_rate": 3e-06, "loss": 0.0083, "step": 2800 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.0077762786023242774, "grad_norm": 0.13409586250782013, "kl": 0.21367400884628296, "learning_rate": 3e-06, "loss": 0.0077, "step": 2801 }, { "clip_ratio": 0.0, "epoch": 0.007779054853164093, "grad_norm": 0.15513594448566437, "kl": 0.2097683995962143, "learning_rate": 3e-06, "loss": 0.007, "step": 2802 }, { "clip_ratio": 0.00032011767325457186, "epoch": 0.007781831104003909, "grad_norm": 0.1499176025390625, "kl": 0.19966993480920792, "learning_rate": 3e-06, "loss": 0.0062, "step": 2803 }, { "clip_ratio": 0.000732421875, "epoch": 0.0077846073548437245, "grad_norm": 0.1529388129711151, "kl": 0.19898688793182373, "learning_rate": 3e-06, "loss": 0.006, "step": 2804 }, { "clip_ratio": 0.00021950175141682848, "epoch": 0.007787383605683541, "grad_norm": 0.1069430336356163, "kl": 0.19208506494760513, "learning_rate": 3e-06, "loss": 0.0067, "step": 2805 }, { "clip_ratio": 0.0008765487291384488, "epoch": 0.007790159856523356, "grad_norm": 0.15367946028709412, "kl": 0.1748751550912857, "learning_rate": 3e-06, "loss": 0.0037, "step": 2806 }, { "clip_ratio": 0.0007885206578066573, "epoch": 0.0077929361073631725, "grad_norm": 0.1313498616218567, "kl": 0.17698493599891663, "learning_rate": 3e-06, "loss": 0.0047, "step": 2807 }, { "clip_ratio": 0.0011222050379728898, "epoch": 0.007795712358202989, "grad_norm": 0.1558087021112442, "kl": 0.17559701949357986, "learning_rate": 3e-06, "loss": 0.0022, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 224.5416717529297, "epoch": 0.007798488609042804, "grad_norm": 0.10879174619913101, "kl": 0.16165052354335785, "learning_rate": 3e-06, "loss": 0.0063, "reward": 0.2291666865348816, "reward_std": 0.16722054407000542, "rewards/countdown_reward_func": 0.2291666716337204, "step": 2809, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00018194336735177785, "epoch": 0.00780126485988262, "grad_norm": 0.08522436022758484, "kl": 0.16320092976093292, "learning_rate": 3e-06, "loss": 0.0062, "step": 2810 }, { "clip_ratio": 0.0005358960625017062, "epoch": 0.007804041110722436, "grad_norm": 0.07876789569854736, "kl": 0.15845438838005066, "learning_rate": 3e-06, "loss": 0.0055, "step": 2811 }, { "clip_ratio": 0.0003692762111313641, "epoch": 0.007806817361562252, "grad_norm": 0.06741394847631454, "kl": 0.14483580738306046, "learning_rate": 3e-06, "loss": 0.0045, "step": 2812 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.0078095936124020675, "grad_norm": 0.08038198202848434, "kl": 0.13989199697971344, "learning_rate": 3e-06, "loss": 0.0045, "step": 2813 }, { "clip_ratio": 0.0004397176089696586, "epoch": 0.007812369863241884, "grad_norm": 0.08318090438842773, "kl": 0.14135022461414337, "learning_rate": 3e-06, "loss": 0.0044, "step": 2814 }, { "clip_ratio": 0.00027204701473237947, "epoch": 0.0078151461140817, "grad_norm": 0.10804148018360138, "kl": 0.1371602937579155, "learning_rate": 3e-06, "loss": 0.0033, "step": 2815 }, { "clip_ratio": 0.0015172622224781662, "epoch": 0.007817922364921515, "grad_norm": 0.08286291360855103, "kl": 0.13602018356323242, "learning_rate": 3e-06, "loss": 0.0043, "step": 2816 }, { "clip_ratio": 0.0007061177748255432, "epoch": 0.007820698615761332, "grad_norm": 0.0705978125333786, "kl": 0.13421594351530075, "learning_rate": 3e-06, "loss": 0.0029, "step": 2817 }, { "clip_ratio": 0.0016204343410208821, "epoch": 0.007823474866601147, "grad_norm": 0.06107841059565544, "kl": 0.1247733011841774, "learning_rate": 3e-06, "loss": 0.003, "step": 2818 }, { "clip_ratio": 0.0015763617120683193, "epoch": 0.007826251117440963, "grad_norm": 0.07388308644294739, "kl": 0.11976242437958717, "learning_rate": 3e-06, "loss": 0.0033, "step": 2819 }, { "clip_ratio": 0.0026265073101967573, "epoch": 0.00782902736828078, "grad_norm": 0.07933972030878067, "kl": 0.12262334674596786, "learning_rate": 3e-06, "loss": 0.0034, "step": 2820 }, { "clip_ratio": 0.00017668912187218666, "completion_length": 226.08333587646484, "epoch": 0.007831803619120595, "grad_norm": 0.07275235652923584, "kl": 0.1178230457007885, "learning_rate": 3e-06, "loss": 0.0063, "reward": 0.3604166805744171, "reward_std": 0.26794980466365814, "rewards/countdown_reward_func": 0.3604166805744171, "step": 2821, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00018867924518417567, "epoch": 0.00783457986996041, "grad_norm": 0.0907898098230362, "kl": 0.10899732261896133, "learning_rate": 3e-06, "loss": 0.0064, "step": 2822 }, { "clip_ratio": 8.765778329689056e-05, "epoch": 0.007837356120800226, "grad_norm": 0.07570625096559525, "kl": 0.11293921619653702, "learning_rate": 3e-06, "loss": 0.0063, "step": 2823 }, { "clip_ratio": 0.00017620009748497978, "epoch": 0.007840132371640043, "grad_norm": 0.0821792408823967, "kl": 0.1178252212703228, "learning_rate": 3e-06, "loss": 0.007, "step": 2824 }, { "clip_ratio": 0.0001815083815017715, "epoch": 0.007842908622479858, "grad_norm": 0.07792210578918457, "kl": 0.11202748119831085, "learning_rate": 3e-06, "loss": 0.0056, "step": 2825 }, { "clip_ratio": 0.0, "epoch": 0.007845684873319674, "grad_norm": 0.0826457291841507, "kl": 0.10975588858127594, "learning_rate": 3e-06, "loss": 0.0063, "step": 2826 }, { "clip_ratio": 0.00017668912187218666, "epoch": 0.00784846112415949, "grad_norm": 0.11345770210027695, "kl": 0.10947077348828316, "learning_rate": 3e-06, "loss": 0.0055, "step": 2827 }, { "clip_ratio": 0.00017054902855306864, "epoch": 0.007851237374999306, "grad_norm": 0.0927169919013977, "kl": 0.1009017825126648, "learning_rate": 3e-06, "loss": 0.0054, "step": 2828 }, { "clip_ratio": 0.0, "epoch": 0.007854013625839122, "grad_norm": 0.07348012179136276, "kl": 0.10423189401626587, "learning_rate": 3e-06, "loss": 0.0064, "step": 2829 }, { "clip_ratio": 0.00027807458536699414, "epoch": 0.007856789876678937, "grad_norm": 0.08235246688127518, "kl": 0.11006882041692734, "learning_rate": 3e-06, "loss": 0.0058, "step": 2830 }, { "clip_ratio": 0.00018904324679169804, "epoch": 0.007859566127518754, "grad_norm": 0.08280429244041443, "kl": 0.10450118780136108, "learning_rate": 3e-06, "loss": 0.0049, "step": 2831 }, { "clip_ratio": 0.00017054902855306864, "epoch": 0.00786234237835857, "grad_norm": 0.07959123700857162, "kl": 0.1026650108397007, "learning_rate": 3e-06, "loss": 0.0057, "step": 2832 }, { "clip_ratio": 0.00018656716565601528, "completion_length": 221.70833587646484, "epoch": 0.007865118629198385, "grad_norm": 0.19074809551239014, "kl": 0.09541347995400429, "learning_rate": 3e-06, "loss": 0.0058, "reward": 0.24583334475755692, "reward_std": 0.18119703978300095, "rewards/countdown_reward_func": 0.24583334475755692, "step": 2833, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0078678948800382, "grad_norm": 0.06446890532970428, "kl": 0.09570834413170815, "learning_rate": 3e-06, "loss": 0.0067, "step": 2834 }, { "clip_ratio": 0.0, "epoch": 0.007870671130878018, "grad_norm": 0.06792428344488144, "kl": 0.09835857525467873, "learning_rate": 3e-06, "loss": 0.0064, "step": 2835 }, { "clip_ratio": 0.0006586709641851485, "epoch": 0.007873447381717833, "grad_norm": 0.06752215325832367, "kl": 0.1026206836104393, "learning_rate": 3e-06, "loss": 0.0065, "step": 2836 }, { "clip_ratio": 0.00034531758865341544, "epoch": 0.007876223632557649, "grad_norm": 0.06314112991094589, "kl": 0.09647680073976517, "learning_rate": 3e-06, "loss": 0.0058, "step": 2837 }, { "clip_ratio": 0.00018601190822664648, "epoch": 0.007878999883397464, "grad_norm": 0.09703534841537476, "kl": 0.0933399461209774, "learning_rate": 3e-06, "loss": 0.0059, "step": 2838 }, { "clip_ratio": 0.0, "epoch": 0.007881776134237281, "grad_norm": 0.09708074480295181, "kl": 0.0925879254937172, "learning_rate": 3e-06, "loss": 0.0051, "step": 2839 }, { "clip_ratio": 0.0, "epoch": 0.007884552385077096, "grad_norm": 0.06116492301225662, "kl": 0.09267272800207138, "learning_rate": 3e-06, "loss": 0.0061, "step": 2840 }, { "clip_ratio": 0.000174820474057924, "epoch": 0.007887328635916912, "grad_norm": 0.0847509354352951, "kl": 0.09728147834539413, "learning_rate": 3e-06, "loss": 0.0054, "step": 2841 }, { "clip_ratio": 0.00045973988017067313, "epoch": 0.007890104886756729, "grad_norm": 0.05902128666639328, "kl": 0.10011890530586243, "learning_rate": 3e-06, "loss": 0.0057, "step": 2842 }, { "clip_ratio": 0.0006093604024499655, "epoch": 0.007892881137596544, "grad_norm": 0.06073309853672981, "kl": 0.09300164133310318, "learning_rate": 3e-06, "loss": 0.005, "step": 2843 }, { "clip_ratio": 0.0003962340415455401, "epoch": 0.00789565738843636, "grad_norm": 0.07167181372642517, "kl": 0.09146009758114815, "learning_rate": 3e-06, "loss": 0.0055, "step": 2844 }, { "clip_ratio": 9.177679748972878e-05, "completion_length": 233.87500762939453, "epoch": 0.007898433639276175, "grad_norm": 0.0647943839430809, "kl": 0.10076628625392914, "learning_rate": 3e-06, "loss": 0.0003, "reward": 0.3750000298023224, "reward_std": 0.23609445989131927, "rewards/countdown_reward_func": 0.3750000298023224, "step": 2845, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0005931506748311222, "epoch": 0.007901209890115992, "grad_norm": 0.07141556590795517, "kl": 0.0974540002644062, "learning_rate": 3e-06, "loss": -0.0003, "step": 2846 }, { "clip_ratio": 0.0, "epoch": 0.007903986140955808, "grad_norm": 0.06702892482280731, "kl": 0.10098153725266457, "learning_rate": 3e-06, "loss": 0.0001, "step": 2847 }, { "clip_ratio": 0.0, "epoch": 0.007906762391795623, "grad_norm": 0.07142177224159241, "kl": 0.09585936367511749, "learning_rate": 3e-06, "loss": -0.0008, "step": 2848 }, { "clip_ratio": 0.00010288065823260695, "epoch": 0.007909538642635439, "grad_norm": 0.08158602565526962, "kl": 0.10074026137590408, "learning_rate": 3e-06, "loss": -0.0003, "step": 2849 }, { "clip_ratio": 0.0, "epoch": 0.007912314893475256, "grad_norm": 0.061286475509405136, "kl": 0.0977989099919796, "learning_rate": 3e-06, "loss": -0.0006, "step": 2850 }, { "clip_ratio": 0.000481799004774075, "epoch": 0.007915091144315071, "grad_norm": 0.06675193458795547, "kl": 0.10005636513233185, "learning_rate": 3e-06, "loss": -0.0006, "step": 2851 }, { "clip_ratio": 0.0003470212686806917, "epoch": 0.007917867395154887, "grad_norm": 0.06907733529806137, "kl": 0.09792259708046913, "learning_rate": 3e-06, "loss": -0.001, "step": 2852 }, { "clip_ratio": 9.177679748972878e-05, "epoch": 0.007920643645994704, "grad_norm": 0.06476406753063202, "kl": 0.10099447146058083, "learning_rate": 3e-06, "loss": -0.0007, "step": 2853 }, { "clip_ratio": 0.00010288065823260695, "epoch": 0.007923419896834519, "grad_norm": 0.07475689798593521, "kl": 0.09664865583181381, "learning_rate": 3e-06, "loss": -0.0017, "step": 2854 }, { "clip_ratio": 0.0005236759025137872, "epoch": 0.007926196147674335, "grad_norm": 0.0729636549949646, "kl": 0.10056468844413757, "learning_rate": 3e-06, "loss": -0.0007, "step": 2855 }, { "clip_ratio": 0.0002057613164652139, "epoch": 0.00792897239851415, "grad_norm": 0.06851907819509506, "kl": 0.09853481873869896, "learning_rate": 3e-06, "loss": -0.0013, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 225.6875, "epoch": 0.007931748649353967, "grad_norm": 0.08417107909917831, "kl": 0.09658883139491081, "learning_rate": 3e-06, "loss": 0.0228, "reward": 0.37916669249534607, "reward_std": 0.345259428024292, "rewards/countdown_reward_func": 0.37916669249534607, "step": 2857, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.097525617107749e-05, "epoch": 0.007934524900193782, "grad_norm": 0.0906069204211235, "kl": 0.0999494418501854, "learning_rate": 3e-06, "loss": 0.0225, "step": 2858 }, { "clip_ratio": 0.00018565761274658144, "epoch": 0.007937301151033598, "grad_norm": 0.10299421846866608, "kl": 0.09859372675418854, "learning_rate": 3e-06, "loss": 0.0233, "step": 2859 }, { "clip_ratio": 9.527438669465482e-05, "epoch": 0.007940077401873413, "grad_norm": 0.10134247690439224, "kl": 0.10400687530636787, "learning_rate": 3e-06, "loss": 0.0238, "step": 2860 }, { "clip_ratio": 0.0, "epoch": 0.00794285365271323, "grad_norm": 0.0990184023976326, "kl": 0.09727415442466736, "learning_rate": 3e-06, "loss": 0.0222, "step": 2861 }, { "clip_ratio": 0.00025975992321036756, "epoch": 0.007945629903553046, "grad_norm": 0.11110273003578186, "kl": 0.09704288840293884, "learning_rate": 3e-06, "loss": 0.0231, "step": 2862 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.007948406154392861, "grad_norm": 0.09088550508022308, "kl": 0.09921449795365334, "learning_rate": 3e-06, "loss": 0.0216, "step": 2863 }, { "clip_ratio": 0.0001750715746311471, "epoch": 0.007951182405232678, "grad_norm": 0.09097853302955627, "kl": 0.10326225683093071, "learning_rate": 3e-06, "loss": 0.0224, "step": 2864 }, { "clip_ratio": 0.00031251084874384105, "epoch": 0.007953958656072494, "grad_norm": 0.10051525384187698, "kl": 0.1042257696390152, "learning_rate": 3e-06, "loss": 0.022, "step": 2865 }, { "clip_ratio": 0.00044858358160126954, "epoch": 0.00795673490691231, "grad_norm": 0.09236780554056168, "kl": 0.10993418842554092, "learning_rate": 3e-06, "loss": 0.0223, "step": 2866 }, { "clip_ratio": 0.0008895074424799532, "epoch": 0.007959511157752125, "grad_norm": 0.09969329088926315, "kl": 0.10290747508406639, "learning_rate": 3e-06, "loss": 0.0203, "step": 2867 }, { "clip_ratio": 0.00018142474436899647, "epoch": 0.007962287408591942, "grad_norm": 0.08537083119153976, "kl": 0.10400986671447754, "learning_rate": 3e-06, "loss": 0.0211, "step": 2868 }, { "clip_ratio": 0.00025035660655703396, "completion_length": 226.87500762939453, "epoch": 0.007965063659431757, "grad_norm": 0.06250222772359848, "kl": 0.1183239109814167, "learning_rate": 3e-06, "loss": 0.0096, "reward": 0.22708335518836975, "reward_std": 0.2217756062746048, "rewards/countdown_reward_func": 0.22708334028720856, "step": 2869, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0002849273369065486, "epoch": 0.007967839910271573, "grad_norm": 0.0647013857960701, "kl": 0.10483258217573166, "learning_rate": 3e-06, "loss": 0.0085, "step": 2870 }, { "clip_ratio": 0.00039553697570227087, "epoch": 0.007970616161111388, "grad_norm": 0.08133979141712189, "kl": 0.11905910074710846, "learning_rate": 3e-06, "loss": 0.0096, "step": 2871 }, { "clip_ratio": 0.000244140625, "epoch": 0.007973392411951205, "grad_norm": 0.06605112552642822, "kl": 0.11454110965132713, "learning_rate": 3e-06, "loss": 0.0093, "step": 2872 }, { "clip_ratio": 0.0, "epoch": 0.00797616866279102, "grad_norm": 0.0676497220993042, "kl": 0.11419738829135895, "learning_rate": 3e-06, "loss": 0.0089, "step": 2873 }, { "clip_ratio": 8.207485370803624e-05, "epoch": 0.007978944913630836, "grad_norm": 0.0761796236038208, "kl": 0.14306952059268951, "learning_rate": 3e-06, "loss": 0.0095, "step": 2874 }, { "clip_ratio": 0.0002586206828709692, "epoch": 0.007981721164470653, "grad_norm": 0.0685449168086052, "kl": 0.12428713589906693, "learning_rate": 3e-06, "loss": 0.009, "step": 2875 }, { "clip_ratio": 0.0, "epoch": 0.007984497415310468, "grad_norm": 0.06311395019292831, "kl": 0.11094345152378082, "learning_rate": 3e-06, "loss": 0.008, "step": 2876 }, { "clip_ratio": 0.0003959706373279914, "epoch": 0.007987273666150284, "grad_norm": 0.0760158896446228, "kl": 0.12179388105869293, "learning_rate": 3e-06, "loss": 0.0087, "step": 2877 }, { "clip_ratio": 0.00010434056457597762, "epoch": 0.0079900499169901, "grad_norm": 0.06978533416986465, "kl": 0.11936772614717484, "learning_rate": 3e-06, "loss": 0.0085, "step": 2878 }, { "clip_ratio": 8.98634098120965e-05, "epoch": 0.007992826167829916, "grad_norm": 0.07645125687122345, "kl": 0.11859909072518349, "learning_rate": 3e-06, "loss": 0.0089, "step": 2879 }, { "clip_ratio": 0.0, "epoch": 0.007995602418669732, "grad_norm": 0.07927416265010834, "kl": 0.14601439237594604, "learning_rate": 3e-06, "loss": 0.0087, "step": 2880 }, { "clip_ratio": 8.896797226043418e-05, "completion_length": 231.3541717529297, "epoch": 0.007998378669509547, "grad_norm": 0.08362464606761932, "kl": 0.1300552785396576, "learning_rate": 3e-06, "loss": 0.0122, "reward": 0.3437500298023224, "reward_std": 0.34344974160194397, "rewards/countdown_reward_func": 0.3437500298023224, "step": 2881, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.008001154920349363, "grad_norm": 0.0994076058268547, "kl": 0.12091857567429543, "learning_rate": 3e-06, "loss": 0.0123, "step": 2882 }, { "clip_ratio": 0.0, "epoch": 0.00800393117118918, "grad_norm": 0.08601740747690201, "kl": 0.1273326873779297, "learning_rate": 3e-06, "loss": 0.0125, "step": 2883 }, { "clip_ratio": 0.0, "epoch": 0.008006707422028995, "grad_norm": 0.10227788239717484, "kl": 0.11781539395451546, "learning_rate": 3e-06, "loss": 0.0119, "step": 2884 }, { "clip_ratio": 0.00017723909695632756, "epoch": 0.00800948367286881, "grad_norm": 0.09209822118282318, "kl": 0.11817209422588348, "learning_rate": 3e-06, "loss": 0.012, "step": 2885 }, { "clip_ratio": 0.0, "epoch": 0.008012259923708628, "grad_norm": 0.08977486938238144, "kl": 0.12534453719854355, "learning_rate": 3e-06, "loss": 0.0126, "step": 2886 }, { "clip_ratio": 0.00035437509359326214, "epoch": 0.008015036174548443, "grad_norm": 0.08289460092782974, "kl": 0.1300041750073433, "learning_rate": 3e-06, "loss": 0.0116, "step": 2887 }, { "clip_ratio": 0.0001893517910502851, "epoch": 0.008017812425388259, "grad_norm": 0.1072833463549614, "kl": 0.1224563904106617, "learning_rate": 3e-06, "loss": 0.0114, "step": 2888 }, { "clip_ratio": 0.00017723910423228517, "epoch": 0.008020588676228074, "grad_norm": 0.08867861330509186, "kl": 0.12750563025474548, "learning_rate": 3e-06, "loss": 0.0118, "step": 2889 }, { "clip_ratio": 9.585889347363263e-05, "epoch": 0.008023364927067891, "grad_norm": 0.09481937438249588, "kl": 0.11749697849154472, "learning_rate": 3e-06, "loss": 0.0113, "step": 2890 }, { "clip_ratio": 0.00036895690573146567, "epoch": 0.008026141177907706, "grad_norm": 0.07707834988832474, "kl": 0.11860840022563934, "learning_rate": 3e-06, "loss": 0.0104, "step": 2891 }, { "clip_ratio": 0.00039426982402801514, "epoch": 0.008028917428747522, "grad_norm": 0.08561480790376663, "kl": 0.12469931319355965, "learning_rate": 3e-06, "loss": 0.0109, "step": 2892 }, { "clip_ratio": 0.00011814745084848255, "completion_length": 226.625, "epoch": 0.008031693679587337, "grad_norm": 0.06807278096675873, "kl": 0.12008354067802429, "learning_rate": 3e-06, "loss": 0.017, "reward": 0.26875002682209015, "reward_std": 0.28773441165685654, "rewards/countdown_reward_func": 0.26875002682209015, "step": 2893, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0006051514646969736, "epoch": 0.008034469930427154, "grad_norm": 0.1275395005941391, "kl": 0.12513798847794533, "learning_rate": 3e-06, "loss": 0.0159, "step": 2894 }, { "clip_ratio": 8.394895849050954e-05, "epoch": 0.00803724618126697, "grad_norm": 0.10239741951227188, "kl": 0.12161910533905029, "learning_rate": 3e-06, "loss": 0.0162, "step": 2895 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.008040022432106785, "grad_norm": 0.0647832453250885, "kl": 0.12413807958364487, "learning_rate": 3e-06, "loss": 0.0161, "step": 2896 }, { "clip_ratio": 0.0, "epoch": 0.008042798682946602, "grad_norm": 0.08120051771402359, "kl": 0.12584573403000832, "learning_rate": 3e-06, "loss": 0.0166, "step": 2897 }, { "clip_ratio": 0.00026290465029887855, "epoch": 0.008045574933786418, "grad_norm": 0.07623409479856491, "kl": 0.12812711670994759, "learning_rate": 3e-06, "loss": 0.0159, "step": 2898 }, { "clip_ratio": 0.0, "epoch": 0.008048351184626233, "grad_norm": 0.11186090111732483, "kl": 0.12186853587627411, "learning_rate": 3e-06, "loss": 0.0162, "step": 2899 }, { "clip_ratio": 0.00027075811522081494, "epoch": 0.008051127435466049, "grad_norm": 0.10452325642108917, "kl": 0.12681635469198227, "learning_rate": 3e-06, "loss": 0.0151, "step": 2900 }, { "clip_ratio": 0.0003509107627905905, "epoch": 0.008053903686305866, "grad_norm": 0.09368684887886047, "kl": 0.12551752850413322, "learning_rate": 3e-06, "loss": 0.0155, "step": 2901 }, { "clip_ratio": 0.00038472846790682524, "epoch": 0.008056679937145681, "grad_norm": 0.06909853219985962, "kl": 0.12604644522070885, "learning_rate": 3e-06, "loss": 0.0159, "step": 2902 }, { "clip_ratio": 0.0, "epoch": 0.008059456187985497, "grad_norm": 0.07726036012172699, "kl": 0.128226388245821, "learning_rate": 3e-06, "loss": 0.0159, "step": 2903 }, { "clip_ratio": 0.000373729330021888, "epoch": 0.008062232438825312, "grad_norm": 0.0775313749909401, "kl": 0.13021286204457283, "learning_rate": 3e-06, "loss": 0.0154, "step": 2904 }, { "clip_ratio": 8.871540194377303e-05, "completion_length": 234.95833587646484, "epoch": 0.008065008689665129, "grad_norm": 0.050456538796424866, "kl": 0.12343299016356468, "learning_rate": 3e-06, "loss": 0.0198, "reward": 0.23125002533197403, "reward_std": 0.16211743280291557, "rewards/countdown_reward_func": 0.23125002533197403, "step": 2905, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0006441513833124191, "epoch": 0.008067784940504944, "grad_norm": 0.06088346615433693, "kl": 0.1258678138256073, "learning_rate": 3e-06, "loss": 0.02, "step": 2906 }, { "clip_ratio": 0.0, "epoch": 0.00807056119134476, "grad_norm": 0.05327542871236801, "kl": 0.11154912039637566, "learning_rate": 3e-06, "loss": 0.0191, "step": 2907 }, { "clip_ratio": 0.00044642857392318547, "epoch": 0.008073337442184577, "grad_norm": 0.052772704511880875, "kl": 0.11424395442008972, "learning_rate": 3e-06, "loss": 0.019, "step": 2908 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.008076113693024392, "grad_norm": 0.06258887052536011, "kl": 0.12520792335271835, "learning_rate": 3e-06, "loss": 0.0194, "step": 2909 }, { "clip_ratio": 0.0003544513165252283, "epoch": 0.008078889943864208, "grad_norm": 0.05277208611369133, "kl": 0.12279794365167618, "learning_rate": 3e-06, "loss": 0.0199, "step": 2910 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.008081666194704023, "grad_norm": 0.04740145057439804, "kl": 0.1260455995798111, "learning_rate": 3e-06, "loss": 0.0193, "step": 2911 }, { "clip_ratio": 0.0010958234197460115, "epoch": 0.00808444244554384, "grad_norm": 0.11674796789884567, "kl": 0.12843139842152596, "learning_rate": 3e-06, "loss": 0.019, "step": 2912 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.008087218696383656, "grad_norm": 0.051824696362018585, "kl": 0.11402711644768715, "learning_rate": 3e-06, "loss": 0.0181, "step": 2913 }, { "clip_ratio": 0.0003579796975827776, "epoch": 0.008089994947223471, "grad_norm": 0.05917217954993248, "kl": 0.11514993757009506, "learning_rate": 3e-06, "loss": 0.0186, "step": 2914 }, { "clip_ratio": 0.00027479883283376694, "epoch": 0.008092771198063287, "grad_norm": 0.0590234100818634, "kl": 0.1249060183763504, "learning_rate": 3e-06, "loss": 0.0184, "step": 2915 }, { "clip_ratio": 0.00036769236612599343, "epoch": 0.008095547448903104, "grad_norm": 0.06105006858706474, "kl": 0.12332272529602051, "learning_rate": 3e-06, "loss": 0.0191, "step": 2916 }, { "clip_ratio": 9.527438669465482e-05, "completion_length": 219.0416717529297, "epoch": 0.00809832369974292, "grad_norm": 0.1498878449201584, "kl": 0.12559155747294426, "learning_rate": 3e-06, "loss": 0.04, "reward": 0.37916669249534607, "reward_std": 0.3609502911567688, "rewards/countdown_reward_func": 0.3791666775941849, "step": 2917, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0006061493186280131, "epoch": 0.008101099950582735, "grad_norm": 0.12946128845214844, "kl": 0.13948071748018265, "learning_rate": 3e-06, "loss": 0.0402, "step": 2918 }, { "clip_ratio": 0.001031319028697908, "epoch": 0.008103876201422552, "grad_norm": 0.1394750028848648, "kl": 0.13896315544843674, "learning_rate": 3e-06, "loss": 0.0405, "step": 2919 }, { "clip_ratio": 0.00036930531496182084, "epoch": 0.008106652452262367, "grad_norm": 0.1233178898692131, "kl": 0.13294941559433937, "learning_rate": 3e-06, "loss": 0.0393, "step": 2920 }, { "clip_ratio": 0.0002002282053581439, "epoch": 0.008109428703102183, "grad_norm": 0.1306765377521515, "kl": 0.13720841705799103, "learning_rate": 3e-06, "loss": 0.0383, "step": 2921 }, { "clip_ratio": 0.00010229132749373093, "epoch": 0.008112204953941998, "grad_norm": 0.13917167484760284, "kl": 0.14394008368253708, "learning_rate": 3e-06, "loss": 0.0401, "step": 2922 }, { "clip_ratio": 0.0001230314956046641, "epoch": 0.008114981204781815, "grad_norm": 0.14495928585529327, "kl": 0.13566283136606216, "learning_rate": 3e-06, "loss": 0.037, "step": 2923 }, { "clip_ratio": 0.0009354346766485833, "epoch": 0.00811775745562163, "grad_norm": 0.12309663742780685, "kl": 0.15085419267416, "learning_rate": 3e-06, "loss": 0.0374, "step": 2924 }, { "clip_ratio": 0.001744092172884848, "epoch": 0.008120533706461446, "grad_norm": 0.2163287252187729, "kl": 0.15629679709672928, "learning_rate": 3e-06, "loss": 0.0366, "step": 2925 }, { "clip_ratio": 0.0012562716729007661, "epoch": 0.008123309957301261, "grad_norm": 0.1353260725736618, "kl": 0.15305960923433304, "learning_rate": 3e-06, "loss": 0.0365, "step": 2926 }, { "clip_ratio": 0.0012852277723141015, "epoch": 0.008126086208141078, "grad_norm": 0.11791636794805527, "kl": 0.15614692121744156, "learning_rate": 3e-06, "loss": 0.0339, "step": 2927 }, { "clip_ratio": 0.0014434565382543951, "epoch": 0.008128862458980894, "grad_norm": 0.11795477569103241, "kl": 0.16650176048278809, "learning_rate": 3e-06, "loss": 0.0359, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 208.3541717529297, "epoch": 0.00813163870982071, "grad_norm": 0.10702957212924957, "kl": 0.19728034734725952, "learning_rate": 3e-06, "loss": 0.0205, "reward": 0.3604166954755783, "reward_std": 0.350378192961216, "rewards/countdown_reward_func": 0.3604166880249977, "step": 2929, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.008134414960660526, "grad_norm": 0.12271902710199356, "kl": 0.1852600872516632, "learning_rate": 3e-06, "loss": 0.0196, "step": 2930 }, { "clip_ratio": 0.00010296540131093934, "epoch": 0.008137191211500342, "grad_norm": 0.2417660355567932, "kl": 0.19279366731643677, "learning_rate": 3e-06, "loss": 0.0198, "step": 2931 }, { "clip_ratio": 0.0001774700212990865, "epoch": 0.008139967462340157, "grad_norm": 0.08538821339607239, "kl": 0.1945384293794632, "learning_rate": 3e-06, "loss": 0.0199, "step": 2932 }, { "clip_ratio": 0.0, "epoch": 0.008142743713179973, "grad_norm": 0.09286801517009735, "kl": 0.20258444547653198, "learning_rate": 3e-06, "loss": 0.0197, "step": 2933 }, { "clip_ratio": 0.0, "epoch": 0.00814551996401979, "grad_norm": 0.1027495339512825, "kl": 0.21260683983564377, "learning_rate": 3e-06, "loss": 0.0198, "step": 2934 }, { "clip_ratio": 0.0001865774393081665, "epoch": 0.008148296214859605, "grad_norm": 0.09365507960319519, "kl": 0.22567472606897354, "learning_rate": 3e-06, "loss": 0.0191, "step": 2935 }, { "clip_ratio": 0.00010296540131093934, "epoch": 0.00815107246569942, "grad_norm": 0.13052593171596527, "kl": 0.21785826981067657, "learning_rate": 3e-06, "loss": 0.0174, "step": 2936 }, { "clip_ratio": 0.00020838537602685392, "epoch": 0.008153848716539236, "grad_norm": 0.11136723309755325, "kl": 0.2212338000535965, "learning_rate": 3e-06, "loss": 0.0179, "step": 2937 }, { "clip_ratio": 0.0003912675892934203, "epoch": 0.008156624967379053, "grad_norm": 0.12249906361103058, "kl": 0.2211797907948494, "learning_rate": 3e-06, "loss": 0.0185, "step": 2938 }, { "clip_ratio": 0.0005108644763822667, "epoch": 0.008159401218218868, "grad_norm": 0.08566109836101532, "kl": 0.22614888846874237, "learning_rate": 3e-06, "loss": 0.0184, "step": 2939 }, { "clip_ratio": 0.00010179152741329744, "epoch": 0.008162177469058684, "grad_norm": 0.08335477113723755, "kl": 0.2380538210272789, "learning_rate": 3e-06, "loss": 0.0188, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 204.7916717529297, "epoch": 0.008164953719898501, "grad_norm": 0.12303364276885986, "kl": 0.23863644152879715, "learning_rate": 3e-06, "loss": -0.0007, "reward": 0.322916679084301, "reward_std": 0.33256906270980835, "rewards/countdown_reward_func": 0.3229166641831398, "step": 2941, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00020441536616999656, "epoch": 0.008167729970738316, "grad_norm": 0.12560997903347015, "kl": 0.2275976613163948, "learning_rate": 3e-06, "loss": -0.0011, "step": 2942 }, { "clip_ratio": 0.0006918944054632448, "epoch": 0.008170506221578132, "grad_norm": 0.15725287795066833, "kl": 0.242391936480999, "learning_rate": 3e-06, "loss": -0.0003, "step": 2943 }, { "clip_ratio": 0.0, "epoch": 0.008173282472417947, "grad_norm": 0.13426519930362701, "kl": 0.22410206496715546, "learning_rate": 3e-06, "loss": -0.0015, "step": 2944 }, { "clip_ratio": 9.117432637140155e-05, "epoch": 0.008176058723257764, "grad_norm": 0.1572161614894867, "kl": 0.2253274992108345, "learning_rate": 3e-06, "loss": -0.0012, "step": 2945 }, { "clip_ratio": 0.00033357564825564623, "epoch": 0.00817883497409758, "grad_norm": 0.12129116803407669, "kl": 0.22914253175258636, "learning_rate": 3e-06, "loss": -0.002, "step": 2946 }, { "clip_ratio": 0.000341290608048439, "epoch": 0.008181611224937395, "grad_norm": 0.11379671096801758, "kl": 0.23221316188573837, "learning_rate": 3e-06, "loss": -0.0018, "step": 2947 }, { "clip_ratio": 0.0009241988882422447, "epoch": 0.00818438747577721, "grad_norm": 0.14020630717277527, "kl": 0.2144078090786934, "learning_rate": 3e-06, "loss": -0.0032, "step": 2948 }, { "clip_ratio": 0.00039001560071483254, "epoch": 0.008187163726617028, "grad_norm": 0.1497488021850586, "kl": 0.2228521853685379, "learning_rate": 3e-06, "loss": -0.003, "step": 2949 }, { "clip_ratio": 9.67492233030498e-05, "epoch": 0.008189939977456843, "grad_norm": 0.12886874377727509, "kl": 0.20249485969543457, "learning_rate": 3e-06, "loss": -0.0045, "step": 2950 }, { "clip_ratio": 0.00010220768308499828, "epoch": 0.008192716228296659, "grad_norm": 0.13839411735534668, "kl": 0.20022351294755936, "learning_rate": 3e-06, "loss": -0.0049, "step": 2951 }, { "clip_ratio": 0.0009093996486626565, "epoch": 0.008195492479136476, "grad_norm": 0.1604899913072586, "kl": 0.2021883800625801, "learning_rate": 3e-06, "loss": -0.0045, "step": 2952 }, { "clip_ratio": 0.00017998559633269906, "completion_length": 232.0625, "epoch": 0.008198268729976291, "grad_norm": 0.1349242776632309, "kl": 0.19092954695224762, "learning_rate": 3e-06, "loss": 0.0467, "reward": 0.4187500476837158, "reward_std": 0.3929017186164856, "rewards/countdown_reward_func": 0.41875001788139343, "step": 2953, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.008201044980816106, "grad_norm": 0.14125162363052368, "kl": 0.18935513496398926, "learning_rate": 3e-06, "loss": 0.0487, "step": 2954 }, { "clip_ratio": 0.0, "epoch": 0.008203821231655922, "grad_norm": 0.13364918529987335, "kl": 0.1809384673833847, "learning_rate": 3e-06, "loss": 0.0484, "step": 2955 }, { "clip_ratio": 0.0, "epoch": 0.008206597482495739, "grad_norm": 0.14803123474121094, "kl": 0.1891830489039421, "learning_rate": 3e-06, "loss": 0.0478, "step": 2956 }, { "clip_ratio": 8.338892803294584e-05, "epoch": 0.008209373733335554, "grad_norm": 0.13373561203479767, "kl": 0.19126961380243301, "learning_rate": 3e-06, "loss": 0.0482, "step": 2957 }, { "clip_ratio": 0.0, "epoch": 0.00821214998417537, "grad_norm": 0.1259787380695343, "kl": 0.19757063686847687, "learning_rate": 3e-06, "loss": 0.0486, "step": 2958 }, { "clip_ratio": 0.0002699783944990486, "epoch": 0.008214926235015185, "grad_norm": 0.14620471000671387, "kl": 0.19020027667284012, "learning_rate": 3e-06, "loss": 0.0464, "step": 2959 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.008217702485855002, "grad_norm": 0.1397581696510315, "kl": 0.1945667639374733, "learning_rate": 3e-06, "loss": 0.048, "step": 2960 }, { "clip_ratio": 8.338892803294584e-05, "epoch": 0.008220478736694818, "grad_norm": 0.15621036291122437, "kl": 0.19359785318374634, "learning_rate": 3e-06, "loss": 0.0475, "step": 2961 }, { "clip_ratio": 0.0002744326484389603, "epoch": 0.008223254987534633, "grad_norm": 0.15102514624595642, "kl": 0.20723125338554382, "learning_rate": 3e-06, "loss": 0.046, "step": 2962 }, { "clip_ratio": 0.0, "epoch": 0.00822603123837445, "grad_norm": 0.13393425941467285, "kl": 0.2166873812675476, "learning_rate": 3e-06, "loss": 0.0452, "step": 2963 }, { "clip_ratio": 0.0, "epoch": 0.008228807489214266, "grad_norm": 0.12785588204860687, "kl": 0.22511591017246246, "learning_rate": 3e-06, "loss": 0.0463, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 198.70834350585938, "epoch": 0.008231583740054081, "grad_norm": 0.1040295884013176, "kl": 0.20944831520318985, "learning_rate": 3e-06, "loss": 0.0281, "reward": 0.24791668355464935, "reward_std": 0.1760939434170723, "rewards/countdown_reward_func": 0.24791668355464935, "step": 2965, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00030983770557213575, "epoch": 0.008234359990893897, "grad_norm": 0.09268911182880402, "kl": 0.20641998201608658, "learning_rate": 3e-06, "loss": 0.0276, "step": 2966 }, { "clip_ratio": 0.0, "epoch": 0.008237136241733714, "grad_norm": 0.08695083111524582, "kl": 0.20673486590385437, "learning_rate": 3e-06, "loss": 0.0272, "step": 2967 }, { "clip_ratio": 0.0, "epoch": 0.008239912492573529, "grad_norm": 0.2699783146381378, "kl": 0.2257823646068573, "learning_rate": 3e-06, "loss": 0.0284, "step": 2968 }, { "clip_ratio": 0.0003003887250088155, "epoch": 0.008242688743413345, "grad_norm": 0.10769811272621155, "kl": 0.22646521776914597, "learning_rate": 3e-06, "loss": 0.0274, "step": 2969 }, { "clip_ratio": 0.0004038741026306525, "epoch": 0.00824546499425316, "grad_norm": 0.0701657086610794, "kl": 0.23094762861728668, "learning_rate": 3e-06, "loss": 0.027, "step": 2970 }, { "clip_ratio": 0.00019700179836945608, "epoch": 0.008248241245092977, "grad_norm": 0.06987138837575912, "kl": 0.2354016751050949, "learning_rate": 3e-06, "loss": 0.0278, "step": 2971 }, { "clip_ratio": 0.00020593080262187868, "epoch": 0.008251017495932792, "grad_norm": 0.07976119220256805, "kl": 0.22992369532585144, "learning_rate": 3e-06, "loss": 0.0272, "step": 2972 }, { "clip_ratio": 0.00011394712782930583, "epoch": 0.008253793746772608, "grad_norm": 0.0750546008348465, "kl": 0.2314920872449875, "learning_rate": 3e-06, "loss": 0.0265, "step": 2973 }, { "clip_ratio": 0.00011394712782930583, "epoch": 0.008256569997612425, "grad_norm": 0.09447232633829117, "kl": 0.24742433428764343, "learning_rate": 3e-06, "loss": 0.0272, "step": 2974 }, { "clip_ratio": 0.0008895580103853717, "epoch": 0.00825934624845224, "grad_norm": 0.09518829733133316, "kl": 0.24733898043632507, "learning_rate": 3e-06, "loss": 0.0266, "step": 2975 }, { "clip_ratio": 0.0003161244676448405, "epoch": 0.008262122499292056, "grad_norm": 0.07405757158994675, "kl": 0.25000642240047455, "learning_rate": 3e-06, "loss": 0.027, "step": 2976 }, { "clip_ratio": 0.000281903070572298, "completion_length": 209.31250762939453, "epoch": 0.008264898750131871, "grad_norm": 0.09525343030691147, "kl": 0.27141405642032623, "learning_rate": 3e-06, "loss": 0.0202, "reward": 0.2854166850447655, "reward_std": 0.26794979721307755, "rewards/countdown_reward_func": 0.2854166850447655, "step": 2977, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0003736920771189034, "epoch": 0.008267675000971688, "grad_norm": 0.0926135703921318, "kl": 0.2705317437648773, "learning_rate": 3e-06, "loss": 0.0203, "step": 2978 }, { "clip_ratio": 0.00012054001854266971, "epoch": 0.008270451251811504, "grad_norm": 0.16147008538246155, "kl": 0.26931579411029816, "learning_rate": 3e-06, "loss": 0.0194, "step": 2979 }, { "clip_ratio": 0.0, "epoch": 0.00827322750265132, "grad_norm": 0.13489627838134766, "kl": 0.2725987881422043, "learning_rate": 3e-06, "loss": 0.0204, "step": 2980 }, { "clip_ratio": 0.0001983628171728924, "epoch": 0.008276003753491135, "grad_norm": 0.11945942044258118, "kl": 0.2618674486875534, "learning_rate": 3e-06, "loss": 0.0203, "step": 2981 }, { "clip_ratio": 0.0003802281280513853, "epoch": 0.008278780004330952, "grad_norm": 0.1246536523103714, "kl": 0.28595584630966187, "learning_rate": 3e-06, "loss": 0.0195, "step": 2982 }, { "clip_ratio": 9.505703201284632e-05, "epoch": 0.008281556255170767, "grad_norm": 0.1368720978498459, "kl": 0.2733803018927574, "learning_rate": 3e-06, "loss": 0.0193, "step": 2983 }, { "clip_ratio": 0.0, "epoch": 0.008284332506010583, "grad_norm": 0.09481362998485565, "kl": 0.26490673422813416, "learning_rate": 3e-06, "loss": 0.02, "step": 2984 }, { "clip_ratio": 0.00021974636911181733, "epoch": 0.0082871087568504, "grad_norm": 0.09505084902048111, "kl": 0.26156656444072723, "learning_rate": 3e-06, "loss": 0.0178, "step": 2985 }, { "clip_ratio": 0.0, "epoch": 0.008289885007690215, "grad_norm": 0.1336756944656372, "kl": 0.25921260565519333, "learning_rate": 3e-06, "loss": 0.0189, "step": 2986 }, { "clip_ratio": 0.00020661157032009214, "epoch": 0.00829266125853003, "grad_norm": 0.14265599846839905, "kl": 0.24782179296016693, "learning_rate": 3e-06, "loss": 0.0187, "step": 2987 }, { "clip_ratio": 9.505703201284632e-05, "epoch": 0.008295437509369846, "grad_norm": 0.09963567554950714, "kl": 0.26754000782966614, "learning_rate": 3e-06, "loss": 0.018, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 210.77084350585938, "epoch": 0.008298213760209663, "grad_norm": 0.13728204369544983, "kl": 0.260358989238739, "learning_rate": 3e-06, "loss": 0.041, "reward": 0.3229166716337204, "reward_std": 0.29996681958436966, "rewards/countdown_reward_func": 0.3229166716337204, "step": 2989, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0015565603971481323, "epoch": 0.008300990011049478, "grad_norm": 0.1172989085316658, "kl": 0.24400679022073746, "learning_rate": 3e-06, "loss": 0.0407, "step": 2990 }, { "clip_ratio": 9.07111752894707e-05, "epoch": 0.008303766261889294, "grad_norm": 0.1250603049993515, "kl": 0.243415929377079, "learning_rate": 3e-06, "loss": 0.0403, "step": 2991 }, { "clip_ratio": 0.00020188145572319627, "epoch": 0.00830654251272911, "grad_norm": 0.12841704487800598, "kl": 0.2400432676076889, "learning_rate": 3e-06, "loss": 0.0419, "step": 2992 }, { "clip_ratio": 0.00018642803479451686, "epoch": 0.008309318763568926, "grad_norm": 0.11256518214941025, "kl": 0.2428617998957634, "learning_rate": 3e-06, "loss": 0.0411, "step": 2993 }, { "clip_ratio": 0.0004469587147468701, "epoch": 0.008312095014408742, "grad_norm": 0.08892057836055756, "kl": 0.25952766835689545, "learning_rate": 3e-06, "loss": 0.0419, "step": 2994 }, { "clip_ratio": 0.00024485797621309757, "epoch": 0.008314871265248557, "grad_norm": 0.12862318754196167, "kl": 0.26006074249744415, "learning_rate": 3e-06, "loss": 0.0395, "step": 2995 }, { "clip_ratio": 0.0009444155148230493, "epoch": 0.008317647516088374, "grad_norm": 0.11506360024213791, "kl": 0.24812127649784088, "learning_rate": 3e-06, "loss": 0.0392, "step": 2996 }, { "clip_ratio": 0.00031964890513336286, "epoch": 0.00832042376692819, "grad_norm": 0.11527801305055618, "kl": 0.25324472039937973, "learning_rate": 3e-06, "loss": 0.0392, "step": 2997 }, { "clip_ratio": 0.0, "epoch": 0.008323200017768005, "grad_norm": 0.116396963596344, "kl": 0.2548160031437874, "learning_rate": 3e-06, "loss": 0.039, "step": 2998 }, { "clip_ratio": 0.000394110924389679, "epoch": 0.00832597626860782, "grad_norm": 0.11536931991577148, "kl": 0.25775010883808136, "learning_rate": 3e-06, "loss": 0.039, "step": 2999 }, { "epoch": 0.008328752519447638, "grad_norm": 0.08782674372196198, "learning_rate": 3e-06, "loss": 0.0401, "step": 3000 }, { "clip_ratio": 0.00025013775302795693, "completion_length": 199.87500762939453, "epoch": 0.008331528770287453, "grad_norm": 0.12537510693073273, "kl": 0.2633812315762043, "learning_rate": 3e-06, "loss": -0.0005, "reward": 0.40000003576278687, "reward_std": 0.34935320913791656, "rewards/countdown_reward_func": 0.4000000059604645, "step": 3001, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00025826445198617876, "epoch": 0.008334305021127269, "grad_norm": 0.13606682419776917, "kl": 0.24667251855134964, "learning_rate": 3e-06, "loss": -0.0009, "step": 3002 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.008337081271967084, "grad_norm": 0.12609559297561646, "kl": 0.24887027591466904, "learning_rate": 3e-06, "loss": 0.0006, "step": 3003 }, { "clip_ratio": 0.0, "epoch": 0.008339857522806901, "grad_norm": 0.1376654952764511, "kl": 0.25823332369327545, "learning_rate": 3e-06, "loss": -0.0006, "step": 3004 }, { "clip_ratio": 0.0, "epoch": 0.008342633773646716, "grad_norm": 0.13998550176620483, "kl": 0.2443101778626442, "learning_rate": 3e-06, "loss": 0.0002, "step": 3005 }, { "clip_ratio": 0.0, "epoch": 0.008345410024486532, "grad_norm": 0.13827265799045563, "kl": 0.24013708531856537, "learning_rate": 3e-06, "loss": -0.0007, "step": 3006 }, { "clip_ratio": 0.00011425960110500455, "epoch": 0.008348186275326349, "grad_norm": 0.1322464942932129, "kl": 0.23806992173194885, "learning_rate": 3e-06, "loss": -0.0012, "step": 3007 }, { "clip_ratio": 0.0, "epoch": 0.008350962526166164, "grad_norm": 0.15478762984275818, "kl": 0.22945895791053772, "learning_rate": 3e-06, "loss": -0.0027, "step": 3008 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.00835373877700598, "grad_norm": 0.11452729254961014, "kl": 0.22601287066936493, "learning_rate": 3e-06, "loss": -0.002, "step": 3009 }, { "clip_ratio": 0.00018180024926550686, "epoch": 0.008356515027845795, "grad_norm": 0.12319207191467285, "kl": 0.23140033334493637, "learning_rate": 3e-06, "loss": -0.0026, "step": 3010 }, { "clip_ratio": 0.0, "epoch": 0.008359291278685612, "grad_norm": 0.1453821361064911, "kl": 0.21502837538719177, "learning_rate": 3e-06, "loss": -0.0033, "step": 3011 }, { "clip_ratio": 8.6088155512698e-05, "epoch": 0.008362067529525428, "grad_norm": 0.12773115932941437, "kl": 0.2083139270544052, "learning_rate": 3e-06, "loss": -0.0035, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 204.56250762939453, "epoch": 0.008364843780365243, "grad_norm": 0.1017741858959198, "kl": 0.20100193470716476, "learning_rate": 3e-06, "loss": 0.0043, "reward": 0.4166666865348816, "reward_std": 0.3485528528690338, "rewards/countdown_reward_func": 0.4166666865348816, "step": 3013, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0005079824477434158, "epoch": 0.008367620031205059, "grad_norm": 0.1420072764158249, "kl": 0.2037753239274025, "learning_rate": 3e-06, "loss": 0.0042, "step": 3014 }, { "clip_ratio": 0.0, "epoch": 0.008370396282044876, "grad_norm": 0.10676582157611847, "kl": 0.2035331055521965, "learning_rate": 3e-06, "loss": 0.0045, "step": 3015 }, { "clip_ratio": 8.922198321670294e-05, "epoch": 0.008373172532884691, "grad_norm": 0.14463919401168823, "kl": 0.19243095070123672, "learning_rate": 3e-06, "loss": 0.004, "step": 3016 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.008375948783724507, "grad_norm": 0.10084468871355057, "kl": 0.1860548034310341, "learning_rate": 3e-06, "loss": 0.0041, "step": 3017 }, { "clip_ratio": 0.00022831049864180386, "epoch": 0.008378725034564324, "grad_norm": 0.1313498318195343, "kl": 0.18554911762475967, "learning_rate": 3e-06, "loss": 0.0038, "step": 3018 }, { "clip_ratio": 0.0, "epoch": 0.008381501285404139, "grad_norm": 0.1002134457230568, "kl": 0.17101778835058212, "learning_rate": 3e-06, "loss": 0.0026, "step": 3019 }, { "clip_ratio": 0.0012469383655115962, "epoch": 0.008384277536243954, "grad_norm": 0.12964390218257904, "kl": 0.17422015964984894, "learning_rate": 3e-06, "loss": 0.001, "step": 3020 }, { "clip_ratio": 0.0, "epoch": 0.00838705378708377, "grad_norm": 0.10013823211193085, "kl": 0.1747244894504547, "learning_rate": 3e-06, "loss": 0.0029, "step": 3021 }, { "clip_ratio": 0.0, "epoch": 0.008389830037923587, "grad_norm": 0.1409793198108673, "kl": 0.16584274917840958, "learning_rate": 3e-06, "loss": 0.0013, "step": 3022 }, { "clip_ratio": 0.0003175324818585068, "epoch": 0.008392606288763402, "grad_norm": 0.12917909026145935, "kl": 0.16229597479104996, "learning_rate": 3e-06, "loss": 0.0029, "step": 3023 }, { "clip_ratio": 0.0005150834622327238, "epoch": 0.008395382539603218, "grad_norm": 0.12114616483449936, "kl": 0.16238369047641754, "learning_rate": 3e-06, "loss": 0.0017, "step": 3024 }, { "clip_ratio": 0.00020101070549571887, "completion_length": 215.00000762939453, "epoch": 0.008398158790443033, "grad_norm": 0.09409298002719879, "kl": 0.1476946622133255, "learning_rate": 3e-06, "loss": 0.0302, "reward": 0.34166668355464935, "reward_std": 0.31335416436195374, "rewards/countdown_reward_func": 0.34166668355464935, "step": 3025, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0003320180549053475, "epoch": 0.00840093504128285, "grad_norm": 0.09882736206054688, "kl": 0.15368925780057907, "learning_rate": 3e-06, "loss": 0.0301, "step": 3026 }, { "clip_ratio": 0.0001945525291375816, "epoch": 0.008403711292122666, "grad_norm": 0.10567183792591095, "kl": 0.14300309866666794, "learning_rate": 3e-06, "loss": 0.0302, "step": 3027 }, { "clip_ratio": 0.00046948358067311347, "epoch": 0.008406487542962481, "grad_norm": 0.1465691328048706, "kl": 0.14580754935741425, "learning_rate": 3e-06, "loss": 0.0308, "step": 3028 }, { "clip_ratio": 0.0, "epoch": 0.008409263793802298, "grad_norm": 0.12747783958911896, "kl": 0.1436186358332634, "learning_rate": 3e-06, "loss": 0.0301, "step": 3029 }, { "clip_ratio": 0.0, "epoch": 0.008412040044642114, "grad_norm": 0.09885192662477493, "kl": 0.13538425415754318, "learning_rate": 3e-06, "loss": 0.0292, "step": 3030 }, { "clip_ratio": 9.697439963929355e-05, "epoch": 0.00841481629548193, "grad_norm": 0.09384225308895111, "kl": 0.14452335238456726, "learning_rate": 3e-06, "loss": 0.0294, "step": 3031 }, { "clip_ratio": 9.72762645687908e-05, "epoch": 0.008417592546321745, "grad_norm": 0.09388621151447296, "kl": 0.1505364328622818, "learning_rate": 3e-06, "loss": 0.0301, "step": 3032 }, { "clip_ratio": 0.0007261410937644541, "epoch": 0.008420368797161562, "grad_norm": 0.10633539408445358, "kl": 0.14311276376247406, "learning_rate": 3e-06, "loss": 0.0295, "step": 3033 }, { "clip_ratio": 0.0001783546103979461, "epoch": 0.008423145048001377, "grad_norm": 0.13338486850261688, "kl": 0.14947623014450073, "learning_rate": 3e-06, "loss": 0.0291, "step": 3034 }, { "clip_ratio": 0.00010860121983569115, "epoch": 0.008425921298841193, "grad_norm": 0.11355477571487427, "kl": 0.1482955366373062, "learning_rate": 3e-06, "loss": 0.0294, "step": 3035 }, { "clip_ratio": 0.0, "epoch": 0.008428697549681008, "grad_norm": 0.09776423871517181, "kl": 0.14452050626277924, "learning_rate": 3e-06, "loss": 0.0278, "step": 3036 }, { "clip_ratio": 9.67492233030498e-05, "completion_length": 219.33333587646484, "epoch": 0.008431473800520825, "grad_norm": 0.11770713329315186, "kl": 0.1683894470334053, "learning_rate": 3e-06, "loss": 0.0253, "reward": 0.3062500059604645, "reward_std": 0.31206805258989334, "rewards/countdown_reward_func": 0.3062499910593033, "step": 3037, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.67492233030498e-05, "epoch": 0.00843425005136064, "grad_norm": 0.11796853691339493, "kl": 0.16781019419431686, "learning_rate": 3e-06, "loss": 0.026, "step": 3038 }, { "clip_ratio": 0.0, "epoch": 0.008437026302200456, "grad_norm": 0.09436098486185074, "kl": 0.16520189493894577, "learning_rate": 3e-06, "loss": 0.025, "step": 3039 }, { "clip_ratio": 0.00018779442325467244, "epoch": 0.008439802553040273, "grad_norm": 0.0881367176771164, "kl": 0.17334932833909988, "learning_rate": 3e-06, "loss": 0.0252, "step": 3040 }, { "clip_ratio": 0.00011071744665969163, "epoch": 0.008442578803880088, "grad_norm": 0.11913830786943436, "kl": 0.175415500998497, "learning_rate": 3e-06, "loss": 0.0252, "step": 3041 }, { "clip_ratio": 0.000304215878713876, "epoch": 0.008445355054719904, "grad_norm": 0.08973114937543869, "kl": 0.16943339258432388, "learning_rate": 3e-06, "loss": 0.0249, "step": 3042 }, { "clip_ratio": 8.620689914096147e-05, "epoch": 0.00844813130555972, "grad_norm": 0.11736555397510529, "kl": 0.18179106712341309, "learning_rate": 3e-06, "loss": 0.025, "step": 3043 }, { "clip_ratio": 0.00028669723542407155, "epoch": 0.008450907556399536, "grad_norm": 0.10757984966039658, "kl": 0.18236199021339417, "learning_rate": 3e-06, "loss": 0.0242, "step": 3044 }, { "clip_ratio": 0.0, "epoch": 0.008453683807239352, "grad_norm": 0.08872993290424347, "kl": 0.18164537847042084, "learning_rate": 3e-06, "loss": 0.0237, "step": 3045 }, { "clip_ratio": 0.0002829962540999986, "epoch": 0.008456460058079167, "grad_norm": 0.08171938359737396, "kl": 0.1926388442516327, "learning_rate": 3e-06, "loss": 0.0243, "step": 3046 }, { "clip_ratio": 9.279881487600505e-05, "epoch": 0.008459236308918983, "grad_norm": 0.11810438334941864, "kl": 0.19261698424816132, "learning_rate": 3e-06, "loss": 0.0242, "step": 3047 }, { "clip_ratio": 0.0006447518680943176, "epoch": 0.0084620125597588, "grad_norm": 0.08821414411067963, "kl": 0.18508438766002655, "learning_rate": 3e-06, "loss": 0.0237, "step": 3048 }, { "clip_ratio": 9.842519648373127e-05, "completion_length": 209.58333587646484, "epoch": 0.008464788810598615, "grad_norm": 0.139348566532135, "kl": 0.21522551029920578, "learning_rate": 3e-06, "loss": 0.056, "reward": 0.302083358168602, "reward_std": 0.3184572756290436, "rewards/countdown_reward_func": 0.302083358168602, "step": 3049, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00011394712782930583, "epoch": 0.00846756506143843, "grad_norm": 0.12385433167219162, "kl": 0.21202124655246735, "learning_rate": 3e-06, "loss": 0.0544, "step": 3050 }, { "clip_ratio": 0.0, "epoch": 0.008470341312278248, "grad_norm": 0.1403563916683197, "kl": 0.2493307888507843, "learning_rate": 3e-06, "loss": 0.0556, "step": 3051 }, { "clip_ratio": 0.0003250418812967837, "epoch": 0.008473117563118063, "grad_norm": 0.13950026035308838, "kl": 0.239150770008564, "learning_rate": 3e-06, "loss": 0.0557, "step": 3052 }, { "clip_ratio": 9.204712841892615e-05, "epoch": 0.008475893813957878, "grad_norm": 0.12193503230810165, "kl": 0.24832145869731903, "learning_rate": 3e-06, "loss": 0.0543, "step": 3053 }, { "clip_ratio": 0.0, "epoch": 0.008478670064797694, "grad_norm": 0.12193918228149414, "kl": 0.24702062457799911, "learning_rate": 3e-06, "loss": 0.0532, "step": 3054 }, { "clip_ratio": 9.842519648373127e-05, "epoch": 0.008481446315637511, "grad_norm": 0.12223386019468307, "kl": 0.25641151517629623, "learning_rate": 3e-06, "loss": 0.0528, "step": 3055 }, { "clip_ratio": 0.0, "epoch": 0.008484222566477326, "grad_norm": 0.1188080683350563, "kl": 0.2540518641471863, "learning_rate": 3e-06, "loss": 0.0512, "step": 3056 }, { "clip_ratio": 0.00019532733858795837, "epoch": 0.008486998817317142, "grad_norm": 0.11689227819442749, "kl": 0.3023042678833008, "learning_rate": 3e-06, "loss": 0.0524, "step": 3057 }, { "clip_ratio": 0.0005254697171039879, "epoch": 0.008489775068156957, "grad_norm": 0.24439960718154907, "kl": 0.2954285740852356, "learning_rate": 3e-06, "loss": 0.0524, "step": 3058 }, { "clip_ratio": 0.0004721923905890435, "epoch": 0.008492551318996774, "grad_norm": 0.10251002013683319, "kl": 0.30764856934547424, "learning_rate": 3e-06, "loss": 0.0506, "step": 3059 }, { "clip_ratio": 0.0014023402472957969, "epoch": 0.00849532756983659, "grad_norm": 0.10669434070587158, "kl": 0.3052254766225815, "learning_rate": 3e-06, "loss": 0.0497, "step": 3060 }, { "clip_ratio": 0.00018189493857789785, "completion_length": 202.9166717529297, "epoch": 0.008498103820676405, "grad_norm": 0.1046997606754303, "kl": 0.2894783765077591, "learning_rate": 3e-06, "loss": 0.0207, "reward": 0.2812499925494194, "reward_std": 0.3629014790058136, "rewards/countdown_reward_func": 0.2812499925494194, "step": 3061, "zero_std_ratio": 0.125 }, { "clip_ratio": 8.212877582991496e-05, "epoch": 0.008500880071516222, "grad_norm": 0.19258429110050201, "kl": 0.30791378021240234, "learning_rate": 3e-06, "loss": 0.0209, "step": 3062 }, { "clip_ratio": 0.0, "epoch": 0.008503656322356038, "grad_norm": 0.12044266611337662, "kl": 0.2983648478984833, "learning_rate": 3e-06, "loss": 0.0209, "step": 3063 }, { "clip_ratio": 9.164222865365446e-05, "epoch": 0.008506432573195853, "grad_norm": 0.09658069908618927, "kl": 0.30851851403713226, "learning_rate": 3e-06, "loss": 0.0207, "step": 3064 }, { "clip_ratio": 0.00021335596102289855, "epoch": 0.008509208824035669, "grad_norm": 0.3173162639141083, "kl": 0.3143046051263809, "learning_rate": 3e-06, "loss": 0.0218, "step": 3065 }, { "clip_ratio": 0.00022387076751329005, "epoch": 0.008511985074875486, "grad_norm": 0.12057913094758987, "kl": 0.3165849447250366, "learning_rate": 3e-06, "loss": 0.0214, "step": 3066 }, { "clip_ratio": 0.0003624003438744694, "epoch": 0.008514761325715301, "grad_norm": 0.10035211592912674, "kl": 0.3217386305332184, "learning_rate": 3e-06, "loss": 0.0204, "step": 3067 }, { "clip_ratio": 0.00018189493857789785, "epoch": 0.008517537576555117, "grad_norm": 0.18664516508579254, "kl": 0.3287786394357681, "learning_rate": 3e-06, "loss": 0.0214, "step": 3068 }, { "clip_ratio": 0.00022387076751329005, "epoch": 0.008520313827394932, "grad_norm": 0.12538620829582214, "kl": 0.3129211813211441, "learning_rate": 3e-06, "loss": 0.0201, "step": 3069 }, { "clip_ratio": 0.00011961722339037806, "epoch": 0.008523090078234749, "grad_norm": 0.12438543140888214, "kl": 0.3173307925462723, "learning_rate": 3e-06, "loss": 0.0198, "step": 3070 }, { "clip_ratio": 0.0005795367178507149, "epoch": 0.008525866329074564, "grad_norm": 0.13736887276172638, "kl": 0.3146470934152603, "learning_rate": 3e-06, "loss": 0.0203, "step": 3071 }, { "clip_ratio": 0.0011748351971618831, "epoch": 0.00852864257991438, "grad_norm": 0.12065845727920532, "kl": 0.31247714161872864, "learning_rate": 3e-06, "loss": 0.0204, "step": 3072 }, { "clip_ratio": 8.567512122681364e-05, "completion_length": 195.27083587646484, "epoch": 0.008531418830754197, "grad_norm": 0.18907572329044342, "kl": 0.32637839019298553, "learning_rate": 3e-06, "loss": 0.0733, "reward": 0.43541669845581055, "reward_std": 0.3927324414253235, "rewards/countdown_reward_func": 0.43541666865348816, "step": 3073, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.00041528159636072814, "epoch": 0.008534195081594012, "grad_norm": 0.19040289521217346, "kl": 0.3201305568218231, "learning_rate": 3e-06, "loss": 0.0735, "step": 3074 }, { "clip_ratio": 0.00010683760774554685, "epoch": 0.008536971332433828, "grad_norm": 0.2120184451341629, "kl": 0.31627263128757477, "learning_rate": 3e-06, "loss": 0.0726, "step": 3075 }, { "clip_ratio": 0.00011984659795416519, "epoch": 0.008539747583273643, "grad_norm": 0.1814897507429123, "kl": 0.31967295706272125, "learning_rate": 3e-06, "loss": 0.0719, "step": 3076 }, { "clip_ratio": 0.00021461536380229518, "epoch": 0.00854252383411346, "grad_norm": 0.18902859091758728, "kl": 0.3382069915533066, "learning_rate": 3e-06, "loss": 0.0723, "step": 3077 }, { "clip_ratio": 0.0, "epoch": 0.008545300084953276, "grad_norm": 0.21231567859649658, "kl": 0.33370304107666016, "learning_rate": 3e-06, "loss": 0.0708, "step": 3078 }, { "clip_ratio": 0.0002872814948204905, "epoch": 0.008548076335793091, "grad_norm": 0.16428625583648682, "kl": 0.36024677753448486, "learning_rate": 3e-06, "loss": 0.0702, "step": 3079 }, { "clip_ratio": 9.476876584812999e-05, "epoch": 0.008550852586632907, "grad_norm": 0.1921549141407013, "kl": 0.37211868166923523, "learning_rate": 3e-06, "loss": 0.0689, "step": 3080 }, { "clip_ratio": 0.0, "epoch": 0.008553628837472724, "grad_norm": 0.19261738657951355, "kl": 0.3795281648635864, "learning_rate": 3e-06, "loss": 0.0677, "step": 3081 }, { "clip_ratio": 9.476876584812999e-05, "epoch": 0.00855640508831254, "grad_norm": 0.1471017599105835, "kl": 0.3986634314060211, "learning_rate": 3e-06, "loss": 0.0668, "step": 3082 }, { "clip_ratio": 0.0, "epoch": 0.008559181339152355, "grad_norm": 0.14474532008171082, "kl": 0.42843401432037354, "learning_rate": 3e-06, "loss": 0.0669, "step": 3083 }, { "clip_ratio": 0.00011984659795416519, "epoch": 0.008561957589992172, "grad_norm": 0.1712830364704132, "kl": 0.4253462553024292, "learning_rate": 3e-06, "loss": 0.0652, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 176.58333587646484, "epoch": 0.008564733840831987, "grad_norm": 0.1854863464832306, "kl": 0.43677669763565063, "learning_rate": 3e-06, "loss": 0.0225, "reward": 0.30000002682209015, "reward_std": 0.29471276700496674, "rewards/countdown_reward_func": 0.30000001937150955, "step": 3085, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.667439735494554e-05, "epoch": 0.008567510091671802, "grad_norm": 0.18476706743240356, "kl": 0.448853075504303, "learning_rate": 3e-06, "loss": 0.0217, "step": 3086 }, { "clip_ratio": 0.0, "epoch": 0.008570286342511618, "grad_norm": 0.14640380442142487, "kl": 0.47662124037742615, "learning_rate": 3e-06, "loss": 0.0221, "step": 3087 }, { "clip_ratio": 0.0, "epoch": 0.008573062593351435, "grad_norm": 0.19531789422035217, "kl": 0.45479515194892883, "learning_rate": 3e-06, "loss": 0.0221, "step": 3088 }, { "clip_ratio": 0.00012562813935801387, "epoch": 0.00857583884419125, "grad_norm": 0.166063129901886, "kl": 0.45544037222862244, "learning_rate": 3e-06, "loss": 0.0215, "step": 3089 }, { "clip_ratio": 0.00012254902685526758, "epoch": 0.008578615095031066, "grad_norm": 0.1422431766986847, "kl": 0.46465128660202026, "learning_rate": 3e-06, "loss": 0.0219, "step": 3090 }, { "clip_ratio": 0.0, "epoch": 0.008581391345870881, "grad_norm": 0.18519708514213562, "kl": 0.4750974178314209, "learning_rate": 3e-06, "loss": 0.0213, "step": 3091 }, { "clip_ratio": 0.0001035625537042506, "epoch": 0.008584167596710698, "grad_norm": 0.16989552974700928, "kl": 0.46254733204841614, "learning_rate": 3e-06, "loss": 0.0218, "step": 3092 }, { "clip_ratio": 0.00011814745084848255, "epoch": 0.008586943847550514, "grad_norm": 0.14211523532867432, "kl": 0.4715369939804077, "learning_rate": 3e-06, "loss": 0.0214, "step": 3093 }, { "clip_ratio": 0.00012562813935801387, "epoch": 0.00858972009839033, "grad_norm": 0.18019472062587738, "kl": 0.4375690817832947, "learning_rate": 3e-06, "loss": 0.0207, "step": 3094 }, { "clip_ratio": 0.00012254902685526758, "epoch": 0.008592496349230146, "grad_norm": 0.1632636934518814, "kl": 0.42439958453178406, "learning_rate": 3e-06, "loss": 0.0188, "step": 3095 }, { "clip_ratio": 0.0002261115878354758, "epoch": 0.008595272600069962, "grad_norm": 0.18175108730793, "kl": 0.42348697781562805, "learning_rate": 3e-06, "loss": 0.0201, "step": 3096 }, { "clip_ratio": 0.0009116693399846554, "completion_length": 188.83333587646484, "epoch": 0.008598048850909777, "grad_norm": 0.16925153136253357, "kl": 0.418431356549263, "learning_rate": 3e-06, "loss": 0.0126, "reward": 0.22708334773778915, "reward_std": 0.1685066595673561, "rewards/countdown_reward_func": 0.22708334773778915, "step": 3097, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.008600825101749593, "grad_norm": 0.16533486545085907, "kl": 0.40907223522663116, "learning_rate": 3e-06, "loss": 0.0123, "step": 3098 }, { "clip_ratio": 0.0, "epoch": 0.00860360135258941, "grad_norm": 0.18030254542827606, "kl": 0.38974761962890625, "learning_rate": 3e-06, "loss": 0.0118, "step": 3099 }, { "clip_ratio": 0.00012939958833158016, "epoch": 0.008606377603429225, "grad_norm": 0.14352940022945404, "kl": 0.3707566559314728, "learning_rate": 3e-06, "loss": 0.0105, "step": 3100 }, { "clip_ratio": 0.0002140280994353816, "epoch": 0.00860915385426904, "grad_norm": 0.14302270114421844, "kl": 0.3659559190273285, "learning_rate": 3e-06, "loss": 0.0097, "step": 3101 }, { "clip_ratio": 0.0011671447136905044, "epoch": 0.008611930105108858, "grad_norm": 0.16045400500297546, "kl": 0.351381853222847, "learning_rate": 3e-06, "loss": 0.0088, "step": 3102 }, { "clip_ratio": 0.0019049870606977493, "epoch": 0.008614706355948673, "grad_norm": 0.14621001482009888, "kl": 0.33087173104286194, "learning_rate": 3e-06, "loss": 0.0077, "step": 3103 }, { "clip_ratio": 0.0007671903586015105, "epoch": 0.008617482606788488, "grad_norm": 0.12585081160068512, "kl": 0.31923481822013855, "learning_rate": 3e-06, "loss": 0.0074, "step": 3104 }, { "clip_ratio": 0.0014542836288455874, "epoch": 0.008620258857628304, "grad_norm": 0.1511574536561966, "kl": 0.3015774190425873, "learning_rate": 3e-06, "loss": 0.0057, "step": 3105 }, { "clip_ratio": 0.0043881842866539955, "epoch": 0.008623035108468121, "grad_norm": 0.1201237142086029, "kl": 0.2829260230064392, "learning_rate": 3e-06, "loss": 0.0059, "step": 3106 }, { "clip_ratio": 0.008837747853249311, "epoch": 0.008625811359307936, "grad_norm": 0.11277580261230469, "kl": 0.2772577852010727, "learning_rate": 3e-06, "loss": 0.0051, "step": 3107 }, { "clip_ratio": 0.018280779011547565, "epoch": 0.008628587610147752, "grad_norm": 0.11727593839168549, "kl": 0.2665339708328247, "learning_rate": 3e-06, "loss": 0.0041, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 203.70833587646484, "epoch": 0.008631363860987567, "grad_norm": 0.11837958544492722, "kl": 0.23622510582208633, "learning_rate": 3e-06, "loss": 0.0162, "reward": 0.35833336412906647, "reward_std": 0.36252938210964203, "rewards/countdown_reward_func": 0.3583333417773247, "step": 3109, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.008634140111827384, "grad_norm": 0.12453795224428177, "kl": 0.23774665594100952, "learning_rate": 3e-06, "loss": 0.0152, "step": 3110 }, { "clip_ratio": 0.0, "epoch": 0.0086369163626672, "grad_norm": 0.12831057608127594, "kl": 0.2375757023692131, "learning_rate": 3e-06, "loss": 0.0151, "step": 3111 }, { "clip_ratio": 0.00021458745322888717, "epoch": 0.008639692613507015, "grad_norm": 0.15377934277057648, "kl": 0.22204899042844772, "learning_rate": 3e-06, "loss": 0.0154, "step": 3112 }, { "clip_ratio": 0.00010945709072984755, "epoch": 0.008642468864346832, "grad_norm": 0.13043875992298126, "kl": 0.22094909846782684, "learning_rate": 3e-06, "loss": 0.0141, "step": 3113 }, { "clip_ratio": 0.0, "epoch": 0.008645245115186648, "grad_norm": 0.13309596478939056, "kl": 0.21534118801355362, "learning_rate": 3e-06, "loss": 0.0145, "step": 3114 }, { "clip_ratio": 0.00010373444092692807, "epoch": 0.008648021366026463, "grad_norm": 0.13112622499465942, "kl": 0.1934543028473854, "learning_rate": 3e-06, "loss": 0.0147, "step": 3115 }, { "clip_ratio": 0.0006119465688243508, "epoch": 0.008650797616866279, "grad_norm": 0.12741701304912567, "kl": 0.1981368288397789, "learning_rate": 3e-06, "loss": 0.0131, "step": 3116 }, { "clip_ratio": 9.85804435913451e-05, "epoch": 0.008653573867706096, "grad_norm": 0.12851454317569733, "kl": 0.20106661319732666, "learning_rate": 3e-06, "loss": 0.0134, "step": 3117 }, { "clip_ratio": 0.0003074552150792442, "epoch": 0.008656350118545911, "grad_norm": 0.11376245319843292, "kl": 0.19054380059242249, "learning_rate": 3e-06, "loss": 0.0144, "step": 3118 }, { "clip_ratio": 0.0, "epoch": 0.008659126369385726, "grad_norm": 0.15265706181526184, "kl": 0.192866250872612, "learning_rate": 3e-06, "loss": 0.0109, "step": 3119 }, { "clip_ratio": 0.0006101824692450464, "epoch": 0.008661902620225542, "grad_norm": 0.12200378626585007, "kl": 0.19264718890190125, "learning_rate": 3e-06, "loss": 0.0123, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 202.95833587646484, "epoch": 0.008664678871065359, "grad_norm": 0.0969107523560524, "kl": 0.19015229493379593, "learning_rate": 3e-06, "loss": -0.0006, "reward": 0.33750002086162567, "reward_std": 0.25560229271650314, "rewards/countdown_reward_func": 0.3375000134110451, "step": 3121, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00011200716835446656, "epoch": 0.008667455121905174, "grad_norm": 0.13161097466945648, "kl": 0.18232522904872894, "learning_rate": 3e-06, "loss": -0.0008, "step": 3122 }, { "clip_ratio": 0.00010729613859439269, "epoch": 0.00867023137274499, "grad_norm": 0.09631598740816116, "kl": 0.1801673322916031, "learning_rate": 3e-06, "loss": -0.002, "step": 3123 }, { "clip_ratio": 0.0, "epoch": 0.008673007623584807, "grad_norm": 0.10793627798557281, "kl": 0.16692427545785904, "learning_rate": 3e-06, "loss": -0.0016, "step": 3124 }, { "clip_ratio": 0.0, "epoch": 0.008675783874424622, "grad_norm": 0.09990864247083664, "kl": 0.17877893894910812, "learning_rate": 3e-06, "loss": -0.001, "step": 3125 }, { "clip_ratio": 0.0005498202008311637, "epoch": 0.008678560125264438, "grad_norm": 0.09849561750888824, "kl": 0.17538080364465714, "learning_rate": 3e-06, "loss": -0.0009, "step": 3126 }, { "clip_ratio": 0.0, "epoch": 0.008681336376104253, "grad_norm": 0.10231944173574448, "kl": 0.1710778921842575, "learning_rate": 3e-06, "loss": -0.0018, "step": 3127 }, { "clip_ratio": 0.0003024511752300896, "epoch": 0.00868411262694407, "grad_norm": 0.12745961546897888, "kl": 0.16763567924499512, "learning_rate": 3e-06, "loss": -0.0024, "step": 3128 }, { "clip_ratio": 0.00010179152741329744, "epoch": 0.008686888877783886, "grad_norm": 0.10914169996976852, "kl": 0.1661512404680252, "learning_rate": 3e-06, "loss": -0.003, "step": 3129 }, { "clip_ratio": 0.00019380133016966283, "epoch": 0.008689665128623701, "grad_norm": 0.1022045835852623, "kl": 0.15079744160175323, "learning_rate": 3e-06, "loss": -0.0032, "step": 3130 }, { "clip_ratio": 0.0, "epoch": 0.008692441379463517, "grad_norm": 0.0985385924577713, "kl": 0.16422124207019806, "learning_rate": 3e-06, "loss": -0.0031, "step": 3131 }, { "clip_ratio": 0.0005141025903867558, "epoch": 0.008695217630303334, "grad_norm": 0.09927012771368027, "kl": 0.16305622458457947, "learning_rate": 3e-06, "loss": -0.0027, "step": 3132 }, { "clip_ratio": 8.840169903123751e-05, "completion_length": 217.7916717529297, "epoch": 0.008697993881143149, "grad_norm": 0.07245054841041565, "kl": 0.14446517080068588, "learning_rate": 3e-06, "loss": 0.0142, "reward": 0.34166668355464935, "reward_std": 0.21667249500751495, "rewards/countdown_reward_func": 0.34166668355464935, "step": 3133, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.008700770131982965, "grad_norm": 0.07044894248247147, "kl": 0.14337440580129623, "learning_rate": 3e-06, "loss": 0.014, "step": 3134 }, { "clip_ratio": 0.0, "epoch": 0.008703546382822782, "grad_norm": 0.0757245197892189, "kl": 0.1318872570991516, "learning_rate": 3e-06, "loss": 0.0136, "step": 3135 }, { "clip_ratio": 0.0, "epoch": 0.008706322633662597, "grad_norm": 0.06143643707036972, "kl": 0.138749398291111, "learning_rate": 3e-06, "loss": 0.0135, "step": 3136 }, { "clip_ratio": 0.0006214517270564102, "epoch": 0.008709098884502412, "grad_norm": 0.07034595310688019, "kl": 0.13337260484695435, "learning_rate": 3e-06, "loss": 0.0129, "step": 3137 }, { "clip_ratio": 0.00027665217203320935, "epoch": 0.008711875135342228, "grad_norm": 0.0638313740491867, "kl": 0.13524968922138214, "learning_rate": 3e-06, "loss": 0.0137, "step": 3138 }, { "clip_ratio": 9.057971328729764e-05, "epoch": 0.008714651386182045, "grad_norm": 0.07481712847948074, "kl": 0.13550669699907303, "learning_rate": 3e-06, "loss": 0.0142, "step": 3139 }, { "clip_ratio": 8.809020073385909e-05, "epoch": 0.00871742763702186, "grad_norm": 0.07455291599035263, "kl": 0.13690495491027832, "learning_rate": 3e-06, "loss": 0.0131, "step": 3140 }, { "clip_ratio": 9.057971328729764e-05, "epoch": 0.008720203887861676, "grad_norm": 0.07917942106723785, "kl": 0.12809910625219345, "learning_rate": 3e-06, "loss": 0.0132, "step": 3141 }, { "clip_ratio": 0.00017944321734830737, "epoch": 0.008722980138701491, "grad_norm": 0.06103288382291794, "kl": 0.13416597247123718, "learning_rate": 3e-06, "loss": 0.0136, "step": 3142 }, { "clip_ratio": 0.0009079299270524643, "epoch": 0.008725756389541308, "grad_norm": 0.09830410033464432, "kl": 0.13004712015390396, "learning_rate": 3e-06, "loss": 0.0122, "step": 3143 }, { "clip_ratio": 0.00018115942657459527, "epoch": 0.008728532640381124, "grad_norm": 0.07918588072061539, "kl": 0.13146449625492096, "learning_rate": 3e-06, "loss": 0.0132, "step": 3144 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 223.0416717529297, "epoch": 0.00873130889122094, "grad_norm": 0.13423959910869598, "kl": 0.14421016722917557, "learning_rate": 3e-06, "loss": 0.0028, "reward": 0.2500000223517418, "reward_std": 0.24180647730827332, "rewards/countdown_reward_func": 0.2500000074505806, "step": 3145, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.008734085142060756, "grad_norm": 0.10806974768638611, "kl": 0.14668051898479462, "learning_rate": 3e-06, "loss": 0.0035, "step": 3146 }, { "clip_ratio": 0.0, "epoch": 0.008736861392900572, "grad_norm": 0.06827183067798615, "kl": 0.14053280651569366, "learning_rate": 3e-06, "loss": 0.0035, "step": 3147 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.008739637643740387, "grad_norm": 0.07476585358381271, "kl": 0.14173533022403717, "learning_rate": 3e-06, "loss": 0.0034, "step": 3148 }, { "clip_ratio": 0.0, "epoch": 0.008742413894580203, "grad_norm": 0.08526886999607086, "kl": 0.13891906291246414, "learning_rate": 3e-06, "loss": 0.0027, "step": 3149 }, { "clip_ratio": 0.0006623226727242582, "epoch": 0.00874519014542002, "grad_norm": 0.1079644113779068, "kl": 0.14368348568677902, "learning_rate": 3e-06, "loss": 0.0028, "step": 3150 }, { "clip_ratio": 0.0005405927586252801, "epoch": 0.008747966396259835, "grad_norm": 0.13065826892852783, "kl": 0.14139443635940552, "learning_rate": 3e-06, "loss": 0.0021, "step": 3151 }, { "clip_ratio": 0.0002578174462541938, "epoch": 0.00875074264709965, "grad_norm": 0.11560037732124329, "kl": 0.1433216780424118, "learning_rate": 3e-06, "loss": 0.0026, "step": 3152 }, { "clip_ratio": 0.00019186492136213928, "epoch": 0.008753518897939466, "grad_norm": 0.06760667264461517, "kl": 0.13707970827817917, "learning_rate": 3e-06, "loss": 0.0023, "step": 3153 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.008756295148779283, "grad_norm": 0.07629324495792389, "kl": 0.13755664974451065, "learning_rate": 3e-06, "loss": 0.0022, "step": 3154 }, { "clip_ratio": 0.0006100469909142703, "epoch": 0.008759071399619098, "grad_norm": 0.08185707032680511, "kl": 0.1355728581547737, "learning_rate": 3e-06, "loss": 0.0012, "step": 3155 }, { "clip_ratio": 0.0003368018660694361, "epoch": 0.008761847650458914, "grad_norm": 0.10096393525600433, "kl": 0.1373049020767212, "learning_rate": 3e-06, "loss": 0.0008, "step": 3156 }, { "clip_ratio": 0.00019086471729679033, "completion_length": 202.50000762939453, "epoch": 0.008764623901298731, "grad_norm": 0.16324223577976227, "kl": 0.13057880103588104, "learning_rate": 3e-06, "loss": 0.051, "reward": 0.4125000089406967, "reward_std": 0.3502862676978111, "rewards/countdown_reward_func": 0.4125000089406967, "step": 3157, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.008767400152138546, "grad_norm": 0.147431880235672, "kl": 0.13627608120441437, "learning_rate": 3e-06, "loss": 0.0516, "step": 3158 }, { "clip_ratio": 0.0, "epoch": 0.008770176402978362, "grad_norm": 0.16802668571472168, "kl": 0.13467568159103394, "learning_rate": 3e-06, "loss": 0.0509, "step": 3159 }, { "clip_ratio": 0.00011961722339037806, "epoch": 0.008772952653818177, "grad_norm": 0.14335252344608307, "kl": 0.13732612133026123, "learning_rate": 3e-06, "loss": 0.0517, "step": 3160 }, { "clip_ratio": 0.00033380321110598743, "epoch": 0.008775728904657994, "grad_norm": 0.14163748919963837, "kl": 0.1391049399971962, "learning_rate": 3e-06, "loss": 0.0515, "step": 3161 }, { "clip_ratio": 0.0, "epoch": 0.00877850515549781, "grad_norm": 0.14354771375656128, "kl": 0.14471176266670227, "learning_rate": 3e-06, "loss": 0.0514, "step": 3162 }, { "clip_ratio": 0.0, "epoch": 0.008781281406337625, "grad_norm": 0.16682764887809753, "kl": 0.13667885959148407, "learning_rate": 3e-06, "loss": 0.0495, "step": 3163 }, { "clip_ratio": 0.00022061960771679878, "epoch": 0.00878405765717744, "grad_norm": 0.14736826717853546, "kl": 0.14657501131296158, "learning_rate": 3e-06, "loss": 0.0501, "step": 3164 }, { "clip_ratio": 9.735202183946967e-05, "epoch": 0.008786833908017258, "grad_norm": 0.15287110209465027, "kl": 0.14519157260656357, "learning_rate": 3e-06, "loss": 0.0491, "step": 3165 }, { "clip_ratio": 0.0003215074248146266, "epoch": 0.008789610158857073, "grad_norm": 0.183084636926651, "kl": 0.15177743136882782, "learning_rate": 3e-06, "loss": 0.0496, "step": 3166 }, { "clip_ratio": 0.00031484465580433607, "epoch": 0.008792386409696889, "grad_norm": 0.12996704876422882, "kl": 0.15576454997062683, "learning_rate": 3e-06, "loss": 0.0486, "step": 3167 }, { "clip_ratio": 9.433962259208784e-05, "epoch": 0.008795162660536706, "grad_norm": 0.1341649740934372, "kl": 0.16323447972536087, "learning_rate": 3e-06, "loss": 0.0477, "step": 3168 }, { "clip_ratio": 9.005763422464952e-05, "completion_length": 230.4375, "epoch": 0.008797938911376521, "grad_norm": 0.08408898860216141, "kl": 0.1531185433268547, "learning_rate": 3e-06, "loss": 0.0029, "reward": 0.24791669100522995, "reward_std": 0.20933149755001068, "rewards/countdown_reward_func": 0.24791667610406876, "step": 3169, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.008800715162216336, "grad_norm": 0.0844191238284111, "kl": 0.16432736068964005, "learning_rate": 3e-06, "loss": 0.0036, "step": 3170 }, { "clip_ratio": 0.0, "epoch": 0.008803491413056152, "grad_norm": 0.08834011107683182, "kl": 0.16174228489398956, "learning_rate": 3e-06, "loss": 0.003, "step": 3171 }, { "clip_ratio": 0.0001686909527052194, "epoch": 0.008806267663895969, "grad_norm": 0.10913260281085968, "kl": 0.17009952664375305, "learning_rate": 3e-06, "loss": 0.0036, "step": 3172 }, { "clip_ratio": 9.314456110587344e-05, "epoch": 0.008809043914735784, "grad_norm": 0.09643187373876572, "kl": 0.1666809618473053, "learning_rate": 3e-06, "loss": 0.0042, "step": 3173 }, { "clip_ratio": 0.00010008006211137399, "epoch": 0.0088118201655756, "grad_norm": 0.09687443822622299, "kl": 0.16616057604551315, "learning_rate": 3e-06, "loss": 0.0035, "step": 3174 }, { "clip_ratio": 0.0, "epoch": 0.008814596416415415, "grad_norm": 0.09173749387264252, "kl": 0.17206548899412155, "learning_rate": 3e-06, "loss": 0.0033, "step": 3175 }, { "clip_ratio": 0.000187265919521451, "epoch": 0.008817372667255232, "grad_norm": 0.08999493718147278, "kl": 0.17810207605361938, "learning_rate": 3e-06, "loss": 0.0035, "step": 3176 }, { "clip_ratio": 0.00026706433709478006, "epoch": 0.008820148918095048, "grad_norm": 0.09909993410110474, "kl": 0.1738302931189537, "learning_rate": 3e-06, "loss": 0.0031, "step": 3177 }, { "clip_ratio": 0.0003430515280342661, "epoch": 0.008822925168934863, "grad_norm": 0.10833445936441422, "kl": 0.17874176800251007, "learning_rate": 3e-06, "loss": 0.0024, "step": 3178 }, { "clip_ratio": 9.314456110587344e-05, "epoch": 0.00882570141977468, "grad_norm": 0.09550219774246216, "kl": 0.1728803962469101, "learning_rate": 3e-06, "loss": 0.0036, "step": 3179 }, { "clip_ratio": 0.0003373819054104388, "epoch": 0.008828477670614496, "grad_norm": 0.10736780613660812, "kl": 0.1694168746471405, "learning_rate": 3e-06, "loss": 0.0028, "step": 3180 }, { "clip_ratio": 0.00027249832783127204, "completion_length": 215.81250762939453, "epoch": 0.008831253921454311, "grad_norm": 0.045050859451293945, "kl": 0.16988086700439453, "learning_rate": 3e-06, "loss": 0.0114, "reward": 0.1354166716337204, "reward_std": 0.09314198791980743, "rewards/countdown_reward_func": 0.1354166716337204, "step": 3181, "zero_std_ratio": 0.75 }, { "clip_ratio": 9.920635056914762e-05, "epoch": 0.008834030172294127, "grad_norm": 0.05934186279773712, "kl": 0.17056862264871597, "learning_rate": 3e-06, "loss": 0.0107, "step": 3182 }, { "clip_ratio": 9.300595411332324e-05, "epoch": 0.008836806423133944, "grad_norm": 0.06379352509975433, "kl": 0.17337292432785034, "learning_rate": 3e-06, "loss": 0.0111, "step": 3183 }, { "clip_ratio": 0.0005377328343456611, "epoch": 0.008839582673973759, "grad_norm": 0.048195429146289825, "kl": 0.1668781116604805, "learning_rate": 3e-06, "loss": 0.0109, "step": 3184 }, { "clip_ratio": 9.300595411332324e-05, "epoch": 0.008842358924813574, "grad_norm": 0.05198591575026512, "kl": 0.16725103557109833, "learning_rate": 3e-06, "loss": 0.0107, "step": 3185 }, { "clip_ratio": 0.0, "epoch": 0.00884513517565339, "grad_norm": 0.047836218029260635, "kl": 0.1580887734889984, "learning_rate": 3e-06, "loss": 0.0103, "step": 3186 }, { "clip_ratio": 0.0003760358376894146, "epoch": 0.008847911426493207, "grad_norm": 0.047014977782964706, "kl": 0.1621924787759781, "learning_rate": 3e-06, "loss": 0.0109, "step": 3187 }, { "clip_ratio": 0.00017329197726212442, "epoch": 0.008850687677333022, "grad_norm": 0.06185169890522957, "kl": 0.16211094707250595, "learning_rate": 3e-06, "loss": 0.0104, "step": 3188 }, { "clip_ratio": 9.300595411332324e-05, "epoch": 0.008853463928172838, "grad_norm": 0.06670218706130981, "kl": 0.16410937905311584, "learning_rate": 3e-06, "loss": 0.0104, "step": 3189 }, { "clip_ratio": 0.0005576964322244748, "epoch": 0.008856240179012655, "grad_norm": 0.046619921922683716, "kl": 0.1557774841785431, "learning_rate": 3e-06, "loss": 0.01, "step": 3190 }, { "clip_ratio": 0.00046365898015210405, "epoch": 0.00885901642985247, "grad_norm": 0.045677557587623596, "kl": 0.15717875957489014, "learning_rate": 3e-06, "loss": 0.0102, "step": 3191 }, { "clip_ratio": 0.000478205613035243, "epoch": 0.008861792680692286, "grad_norm": 0.046221353113651276, "kl": 0.14866939932107925, "learning_rate": 3e-06, "loss": 0.0097, "step": 3192 }, { "clip_ratio": 0.00030436589440796524, "completion_length": 208.2916717529297, "epoch": 0.008864568931532101, "grad_norm": 0.08959240466356277, "kl": 0.16229014843702316, "learning_rate": 3e-06, "loss": 0.0233, "reward": 0.3958333432674408, "reward_std": 0.377306193113327, "rewards/countdown_reward_func": 0.3958333283662796, "step": 3193, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.008867345182371918, "grad_norm": 0.09560725092887878, "kl": 0.165329247713089, "learning_rate": 3e-06, "loss": 0.0231, "step": 3194 }, { "clip_ratio": 0.00017775646119844168, "epoch": 0.008870121433211734, "grad_norm": 0.1095302626490593, "kl": 0.1652495339512825, "learning_rate": 3e-06, "loss": 0.0236, "step": 3195 }, { "clip_ratio": 9.505703201284632e-05, "epoch": 0.00887289768405155, "grad_norm": 0.1113349050283432, "kl": 0.16995997726917267, "learning_rate": 3e-06, "loss": 0.0239, "step": 3196 }, { "clip_ratio": 0.0, "epoch": 0.008875673934891365, "grad_norm": 0.13030965626239777, "kl": 0.15821540355682373, "learning_rate": 3e-06, "loss": 0.0233, "step": 3197 }, { "clip_ratio": 0.0, "epoch": 0.008878450185731182, "grad_norm": 0.11282457411289215, "kl": 0.15858536213636398, "learning_rate": 3e-06, "loss": 0.0231, "step": 3198 }, { "clip_ratio": 0.000332137118675746, "epoch": 0.008881226436570997, "grad_norm": 0.11879956722259521, "kl": 0.15846314281225204, "learning_rate": 3e-06, "loss": 0.0218, "step": 3199 }, { "clip_ratio": 0.0, "epoch": 0.008884002687410813, "grad_norm": 0.10702770203351974, "kl": 0.16206620633602142, "learning_rate": 3e-06, "loss": 0.023, "step": 3200 }, { "clip_ratio": 0.0001927525008795783, "epoch": 0.00888677893825063, "grad_norm": 0.14720606803894043, "kl": 0.16430184245109558, "learning_rate": 3e-06, "loss": 0.023, "step": 3201 }, { "clip_ratio": 0.00028517108876258135, "epoch": 0.008889555189090445, "grad_norm": 0.11928429454565048, "kl": 0.1687665656208992, "learning_rate": 3e-06, "loss": 0.023, "step": 3202 }, { "clip_ratio": 0.00010841283801710233, "epoch": 0.00889233143993026, "grad_norm": 0.18026551604270935, "kl": 0.1596744880080223, "learning_rate": 3e-06, "loss": 0.0224, "step": 3203 }, { "clip_ratio": 0.0, "epoch": 0.008895107690770076, "grad_norm": 0.1088532954454422, "kl": 0.15937218070030212, "learning_rate": 3e-06, "loss": 0.0217, "step": 3204 }, { "clip_ratio": 9.097525617107749e-05, "completion_length": 225.64583587646484, "epoch": 0.008897883941609893, "grad_norm": 0.13462470471858978, "kl": 0.1489795595407486, "learning_rate": 3e-06, "loss": -0.0006, "reward": 0.39375002682209015, "reward_std": 0.35414034128189087, "rewards/countdown_reward_func": 0.39375001192092896, "step": 3205, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0005514068761840463, "epoch": 0.008900660192449708, "grad_norm": 0.09613679349422455, "kl": 0.14799974113702774, "learning_rate": 3e-06, "loss": -0.0001, "step": 3206 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.008903436443289524, "grad_norm": 0.13747777044773102, "kl": 0.15633682161569595, "learning_rate": 3e-06, "loss": 0.0005, "step": 3207 }, { "clip_ratio": 0.0001823501952458173, "epoch": 0.00890621269412934, "grad_norm": 0.10208241641521454, "kl": 0.14966540038585663, "learning_rate": 3e-06, "loss": 0.0005, "step": 3208 }, { "clip_ratio": 0.00018953753169625998, "epoch": 0.008908988944969156, "grad_norm": 0.10091890394687653, "kl": 0.15433496236801147, "learning_rate": 3e-06, "loss": 0.0002, "step": 3209 }, { "clip_ratio": 0.0005576753173954785, "epoch": 0.008911765195808972, "grad_norm": 0.14313778281211853, "kl": 0.14915839582681656, "learning_rate": 3e-06, "loss": -0.0008, "step": 3210 }, { "clip_ratio": 0.0002683616185095161, "epoch": 0.008914541446648787, "grad_norm": 0.135110542178154, "kl": 0.14861468970775604, "learning_rate": 3e-06, "loss": -0.0001, "step": 3211 }, { "clip_ratio": 0.0005443222107714973, "epoch": 0.008917317697488604, "grad_norm": 0.09191743284463882, "kl": 0.14881686121225357, "learning_rate": 3e-06, "loss": -0.0001, "step": 3212 }, { "clip_ratio": 0.00017738635506248102, "epoch": 0.00892009394832842, "grad_norm": 0.1407952606678009, "kl": 0.15256252884864807, "learning_rate": 3e-06, "loss": -0.0013, "step": 3213 }, { "clip_ratio": 0.0002771189610939473, "epoch": 0.008922870199168235, "grad_norm": 0.10390833020210266, "kl": 0.14626572281122208, "learning_rate": 3e-06, "loss": -0.0016, "step": 3214 }, { "clip_ratio": 0.0004360197199275717, "epoch": 0.00892564645000805, "grad_norm": 0.09435190260410309, "kl": 0.14898258447647095, "learning_rate": 3e-06, "loss": -0.0008, "step": 3215 }, { "clip_ratio": 0.0009328907472081482, "epoch": 0.008928422700847868, "grad_norm": 0.13351312279701233, "kl": 0.14194120466709137, "learning_rate": 3e-06, "loss": -0.0031, "step": 3216 }, { "clip_ratio": 0.00010469011613167822, "completion_length": 202.2916717529297, "epoch": 0.008931198951687683, "grad_norm": 0.12071104347705841, "kl": 0.17528368532657623, "learning_rate": 3e-06, "loss": 0.0343, "reward": 0.39791667461395264, "reward_std": 0.3494187593460083, "rewards/countdown_reward_func": 0.39791667461395264, "step": 3217, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00010469011613167822, "epoch": 0.008933975202527498, "grad_norm": 0.12988895177841187, "kl": 0.15753353387117386, "learning_rate": 3e-06, "loss": 0.034, "step": 3218 }, { "clip_ratio": 0.0, "epoch": 0.008936751453367314, "grad_norm": 0.10915003716945648, "kl": 0.1584765389561653, "learning_rate": 3e-06, "loss": 0.0349, "step": 3219 }, { "clip_ratio": 0.0002881090476876125, "epoch": 0.008939527704207131, "grad_norm": 0.1256483644247055, "kl": 0.15565815567970276, "learning_rate": 3e-06, "loss": 0.0341, "step": 3220 }, { "clip_ratio": 0.0, "epoch": 0.008942303955046946, "grad_norm": 0.1146453246474266, "kl": 0.1595410257577896, "learning_rate": 3e-06, "loss": 0.0347, "step": 3221 }, { "clip_ratio": 0.0007747744311927818, "epoch": 0.008945080205886762, "grad_norm": 0.1403200626373291, "kl": 0.16868915408849716, "learning_rate": 3e-06, "loss": 0.0343, "step": 3222 }, { "clip_ratio": 0.00010469011613167822, "epoch": 0.008947856456726579, "grad_norm": 0.1134171113371849, "kl": 0.17640890926122665, "learning_rate": 3e-06, "loss": 0.0339, "step": 3223 }, { "clip_ratio": 0.00040489107777830213, "epoch": 0.008950632707566394, "grad_norm": 0.2046290785074234, "kl": 0.16263237595558167, "learning_rate": 3e-06, "loss": 0.0336, "step": 3224 }, { "clip_ratio": 0.0, "epoch": 0.00895340895840621, "grad_norm": 0.11580358445644379, "kl": 0.16375034302473068, "learning_rate": 3e-06, "loss": 0.0334, "step": 3225 }, { "clip_ratio": 0.0002881090476876125, "epoch": 0.008956185209246025, "grad_norm": 0.13587523996829987, "kl": 0.16372690349817276, "learning_rate": 3e-06, "loss": 0.0319, "step": 3226 }, { "clip_ratio": 0.0003235108670196496, "epoch": 0.008958961460085842, "grad_norm": 0.11053664237260818, "kl": 0.1698642075061798, "learning_rate": 3e-06, "loss": 0.0331, "step": 3227 }, { "clip_ratio": 0.0004447523824637756, "epoch": 0.008961737710925658, "grad_norm": 0.13200335204601288, "kl": 0.18022387474775314, "learning_rate": 3e-06, "loss": 0.0325, "step": 3228 }, { "clip_ratio": 9.110787505051121e-05, "completion_length": 199.3541717529297, "epoch": 0.008964513961765473, "grad_norm": 0.12072847038507462, "kl": 0.15663744509220123, "learning_rate": 3e-06, "loss": 0.0292, "reward": 0.43541669845581055, "reward_std": 0.4068782329559326, "rewards/countdown_reward_func": 0.43541666865348816, "step": 3229, "zero_std_ratio": 0.125 }, { "clip_ratio": 0.0, "epoch": 0.008967290212605289, "grad_norm": 0.13515719771385193, "kl": 0.1652759164571762, "learning_rate": 3e-06, "loss": 0.0294, "step": 3230 }, { "clip_ratio": 9.110787505051121e-05, "epoch": 0.008970066463445106, "grad_norm": 0.12341601401567459, "kl": 0.1741245537996292, "learning_rate": 3e-06, "loss": 0.0285, "step": 3231 }, { "clip_ratio": 9.110787505051121e-05, "epoch": 0.008972842714284921, "grad_norm": 0.11446622759103775, "kl": 0.17039810121059418, "learning_rate": 3e-06, "loss": 0.0282, "step": 3232 }, { "clip_ratio": 9.742790280142799e-05, "epoch": 0.008975618965124737, "grad_norm": 0.12283588945865631, "kl": 0.1725316047668457, "learning_rate": 3e-06, "loss": 0.0278, "step": 3233 }, { "clip_ratio": 0.0, "epoch": 0.008978395215964554, "grad_norm": 0.1350124478340149, "kl": 0.17889952659606934, "learning_rate": 3e-06, "loss": 0.0276, "step": 3234 }, { "clip_ratio": 0.00018376910884398967, "epoch": 0.008981171466804369, "grad_norm": 0.12874352931976318, "kl": 0.16920820623636246, "learning_rate": 3e-06, "loss": 0.0269, "step": 3235 }, { "clip_ratio": 0.00044373475247994065, "epoch": 0.008983947717644184, "grad_norm": 0.13294291496276855, "kl": 0.17868266254663467, "learning_rate": 3e-06, "loss": 0.0274, "step": 3236 }, { "clip_ratio": 0.0, "epoch": 0.008986723968484, "grad_norm": 0.11998309195041656, "kl": 0.18930691480636597, "learning_rate": 3e-06, "loss": 0.0263, "step": 3237 }, { "clip_ratio": 0.00021096106502227485, "epoch": 0.008989500219323817, "grad_norm": 0.10036835819482803, "kl": 0.18419749289751053, "learning_rate": 3e-06, "loss": 0.026, "step": 3238 }, { "clip_ratio": 9.110787505051121e-05, "epoch": 0.008992276470163632, "grad_norm": 0.10466752201318741, "kl": 0.18712753057479858, "learning_rate": 3e-06, "loss": 0.0259, "step": 3239 }, { "clip_ratio": 0.0, "epoch": 0.008995052721003448, "grad_norm": 0.1130727082490921, "kl": 0.19383612275123596, "learning_rate": 3e-06, "loss": 0.0262, "step": 3240 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 214.1666717529297, "epoch": 0.008997828971843263, "grad_norm": 0.1183847188949585, "kl": 0.20084407925605774, "learning_rate": 3e-06, "loss": 0.0054, "reward": 0.3229166865348816, "reward_std": 0.29996680468320847, "rewards/countdown_reward_func": 0.3229166567325592, "step": 3241, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.00900060522268308, "grad_norm": 0.1368788480758667, "kl": 0.20387987047433853, "learning_rate": 3e-06, "loss": 0.0052, "step": 3242 }, { "clip_ratio": 0.0002568188137956895, "epoch": 0.009003381473522896, "grad_norm": 0.10637704282999039, "kl": 0.19523774832487106, "learning_rate": 3e-06, "loss": 0.0056, "step": 3243 }, { "clip_ratio": 0.0003886965350829996, "epoch": 0.009006157724362711, "grad_norm": 0.1235475018620491, "kl": 0.21552522480487823, "learning_rate": 3e-06, "loss": 0.0068, "step": 3244 }, { "clip_ratio": 0.0, "epoch": 0.009008933975202528, "grad_norm": 0.1440919190645218, "kl": 0.20552723854780197, "learning_rate": 3e-06, "loss": 0.0055, "step": 3245 }, { "clip_ratio": 0.00010271158680552617, "epoch": 0.009011710226042344, "grad_norm": 0.10783284157514572, "kl": 0.2124602273106575, "learning_rate": 3e-06, "loss": 0.0056, "step": 3246 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.00901448647688216, "grad_norm": 0.11846953630447388, "kl": 0.20271217823028564, "learning_rate": 3e-06, "loss": 0.005, "step": 3247 }, { "clip_ratio": 0.0, "epoch": 0.009017262727721975, "grad_norm": 0.1430003046989441, "kl": 0.20405713468790054, "learning_rate": 3e-06, "loss": 0.0043, "step": 3248 }, { "clip_ratio": 0.0003630210048868321, "epoch": 0.009020038978561792, "grad_norm": 0.10486691445112228, "kl": 0.19211259484291077, "learning_rate": 3e-06, "loss": 0.0049, "step": 3249 }, { "clip_ratio": 0.0005089522528578527, "epoch": 0.009022815229401607, "grad_norm": 0.12506183981895447, "kl": 0.21054758876562119, "learning_rate": 3e-06, "loss": 0.005, "step": 3250 }, { "clip_ratio": 0.0007255076488945633, "epoch": 0.009025591480241422, "grad_norm": 0.10444033145904541, "kl": 0.19794610142707825, "learning_rate": 3e-06, "loss": 0.0027, "step": 3251 }, { "clip_ratio": 0.00040690103196538985, "epoch": 0.009028367731081238, "grad_norm": 0.10732074081897736, "kl": 0.20300372689962387, "learning_rate": 3e-06, "loss": 0.0041, "step": 3252 }, { "clip_ratio": 0.0002643864354467951, "completion_length": 218.95833587646484, "epoch": 0.009031143981921055, "grad_norm": 0.07791081815958023, "kl": 0.1878623440861702, "learning_rate": 3e-06, "loss": 0.0132, "reward": 0.2875000089406967, "reward_std": 0.223736222833395, "rewards/countdown_reward_func": 0.2875000089406967, "step": 3253, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.00903392023276087, "grad_norm": 0.07708391547203064, "kl": 0.17924807220697403, "learning_rate": 3e-06, "loss": 0.0131, "step": 3254 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.009036696483600686, "grad_norm": 0.0647411197423935, "kl": 0.18367429822683334, "learning_rate": 3e-06, "loss": 0.0127, "step": 3255 }, { "clip_ratio": 0.0, "epoch": 0.009039472734440503, "grad_norm": 0.07643242180347443, "kl": 0.17828871309757233, "learning_rate": 3e-06, "loss": 0.0124, "step": 3256 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.009042248985280318, "grad_norm": 0.07355230301618576, "kl": 0.18642263114452362, "learning_rate": 3e-06, "loss": 0.0136, "step": 3257 }, { "clip_ratio": 0.0005723322610720061, "epoch": 0.009045025236120134, "grad_norm": 0.07546177506446838, "kl": 0.17193427681922913, "learning_rate": 3e-06, "loss": 0.0134, "step": 3258 }, { "clip_ratio": 0.00028463223861763254, "epoch": 0.00904780148695995, "grad_norm": 0.07242903113365173, "kl": 0.17693011462688446, "learning_rate": 3e-06, "loss": 0.0127, "step": 3259 }, { "clip_ratio": 0.0006189604464452714, "epoch": 0.009050577737799766, "grad_norm": 0.07398860901594162, "kl": 0.16921094805002213, "learning_rate": 3e-06, "loss": 0.0126, "step": 3260 }, { "clip_ratio": 0.0008030024473555386, "epoch": 0.009053353988639582, "grad_norm": 0.06699801236391068, "kl": 0.1743146777153015, "learning_rate": 3e-06, "loss": 0.0123, "step": 3261 }, { "clip_ratio": 0.000600044964812696, "epoch": 0.009056130239479397, "grad_norm": 0.07642576098442078, "kl": 0.16925466060638428, "learning_rate": 3e-06, "loss": 0.0119, "step": 3262 }, { "clip_ratio": 0.0005287827589199878, "epoch": 0.009058906490319213, "grad_norm": 0.07423372566699982, "kl": 0.1782313585281372, "learning_rate": 3e-06, "loss": 0.0131, "step": 3263 }, { "clip_ratio": 0.0011484745191410184, "epoch": 0.00906168274115903, "grad_norm": 0.07169768214225769, "kl": 0.16514495015144348, "learning_rate": 3e-06, "loss": 0.0119, "step": 3264 }, { "clip_ratio": 0.0, "completion_length": 230.9375, "epoch": 0.009064458991998845, "grad_norm": 0.06758175045251846, "kl": 0.17153829336166382, "learning_rate": 3e-06, "loss": 0.0277, "reward": 0.21249999850988388, "reward_std": 0.24180647730827332, "rewards/countdown_reward_func": 0.21249999850988388, "step": 3265, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00018137661390937865, "epoch": 0.00906723524283866, "grad_norm": 0.07626805454492569, "kl": 0.16917479038238525, "learning_rate": 3e-06, "loss": 0.028, "step": 3266 }, { "clip_ratio": 0.0, "epoch": 0.009070011493678478, "grad_norm": 0.07564929127693176, "kl": 0.16555088758468628, "learning_rate": 3e-06, "loss": 0.0264, "step": 3267 }, { "clip_ratio": 0.0001798727607820183, "epoch": 0.009072787744518293, "grad_norm": 0.07530555129051208, "kl": 0.16905518621206284, "learning_rate": 3e-06, "loss": 0.0279, "step": 3268 }, { "clip_ratio": 9.197939652949572e-05, "epoch": 0.009075563995358108, "grad_norm": 0.0678284764289856, "kl": 0.15875771641731262, "learning_rate": 3e-06, "loss": 0.0277, "step": 3269 }, { "clip_ratio": 0.00045044990110909566, "epoch": 0.009078340246197924, "grad_norm": 0.06744388490915298, "kl": 0.16655797511339188, "learning_rate": 3e-06, "loss": 0.0282, "step": 3270 }, { "clip_ratio": 0.00017767113604350016, "epoch": 0.009081116497037741, "grad_norm": 0.07151822745800018, "kl": 0.16804775595664978, "learning_rate": 3e-06, "loss": 0.0272, "step": 3271 }, { "clip_ratio": 0.0007222834683489054, "epoch": 0.009083892747877556, "grad_norm": 0.07486232370138168, "kl": 0.1675962507724762, "learning_rate": 3e-06, "loss": 0.0275, "step": 3272 }, { "clip_ratio": 0.0002805163385346532, "epoch": 0.009086668998717372, "grad_norm": 0.06445096433162689, "kl": 0.16649511456489563, "learning_rate": 3e-06, "loss": 0.0263, "step": 3273 }, { "clip_ratio": 0.0006299729575403035, "epoch": 0.009089445249557187, "grad_norm": 0.07382160425186157, "kl": 0.17068657279014587, "learning_rate": 3e-06, "loss": 0.0266, "step": 3274 }, { "clip_ratio": 0.0005363317904993892, "epoch": 0.009092221500397004, "grad_norm": 0.05987700819969177, "kl": 0.16215486079454422, "learning_rate": 3e-06, "loss": 0.0268, "step": 3275 }, { "clip_ratio": 8.973438525572419e-05, "epoch": 0.00909499775123682, "grad_norm": 0.0706355944275856, "kl": 0.17011679708957672, "learning_rate": 3e-06, "loss": 0.0273, "step": 3276 }, { "clip_ratio": 0.00029504328267648816, "completion_length": 208.3541717529297, "epoch": 0.009097774002076635, "grad_norm": 0.08307468146085739, "kl": 0.18701482564210892, "learning_rate": 3e-06, "loss": 0.0213, "reward": 0.2854166775941849, "reward_std": 0.30118735134601593, "rewards/countdown_reward_func": 0.2854166552424431, "step": 3277, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.009100550252916452, "grad_norm": 0.08923904597759247, "kl": 0.18993941694498062, "learning_rate": 3e-06, "loss": 0.022, "step": 3278 }, { "clip_ratio": 0.0001840942568378523, "epoch": 0.009103326503756268, "grad_norm": 0.10027282685041428, "kl": 0.1806354895234108, "learning_rate": 3e-06, "loss": 0.0214, "step": 3279 }, { "clip_ratio": 0.0, "epoch": 0.009106102754596083, "grad_norm": 0.08859215676784515, "kl": 0.1832749918103218, "learning_rate": 3e-06, "loss": 0.0214, "step": 3280 }, { "clip_ratio": 0.0004463749937713146, "epoch": 0.009108879005435899, "grad_norm": 0.07286273688077927, "kl": 0.18666107207536697, "learning_rate": 3e-06, "loss": 0.0213, "step": 3281 }, { "clip_ratio": 0.0, "epoch": 0.009111655256275716, "grad_norm": 0.08396629244089127, "kl": 0.19082605838775635, "learning_rate": 3e-06, "loss": 0.0212, "step": 3282 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.009114431507115531, "grad_norm": 0.07911704480648041, "kl": 0.19739805907011032, "learning_rate": 3e-06, "loss": 0.0204, "step": 3283 }, { "clip_ratio": 0.00023651844821870327, "epoch": 0.009117207757955346, "grad_norm": 0.07769789546728134, "kl": 0.1994594931602478, "learning_rate": 3e-06, "loss": 0.0212, "step": 3284 }, { "clip_ratio": 0.0003681885136757046, "epoch": 0.009119984008795162, "grad_norm": 0.10710373520851135, "kl": 0.1893782541155815, "learning_rate": 3e-06, "loss": 0.0199, "step": 3285 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.009122760259634979, "grad_norm": 0.08658848702907562, "kl": 0.19371358305215836, "learning_rate": 3e-06, "loss": 0.0204, "step": 3286 }, { "clip_ratio": 0.0004337466671131551, "epoch": 0.009125536510474794, "grad_norm": 0.06982763111591339, "kl": 0.19672724604606628, "learning_rate": 3e-06, "loss": 0.0203, "step": 3287 }, { "clip_ratio": 0.00021419576660264283, "epoch": 0.00912831276131461, "grad_norm": 0.07497859001159668, "kl": 0.20117107778787613, "learning_rate": 3e-06, "loss": 0.0195, "step": 3288 }, { "clip_ratio": 0.00032626939355395734, "completion_length": 229.56250762939453, "epoch": 0.009131089012154427, "grad_norm": 0.08327294141054153, "kl": 0.18998228013515472, "learning_rate": 3e-06, "loss": 0.0189, "reward": 0.3229166865348816, "reward_std": 0.2587834671139717, "rewards/countdown_reward_func": 0.3229166865348816, "step": 3289, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.827683632262051e-05, "epoch": 0.009133865262994242, "grad_norm": 0.07260271906852722, "kl": 0.1936226710677147, "learning_rate": 3e-06, "loss": 0.0185, "step": 3290 }, { "clip_ratio": 0.0, "epoch": 0.009136641513834058, "grad_norm": 0.08770108222961426, "kl": 0.19695696979761124, "learning_rate": 3e-06, "loss": 0.0187, "step": 3291 }, { "clip_ratio": 8.827683632262051e-05, "epoch": 0.009139417764673873, "grad_norm": 0.09277809411287308, "kl": 0.1862516850233078, "learning_rate": 3e-06, "loss": 0.0188, "step": 3292 }, { "clip_ratio": 0.00026900929515250027, "epoch": 0.00914219401551369, "grad_norm": 0.07858826965093613, "kl": 0.1943134218454361, "learning_rate": 3e-06, "loss": 0.0186, "step": 3293 }, { "clip_ratio": 0.000244140625, "epoch": 0.009144970266353506, "grad_norm": 0.08508437871932983, "kl": 0.19300925731658936, "learning_rate": 3e-06, "loss": 0.0192, "step": 3294 }, { "clip_ratio": 0.00017040561215253547, "epoch": 0.009147746517193321, "grad_norm": 0.08291451632976532, "kl": 0.19522245973348618, "learning_rate": 3e-06, "loss": 0.0184, "step": 3295 }, { "clip_ratio": 0.0, "epoch": 0.009150522768033137, "grad_norm": 0.06906690448522568, "kl": 0.19837484508752823, "learning_rate": 3e-06, "loss": 0.0179, "step": 3296 }, { "clip_ratio": 0.0, "epoch": 0.009153299018872954, "grad_norm": 0.5661424398422241, "kl": 0.1991024613380432, "learning_rate": 3e-06, "loss": 0.018, "step": 3297 }, { "clip_ratio": 9.057971328729764e-05, "epoch": 0.009156075269712769, "grad_norm": 0.08353456854820251, "kl": 0.1896662339568138, "learning_rate": 3e-06, "loss": 0.018, "step": 3298 }, { "clip_ratio": 0.00016965704708127305, "epoch": 0.009158851520552585, "grad_norm": 0.09154334664344788, "kl": 0.19535797089338303, "learning_rate": 3e-06, "loss": 0.0185, "step": 3299 }, { "clip_ratio": 9.245562250725925e-05, "epoch": 0.009161627771392402, "grad_norm": 0.08360829204320908, "kl": 0.19443128257989883, "learning_rate": 3e-06, "loss": 0.0182, "step": 3300 }, { "clip_ratio": 9.498480358161032e-05, "completion_length": 209.2916717529297, "epoch": 0.009164404022232217, "grad_norm": 0.10126984119415283, "kl": 0.2014843225479126, "learning_rate": 3e-06, "loss": 0.0336, "reward": 0.2500000149011612, "reward_std": 0.223736222833395, "rewards/countdown_reward_func": 0.2500000149011612, "step": 3301, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00030827673617750406, "epoch": 0.009167180273072032, "grad_norm": 0.08888054639101028, "kl": 0.2116442322731018, "learning_rate": 3e-06, "loss": 0.034, "step": 3302 }, { "clip_ratio": 0.0003091730468440801, "epoch": 0.009169956523911848, "grad_norm": 0.06850343942642212, "kl": 0.20003743469715118, "learning_rate": 3e-06, "loss": 0.0339, "step": 3303 }, { "clip_ratio": 0.0004317789280321449, "epoch": 0.009172732774751665, "grad_norm": 0.09514410048723221, "kl": 0.20418274402618408, "learning_rate": 3e-06, "loss": 0.0329, "step": 3304 }, { "clip_ratio": 0.00038600289553869516, "epoch": 0.00917550902559148, "grad_norm": 0.06524792313575745, "kl": 0.2135285809636116, "learning_rate": 3e-06, "loss": 0.0341, "step": 3305 }, { "clip_ratio": 0.00019403288752073422, "epoch": 0.009178285276431296, "grad_norm": 0.07323011755943298, "kl": 0.21629154682159424, "learning_rate": 3e-06, "loss": 0.034, "step": 3306 }, { "clip_ratio": 8.6088155512698e-05, "epoch": 0.009181061527271111, "grad_norm": 0.0979154035449028, "kl": 0.20841020345687866, "learning_rate": 3e-06, "loss": 0.0337, "step": 3307 }, { "clip_ratio": 0.00019936203898396343, "epoch": 0.009183837778110928, "grad_norm": 0.08444679528474808, "kl": 0.22166118025779724, "learning_rate": 3e-06, "loss": 0.0335, "step": 3308 }, { "clip_ratio": 0.0008417013013968244, "epoch": 0.009186614028950744, "grad_norm": 0.066643126308918, "kl": 0.20939507335424423, "learning_rate": 3e-06, "loss": 0.0327, "step": 3309 }, { "clip_ratio": 0.0, "epoch": 0.00918939027979056, "grad_norm": 0.09063675254583359, "kl": 0.21372316777706146, "learning_rate": 3e-06, "loss": 0.032, "step": 3310 }, { "clip_ratio": 9.238728671334684e-05, "epoch": 0.009192166530630376, "grad_norm": 0.0678633600473404, "kl": 0.22817469388246536, "learning_rate": 3e-06, "loss": 0.0335, "step": 3311 }, { "clip_ratio": 8.6088155512698e-05, "epoch": 0.009194942781470192, "grad_norm": 0.07433688640594482, "kl": 0.2330213114619255, "learning_rate": 3e-06, "loss": 0.0326, "step": 3312 }, { "clip_ratio": 0.0, "completion_length": 193.875, "epoch": 0.009197719032310007, "grad_norm": 0.08883730322122574, "kl": 0.2304357886314392, "learning_rate": 3e-06, "loss": -0.0004, "reward": 0.229166679084301, "reward_std": 0.1639271229505539, "rewards/countdown_reward_func": 0.229166679084301, "step": 3313, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0002395426417933777, "epoch": 0.009200495283149823, "grad_norm": 0.08190058171749115, "kl": 0.23292165994644165, "learning_rate": 3e-06, "loss": 0.0005, "step": 3314 }, { "clip_ratio": 0.0, "epoch": 0.00920327153398964, "grad_norm": 0.09772184491157532, "kl": 0.22226303070783615, "learning_rate": 3e-06, "loss": -0.0, "step": 3315 }, { "clip_ratio": 0.00031361104629468173, "epoch": 0.009206047784829455, "grad_norm": 0.11392029374837875, "kl": 0.2292226105928421, "learning_rate": 3e-06, "loss": 0.0004, "step": 3316 }, { "clip_ratio": 0.00011446886492194608, "epoch": 0.00920882403566927, "grad_norm": 0.11159232258796692, "kl": 0.24332569539546967, "learning_rate": 3e-06, "loss": 0.0007, "step": 3317 }, { "clip_ratio": 0.00030144694028422236, "epoch": 0.009211600286509086, "grad_norm": 0.09568655490875244, "kl": 0.21723999828100204, "learning_rate": 3e-06, "loss": -0.0004, "step": 3318 }, { "clip_ratio": 0.00011478420492494479, "epoch": 0.009214376537348903, "grad_norm": 0.0883168876171112, "kl": 0.22488639503717422, "learning_rate": 3e-06, "loss": -0.0008, "step": 3319 }, { "clip_ratio": 0.00010382059554103762, "epoch": 0.009217152788188718, "grad_norm": 0.0834290161728859, "kl": 0.2242891639471054, "learning_rate": 3e-06, "loss": -0.0003, "step": 3320 }, { "clip_ratio": 0.0004038636543555185, "epoch": 0.009219929039028534, "grad_norm": 0.09938092529773712, "kl": 0.20953409373760223, "learning_rate": 3e-06, "loss": -0.0017, "step": 3321 }, { "clip_ratio": 0.0006932226096978411, "epoch": 0.009222705289868351, "grad_norm": 0.11258038878440857, "kl": 0.212013341486454, "learning_rate": 3e-06, "loss": -0.0014, "step": 3322 }, { "clip_ratio": 0.0, "epoch": 0.009225481540708166, "grad_norm": 0.11256524175405502, "kl": 0.22122438251972198, "learning_rate": 3e-06, "loss": -0.002, "step": 3323 }, { "clip_ratio": 0.0008105350425466895, "epoch": 0.009228257791547982, "grad_norm": 0.07479316741228104, "kl": 0.19533280283212662, "learning_rate": 3e-06, "loss": -0.0024, "step": 3324 }, { "clip_ratio": 0.000843369954964146, "completion_length": 200.56250762939453, "epoch": 0.009231034042387797, "grad_norm": 0.1079222559928894, "kl": 0.19438428431749344, "learning_rate": 3e-06, "loss": -0.0046, "reward": 0.24583334475755692, "reward_std": 0.19010094925761223, "rewards/countdown_reward_func": 0.24583334475755692, "step": 3325, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.000617850455455482, "epoch": 0.009233810293227614, "grad_norm": 0.11408974230289459, "kl": 0.18512540310621262, "learning_rate": 3e-06, "loss": -0.0045, "step": 3326 }, { "clip_ratio": 0.0, "epoch": 0.00923658654406743, "grad_norm": 0.11449351161718369, "kl": 0.18007448315620422, "learning_rate": 3e-06, "loss": -0.0056, "step": 3327 }, { "clip_ratio": 0.0, "epoch": 0.009239362794907245, "grad_norm": 0.1207418218255043, "kl": 0.17843185365200043, "learning_rate": 3e-06, "loss": -0.006, "step": 3328 }, { "clip_ratio": 0.00011120996350655332, "epoch": 0.00924213904574706, "grad_norm": 0.1031569242477417, "kl": 0.16534500569105148, "learning_rate": 3e-06, "loss": -0.0063, "step": 3329 }, { "clip_ratio": 0.00010024057701230049, "epoch": 0.009244915296586878, "grad_norm": 0.12397726625204086, "kl": 0.1632966548204422, "learning_rate": 3e-06, "loss": -0.0069, "step": 3330 }, { "clip_ratio": 0.0011015688069164753, "epoch": 0.009247691547426693, "grad_norm": 0.10596464574337006, "kl": 0.16765206307172775, "learning_rate": 3e-06, "loss": -0.007, "step": 3331 }, { "clip_ratio": 0.00021097053831908852, "epoch": 0.009250467798266509, "grad_norm": 0.0956178829073906, "kl": 0.15779069066047668, "learning_rate": 3e-06, "loss": -0.0075, "step": 3332 }, { "clip_ratio": 0.000741794501664117, "epoch": 0.009253244049106326, "grad_norm": 0.105475053191185, "kl": 0.1526746153831482, "learning_rate": 3e-06, "loss": -0.0083, "step": 3333 }, { "clip_ratio": 0.0012937507999595255, "epoch": 0.009256020299946141, "grad_norm": 0.1110682412981987, "kl": 0.15174949169158936, "learning_rate": 3e-06, "loss": -0.0096, "step": 3334 }, { "clip_ratio": 0.002811335667502135, "epoch": 0.009258796550785956, "grad_norm": 0.08937579393386841, "kl": 0.14079055190086365, "learning_rate": 3e-06, "loss": -0.009, "step": 3335 }, { "clip_ratio": 0.0013101234580972232, "epoch": 0.009261572801625772, "grad_norm": 0.1123751848936081, "kl": 0.1391913965344429, "learning_rate": 3e-06, "loss": -0.011, "step": 3336 }, { "clip_ratio": 0.00020833333837799728, "completion_length": 211.2916717529297, "epoch": 0.009264349052465589, "grad_norm": 0.06566524505615234, "kl": 0.12781281024217606, "learning_rate": 3e-06, "loss": 0.0286, "reward": 0.23125001043081284, "reward_std": 0.16211742535233498, "rewards/countdown_reward_func": 0.23125001043081284, "step": 3337, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.0003036437265109271, "epoch": 0.009267125303305404, "grad_norm": 0.07969752699136734, "kl": 0.13375972211360931, "learning_rate": 3e-06, "loss": 0.029, "step": 3338 }, { "clip_ratio": 0.00020109293836867437, "epoch": 0.00926990155414522, "grad_norm": 0.07866007089614868, "kl": 0.13459890335798264, "learning_rate": 3e-06, "loss": 0.0294, "step": 3339 }, { "clip_ratio": 0.0, "epoch": 0.009272677804985035, "grad_norm": 0.12970270216464996, "kl": 0.13485877215862274, "learning_rate": 3e-06, "loss": 0.0291, "step": 3340 }, { "clip_ratio": 0.0006087079527787864, "epoch": 0.009275454055824852, "grad_norm": 0.06883334368467331, "kl": 0.12373000755906105, "learning_rate": 3e-06, "loss": 0.0288, "step": 3341 }, { "clip_ratio": 0.00017999016563408077, "epoch": 0.009278230306664668, "grad_norm": 0.14366085827350616, "kl": 0.12040146440267563, "learning_rate": 3e-06, "loss": 0.0299, "step": 3342 }, { "clip_ratio": 0.0, "epoch": 0.009281006557504483, "grad_norm": 0.06404737383127213, "kl": 0.11984974518418312, "learning_rate": 3e-06, "loss": 0.0291, "step": 3343 }, { "clip_ratio": 0.00020947416487615556, "epoch": 0.0092837828083443, "grad_norm": 0.0853867456316948, "kl": 0.12684471160173416, "learning_rate": 3e-06, "loss": 0.029, "step": 3344 }, { "clip_ratio": 0.00011606313637457788, "epoch": 0.009286559059184116, "grad_norm": 0.07980918139219284, "kl": 0.13090308755636215, "learning_rate": 3e-06, "loss": 0.029, "step": 3345 }, { "clip_ratio": 0.000934203970246017, "epoch": 0.009289335310023931, "grad_norm": 0.15292905271053314, "kl": 0.13403774052858353, "learning_rate": 3e-06, "loss": 0.0294, "step": 3346 }, { "clip_ratio": 0.0010340308508602902, "epoch": 0.009292111560863747, "grad_norm": 0.06572780013084412, "kl": 0.12371078133583069, "learning_rate": 3e-06, "loss": 0.0286, "step": 3347 }, { "clip_ratio": 8.954155055107549e-05, "epoch": 0.009294887811703564, "grad_norm": 0.14957201480865479, "kl": 0.12287519499659538, "learning_rate": 3e-06, "loss": 0.0286, "step": 3348 }, { "clip_ratio": 0.0001152073746197857, "completion_length": 211.93750762939453, "epoch": 0.009297664062543379, "grad_norm": 0.10666563361883163, "kl": 0.14240913093090057, "learning_rate": 3e-06, "loss": 0.0208, "reward": 0.37708336114883423, "reward_std": 0.3498389720916748, "rewards/countdown_reward_func": 0.37708336114883423, "step": 3349, "zero_std_ratio": 0.125 }, { "clip_ratio": 9.95222944766283e-05, "epoch": 0.009300440313383194, "grad_norm": 0.1004534512758255, "kl": 0.14082762598991394, "learning_rate": 3e-06, "loss": 0.0199, "step": 3350 }, { "clip_ratio": 0.0001807442822610028, "epoch": 0.00930321656422301, "grad_norm": 0.10281887650489807, "kl": 0.1348535344004631, "learning_rate": 3e-06, "loss": 0.0198, "step": 3351 }, { "clip_ratio": 0.0001560549280839041, "epoch": 0.009305992815062827, "grad_norm": 0.0971880778670311, "kl": 0.14031432569026947, "learning_rate": 3e-06, "loss": 0.0198, "step": 3352 }, { "clip_ratio": 0.0001152073746197857, "epoch": 0.009308769065902642, "grad_norm": 0.10631027817726135, "kl": 0.14285346865653992, "learning_rate": 3e-06, "loss": 0.0198, "step": 3353 }, { "clip_ratio": 0.00024274009047076106, "epoch": 0.009311545316742458, "grad_norm": 0.12099858373403549, "kl": 0.13796381652355194, "learning_rate": 3e-06, "loss": 0.0199, "step": 3354 }, { "clip_ratio": 0.0, "epoch": 0.009314321567582275, "grad_norm": 0.10015618801116943, "kl": 0.14650218188762665, "learning_rate": 3e-06, "loss": 0.0197, "step": 3355 }, { "clip_ratio": 0.0006242197123356164, "epoch": 0.00931709781842209, "grad_norm": 0.12549257278442383, "kl": 0.14414872974157333, "learning_rate": 3e-06, "loss": 0.0193, "step": 3356 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009319874069261906, "grad_norm": 0.10612329840660095, "kl": 0.1380062699317932, "learning_rate": 3e-06, "loss": 0.018, "step": 3357 }, { "clip_ratio": 0.0001560549280839041, "epoch": 0.009322650320101721, "grad_norm": 0.0948638916015625, "kl": 0.14419499784708023, "learning_rate": 3e-06, "loss": 0.0187, "step": 3358 }, { "clip_ratio": 8.668516238685697e-05, "epoch": 0.009325426570941538, "grad_norm": 0.09685387462377548, "kl": 0.14572890847921371, "learning_rate": 3e-06, "loss": 0.0175, "step": 3359 }, { "clip_ratio": 0.0003864696773234755, "epoch": 0.009328202821781354, "grad_norm": 0.095697320997715, "kl": 0.1411815881729126, "learning_rate": 3e-06, "loss": 0.0185, "step": 3360 }, { "clip_ratio": 0.0, "completion_length": 222.95834350585938, "epoch": 0.00933097907262117, "grad_norm": 0.06107320636510849, "kl": 0.14548498392105103, "learning_rate": 3e-06, "loss": 0.008, "reward": 0.21250002086162567, "reward_std": 0.14995061606168747, "rewards/countdown_reward_func": 0.21250001341104507, "step": 3361, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00010229132749373093, "epoch": 0.009333755323460985, "grad_norm": 0.053627271205186844, "kl": 0.143670916557312, "learning_rate": 3e-06, "loss": 0.0074, "step": 3362 }, { "clip_ratio": 0.0, "epoch": 0.009336531574300802, "grad_norm": 0.05963435396552086, "kl": 0.13754700124263763, "learning_rate": 3e-06, "loss": 0.0074, "step": 3363 }, { "clip_ratio": 0.0, "epoch": 0.009339307825140617, "grad_norm": 0.09170074015855789, "kl": 0.141682930290699, "learning_rate": 3e-06, "loss": 0.0079, "step": 3364 }, { "clip_ratio": 0.00010629251482896507, "epoch": 0.009342084075980433, "grad_norm": 0.059942249208688736, "kl": 0.14293190836906433, "learning_rate": 3e-06, "loss": 0.0079, "step": 3365 }, { "clip_ratio": 8.361203799722716e-05, "epoch": 0.00934486032682025, "grad_norm": 0.046253371983766556, "kl": 0.15248948335647583, "learning_rate": 3e-06, "loss": 0.0081, "step": 3366 }, { "clip_ratio": 0.000205597112653777, "epoch": 0.009347636577660065, "grad_norm": 0.05853024125099182, "kl": 0.1463567167520523, "learning_rate": 3e-06, "loss": 0.0077, "step": 3367 }, { "clip_ratio": 0.00019427072402322665, "epoch": 0.00935041282849988, "grad_norm": 0.05928536877036095, "kl": 0.14271622896194458, "learning_rate": 3e-06, "loss": 0.0072, "step": 3368 }, { "clip_ratio": 0.0, "epoch": 0.009353189079339696, "grad_norm": 0.06247861310839653, "kl": 0.13663607835769653, "learning_rate": 3e-06, "loss": 0.0074, "step": 3369 }, { "clip_ratio": 9.09090886125341e-05, "epoch": 0.009355965330179513, "grad_norm": 0.09067023545503616, "kl": 0.13788402825593948, "learning_rate": 3e-06, "loss": 0.0068, "step": 3370 }, { "clip_ratio": 8.361203799722716e-05, "epoch": 0.009358741581019328, "grad_norm": 0.050808947533369064, "kl": 0.13979151099920273, "learning_rate": 3e-06, "loss": 0.008, "step": 3371 }, { "clip_ratio": 0.00020661157032009214, "epoch": 0.009361517831859144, "grad_norm": 0.04536749795079231, "kl": 0.1478760689496994, "learning_rate": 3e-06, "loss": 0.0074, "step": 3372 }, { "clip_ratio": 0.00017170330102089792, "completion_length": 213.37500762939453, "epoch": 0.00936429408269896, "grad_norm": 0.1138901486992836, "kl": 0.14412663877010345, "learning_rate": 3e-06, "loss": 0.0068, "reward": 0.37916669249534607, "reward_std": 0.43775059282779694, "rewards/countdown_reward_func": 0.3791666775941849, "step": 3373, "zero_std_ratio": 0.0 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009367070333538776, "grad_norm": 0.10576634109020233, "kl": 0.13324858248233795, "learning_rate": 3e-06, "loss": 0.0061, "step": 3374 }, { "clip_ratio": 0.00024938550632214174, "epoch": 0.009369846584378592, "grad_norm": 0.14855250716209412, "kl": 0.13176089525222778, "learning_rate": 3e-06, "loss": 0.0069, "step": 3375 }, { "clip_ratio": 0.0, "epoch": 0.009372622835218407, "grad_norm": 0.09728948771953583, "kl": 0.1340518742799759, "learning_rate": 3e-06, "loss": 0.0055, "step": 3376 }, { "clip_ratio": 0.0, "epoch": 0.009375399086058224, "grad_norm": 0.11313235014677048, "kl": 0.12622303143143654, "learning_rate": 3e-06, "loss": 0.0068, "step": 3377 }, { "clip_ratio": 0.00027053764642914757, "epoch": 0.00937817533689804, "grad_norm": 0.11547991633415222, "kl": 0.13444355130195618, "learning_rate": 3e-06, "loss": 0.0064, "step": 3378 }, { "clip_ratio": 0.0, "epoch": 0.009380951587737855, "grad_norm": 0.20696863532066345, "kl": 0.14222577214241028, "learning_rate": 3e-06, "loss": 0.005, "step": 3379 }, { "clip_ratio": 0.0003241463709855452, "epoch": 0.00938372783857767, "grad_norm": 0.12405950576066971, "kl": 0.1312875896692276, "learning_rate": 3e-06, "loss": 0.0057, "step": 3380 }, { "clip_ratio": 0.0001680052955634892, "epoch": 0.009386504089417488, "grad_norm": 0.1666613668203354, "kl": 0.1284816637635231, "learning_rate": 3e-06, "loss": 0.0042, "step": 3381 }, { "clip_ratio": 8.585165051044896e-05, "epoch": 0.009389280340257303, "grad_norm": 0.09616739302873611, "kl": 0.1330876499414444, "learning_rate": 3e-06, "loss": 0.004, "step": 3382 }, { "clip_ratio": 0.00020601737196557224, "epoch": 0.009392056591097118, "grad_norm": 0.1105840727686882, "kl": 0.12402277812361717, "learning_rate": 3e-06, "loss": 0.0053, "step": 3383 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.009394832841936934, "grad_norm": 0.10908439010381699, "kl": 0.13212332874536514, "learning_rate": 3e-06, "loss": 0.0038, "step": 3384 }, { "clip_ratio": 0.00016812373360153288, "completion_length": 219.2916717529297, "epoch": 0.009397609092776751, "grad_norm": 0.07888802886009216, "kl": 0.13888321816921234, "learning_rate": 3e-06, "loss": 0.0315, "reward": 0.23125001043081284, "reward_std": 0.2115693911910057, "rewards/countdown_reward_func": 0.23125001043081284, "step": 3385, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.009400385343616566, "grad_norm": 0.1012679934501648, "kl": 0.13986077904701233, "learning_rate": 3e-06, "loss": 0.032, "step": 3386 }, { "clip_ratio": 0.0, "epoch": 0.009403161594456382, "grad_norm": 0.07821791619062424, "kl": 0.14335256069898605, "learning_rate": 3e-06, "loss": 0.0319, "step": 3387 }, { "clip_ratio": 8.406186680076644e-05, "epoch": 0.009405937845296199, "grad_norm": 0.0744754746556282, "kl": 0.1401086449623108, "learning_rate": 3e-06, "loss": 0.0311, "step": 3388 }, { "clip_ratio": 8.698677993379533e-05, "epoch": 0.009408714096136014, "grad_norm": 0.09955067932605743, "kl": 0.13762974739074707, "learning_rate": 3e-06, "loss": 0.0314, "step": 3389 }, { "clip_ratio": 0.00017854410543804988, "epoch": 0.00941149034697583, "grad_norm": 0.08295691758394241, "kl": 0.14692870527505875, "learning_rate": 3e-06, "loss": 0.0314, "step": 3390 }, { "clip_ratio": 8.406186680076644e-05, "epoch": 0.009414266597815645, "grad_norm": 0.07514327019453049, "kl": 0.14133457839488983, "learning_rate": 3e-06, "loss": 0.0307, "step": 3391 }, { "clip_ratio": 0.0002468729508109391, "epoch": 0.009417042848655462, "grad_norm": 0.08834700286388397, "kl": 0.14441197365522385, "learning_rate": 3e-06, "loss": 0.0308, "step": 3392 }, { "clip_ratio": 9.448223863728344e-05, "epoch": 0.009419819099495278, "grad_norm": 0.07426346838474274, "kl": 0.14892823994159698, "learning_rate": 3e-06, "loss": 0.0306, "step": 3393 }, { "clip_ratio": 0.0, "epoch": 0.009422595350335093, "grad_norm": 0.07280530780553818, "kl": 0.14693843573331833, "learning_rate": 3e-06, "loss": 0.03, "step": 3394 }, { "clip_ratio": 0.00033521009027026594, "epoch": 0.009425371601174909, "grad_norm": 0.07536084949970245, "kl": 0.14514127373695374, "learning_rate": 3e-06, "loss": 0.0305, "step": 3395 }, { "clip_ratio": 0.00010917030886048451, "epoch": 0.009428147852014726, "grad_norm": 0.07550336420536041, "kl": 0.1551234871149063, "learning_rate": 3e-06, "loss": 0.0302, "step": 3396 }, { "clip_ratio": 0.00041145490831695497, "completion_length": 219.0, "epoch": 0.009430924102854541, "grad_norm": 0.13784070312976837, "kl": 0.13735155761241913, "learning_rate": 3e-06, "loss": -0.0111, "reward": 0.26875001937150955, "reward_std": 0.31206804513931274, "rewards/countdown_reward_func": 0.26875001937150955, "step": 3397, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00017100638069678098, "epoch": 0.009433700353694357, "grad_norm": 0.11191956698894501, "kl": 0.1370161920785904, "learning_rate": 3e-06, "loss": -0.0093, "step": 3398 }, { "clip_ratio": 0.00017743080388754606, "epoch": 0.009436476604534174, "grad_norm": 0.13812246918678284, "kl": 0.1405482068657875, "learning_rate": 3e-06, "loss": -0.0101, "step": 3399 }, { "clip_ratio": 8.229097875300795e-05, "epoch": 0.009439252855373989, "grad_norm": 0.12536658346652985, "kl": 0.14326896518468857, "learning_rate": 3e-06, "loss": -0.0105, "step": 3400 }, { "clip_ratio": 8.229097875300795e-05, "epoch": 0.009442029106213804, "grad_norm": 0.1049521267414093, "kl": 0.13436724245548248, "learning_rate": 3e-06, "loss": -0.0106, "step": 3401 }, { "clip_ratio": 0.0013107871054671705, "epoch": 0.00944480535705362, "grad_norm": 0.11767006665468216, "kl": 0.14459362626075745, "learning_rate": 3e-06, "loss": -0.0098, "step": 3402 }, { "clip_ratio": 0.0008412723254878074, "epoch": 0.009447581607893437, "grad_norm": 0.1364697366952896, "kl": 0.1339626908302307, "learning_rate": 3e-06, "loss": -0.0123, "step": 3403 }, { "clip_ratio": 0.00045482016867026687, "epoch": 0.009450357858733252, "grad_norm": 0.1195710152387619, "kl": 0.1335536167025566, "learning_rate": 3e-06, "loss": -0.0103, "step": 3404 }, { "clip_ratio": 0.00028680896502919495, "epoch": 0.009453134109573068, "grad_norm": 0.12026627361774445, "kl": 0.13475558906793594, "learning_rate": 3e-06, "loss": -0.0112, "step": 3405 }, { "clip_ratio": 8.229097875300795e-05, "epoch": 0.009455910360412883, "grad_norm": 0.11835595965385437, "kl": 0.13657694309949875, "learning_rate": 3e-06, "loss": -0.0113, "step": 3406 }, { "clip_ratio": 8.229097875300795e-05, "epoch": 0.0094586866112527, "grad_norm": 0.10208456218242645, "kl": 0.12595082446932793, "learning_rate": 3e-06, "loss": -0.0122, "step": 3407 }, { "clip_ratio": 0.0011968304752372205, "epoch": 0.009461462862092516, "grad_norm": 0.10951346904039383, "kl": 0.13327614590525627, "learning_rate": 3e-06, "loss": -0.0113, "step": 3408 }, { "clip_ratio": 0.0001811086549423635, "completion_length": 215.1041717529297, "epoch": 0.009464239112932331, "grad_norm": 0.06998094916343689, "kl": 0.13653837144374847, "learning_rate": 3e-06, "loss": 0.0145, "reward": 0.21041668206453323, "reward_std": 0.16692758351564407, "rewards/countdown_reward_func": 0.21041668206453323, "step": 3409, "zero_std_ratio": 0.625 }, { "clip_ratio": 0.00030119207804091275, "epoch": 0.009467015363772148, "grad_norm": 0.08315441757440567, "kl": 0.13463860750198364, "learning_rate": 3e-06, "loss": 0.0145, "step": 3410 }, { "clip_ratio": 0.0005593360838247463, "epoch": 0.009469791614611964, "grad_norm": 0.067221999168396, "kl": 0.12686270102858543, "learning_rate": 3e-06, "loss": 0.014, "step": 3411 }, { "clip_ratio": 0.0, "epoch": 0.009472567865451779, "grad_norm": 0.05755463242530823, "kl": 0.1303653046488762, "learning_rate": 3e-06, "loss": 0.0146, "step": 3412 }, { "clip_ratio": 0.00042026914888992906, "epoch": 0.009475344116291595, "grad_norm": 0.06362846493721008, "kl": 0.12534630298614502, "learning_rate": 3e-06, "loss": 0.0138, "step": 3413 }, { "clip_ratio": 0.00047192400961648673, "epoch": 0.009478120367131412, "grad_norm": 0.07573696225881577, "kl": 0.1270292066037655, "learning_rate": 3e-06, "loss": 0.014, "step": 3414 }, { "clip_ratio": 9.137426968663931e-05, "epoch": 0.009480896617971227, "grad_norm": 0.0652124211192131, "kl": 0.12759653478860855, "learning_rate": 3e-06, "loss": 0.0139, "step": 3415 }, { "clip_ratio": 0.00023854961909819394, "epoch": 0.009483672868811042, "grad_norm": 0.0743669867515564, "kl": 0.12613316997885704, "learning_rate": 3e-06, "loss": 0.0133, "step": 3416 }, { "clip_ratio": 8.18062835605815e-05, "epoch": 0.009486449119650858, "grad_norm": 0.0771777331829071, "kl": 0.119316715747118, "learning_rate": 3e-06, "loss": 0.0132, "step": 3417 }, { "clip_ratio": 0.00028762640431523323, "epoch": 0.009489225370490675, "grad_norm": 0.05563231185078621, "kl": 0.12309416756033897, "learning_rate": 3e-06, "loss": 0.0133, "step": 3418 }, { "clip_ratio": 0.0007487859111279249, "epoch": 0.00949200162133049, "grad_norm": 0.0619962252676487, "kl": 0.11999662220478058, "learning_rate": 3e-06, "loss": 0.0125, "step": 3419 }, { "clip_ratio": 0.0005035349895479158, "epoch": 0.009494777872170306, "grad_norm": 0.13191372156143188, "kl": 0.12294557318091393, "learning_rate": 3e-06, "loss": 0.0129, "step": 3420 }, { "clip_ratio": 0.00025964137603295967, "completion_length": 229.27083587646484, "epoch": 0.009497554123010123, "grad_norm": 0.1147640198469162, "kl": 0.12518445774912834, "learning_rate": 3e-06, "loss": 0.0386, "reward": 0.32500000298023224, "reward_std": 0.33128294348716736, "rewards/countdown_reward_func": 0.32499998807907104, "step": 3421, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.009500330373849938, "grad_norm": 0.10162476450204849, "kl": 0.12149158865213394, "learning_rate": 3e-06, "loss": 0.0385, "step": 3422 }, { "clip_ratio": 8.967001485871151e-05, "epoch": 0.009503106624689754, "grad_norm": 0.12649016082286835, "kl": 0.12441273033618927, "learning_rate": 3e-06, "loss": 0.0391, "step": 3423 }, { "clip_ratio": 0.0, "epoch": 0.00950588287552957, "grad_norm": 0.12122523039579391, "kl": 0.1215183287858963, "learning_rate": 3e-06, "loss": 0.0378, "step": 3424 }, { "clip_ratio": 0.0, "epoch": 0.009508659126369386, "grad_norm": 0.09479762613773346, "kl": 0.11806304007768631, "learning_rate": 3e-06, "loss": 0.0378, "step": 3425 }, { "clip_ratio": 0.0, "epoch": 0.009511435377209202, "grad_norm": 0.11704821139574051, "kl": 0.1288565918803215, "learning_rate": 3e-06, "loss": 0.0378, "step": 3426 }, { "clip_ratio": 0.0, "epoch": 0.009514211628049017, "grad_norm": 0.10025089234113693, "kl": 0.1304003670811653, "learning_rate": 3e-06, "loss": 0.0372, "step": 3427 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.009516987878888833, "grad_norm": 0.1017884612083435, "kl": 0.12766587361693382, "learning_rate": 3e-06, "loss": 0.0369, "step": 3428 }, { "clip_ratio": 0.0, "epoch": 0.00951976412972865, "grad_norm": 0.12916648387908936, "kl": 0.13421806693077087, "learning_rate": 3e-06, "loss": 0.0376, "step": 3429 }, { "clip_ratio": 8.78425853443332e-05, "epoch": 0.009522540380568465, "grad_norm": 0.11458772420883179, "kl": 0.1326707825064659, "learning_rate": 3e-06, "loss": 0.0354, "step": 3430 }, { "clip_ratio": 0.0, "epoch": 0.00952531663140828, "grad_norm": 0.09468293190002441, "kl": 0.13024404272437096, "learning_rate": 3e-06, "loss": 0.037, "step": 3431 }, { "clip_ratio": 0.00011927480954909697, "epoch": 0.009528092882248098, "grad_norm": 0.1033705398440361, "kl": 0.14313241094350815, "learning_rate": 3e-06, "loss": 0.0363, "step": 3432 }, { "clip_ratio": 8.941345004132017e-05, "completion_length": 240.18750762939453, "epoch": 0.009530869133087913, "grad_norm": 0.10980170220136642, "kl": 0.12325490266084671, "learning_rate": 3e-06, "loss": -0.0068, "reward": 0.3437500298023224, "reward_std": 0.32775889337062836, "rewards/countdown_reward_func": 0.3437500149011612, "step": 3433, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.001059981295838952, "epoch": 0.009533645383927728, "grad_norm": 0.15972431004047394, "kl": 0.12082348018884659, "learning_rate": 3e-06, "loss": -0.0066, "step": 3434 }, { "clip_ratio": 8.239947055699304e-05, "epoch": 0.009536421634767544, "grad_norm": 0.09864591062068939, "kl": 0.12724430859088898, "learning_rate": 3e-06, "loss": -0.0064, "step": 3435 }, { "clip_ratio": 0.0, "epoch": 0.009539197885607361, "grad_norm": 0.16317464411258698, "kl": 0.1318410336971283, "learning_rate": 3e-06, "loss": -0.0064, "step": 3436 }, { "clip_ratio": 0.0, "epoch": 0.009541974136447176, "grad_norm": 0.10252336412668228, "kl": 0.1263137310743332, "learning_rate": 3e-06, "loss": -0.0069, "step": 3437 }, { "clip_ratio": 8.43454763526097e-05, "epoch": 0.009544750387286992, "grad_norm": 0.09853146225214005, "kl": 0.13311899453401566, "learning_rate": 3e-06, "loss": -0.0065, "step": 3438 }, { "clip_ratio": 0.00016572567983530462, "epoch": 0.009547526638126807, "grad_norm": 0.11790407449007034, "kl": 0.1293671801686287, "learning_rate": 3e-06, "loss": -0.0072, "step": 3439 }, { "clip_ratio": 0.0006520609022118151, "epoch": 0.009550302888966624, "grad_norm": 0.11374343186616898, "kl": 0.12439935654401779, "learning_rate": 3e-06, "loss": -0.0072, "step": 3440 }, { "clip_ratio": 0.0, "epoch": 0.00955307913980644, "grad_norm": 0.10072995722293854, "kl": 0.12724566459655762, "learning_rate": 3e-06, "loss": -0.0067, "step": 3441 }, { "clip_ratio": 8.333333244081587e-05, "epoch": 0.009555855390646255, "grad_norm": 0.11592025309801102, "kl": 0.1293221265077591, "learning_rate": 3e-06, "loss": -0.0069, "step": 3442 }, { "clip_ratio": 0.0, "epoch": 0.009558631641486072, "grad_norm": 0.09814949333667755, "kl": 0.12223512679338455, "learning_rate": 3e-06, "loss": -0.0085, "step": 3443 }, { "clip_ratio": 0.0, "epoch": 0.009561407892325888, "grad_norm": 0.09910566359758377, "kl": 0.12733029201626778, "learning_rate": 3e-06, "loss": -0.009, "step": 3444 }, { "clip_ratio": 0.0, "completion_length": 213.20834350585938, "epoch": 0.009564184143165703, "grad_norm": 0.07369378954172134, "kl": 0.13885943591594696, "learning_rate": 3e-06, "loss": -0.0044, "reward": 0.24791666865348816, "reward_std": 0.2114001139998436, "rewards/countdown_reward_func": 0.24791666865348816, "step": 3445, "zero_std_ratio": 0.5 }, { "clip_ratio": 8.979885024018586e-05, "epoch": 0.009566960394005519, "grad_norm": 0.10168888419866562, "kl": 0.13936101645231247, "learning_rate": 3e-06, "loss": -0.0043, "step": 3446 }, { "clip_ratio": 0.00035141754779033363, "epoch": 0.009569736644845336, "grad_norm": 0.11556611210107803, "kl": 0.13464120775461197, "learning_rate": 3e-06, "loss": -0.0036, "step": 3447 }, { "clip_ratio": 0.0001842097262851894, "epoch": 0.009572512895685151, "grad_norm": 0.07607579976320267, "kl": 0.1323195919394493, "learning_rate": 3e-06, "loss": -0.0044, "step": 3448 }, { "clip_ratio": 0.0, "epoch": 0.009575289146524966, "grad_norm": 0.1008942723274231, "kl": 0.12717263028025627, "learning_rate": 3e-06, "loss": -0.0055, "step": 3449 }, { "clip_ratio": 0.0002090300986310467, "epoch": 0.009578065397364782, "grad_norm": 0.08813519030809402, "kl": 0.1329830437898636, "learning_rate": 3e-06, "loss": -0.004, "step": 3450 }, { "clip_ratio": 0.0003098365559708327, "epoch": 0.009580841648204599, "grad_norm": 0.07570565491914749, "kl": 0.12968800216913223, "learning_rate": 3e-06, "loss": -0.0048, "step": 3451 }, { "clip_ratio": 0.00017959770048037171, "epoch": 0.009583617899044414, "grad_norm": 0.1254562884569168, "kl": 0.1306021586060524, "learning_rate": 3e-06, "loss": -0.0049, "step": 3452 }, { "clip_ratio": 0.00035919540096074343, "epoch": 0.00958639414988423, "grad_norm": 0.12214601039886475, "kl": 0.12539087235927582, "learning_rate": 3e-06, "loss": -0.0058, "step": 3453 }, { "clip_ratio": 0.00038747224607504904, "epoch": 0.009589170400724047, "grad_norm": 0.07395763695240021, "kl": 0.12522559985518456, "learning_rate": 3e-06, "loss": -0.0056, "step": 3454 }, { "clip_ratio": 0.0001764318731147796, "epoch": 0.009591946651563862, "grad_norm": 0.10475248098373413, "kl": 0.11799245327711105, "learning_rate": 3e-06, "loss": -0.0068, "step": 3455 }, { "clip_ratio": 0.0006975842115934938, "epoch": 0.009594722902403678, "grad_norm": 0.10387440025806427, "kl": 0.12441253289580345, "learning_rate": 3e-06, "loss": -0.0053, "step": 3456 }, { "clip_ratio": 8.445946150459349e-05, "completion_length": 231.87500762939453, "epoch": 0.009597499153243493, "grad_norm": 0.10248544812202454, "kl": 0.1054985448718071, "learning_rate": 3e-06, "loss": 0.0006, "reward": 0.1562500149011612, "reward_std": 0.10402268171310425, "rewards/countdown_reward_func": 0.1562500111758709, "step": 3457, "zero_std_ratio": 0.75 }, { "clip_ratio": 0.0001839817559812218, "epoch": 0.00960027540408331, "grad_norm": 0.06195636838674545, "kl": 0.11453674733638763, "learning_rate": 3e-06, "loss": 0.0017, "step": 3458 }, { "clip_ratio": 8.979885024018586e-05, "epoch": 0.009603051654923126, "grad_norm": 0.05872516334056854, "kl": 0.11472771316766739, "learning_rate": 3e-06, "loss": 0.0019, "step": 3459 }, { "clip_ratio": 0.00036309804272605106, "epoch": 0.009605827905762941, "grad_norm": 0.07733883708715439, "kl": 0.11076173931360245, "learning_rate": 3e-06, "loss": 0.0011, "step": 3460 }, { "clip_ratio": 0.00035994016798213124, "epoch": 0.009608604156602757, "grad_norm": 0.04881132021546364, "kl": 0.11782843619585037, "learning_rate": 3e-06, "loss": 0.0023, "step": 3461 }, { "clip_ratio": 0.0009220041974913329, "epoch": 0.009611380407442574, "grad_norm": 0.0602128803730011, "kl": 0.10646460205316544, "learning_rate": 3e-06, "loss": 0.0012, "step": 3462 }, { "clip_ratio": 0.00018246008403366432, "epoch": 0.009614156658282389, "grad_norm": 0.08947863429784775, "kl": 0.09977680444717407, "learning_rate": 3e-06, "loss": 0.0002, "step": 3463 }, { "clip_ratio": 0.0001990445889532566, "epoch": 0.009616932909122205, "grad_norm": 0.054588522762060165, "kl": 0.10717753693461418, "learning_rate": 3e-06, "loss": 0.0012, "step": 3464 }, { "clip_ratio": 0.0005232279800111428, "epoch": 0.009619709159962022, "grad_norm": 0.05915176868438721, "kl": 0.10687706246972084, "learning_rate": 3e-06, "loss": 0.0012, "step": 3465 }, { "clip_ratio": 0.00037736643571406603, "epoch": 0.009622485410801837, "grad_norm": 0.1070033609867096, "kl": 0.1022193692624569, "learning_rate": 3e-06, "loss": 0.0004, "step": 3466 }, { "clip_ratio": 0.0007383654592558742, "epoch": 0.009625261661641652, "grad_norm": 0.04885929450392723, "kl": 0.11033875867724419, "learning_rate": 3e-06, "loss": 0.001, "step": 3467 }, { "clip_ratio": 0.0012745531857945025, "epoch": 0.009628037912481468, "grad_norm": 0.058803994208574295, "kl": 0.09932730719447136, "learning_rate": 3e-06, "loss": 0.0001, "step": 3468 }, { "clip_ratio": 9.252405288862064e-05, "completion_length": 226.14583587646484, "epoch": 0.009630814163321285, "grad_norm": 0.0778389498591423, "kl": 0.10511306673288345, "learning_rate": 3e-06, "loss": 0.0292, "reward": 0.21250002086162567, "reward_std": 0.2080453634262085, "rewards/countdown_reward_func": 0.21250002086162567, "step": 3469, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.0096335904141611, "grad_norm": 0.08130453526973724, "kl": 0.10237300395965576, "learning_rate": 3e-06, "loss": 0.03, "step": 3470 }, { "clip_ratio": 0.0005314425579854287, "epoch": 0.009636366665000916, "grad_norm": 0.08156713843345642, "kl": 0.11665638536214828, "learning_rate": 3e-06, "loss": 0.0311, "step": 3471 }, { "clip_ratio": 9.578544268151745e-05, "epoch": 0.009639142915840731, "grad_norm": 0.1228790283203125, "kl": 0.10724557191133499, "learning_rate": 3e-06, "loss": 0.0306, "step": 3472 }, { "clip_ratio": 8.692628762219101e-05, "epoch": 0.009641919166680548, "grad_norm": 0.07690811902284622, "kl": 0.10716042667627335, "learning_rate": 3e-06, "loss": 0.0303, "step": 3473 }, { "clip_ratio": 0.0, "epoch": 0.009644695417520364, "grad_norm": 0.08034221082925797, "kl": 0.11300872638821602, "learning_rate": 3e-06, "loss": 0.0305, "step": 3474 }, { "clip_ratio": 9.984025382436812e-05, "epoch": 0.00964747166836018, "grad_norm": 0.08005085587501526, "kl": 0.10297010093927383, "learning_rate": 3e-06, "loss": 0.029, "step": 3475 }, { "clip_ratio": 9.04486223589629e-05, "epoch": 0.009650247919199996, "grad_norm": 0.08139485865831375, "kl": 0.10121942311525345, "learning_rate": 3e-06, "loss": 0.0293, "step": 3476 }, { "clip_ratio": 0.00020037565263919532, "epoch": 0.009653024170039812, "grad_norm": 0.07541320472955704, "kl": 0.11635610833764076, "learning_rate": 3e-06, "loss": 0.0305, "step": 3477 }, { "clip_ratio": 0.00017716565344017, "epoch": 0.009655800420879627, "grad_norm": 0.08489951491355896, "kl": 0.10750338807702065, "learning_rate": 3e-06, "loss": 0.0296, "step": 3478 }, { "clip_ratio": 0.0, "epoch": 0.009658576671719443, "grad_norm": 0.07268678396940231, "kl": 0.1074596494436264, "learning_rate": 3e-06, "loss": 0.0295, "step": 3479 }, { "clip_ratio": 0.0, "epoch": 0.00966135292255926, "grad_norm": 0.0808081403374672, "kl": 0.11395622044801712, "learning_rate": 3e-06, "loss": 0.0291, "step": 3480 }, { "clip_ratio": 9.259259240934625e-05, "completion_length": 228.06250762939453, "epoch": 0.009664129173399075, "grad_norm": 0.08449249714612961, "kl": 0.10799524933099747, "learning_rate": 3e-06, "loss": 0.0166, "reward": 0.34166666120290756, "reward_std": 0.22970525547862053, "rewards/countdown_reward_func": 0.34166666120290756, "step": 3481, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.00966690542423889, "grad_norm": 0.11058665812015533, "kl": 0.1098659448325634, "learning_rate": 3e-06, "loss": 0.0166, "step": 3482 }, { "clip_ratio": 9.259259240934625e-05, "epoch": 0.009669681675078706, "grad_norm": 0.07632815092802048, "kl": 0.10919361189007759, "learning_rate": 3e-06, "loss": 0.0168, "step": 3483 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009672457925918523, "grad_norm": 0.07413569837808609, "kl": 0.10725849866867065, "learning_rate": 3e-06, "loss": 0.0165, "step": 3484 }, { "clip_ratio": 0.0003454372053965926, "epoch": 0.009675234176758338, "grad_norm": 0.07222134619951248, "kl": 0.1083565354347229, "learning_rate": 3e-06, "loss": 0.0159, "step": 3485 }, { "clip_ratio": 0.0001739728031679988, "epoch": 0.009678010427598154, "grad_norm": 0.06880340725183487, "kl": 0.11005958169698715, "learning_rate": 3e-06, "loss": 0.0167, "step": 3486 }, { "clip_ratio": 0.0, "epoch": 0.009680786678437971, "grad_norm": 0.06830192357301712, "kl": 0.11470349133014679, "learning_rate": 3e-06, "loss": 0.0155, "step": 3487 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009683562929277786, "grad_norm": 0.10306932032108307, "kl": 0.11505845189094543, "learning_rate": 3e-06, "loss": 0.0153, "step": 3488 }, { "clip_ratio": 0.00029518577503040433, "epoch": 0.009686339180117602, "grad_norm": 0.08118003606796265, "kl": 0.11645545065402985, "learning_rate": 3e-06, "loss": 0.0153, "step": 3489 }, { "clip_ratio": 9.742790280142799e-05, "epoch": 0.009689115430957417, "grad_norm": 0.08631235361099243, "kl": 0.11517899483442307, "learning_rate": 3e-06, "loss": 0.0154, "step": 3490 }, { "clip_ratio": 0.0, "epoch": 0.009691891681797234, "grad_norm": 0.06677432358264923, "kl": 0.1159871518611908, "learning_rate": 3e-06, "loss": 0.0152, "step": 3491 }, { "clip_ratio": 0.00017438616487197578, "epoch": 0.00969466793263705, "grad_norm": 0.06831302493810654, "kl": 0.11832474172115326, "learning_rate": 3e-06, "loss": 0.0152, "step": 3492 }, { "clip_ratio": 0.0001453488366678357, "completion_length": 211.18750762939453, "epoch": 0.009697444183476865, "grad_norm": 0.08057886362075806, "kl": 0.12852280586957932, "learning_rate": 3e-06, "loss": 0.0015, "reward": 0.3229166716337204, "reward_std": 0.2939821407198906, "rewards/countdown_reward_func": 0.3229166716337204, "step": 3493, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.204712841892615e-05, "epoch": 0.00970022043431668, "grad_norm": 0.08371911197900772, "kl": 0.13107379153370857, "learning_rate": 3e-06, "loss": 0.0017, "step": 3494 }, { "clip_ratio": 0.0006627755356021225, "epoch": 0.009702996685156498, "grad_norm": 0.07055287808179855, "kl": 0.13106022402644157, "learning_rate": 3e-06, "loss": 0.0021, "step": 3495 }, { "clip_ratio": 0.00011531365453265607, "epoch": 0.009705772935996313, "grad_norm": 0.0839470624923706, "kl": 0.13058749213814735, "learning_rate": 3e-06, "loss": 0.0009, "step": 3496 }, { "clip_ratio": 8.509190229233354e-05, "epoch": 0.009708549186836129, "grad_norm": 0.08248282223939896, "kl": 0.1337585747241974, "learning_rate": 3e-06, "loss": 0.0008, "step": 3497 }, { "clip_ratio": 0.0001453488366678357, "epoch": 0.009711325437675946, "grad_norm": 0.09479349106550217, "kl": 0.1362338587641716, "learning_rate": 3e-06, "loss": 0.0014, "step": 3498 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009714101688515761, "grad_norm": 0.0892326831817627, "kl": 0.1331903636455536, "learning_rate": 3e-06, "loss": 0.0012, "step": 3499 }, { "clip_ratio": 0.0, "epoch": 0.009716877939355576, "grad_norm": 0.08561542630195618, "kl": 0.13591472804546356, "learning_rate": 3e-06, "loss": 0.0006, "step": 3500 }, { "clip_ratio": 0.0001453488366678357, "epoch": 0.009719654190195392, "grad_norm": 0.07098863273859024, "kl": 0.13396965712308884, "learning_rate": 3e-06, "loss": 0.0013, "step": 3501 }, { "clip_ratio": 0.0, "epoch": 0.009722430441035209, "grad_norm": 0.08743258565664291, "kl": 0.1321343556046486, "learning_rate": 3e-06, "loss": 0.0006, "step": 3502 }, { "clip_ratio": 0.0001453488366678357, "epoch": 0.009725206691875024, "grad_norm": 0.09284117072820663, "kl": 0.13706067204475403, "learning_rate": 3e-06, "loss": 0.0004, "step": 3503 }, { "clip_ratio": 0.0, "epoch": 0.00972798294271484, "grad_norm": 0.0912276953458786, "kl": 0.1364956498146057, "learning_rate": 3e-06, "loss": 0.0003, "step": 3504 }, { "clip_ratio": 0.00017507003212813288, "completion_length": 220.8541717529297, "epoch": 0.009730759193554655, "grad_norm": 0.09663709998130798, "kl": 0.1330753043293953, "learning_rate": 3e-06, "loss": 0.002, "reward": 0.21250002086162567, "reward_std": 0.2080453559756279, "rewards/countdown_reward_func": 0.21250002086162567, "step": 3505, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.009733535444394472, "grad_norm": 0.0878622978925705, "kl": 0.12130120024085045, "learning_rate": 3e-06, "loss": 0.0016, "step": 3506 }, { "clip_ratio": 0.0001069290010491386, "epoch": 0.009736311695234288, "grad_norm": 0.09262003004550934, "kl": 0.11884085834026337, "learning_rate": 3e-06, "loss": 0.0018, "step": 3507 }, { "clip_ratio": 0.0, "epoch": 0.009739087946074103, "grad_norm": 0.0833728015422821, "kl": 0.12756216898560524, "learning_rate": 3e-06, "loss": 0.0013, "step": 3508 }, { "clip_ratio": 0.000919048507057596, "epoch": 0.00974186419691392, "grad_norm": 0.0776054710149765, "kl": 0.11975589767098427, "learning_rate": 3e-06, "loss": 0.0011, "step": 3509 }, { "clip_ratio": 0.00013631406181957573, "epoch": 0.009744640447753736, "grad_norm": 0.07807232439517975, "kl": 0.12038680911064148, "learning_rate": 3e-06, "loss": 0.0015, "step": 3510 }, { "clip_ratio": 0.0002770246137515642, "epoch": 0.009747416698593551, "grad_norm": 0.08615856617689133, "kl": 0.12864018976688385, "learning_rate": 3e-06, "loss": 0.0008, "step": 3511 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.009750192949433367, "grad_norm": 0.07630624622106552, "kl": 0.11770515888929367, "learning_rate": 3e-06, "loss": 0.0007, "step": 3512 }, { "clip_ratio": 0.0002225209609605372, "epoch": 0.009752969200273184, "grad_norm": 0.09379961341619492, "kl": 0.11424989998340607, "learning_rate": 3e-06, "loss": 0.0007, "step": 3513 }, { "clip_ratio": 0.0, "epoch": 0.009755745451112999, "grad_norm": 0.08609069883823395, "kl": 0.1216898001730442, "learning_rate": 3e-06, "loss": 0.0006, "step": 3514 }, { "clip_ratio": 0.00045617805153597146, "epoch": 0.009758521701952814, "grad_norm": 0.06617363542318344, "kl": 0.11376858875155449, "learning_rate": 3e-06, "loss": 0.0002, "step": 3515 }, { "clip_ratio": 0.0015657315670978278, "epoch": 0.00976129795279263, "grad_norm": 0.07689303904771805, "kl": 0.1162155233323574, "learning_rate": 3e-06, "loss": 0.0005, "step": 3516 }, { "clip_ratio": 0.0, "completion_length": 226.64583587646484, "epoch": 0.009764074203632447, "grad_norm": 0.0697975903749466, "kl": 0.11187063902616501, "learning_rate": 3e-06, "loss": 0.0217, "reward": 0.3229166716337204, "reward_std": 0.2782912999391556, "rewards/countdown_reward_func": 0.3229166716337204, "step": 3517, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.00026900615193881094, "epoch": 0.009766850454472262, "grad_norm": 0.11269184947013855, "kl": 0.10883195325732231, "learning_rate": 3e-06, "loss": 0.0219, "step": 3518 }, { "clip_ratio": 0.0, "epoch": 0.009769626705312078, "grad_norm": 0.09275030344724655, "kl": 0.11466595157980919, "learning_rate": 3e-06, "loss": 0.0224, "step": 3519 }, { "clip_ratio": 8.202099706977606e-05, "epoch": 0.009772402956151895, "grad_norm": 0.10259377956390381, "kl": 0.11304089426994324, "learning_rate": 3e-06, "loss": 0.0219, "step": 3520 }, { "clip_ratio": 0.0, "epoch": 0.00977517920699171, "grad_norm": 0.0874171033501625, "kl": 0.11192367970943451, "learning_rate": 3e-06, "loss": 0.0217, "step": 3521 }, { "clip_ratio": 0.00045184348709881306, "epoch": 0.009777955457831526, "grad_norm": 0.08275522291660309, "kl": 0.10903790220618248, "learning_rate": 3e-06, "loss": 0.0215, "step": 3522 }, { "clip_ratio": 0.0, "epoch": 0.009780731708671341, "grad_norm": 0.06796948611736298, "kl": 0.11260716989636421, "learning_rate": 3e-06, "loss": 0.0209, "step": 3523 }, { "clip_ratio": 0.0006830600905232131, "epoch": 0.009783507959511158, "grad_norm": 0.10275343805551529, "kl": 0.11056050658226013, "learning_rate": 3e-06, "loss": 0.0211, "step": 3524 }, { "clip_ratio": 0.0, "epoch": 0.009786284210350974, "grad_norm": 0.10141252726316452, "kl": 0.1181531697511673, "learning_rate": 3e-06, "loss": 0.0209, "step": 3525 }, { "clip_ratio": 8.202099706977606e-05, "epoch": 0.00978906046119079, "grad_norm": 0.08189576119184494, "kl": 0.11841671168804169, "learning_rate": 3e-06, "loss": 0.0203, "step": 3526 }, { "clip_ratio": 0.00018214026931673288, "epoch": 0.009791836712030605, "grad_norm": 0.09442564100027084, "kl": 0.11579911410808563, "learning_rate": 3e-06, "loss": 0.0203, "step": 3527 }, { "clip_ratio": 0.00027029376360587776, "epoch": 0.009794612962870422, "grad_norm": 0.09017150849103928, "kl": 0.11289473995566368, "learning_rate": 3e-06, "loss": 0.02, "step": 3528 }, { "clip_ratio": 0.0002884615387301892, "completion_length": 215.1041717529297, "epoch": 0.009797389213710237, "grad_norm": 0.10957752168178558, "kl": 0.1238323375582695, "learning_rate": 3e-06, "loss": 0.0312, "reward": 0.397916704416275, "reward_std": 0.39448077976703644, "rewards/countdown_reward_func": 0.39791665971279144, "step": 3529, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.00019212366169085726, "epoch": 0.009800165464550053, "grad_norm": 0.11883625388145447, "kl": 0.1277984194457531, "learning_rate": 3e-06, "loss": 0.0308, "step": 3530 }, { "clip_ratio": 0.0001923076924867928, "epoch": 0.00980294171538987, "grad_norm": 0.11993909627199173, "kl": 0.1306721642613411, "learning_rate": 3e-06, "loss": 0.0299, "step": 3531 }, { "clip_ratio": 0.00010121457307832316, "epoch": 0.009805717966229685, "grad_norm": 0.1546984314918518, "kl": 0.12464490905404091, "learning_rate": 3e-06, "loss": 0.0291, "step": 3532 }, { "clip_ratio": 0.0, "epoch": 0.0098084942170695, "grad_norm": 0.1257956326007843, "kl": 0.1325385645031929, "learning_rate": 3e-06, "loss": 0.0301, "step": 3533 }, { "clip_ratio": 0.0002024291461566463, "epoch": 0.009811270467909316, "grad_norm": 0.11736729741096497, "kl": 0.12754249945282936, "learning_rate": 3e-06, "loss": 0.0291, "step": 3534 }, { "clip_ratio": 0.0, "epoch": 0.009814046718749133, "grad_norm": 0.10580999404191971, "kl": 0.13433215022087097, "learning_rate": 3e-06, "loss": 0.0292, "step": 3535 }, { "clip_ratio": 0.00018691523291636258, "epoch": 0.009816822969588948, "grad_norm": 0.11789095401763916, "kl": 0.14152230322360992, "learning_rate": 3e-06, "loss": 0.0289, "step": 3536 }, { "clip_ratio": 0.00020826146646868438, "epoch": 0.009819599220428764, "grad_norm": 0.11437322944402695, "kl": 0.14290494471788406, "learning_rate": 3e-06, "loss": 0.0273, "step": 3537 }, { "clip_ratio": 0.0, "epoch": 0.00982237547126858, "grad_norm": 0.15080593526363373, "kl": 0.13862910866737366, "learning_rate": 3e-06, "loss": 0.0254, "step": 3538 }, { "clip_ratio": 0.0, "epoch": 0.009825151722108396, "grad_norm": 0.1022270917892456, "kl": 0.14875853806734085, "learning_rate": 3e-06, "loss": 0.0266, "step": 3539 }, { "clip_ratio": 0.00018839487165678293, "epoch": 0.009827927972948212, "grad_norm": 0.10433381050825119, "kl": 0.14243001490831375, "learning_rate": 3e-06, "loss": 0.0255, "step": 3540 }, { "clip_ratio": 0.0008953584401751868, "completion_length": 234.70834350585938, "epoch": 0.009830704223788027, "grad_norm": 0.08352696895599365, "kl": 0.15294237434864044, "learning_rate": 3e-06, "loss": 0.006, "reward": 0.21041667461395264, "reward_std": 0.24690958857536316, "rewards/countdown_reward_func": 0.21041667461395264, "step": 3541, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.009833480474627844, "grad_norm": 0.09349098801612854, "kl": 0.16389970481395721, "learning_rate": 3e-06, "loss": 0.0068, "step": 3542 }, { "clip_ratio": 0.0, "epoch": 0.00983625672546766, "grad_norm": 0.07954943180084229, "kl": 0.16744234412908554, "learning_rate": 3e-06, "loss": 0.0065, "step": 3543 }, { "clip_ratio": 0.0, "epoch": 0.009839032976307475, "grad_norm": 0.08301316201686859, "kl": 0.16650399565696716, "learning_rate": 3e-06, "loss": 0.0066, "step": 3544 }, { "clip_ratio": 0.0, "epoch": 0.00984180922714729, "grad_norm": 0.1138041764497757, "kl": 0.1684616059064865, "learning_rate": 3e-06, "loss": 0.0067, "step": 3545 }, { "clip_ratio": 0.00017959770048037171, "epoch": 0.009844585477987108, "grad_norm": 0.062481727451086044, "kl": 0.1681341528892517, "learning_rate": 3e-06, "loss": 0.0068, "step": 3546 }, { "clip_ratio": 0.00027221686468692496, "epoch": 0.009847361728826923, "grad_norm": 0.08000363409519196, "kl": 0.165481299161911, "learning_rate": 3e-06, "loss": 0.0057, "step": 3547 }, { "clip_ratio": 0.0, "epoch": 0.009850137979666738, "grad_norm": 0.09288675338029861, "kl": 0.171707421541214, "learning_rate": 3e-06, "loss": 0.0063, "step": 3548 }, { "clip_ratio": 0.00036075260140933096, "epoch": 0.009852914230506554, "grad_norm": 0.08349016308784485, "kl": 0.17262651026248932, "learning_rate": 3e-06, "loss": 0.0064, "step": 3549 }, { "clip_ratio": 0.0008906753064366058, "epoch": 0.009855690481346371, "grad_norm": 0.08174631744623184, "kl": 0.1684475690126419, "learning_rate": 3e-06, "loss": 0.0065, "step": 3550 }, { "clip_ratio": 0.0006402290600817651, "epoch": 0.009858466732186186, "grad_norm": 0.11256370693445206, "kl": 0.1669769287109375, "learning_rate": 3e-06, "loss": 0.0062, "step": 3551 }, { "clip_ratio": 0.0005131080470164306, "epoch": 0.009861242983026002, "grad_norm": 0.06646931171417236, "kl": 0.1654352843761444, "learning_rate": 3e-06, "loss": 0.0061, "step": 3552 }, { "clip_ratio": 0.0, "completion_length": 211.06250762939453, "epoch": 0.009864019233865819, "grad_norm": 0.13966208696365356, "kl": 0.1667003333568573, "learning_rate": 3e-06, "loss": -0.0041, "reward": 0.2666666805744171, "reward_std": 0.2149241641163826, "rewards/countdown_reward_func": 0.2666666805744171, "step": 3553, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00035937850043410435, "epoch": 0.009866795484705634, "grad_norm": 0.09932344406843185, "kl": 0.15992556512355804, "learning_rate": 3e-06, "loss": -0.0039, "step": 3554 }, { "clip_ratio": 0.00043766023009084165, "epoch": 0.00986957173554545, "grad_norm": 0.10521374642848969, "kl": 0.15829944610595703, "learning_rate": 3e-06, "loss": -0.0041, "step": 3555 }, { "clip_ratio": 0.0006664529355475679, "epoch": 0.009872347986385265, "grad_norm": 0.09699181467294693, "kl": 0.1541534811258316, "learning_rate": 3e-06, "loss": -0.0034, "step": 3556 }, { "clip_ratio": 0.0004751791711896658, "epoch": 0.009875124237225082, "grad_norm": 0.09058420360088348, "kl": 0.16084708273410797, "learning_rate": 3e-06, "loss": -0.0039, "step": 3557 }, { "clip_ratio": 0.0007214536308310926, "epoch": 0.009877900488064898, "grad_norm": 0.12025748938322067, "kl": 0.15611423552036285, "learning_rate": 3e-06, "loss": -0.0044, "step": 3558 }, { "clip_ratio": 0.00010879024921450764, "epoch": 0.009880676738904713, "grad_norm": 0.14152219891548157, "kl": 0.14850230515003204, "learning_rate": 3e-06, "loss": -0.0066, "step": 3559 }, { "clip_ratio": 0.0002903443528339267, "epoch": 0.009883452989744529, "grad_norm": 0.14620235562324524, "kl": 0.13928164541721344, "learning_rate": 3e-06, "loss": -0.0062, "step": 3560 }, { "clip_ratio": 0.0006284190239966847, "epoch": 0.009886229240584346, "grad_norm": 0.0934399738907814, "kl": 0.13627835363149643, "learning_rate": 3e-06, "loss": -0.0056, "step": 3561 }, { "clip_ratio": 0.0010975940676871687, "epoch": 0.009889005491424161, "grad_norm": 0.08142386376857758, "kl": 0.13167067617177963, "learning_rate": 3e-06, "loss": -0.0067, "step": 3562 }, { "clip_ratio": 0.0023669447982683778, "epoch": 0.009891781742263977, "grad_norm": 0.09869975596666336, "kl": 0.13874101638793945, "learning_rate": 3e-06, "loss": -0.0061, "step": 3563 }, { "clip_ratio": 0.0018368879100307822, "epoch": 0.009894557993103794, "grad_norm": 0.11632751673460007, "kl": 0.13457733392715454, "learning_rate": 3e-06, "loss": -0.0065, "step": 3564 }, { "clip_ratio": 9.342301927972585e-05, "completion_length": 232.02083587646484, "epoch": 0.009897334243943609, "grad_norm": 0.08241791278123856, "kl": 0.10620899125933647, "learning_rate": 3e-06, "loss": -0.0079, "reward": 0.18958334624767303, "reward_std": 0.2322119027376175, "rewards/countdown_reward_func": 0.18958333879709244, "step": 3565, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009900110494783424, "grad_norm": 0.0915219709277153, "kl": 0.10775751993060112, "learning_rate": 3e-06, "loss": -0.0069, "step": 3566 }, { "clip_ratio": 0.0002562636582297273, "epoch": 0.00990288674562324, "grad_norm": 0.08054718375205994, "kl": 0.10378298535943031, "learning_rate": 3e-06, "loss": -0.0068, "step": 3567 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.009905662996463057, "grad_norm": 0.09839877486228943, "kl": 0.10335510596632957, "learning_rate": 3e-06, "loss": -0.0082, "step": 3568 }, { "clip_ratio": 0.00048828125, "epoch": 0.009908439247302872, "grad_norm": 0.08594221621751785, "kl": 0.0978197231888771, "learning_rate": 3e-06, "loss": -0.0077, "step": 3569 }, { "clip_ratio": 0.00026547446759650484, "epoch": 0.009911215498142688, "grad_norm": 0.0899190679192543, "kl": 0.09784703329205513, "learning_rate": 3e-06, "loss": -0.0078, "step": 3570 }, { "clip_ratio": 0.00040981327038025483, "epoch": 0.009913991748982503, "grad_norm": 0.08349990844726562, "kl": 0.09218048676848412, "learning_rate": 3e-06, "loss": -0.0101, "step": 3571 }, { "clip_ratio": 0.0006032017408870161, "epoch": 0.00991676799982232, "grad_norm": 0.08477947860956192, "kl": 0.0942063108086586, "learning_rate": 3e-06, "loss": -0.0093, "step": 3572 }, { "clip_ratio": 0.0009300952660851181, "epoch": 0.009919544250662136, "grad_norm": 0.08447878062725067, "kl": 0.09112598747015, "learning_rate": 3e-06, "loss": -0.0087, "step": 3573 }, { "clip_ratio": 0.0012770333560183644, "epoch": 0.009922320501501951, "grad_norm": 0.0808240994811058, "kl": 0.09319330751895905, "learning_rate": 3e-06, "loss": -0.0107, "step": 3574 }, { "clip_ratio": 0.002172630396671593, "epoch": 0.009925096752341768, "grad_norm": 0.08379380404949188, "kl": 0.0882505401968956, "learning_rate": 3e-06, "loss": -0.0105, "step": 3575 }, { "clip_ratio": 0.002160667674615979, "epoch": 0.009927873003181584, "grad_norm": 0.08182033151388168, "kl": 0.08952687680721283, "learning_rate": 3e-06, "loss": -0.011, "step": 3576 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 232.1041717529297, "epoch": 0.009930649254021399, "grad_norm": 0.08643139153718948, "kl": 0.09609649330377579, "learning_rate": 3e-06, "loss": 0.014, "reward": 0.3062499985098839, "reward_std": 0.253973301500082, "rewards/countdown_reward_func": 0.3062499985098839, "step": 3577, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.009933425504861215, "grad_norm": 0.0905647799372673, "kl": 0.08487126603722572, "learning_rate": 3e-06, "loss": 0.0139, "step": 3578 }, { "clip_ratio": 0.0, "epoch": 0.009936201755701032, "grad_norm": 0.1130918338894844, "kl": 0.08920860663056374, "learning_rate": 3e-06, "loss": 0.0136, "step": 3579 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009938978006540847, "grad_norm": 0.10128752142190933, "kl": 0.08758973702788353, "learning_rate": 3e-06, "loss": 0.0135, "step": 3580 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.009941754257380662, "grad_norm": 0.07439632713794708, "kl": 0.08840979263186455, "learning_rate": 3e-06, "loss": 0.014, "step": 3581 }, { "clip_ratio": 8.973438525572419e-05, "epoch": 0.009944530508220478, "grad_norm": 0.08572861552238464, "kl": 0.0832834467291832, "learning_rate": 3e-06, "loss": 0.014, "step": 3582 }, { "clip_ratio": 0.00017934002971742302, "epoch": 0.009947306759060295, "grad_norm": 0.08256344497203827, "kl": 0.09234043955802917, "learning_rate": 3e-06, "loss": 0.0135, "step": 3583 }, { "clip_ratio": 8.967001485871151e-05, "epoch": 0.00995008300990011, "grad_norm": 0.0929991826415062, "kl": 0.08213445544242859, "learning_rate": 3e-06, "loss": 0.0138, "step": 3584 }, { "clip_ratio": 0.0004405128347571008, "epoch": 0.009952859260739926, "grad_norm": 0.09133868664503098, "kl": 0.08775995671749115, "learning_rate": 3e-06, "loss": 0.014, "step": 3585 }, { "clip_ratio": 0.0002692031557671726, "epoch": 0.009955635511579743, "grad_norm": 0.10985512286424637, "kl": 0.08732177317142487, "learning_rate": 3e-06, "loss": 0.0122, "step": 3586 }, { "clip_ratio": 0.001576516471686773, "epoch": 0.009958411762419558, "grad_norm": 0.08233989030122757, "kl": 0.08825709670782089, "learning_rate": 3e-06, "loss": 0.014, "step": 3587 }, { "clip_ratio": 0.0003517478980938904, "epoch": 0.009961188013259374, "grad_norm": 0.08376505225896835, "kl": 0.08349743485450745, "learning_rate": 3e-06, "loss": 0.0133, "step": 3588 }, { "clip_ratio": 0.0015560845713480376, "completion_length": 242.45833587646484, "epoch": 0.00996396426409919, "grad_norm": 0.10015111416578293, "kl": 0.09221503511071205, "learning_rate": 3e-06, "loss": 0.0042, "reward": 0.3020833432674408, "reward_std": 0.3433144688606262, "rewards/countdown_reward_func": 0.3020833432674408, "step": 3589, "zero_std_ratio": 0.125 }, { "clip_ratio": 8.585165051044896e-05, "epoch": 0.009966740514939006, "grad_norm": 0.08869469910860062, "kl": 0.09250494837760925, "learning_rate": 3e-06, "loss": 0.0038, "step": 3590 }, { "clip_ratio": 8.585165051044896e-05, "epoch": 0.009969516765778822, "grad_norm": 0.1028810515999794, "kl": 0.09852945432066917, "learning_rate": 3e-06, "loss": 0.0033, "step": 3591 }, { "clip_ratio": 0.0, "epoch": 0.009972293016618637, "grad_norm": 0.10230275988578796, "kl": 0.09581045806407928, "learning_rate": 3e-06, "loss": 0.0034, "step": 3592 }, { "clip_ratio": 0.0002545824972912669, "epoch": 0.009975069267458453, "grad_norm": 0.07933814823627472, "kl": 0.09733888879418373, "learning_rate": 3e-06, "loss": 0.0038, "step": 3593 }, { "clip_ratio": 0.0009748198208399117, "epoch": 0.00997784551829827, "grad_norm": 0.08943408727645874, "kl": 0.09612659737467766, "learning_rate": 3e-06, "loss": 0.0036, "step": 3594 }, { "clip_ratio": 0.0, "epoch": 0.009980621769138085, "grad_norm": 0.08962644636631012, "kl": 0.0953371524810791, "learning_rate": 3e-06, "loss": 0.0029, "step": 3595 }, { "clip_ratio": 0.0, "epoch": 0.0099833980199779, "grad_norm": 0.08942896872758865, "kl": 0.09632308036088943, "learning_rate": 3e-06, "loss": 0.0024, "step": 3596 }, { "clip_ratio": 0.0005977830442134291, "epoch": 0.009986174270817718, "grad_norm": 0.09577378630638123, "kl": 0.10286097973585129, "learning_rate": 3e-06, "loss": 0.0024, "step": 3597 }, { "clip_ratio": 0.00017767296958481893, "epoch": 0.009988950521657533, "grad_norm": 0.14195489883422852, "kl": 0.10008842870593071, "learning_rate": 3e-06, "loss": 0.0016, "step": 3598 }, { "clip_ratio": 0.002063878404442221, "epoch": 0.009991726772497348, "grad_norm": 0.10049892216920853, "kl": 0.10134289786219597, "learning_rate": 3e-06, "loss": 0.0027, "step": 3599 }, { "clip_ratio": 0.0005936498855589889, "epoch": 0.009994503023337164, "grad_norm": 0.08835538476705551, "kl": 0.0999576486647129, "learning_rate": 3e-06, "loss": 0.0016, "step": 3600 }, { "clip_ratio": 9.53470662352629e-05, "completion_length": 227.8541717529297, "epoch": 0.009997279274176981, "grad_norm": 0.05146416649222374, "kl": 0.09291590750217438, "learning_rate": 3e-06, "loss": 0.0018, "reward": 0.19166668504476547, "reward_std": 0.12152323126792908, "rewards/countdown_reward_func": 0.19166666641831398, "step": 3601, "zero_std_ratio": 0.75 }, { "clip_ratio": 0.000244140625, "epoch": 0.010000055525016796, "grad_norm": 0.05218259245157242, "kl": 0.10260960459709167, "learning_rate": 3e-06, "loss": 0.0021, "step": 3602 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010002831775856612, "grad_norm": 0.058103084564208984, "kl": 0.09497980400919914, "learning_rate": 3e-06, "loss": 0.0023, "step": 3603 }, { "clip_ratio": 0.00016965705435723066, "epoch": 0.010005608026696427, "grad_norm": 0.05355222523212433, "kl": 0.09517773985862732, "learning_rate": 3e-06, "loss": 0.0021, "step": 3604 }, { "clip_ratio": 0.0, "epoch": 0.010008384277536244, "grad_norm": 0.0592651441693306, "kl": 0.09849020838737488, "learning_rate": 3e-06, "loss": 0.0015, "step": 3605 }, { "clip_ratio": 0.00035310734529048204, "epoch": 0.01001116052837606, "grad_norm": 0.06761127710342407, "kl": 0.09091968089342117, "learning_rate": 3e-06, "loss": 0.0019, "step": 3606 }, { "clip_ratio": 0.00019379844889044762, "epoch": 0.010013936779215875, "grad_norm": 0.051605161279439926, "kl": 0.09430093690752983, "learning_rate": 3e-06, "loss": 0.0018, "step": 3607 }, { "clip_ratio": 0.00035695651604328305, "epoch": 0.010016713030055692, "grad_norm": 0.04927424341440201, "kl": 0.10303110629320145, "learning_rate": 3e-06, "loss": 0.002, "step": 3608 }, { "clip_ratio": 0.0005246637738309801, "epoch": 0.010019489280895508, "grad_norm": 0.05644126608967781, "kl": 0.09355449676513672, "learning_rate": 3e-06, "loss": 0.0022, "step": 3609 }, { "clip_ratio": 0.00016965705435723066, "epoch": 0.010022265531735323, "grad_norm": 0.05118807405233383, "kl": 0.09361306577920914, "learning_rate": 3e-06, "loss": 0.0019, "step": 3610 }, { "clip_ratio": 0.00017672727699391544, "epoch": 0.010025041782575139, "grad_norm": 0.055156417191028595, "kl": 0.09621712565422058, "learning_rate": 3e-06, "loss": 0.0015, "step": 3611 }, { "clip_ratio": 0.00016965704708127305, "epoch": 0.010027818033414956, "grad_norm": 0.06433351337909698, "kl": 0.08880849182605743, "learning_rate": 3e-06, "loss": 0.0017, "step": 3612 }, { "clip_ratio": 0.0, "completion_length": 226.7291717529297, "epoch": 0.010030594284254771, "grad_norm": 0.0678560733795166, "kl": 0.10531853139400482, "learning_rate": 3e-06, "loss": 0.0071, "reward": 0.24791669100522995, "reward_std": 0.245161235332489, "rewards/countdown_reward_func": 0.24791667610406876, "step": 3613, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.001397733693011105, "epoch": 0.010033370535094586, "grad_norm": 0.07233700156211853, "kl": 0.10957350209355354, "learning_rate": 3e-06, "loss": 0.0074, "step": 3614 }, { "clip_ratio": 0.0, "epoch": 0.010036146785934402, "grad_norm": 0.06673402339220047, "kl": 0.09965291991829872, "learning_rate": 3e-06, "loss": 0.007, "step": 3615 }, { "clip_ratio": 0.0006085396962589584, "epoch": 0.010038923036774219, "grad_norm": 0.077565036714077, "kl": 0.09470411390066147, "learning_rate": 3e-06, "loss": 0.0067, "step": 3616 }, { "clip_ratio": 0.0003545760118868202, "epoch": 0.010041699287614034, "grad_norm": 0.06049685925245285, "kl": 0.09756863489747047, "learning_rate": 3e-06, "loss": 0.0074, "step": 3617 }, { "clip_ratio": 0.00035556586226448417, "epoch": 0.01004447553845385, "grad_norm": 0.07105754315853119, "kl": 0.11034474149346352, "learning_rate": 3e-06, "loss": 0.0074, "step": 3618 }, { "clip_ratio": 0.00027919336571358144, "epoch": 0.010047251789293667, "grad_norm": 0.06838853657245636, "kl": 0.10413213819265366, "learning_rate": 3e-06, "loss": 0.0065, "step": 3619 }, { "clip_ratio": 0.0020479720551520586, "epoch": 0.010050028040133482, "grad_norm": 0.0666194036602974, "kl": 0.106504175812006, "learning_rate": 3e-06, "loss": 0.0068, "step": 3620 }, { "clip_ratio": 0.0002624084590934217, "epoch": 0.010052804290973298, "grad_norm": 0.06731873750686646, "kl": 0.09811598435044289, "learning_rate": 3e-06, "loss": 0.0064, "step": 3621 }, { "clip_ratio": 0.00037735849036835134, "epoch": 0.010055580541813113, "grad_norm": 0.06610267609357834, "kl": 0.09473022818565369, "learning_rate": 3e-06, "loss": 0.0055, "step": 3622 }, { "clip_ratio": 0.0002700594486668706, "epoch": 0.01005835679265293, "grad_norm": 0.05519413575530052, "kl": 0.09653511270880699, "learning_rate": 3e-06, "loss": 0.0065, "step": 3623 }, { "clip_ratio": 0.0007142680988181382, "epoch": 0.010061133043492746, "grad_norm": 0.06636186689138412, "kl": 0.10993264243006706, "learning_rate": 3e-06, "loss": 0.0067, "step": 3624 }, { "clip_ratio": 9.07111752894707e-05, "completion_length": 239.9375, "epoch": 0.010063909294332561, "grad_norm": 0.048276085406541824, "kl": 0.08672815561294556, "learning_rate": 3e-06, "loss": -0.0035, "reward": 0.1562500149011612, "reward_std": 0.10402268171310425, "rewards/countdown_reward_func": 0.1562500111758709, "step": 3625, "zero_std_ratio": 0.75 }, { "clip_ratio": 0.0002502954375813715, "epoch": 0.010066685545172377, "grad_norm": 0.06022067740559578, "kl": 0.09006969258189201, "learning_rate": 3e-06, "loss": -0.0042, "step": 3626 }, { "clip_ratio": 0.0002623061664053239, "epoch": 0.010069461796012194, "grad_norm": 0.05575935170054436, "kl": 0.08650654926896095, "learning_rate": 3e-06, "loss": -0.0036, "step": 3627 }, { "clip_ratio": 0.0002623061664053239, "epoch": 0.010072238046852009, "grad_norm": 0.052357081323862076, "kl": 0.08896467089653015, "learning_rate": 3e-06, "loss": -0.0042, "step": 3628 }, { "clip_ratio": 0.0003544566934579052, "epoch": 0.010075014297691825, "grad_norm": 0.050558920949697495, "kl": 0.09738479554653168, "learning_rate": 3e-06, "loss": -0.003, "step": 3629 }, { "clip_ratio": 9.07111752894707e-05, "epoch": 0.010077790548531642, "grad_norm": 0.05718159303069115, "kl": 0.08443065360188484, "learning_rate": 3e-06, "loss": -0.004, "step": 3630 }, { "clip_ratio": 0.00042901538108708337, "epoch": 0.010080566799371457, "grad_norm": 0.05324540287256241, "kl": 0.08448105677962303, "learning_rate": 3e-06, "loss": -0.0038, "step": 3631 }, { "clip_ratio": 0.0012064556358382106, "epoch": 0.010083343050211272, "grad_norm": 0.056545451283454895, "kl": 0.08605921640992165, "learning_rate": 3e-06, "loss": -0.0044, "step": 3632 }, { "clip_ratio": 0.0002637455108924769, "epoch": 0.010086119301051088, "grad_norm": 0.054126057773828506, "kl": 0.08238720148801804, "learning_rate": 3e-06, "loss": -0.0047, "step": 3633 }, { "clip_ratio": 0.0006083748303353786, "epoch": 0.010088895551890905, "grad_norm": 0.0858687236905098, "kl": 0.08479169383645058, "learning_rate": 3e-06, "loss": -0.0046, "step": 3634 }, { "clip_ratio": 0.000421203272708226, "epoch": 0.01009167180273072, "grad_norm": 0.07297965884208679, "kl": 0.09065381065011024, "learning_rate": 3e-06, "loss": -0.004, "step": 3635 }, { "clip_ratio": 0.002256663632579148, "epoch": 0.010094448053570536, "grad_norm": 0.052479516714811325, "kl": 0.07902554795145988, "learning_rate": 3e-06, "loss": -0.005, "step": 3636 }, { "clip_ratio": 0.00017130826745415106, "completion_length": 233.0416717529297, "epoch": 0.010097224304410351, "grad_norm": 0.08496987074613571, "kl": 0.0859854519367218, "learning_rate": 3e-06, "loss": -0.0053, "reward": 0.2645833343267441, "reward_std": 0.28140274435281754, "rewards/countdown_reward_func": 0.2645833343267441, "step": 3637, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.211496217176318e-05, "epoch": 0.010100000555250168, "grad_norm": 0.11726462095975876, "kl": 0.08350119739770889, "learning_rate": 3e-06, "loss": -0.0061, "step": 3638 }, { "clip_ratio": 0.00018208303663413972, "epoch": 0.010102776806089984, "grad_norm": 0.08928564935922623, "kl": 0.08794764429330826, "learning_rate": 3e-06, "loss": -0.0048, "step": 3639 }, { "clip_ratio": 0.0001850481057772413, "epoch": 0.0101055530569298, "grad_norm": 0.10599423199892044, "kl": 0.08475306630134583, "learning_rate": 3e-06, "loss": -0.0058, "step": 3640 }, { "clip_ratio": 9.211496217176318e-05, "epoch": 0.010108329307769616, "grad_norm": 0.0907074436545372, "kl": 0.08218500018119812, "learning_rate": 3e-06, "loss": -0.0052, "step": 3641 }, { "clip_ratio": 0.0003533913040882908, "epoch": 0.010111105558609432, "grad_norm": 0.09170997887849808, "kl": 0.08138689771294594, "learning_rate": 3e-06, "loss": -0.0063, "step": 3642 }, { "clip_ratio": 0.0006963768973946571, "epoch": 0.010113881809449247, "grad_norm": 0.08457538485527039, "kl": 0.08474935963749886, "learning_rate": 3e-06, "loss": -0.0058, "step": 3643 }, { "clip_ratio": 0.0003829580673482269, "epoch": 0.010116658060289063, "grad_norm": 0.10203880071640015, "kl": 0.08263363689184189, "learning_rate": 3e-06, "loss": -0.0073, "step": 3644 }, { "clip_ratio": 0.0007045355960144661, "epoch": 0.01011943431112888, "grad_norm": 0.07963455468416214, "kl": 0.08823851123452187, "learning_rate": 3e-06, "loss": -0.0063, "step": 3645 }, { "clip_ratio": 0.0008081280611804686, "epoch": 0.010122210561968695, "grad_norm": 0.10088809579610825, "kl": 0.08502522110939026, "learning_rate": 3e-06, "loss": -0.0077, "step": 3646 }, { "clip_ratio": 0.0008116622047964483, "epoch": 0.01012498681280851, "grad_norm": 0.09863507002592087, "kl": 0.08328916132450104, "learning_rate": 3e-06, "loss": -0.0066, "step": 3647 }, { "clip_ratio": 0.0006202931690495461, "epoch": 0.010127763063648326, "grad_norm": 0.09454872459173203, "kl": 0.08354802429676056, "learning_rate": 3e-06, "loss": -0.0081, "step": 3648 }, { "clip_ratio": 0.000267253810307011, "completion_length": 237.62500762939453, "epoch": 0.010130539314488143, "grad_norm": 0.11663991212844849, "kl": 0.08883662521839142, "learning_rate": 3e-06, "loss": -0.0059, "reward": 0.4000000059604645, "reward_std": 0.33992571383714676, "rewards/countdown_reward_func": 0.4000000059604645, "step": 3649, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010133315565327958, "grad_norm": 0.09802624583244324, "kl": 0.08094433695077896, "learning_rate": 3e-06, "loss": -0.0063, "step": 3650 }, { "clip_ratio": 0.00017494309577159584, "epoch": 0.010136091816167774, "grad_norm": 0.11795218288898468, "kl": 0.0826914869248867, "learning_rate": 3e-06, "loss": -0.0061, "step": 3651 }, { "clip_ratio": 0.000801381072960794, "epoch": 0.010138868067007591, "grad_norm": 0.13071514666080475, "kl": 0.09427114948630333, "learning_rate": 3e-06, "loss": -0.0059, "step": 3652 }, { "clip_ratio": 0.0006510416860692203, "epoch": 0.010141644317847406, "grad_norm": 0.10175346583127975, "kl": 0.08908730000257492, "learning_rate": 3e-06, "loss": -0.007, "step": 3653 }, { "clip_ratio": 0.0005245499778538942, "epoch": 0.010144420568687222, "grad_norm": 0.09861656278371811, "kl": 0.08767584338784218, "learning_rate": 3e-06, "loss": -0.0072, "step": 3654 }, { "clip_ratio": 0.0003439117790549062, "epoch": 0.010147196819527037, "grad_norm": 0.1005256325006485, "kl": 0.09155403077602386, "learning_rate": 3e-06, "loss": -0.0076, "step": 3655 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.010149973070366854, "grad_norm": 0.11633270233869553, "kl": 0.08335649594664574, "learning_rate": 3e-06, "loss": -0.0085, "step": 3656 }, { "clip_ratio": 0.0010179505916312337, "epoch": 0.01015274932120667, "grad_norm": 0.12330052256584167, "kl": 0.08682303503155708, "learning_rate": 3e-06, "loss": -0.0084, "step": 3657 }, { "clip_ratio": 0.0008518361137248576, "epoch": 0.010155525572046485, "grad_norm": 0.08478277176618576, "kl": 0.09848198294639587, "learning_rate": 3e-06, "loss": -0.0086, "step": 3658 }, { "clip_ratio": 0.0013834635319653898, "epoch": 0.010158301822886302, "grad_norm": 0.10296488553285599, "kl": 0.09211910516023636, "learning_rate": 3e-06, "loss": -0.0099, "step": 3659 }, { "clip_ratio": 0.0010252931679133326, "epoch": 0.010161078073726118, "grad_norm": 0.12680672109127045, "kl": 0.09135585278272629, "learning_rate": 3e-06, "loss": -0.0098, "step": 3660 }, { "clip_ratio": 0.00020868112915195525, "completion_length": 231.08334350585938, "epoch": 0.010163854324565933, "grad_norm": 0.0681372731924057, "kl": 0.10179301351308823, "learning_rate": 3e-06, "loss": -0.0065, "reward": 0.3437500298023224, "reward_std": 0.2696641534566879, "rewards/countdown_reward_func": 0.3437500298023224, "step": 3661, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.010166630575405749, "grad_norm": 0.06358912587165833, "kl": 0.1072254478931427, "learning_rate": 3e-06, "loss": -0.007, "step": 3662 }, { "clip_ratio": 0.00044686285400530323, "epoch": 0.010169406826245566, "grad_norm": 0.06818949431180954, "kl": 0.10475390776991844, "learning_rate": 3e-06, "loss": -0.0072, "step": 3663 }, { "clip_ratio": 0.0005052476772107184, "epoch": 0.010172183077085381, "grad_norm": 0.07285914570093155, "kl": 0.10073699057102203, "learning_rate": 3e-06, "loss": -0.0067, "step": 3664 }, { "clip_ratio": 0.00028952574939467013, "epoch": 0.010174959327925196, "grad_norm": 0.0680256187915802, "kl": 0.09941226243972778, "learning_rate": 3e-06, "loss": -0.0071, "step": 3665 }, { "clip_ratio": 0.00018572077533463016, "epoch": 0.010177735578765012, "grad_norm": 0.07317747920751572, "kl": 0.10381283238530159, "learning_rate": 3e-06, "loss": -0.007, "step": 3666 }, { "clip_ratio": 0.00020868112915195525, "epoch": 0.010180511829604829, "grad_norm": 0.0646790936589241, "kl": 0.10353933274745941, "learning_rate": 3e-06, "loss": -0.0065, "step": 3667 }, { "clip_ratio": 0.00042463721183594316, "epoch": 0.010183288080444644, "grad_norm": 0.06955524533987045, "kl": 0.10776736587285995, "learning_rate": 3e-06, "loss": -0.007, "step": 3668 }, { "clip_ratio": 0.0003782120911637321, "epoch": 0.01018606433128446, "grad_norm": 0.06638143956661224, "kl": 0.10359567031264305, "learning_rate": 3e-06, "loss": -0.0081, "step": 3669 }, { "clip_ratio": 0.0007550554073532112, "epoch": 0.010188840582124277, "grad_norm": 0.06565908342599869, "kl": 0.10221174359321594, "learning_rate": 3e-06, "loss": -0.0079, "step": 3670 }, { "clip_ratio": 0.00027558418514672667, "epoch": 0.010191616832964092, "grad_norm": 0.07413748651742935, "kl": 0.09760120511054993, "learning_rate": 3e-06, "loss": -0.008, "step": 3671 }, { "clip_ratio": 0.0004323856846895069, "epoch": 0.010194393083803908, "grad_norm": 0.0687633752822876, "kl": 0.10222437605261803, "learning_rate": 3e-06, "loss": -0.0081, "step": 3672 }, { "clip_ratio": 0.00017390099674230441, "completion_length": 241.06250762939453, "epoch": 0.010197169334643723, "grad_norm": 0.05424873158335686, "kl": 0.08744099736213684, "learning_rate": 3e-06, "loss": -0.0072, "reward": 0.19166668504476547, "reward_std": 0.12152322381734848, "rewards/countdown_reward_func": 0.19166666641831398, "step": 3673, "zero_std_ratio": 0.75 }, { "clip_ratio": 0.0, "epoch": 0.01019994558548354, "grad_norm": 0.05150388181209564, "kl": 0.09159904345870018, "learning_rate": 3e-06, "loss": -0.0071, "step": 3674 }, { "clip_ratio": 0.00016527282423339784, "epoch": 0.010202721836323356, "grad_norm": 0.055373065173625946, "kl": 0.09357555583119392, "learning_rate": 3e-06, "loss": -0.0071, "step": 3675 }, { "clip_ratio": 0.00044742397585650906, "epoch": 0.010205498087163171, "grad_norm": 0.06895780563354492, "kl": 0.09898631647229195, "learning_rate": 3e-06, "loss": -0.0071, "step": 3676 }, { "clip_ratio": 0.0001823486527428031, "epoch": 0.010208274338002987, "grad_norm": 0.0632469430565834, "kl": 0.086412712931633, "learning_rate": 3e-06, "loss": -0.0079, "step": 3677 }, { "clip_ratio": 0.001015738962450996, "epoch": 0.010211050588842804, "grad_norm": 0.05175432562828064, "kl": 0.08875154331326485, "learning_rate": 3e-06, "loss": -0.0072, "step": 3678 }, { "clip_ratio": 0.0001823486527428031, "epoch": 0.010213826839682619, "grad_norm": 0.08085650950670242, "kl": 0.08807466551661491, "learning_rate": 3e-06, "loss": -0.0075, "step": 3679 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010216603090522434, "grad_norm": 0.055448535829782486, "kl": 0.09215493127703667, "learning_rate": 3e-06, "loss": -0.0071, "step": 3680 }, { "clip_ratio": 0.000244140625, "epoch": 0.010219379341362252, "grad_norm": 0.057238444685935974, "kl": 0.0937417522072792, "learning_rate": 3e-06, "loss": -0.0078, "step": 3681 }, { "clip_ratio": 0.0004166951694060117, "epoch": 0.010222155592202067, "grad_norm": 0.07063135504722595, "kl": 0.09952731058001518, "learning_rate": 3e-06, "loss": -0.0078, "step": 3682 }, { "clip_ratio": 0.00024683355150045827, "epoch": 0.010224931843041882, "grad_norm": 0.04441355541348457, "kl": 0.08707210421562195, "learning_rate": 3e-06, "loss": -0.008, "step": 3683 }, { "clip_ratio": 0.0010984656400978565, "epoch": 0.010227708093881698, "grad_norm": 0.04816470667719841, "kl": 0.08925540745258331, "learning_rate": 3e-06, "loss": -0.0076, "step": 3684 }, { "clip_ratio": 0.0002445131976855919, "completion_length": 240.37500762939453, "epoch": 0.010230484344721515, "grad_norm": 0.20789018273353577, "kl": 0.09080018103122711, "learning_rate": 3e-06, "loss": 0.0131, "reward": 0.34166668355464935, "reward_std": 0.3215610906481743, "rewards/countdown_reward_func": 0.34166667610406876, "step": 3685, "zero_std_ratio": 0.25 }, { "clip_ratio": 8.656509453430772e-05, "epoch": 0.01023326059556133, "grad_norm": 0.07286840677261353, "kl": 0.08963349461555481, "learning_rate": 3e-06, "loss": 0.0143, "step": 3686 }, { "clip_ratio": 0.0, "epoch": 0.010236036846401146, "grad_norm": 0.2193601429462433, "kl": 0.0885586068034172, "learning_rate": 3e-06, "loss": 0.013, "step": 3687 }, { "clip_ratio": 0.0, "epoch": 0.010238813097240961, "grad_norm": 0.06472805887460709, "kl": 0.0873815193772316, "learning_rate": 3e-06, "loss": 0.0138, "step": 3688 }, { "clip_ratio": 9.077705180970952e-05, "epoch": 0.010241589348080778, "grad_norm": 0.07682758569717407, "kl": 0.09266604483127594, "learning_rate": 3e-06, "loss": 0.0134, "step": 3689 }, { "clip_ratio": 0.0, "epoch": 0.010244365598920594, "grad_norm": 0.06587347388267517, "kl": 0.0944773405790329, "learning_rate": 3e-06, "loss": 0.0136, "step": 3690 }, { "clip_ratio": 0.0004069010610692203, "epoch": 0.01024714184976041, "grad_norm": 0.0914352759718895, "kl": 0.09185301885008812, "learning_rate": 3e-06, "loss": 0.0126, "step": 3691 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010249918100600226, "grad_norm": 0.09088455140590668, "kl": 0.08979807794094086, "learning_rate": 3e-06, "loss": 0.0131, "step": 3692 }, { "clip_ratio": 0.0, "epoch": 0.010252694351440042, "grad_norm": 0.08266724646091461, "kl": 0.08837097510695457, "learning_rate": 3e-06, "loss": 0.0126, "step": 3693 }, { "clip_ratio": 0.0, "epoch": 0.010255470602279857, "grad_norm": 0.06148362159729004, "kl": 0.08767904341220856, "learning_rate": 3e-06, "loss": 0.0131, "step": 3694 }, { "clip_ratio": 0.00017734215361997485, "epoch": 0.010258246853119673, "grad_norm": 0.11369356513023376, "kl": 0.0918169654905796, "learning_rate": 3e-06, "loss": 0.0125, "step": 3695 }, { "clip_ratio": 0.00043130459380336106, "epoch": 0.01026102310395949, "grad_norm": 0.06331091374158859, "kl": 0.09372703358530998, "learning_rate": 3e-06, "loss": 0.013, "step": 3696 }, { "clip_ratio": 0.00024429988843621686, "completion_length": 251.75, "epoch": 0.010263799354799305, "grad_norm": 0.06566727161407471, "kl": 0.11013549938797951, "learning_rate": 3e-06, "loss": 0.0006, "reward": 0.3062500059604645, "reward_std": 0.2359030358493328, "rewards/countdown_reward_func": 0.3062500059604645, "step": 3697, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0005702985217794776, "epoch": 0.01026657560563912, "grad_norm": 0.08787801116704941, "kl": 0.10066089406609535, "learning_rate": 3e-06, "loss": -0.0, "step": 3698 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.010269351856478936, "grad_norm": 0.07703584432601929, "kl": 0.10739707946777344, "learning_rate": 3e-06, "loss": 0.0006, "step": 3699 }, { "clip_ratio": 0.00016432092525064945, "epoch": 0.010272128107318753, "grad_norm": 0.06794824451208115, "kl": 0.11057337746024132, "learning_rate": 3e-06, "loss": 0.0003, "step": 3700 }, { "clip_ratio": 0.00017322444182354957, "epoch": 0.010274904358158568, "grad_norm": 0.0614597462117672, "kl": 0.100674107670784, "learning_rate": 3e-06, "loss": -0.0006, "step": 3701 }, { "clip_ratio": 0.0, "epoch": 0.010277680608998384, "grad_norm": 0.08092259615659714, "kl": 0.11043494194746017, "learning_rate": 3e-06, "loss": 0.0, "step": 3702 }, { "clip_ratio": 8.153946691891178e-05, "epoch": 0.010280456859838201, "grad_norm": 0.06434497237205505, "kl": 0.11076181009411812, "learning_rate": 3e-06, "loss": -0.0004, "step": 3703 }, { "clip_ratio": 0.00032552083575865254, "epoch": 0.010283233110678016, "grad_norm": 0.06610029190778732, "kl": 0.10248962044715881, "learning_rate": 3e-06, "loss": -0.0011, "step": 3704 }, { "clip_ratio": 0.00032552084303461015, "epoch": 0.010286009361517832, "grad_norm": 0.0726429671049118, "kl": 0.10930807888507843, "learning_rate": 3e-06, "loss": -0.0007, "step": 3705 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010288785612357647, "grad_norm": 0.05448497086763382, "kl": 0.1132940948009491, "learning_rate": 3e-06, "loss": -0.001, "step": 3706 }, { "clip_ratio": 0.00032552083575865254, "epoch": 0.010291561863197464, "grad_norm": 0.05748455598950386, "kl": 0.10172730311751366, "learning_rate": 3e-06, "loss": -0.0013, "step": 3707 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.01029433811403728, "grad_norm": 0.07410575449466705, "kl": 0.11324379593133926, "learning_rate": 3e-06, "loss": -0.0018, "step": 3708 }, { "clip_ratio": 0.0005191129457671195, "completion_length": 241.08334350585938, "epoch": 0.010297114364877095, "grad_norm": 0.06524646282196045, "kl": 0.09655367583036423, "learning_rate": 3e-06, "loss": -0.0117, "reward": 0.23125001788139343, "reward_std": 0.2115694060921669, "rewards/countdown_reward_func": 0.23125001043081284, "step": 3709, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00017369926354149356, "epoch": 0.01029989061571691, "grad_norm": 0.08108525723218918, "kl": 0.08781386911869049, "learning_rate": 3e-06, "loss": -0.0124, "step": 3710 }, { "clip_ratio": 0.0, "epoch": 0.010302666866556728, "grad_norm": 0.05896029621362686, "kl": 0.09934370219707489, "learning_rate": 3e-06, "loss": -0.0127, "step": 3711 }, { "clip_ratio": 8.256275032181293e-05, "epoch": 0.010305443117396543, "grad_norm": 0.05678814277052879, "kl": 0.1032622829079628, "learning_rate": 3e-06, "loss": -0.0118, "step": 3712 }, { "clip_ratio": 0.0006534067360917106, "epoch": 0.010308219368236358, "grad_norm": 0.058886971324682236, "kl": 0.10369609296321869, "learning_rate": 3e-06, "loss": -0.0121, "step": 3713 }, { "clip_ratio": 0.0005004404665669426, "epoch": 0.010310995619076176, "grad_norm": 0.053742337971925735, "kl": 0.098076730966568, "learning_rate": 3e-06, "loss": -0.0133, "step": 3714 }, { "clip_ratio": 0.0004276718245819211, "epoch": 0.010313771869915991, "grad_norm": 0.06250131875276566, "kl": 0.09994548559188843, "learning_rate": 3e-06, "loss": -0.0128, "step": 3715 }, { "clip_ratio": 0.0002533401348046027, "epoch": 0.010316548120755806, "grad_norm": 0.07723374664783478, "kl": 0.0916074737906456, "learning_rate": 3e-06, "loss": -0.0133, "step": 3716 }, { "clip_ratio": 0.000750820865505375, "epoch": 0.010319324371595622, "grad_norm": 0.05880725756287575, "kl": 0.10326195880770683, "learning_rate": 3e-06, "loss": -0.0136, "step": 3717 }, { "clip_ratio": 0.0013508264673873782, "epoch": 0.010322100622435439, "grad_norm": 0.06201096996665001, "kl": 0.10899652913212776, "learning_rate": 3e-06, "loss": -0.0125, "step": 3718 }, { "clip_ratio": 0.0009010950161609799, "epoch": 0.010324876873275254, "grad_norm": 0.809346079826355, "kl": 0.10828637331724167, "learning_rate": 3e-06, "loss": -0.0133, "step": 3719 }, { "clip_ratio": 0.001102704060031101, "epoch": 0.01032765312411507, "grad_norm": 0.056093886494636536, "kl": 0.10217641666531563, "learning_rate": 3e-06, "loss": -0.0142, "step": 3720 }, { "clip_ratio": 0.0, "completion_length": 248.4791717529297, "epoch": 0.010330429374954885, "grad_norm": 0.06059785187244415, "kl": 0.10218418762087822, "learning_rate": 3e-06, "loss": 0.0154, "reward": 0.36250001192092896, "reward_std": 0.33992572128772736, "rewards/countdown_reward_func": 0.36249998211860657, "step": 3721, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.010333205625794702, "grad_norm": 0.07382272183895111, "kl": 0.10443625971674919, "learning_rate": 3e-06, "loss": 0.0148, "step": 3722 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.010335981876634518, "grad_norm": 0.08616597205400467, "kl": 0.1118583232164383, "learning_rate": 3e-06, "loss": 0.0155, "step": 3723 }, { "clip_ratio": 9.191176650347188e-05, "epoch": 0.010338758127474333, "grad_norm": 0.08347748965024948, "kl": 0.11194498836994171, "learning_rate": 3e-06, "loss": 0.0157, "step": 3724 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.01034153437831415, "grad_norm": 0.06321947276592255, "kl": 0.11334555968642235, "learning_rate": 3e-06, "loss": 0.0153, "step": 3725 }, { "clip_ratio": 0.0002441406322759576, "epoch": 0.010344310629153966, "grad_norm": 0.07499963045120239, "kl": 0.11455012112855911, "learning_rate": 3e-06, "loss": 0.0153, "step": 3726 }, { "clip_ratio": 0.0004177449009148404, "epoch": 0.010347086879993781, "grad_norm": 0.06254333257675171, "kl": 0.10725605487823486, "learning_rate": 3e-06, "loss": 0.0149, "step": 3727 }, { "clip_ratio": 0.0004080835933564231, "epoch": 0.010349863130833597, "grad_norm": 0.06843312829732895, "kl": 0.10832975059747696, "learning_rate": 3e-06, "loss": 0.014, "step": 3728 }, { "clip_ratio": 0.0006510416860692203, "epoch": 0.010352639381673414, "grad_norm": 0.06936750560998917, "kl": 0.11793709173798561, "learning_rate": 3e-06, "loss": 0.0146, "step": 3729 }, { "clip_ratio": 0.00016394296108046547, "epoch": 0.010355415632513229, "grad_norm": 0.07849463820457458, "kl": 0.11797652021050453, "learning_rate": 3e-06, "loss": 0.0146, "step": 3730 }, { "clip_ratio": 0.0005003076512366533, "epoch": 0.010358191883353044, "grad_norm": 0.06481056660413742, "kl": 0.11977511644363403, "learning_rate": 3e-06, "loss": 0.0143, "step": 3731 }, { "clip_ratio": 0.0004991251189494506, "epoch": 0.01036096813419286, "grad_norm": 0.06328581273555756, "kl": 0.11961868405342102, "learning_rate": 3e-06, "loss": 0.0139, "step": 3732 }, { "clip_ratio": 0.0005440004752017558, "completion_length": 234.31250762939453, "epoch": 0.010363744385032677, "grad_norm": 0.06403510272502899, "kl": 0.11348060145974159, "learning_rate": 3e-06, "loss": 0.0015, "reward": 0.24583333730697632, "reward_std": 0.26388657093048096, "rewards/countdown_reward_func": 0.24583333730697632, "step": 3733, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.010366520635872492, "grad_norm": 0.07515983283519745, "kl": 0.11439793929457664, "learning_rate": 3e-06, "loss": 0.0014, "step": 3734 }, { "clip_ratio": 0.0010691550560295582, "epoch": 0.010369296886712308, "grad_norm": 0.09306424856185913, "kl": 0.11976934224367142, "learning_rate": 3e-06, "loss": 0.0014, "step": 3735 }, { "clip_ratio": 0.00017390426364727318, "epoch": 0.010372073137552125, "grad_norm": 0.05594691261649132, "kl": 0.11266357079148293, "learning_rate": 3e-06, "loss": 0.0013, "step": 3736 }, { "clip_ratio": 0.0, "epoch": 0.01037484938839194, "grad_norm": 0.06227236986160278, "kl": 0.12277952954173088, "learning_rate": 3e-06, "loss": 0.0015, "step": 3737 }, { "clip_ratio": 0.00027129166846862063, "epoch": 0.010377625639231756, "grad_norm": 0.06138479709625244, "kl": 0.12303722649812698, "learning_rate": 3e-06, "loss": 0.001, "step": 3738 }, { "clip_ratio": 9.259259240934625e-05, "epoch": 0.010380401890071571, "grad_norm": 0.06332753598690033, "kl": 0.12067343667149544, "learning_rate": 3e-06, "loss": 0.001, "step": 3739 }, { "clip_ratio": 0.00027550501545192674, "epoch": 0.010383178140911388, "grad_norm": 0.07497508078813553, "kl": 0.12001671269536018, "learning_rate": 3e-06, "loss": 0.0, "step": 3740 }, { "clip_ratio": 0.0, "epoch": 0.010385954391751204, "grad_norm": 0.06443694978952408, "kl": 0.1244162917137146, "learning_rate": 3e-06, "loss": 0.0009, "step": 3741 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010388730642591019, "grad_norm": 0.05547724664211273, "kl": 0.11742773652076721, "learning_rate": 3e-06, "loss": 0.0002, "step": 3742 }, { "clip_ratio": 0.00028068863321095705, "epoch": 0.010391506893430835, "grad_norm": 0.062318652868270874, "kl": 0.12811940163373947, "learning_rate": 3e-06, "loss": 0.0009, "step": 3743 }, { "clip_ratio": 0.0002675356809049845, "epoch": 0.010394283144270652, "grad_norm": 0.06962389498949051, "kl": 0.12719620391726494, "learning_rate": 3e-06, "loss": 0.0002, "step": 3744 }, { "clip_ratio": 0.00027312454767525196, "completion_length": 233.27084350585938, "epoch": 0.010397059395110467, "grad_norm": 0.06811058521270752, "kl": 0.13646503165364265, "learning_rate": 3e-06, "loss": 0.0192, "reward": 0.2875000163912773, "reward_std": 0.29990123212337494, "rewards/countdown_reward_func": 0.2875000014901161, "step": 3745, "zero_std_ratio": 0.25 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.010399835645950282, "grad_norm": 0.06379447877407074, "kl": 0.13615497574210167, "learning_rate": 3e-06, "loss": 0.0192, "step": 3746 }, { "clip_ratio": 0.0, "epoch": 0.0104026118967901, "grad_norm": 0.05889496952295303, "kl": 0.13460170477628708, "learning_rate": 3e-06, "loss": 0.0191, "step": 3747 }, { "clip_ratio": 0.0009842519648373127, "epoch": 0.010405388147629915, "grad_norm": 0.06529540568590164, "kl": 0.14297862350940704, "learning_rate": 3e-06, "loss": 0.0199, "step": 3748 }, { "clip_ratio": 0.00010056315659312531, "epoch": 0.01040816439846973, "grad_norm": 0.06160197779536247, "kl": 0.1299450732767582, "learning_rate": 3e-06, "loss": 0.018, "step": 3749 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.010410940649309546, "grad_norm": 0.06234075129032135, "kl": 0.13947857916355133, "learning_rate": 3e-06, "loss": 0.019, "step": 3750 }, { "clip_ratio": 0.001098891720175743, "epoch": 0.010413716900149363, "grad_norm": 0.05815495550632477, "kl": 0.13869785889983177, "learning_rate": 3e-06, "loss": 0.0183, "step": 3751 }, { "clip_ratio": 9.104151831706986e-05, "epoch": 0.010416493150989178, "grad_norm": 0.06667822599411011, "kl": 0.1387176439166069, "learning_rate": 3e-06, "loss": 0.0185, "step": 3752 }, { "clip_ratio": 0.00028044619830325246, "epoch": 0.010419269401828994, "grad_norm": 0.06165160983800888, "kl": 0.13564497232437134, "learning_rate": 3e-06, "loss": 0.0182, "step": 3753 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.01042204565266881, "grad_norm": 0.06771142035722733, "kl": 0.14472221583127975, "learning_rate": 3e-06, "loss": 0.018, "step": 3754 }, { "clip_ratio": 0.00045520756975747645, "epoch": 0.010424821903508626, "grad_norm": 0.06904330104589462, "kl": 0.13105230778455734, "learning_rate": 3e-06, "loss": 0.0173, "step": 3755 }, { "clip_ratio": 0.00019623234402388334, "epoch": 0.010427598154348442, "grad_norm": 0.07135909050703049, "kl": 0.13973189890384674, "learning_rate": 3e-06, "loss": 0.0173, "step": 3756 }, { "clip_ratio": 0.00036735781031893566, "completion_length": 213.89583587646484, "epoch": 0.010430374405188257, "grad_norm": 0.07671444118022919, "kl": 0.12635323405265808, "learning_rate": 3e-06, "loss": 0.0423, "reward": 0.3437500223517418, "reward_std": 0.2202121838927269, "rewards/countdown_reward_func": 0.3437500223517418, "step": 3757, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0001179245300590992, "epoch": 0.010433150656028074, "grad_norm": 0.07642974704504013, "kl": 0.13858602941036224, "learning_rate": 3e-06, "loss": 0.0423, "step": 3758 }, { "clip_ratio": 0.0, "epoch": 0.01043592690686789, "grad_norm": 0.09286346286535263, "kl": 0.13408120721578598, "learning_rate": 3e-06, "loss": 0.0419, "step": 3759 }, { "clip_ratio": 9.834775846684352e-05, "epoch": 0.010438703157707705, "grad_norm": 0.0770239308476448, "kl": 0.13056348264217377, "learning_rate": 3e-06, "loss": 0.0414, "step": 3760 }, { "clip_ratio": 0.0, "epoch": 0.01044147940854752, "grad_norm": 0.08542706072330475, "kl": 0.12958365678787231, "learning_rate": 3e-06, "loss": 0.0406, "step": 3761 }, { "clip_ratio": 0.0008124129381030798, "epoch": 0.010444255659387338, "grad_norm": 0.08395932614803314, "kl": 0.1308015137910843, "learning_rate": 3e-06, "loss": 0.0408, "step": 3762 }, { "clip_ratio": 0.0017508447199361399, "epoch": 0.010447031910227153, "grad_norm": 0.07770942896604538, "kl": 0.13082632422447205, "learning_rate": 3e-06, "loss": 0.0402, "step": 3763 }, { "clip_ratio": 0.0009491710516158491, "epoch": 0.010449808161066968, "grad_norm": 0.08154240250587463, "kl": 0.13726342469453812, "learning_rate": 3e-06, "loss": 0.0399, "step": 3764 }, { "clip_ratio": 0.002894071745686233, "epoch": 0.010452584411906784, "grad_norm": 0.08331954479217529, "kl": 0.13326022773981094, "learning_rate": 3e-06, "loss": 0.0386, "step": 3765 }, { "clip_ratio": 0.003942181181628257, "epoch": 0.010455360662746601, "grad_norm": 0.0719565749168396, "kl": 0.1324576511979103, "learning_rate": 3e-06, "loss": 0.0385, "step": 3766 }, { "clip_ratio": 0.003481343446765095, "epoch": 0.010458136913586416, "grad_norm": 0.07970364391803741, "kl": 0.1310102716088295, "learning_rate": 3e-06, "loss": 0.0373, "step": 3767 }, { "clip_ratio": 0.005239815160166472, "epoch": 0.010460913164426232, "grad_norm": 0.07124697417020798, "kl": 0.13389409333467484, "learning_rate": 3e-06, "loss": 0.0377, "step": 3768 }, { "clip_ratio": 0.0, "completion_length": 217.6666717529297, "epoch": 0.010463689415266049, "grad_norm": 0.06575433909893036, "kl": 0.13873089104890823, "learning_rate": 3e-06, "loss": 0.0261, "reward": 0.2645833492279053, "reward_std": 0.22712497413158417, "rewards/countdown_reward_func": 0.2645833492279053, "step": 3769, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.010466465666105864, "grad_norm": 0.09294385462999344, "kl": 0.13328709453344345, "learning_rate": 3e-06, "loss": 0.0254, "step": 3770 }, { "clip_ratio": 0.00044407311361283064, "epoch": 0.01046924191694568, "grad_norm": 0.06368909031152725, "kl": 0.13753554224967957, "learning_rate": 3e-06, "loss": 0.0252, "step": 3771 }, { "clip_ratio": 0.0007261410937644541, "epoch": 0.010472018167785495, "grad_norm": 0.1784268021583557, "kl": 0.14365457743406296, "learning_rate": 3e-06, "loss": 0.0259, "step": 3772 }, { "clip_ratio": 0.00010373444092692807, "epoch": 0.010474794418625312, "grad_norm": 0.07578235864639282, "kl": 0.14355264976620674, "learning_rate": 3e-06, "loss": 0.0246, "step": 3773 }, { "clip_ratio": 0.0002771618601400405, "epoch": 0.010477570669465128, "grad_norm": 0.06540807336568832, "kl": 0.1357973851263523, "learning_rate": 3e-06, "loss": 0.024, "step": 3774 }, { "clip_ratio": 0.0013077643525321037, "epoch": 0.010480346920304943, "grad_norm": 0.06500686705112457, "kl": 0.14589446783065796, "learning_rate": 3e-06, "loss": 0.0245, "step": 3775 }, { "clip_ratio": 0.0021420002449303865, "epoch": 0.010483123171144759, "grad_norm": 0.0795934870839119, "kl": 0.1385023593902588, "learning_rate": 3e-06, "loss": 0.0228, "step": 3776 }, { "clip_ratio": 0.002450719242915511, "epoch": 0.010485899421984576, "grad_norm": 0.05969502031803131, "kl": 0.14415999501943588, "learning_rate": 3e-06, "loss": 0.0234, "step": 3777 }, { "clip_ratio": 0.001866095990408212, "epoch": 0.010488675672824391, "grad_norm": 0.06225022301077843, "kl": 0.1512022614479065, "learning_rate": 3e-06, "loss": 0.0231, "step": 3778 }, { "clip_ratio": 0.0027937215054407716, "epoch": 0.010491451923664206, "grad_norm": 0.06146795302629471, "kl": 0.15099599957466125, "learning_rate": 3e-06, "loss": 0.0226, "step": 3779 }, { "clip_ratio": 0.0028990537975914776, "epoch": 0.010494228174504024, "grad_norm": 0.05856690555810928, "kl": 0.14623450487852097, "learning_rate": 3e-06, "loss": 0.0225, "step": 3780 }, { "clip_ratio": 0.0004230118356645107, "completion_length": 225.83333587646484, "epoch": 0.010497004425343839, "grad_norm": 0.09402552247047424, "kl": 0.14892438799142838, "learning_rate": 3e-06, "loss": 0.0412, "reward": 0.30000001192092896, "reward_std": 0.2920546680688858, "rewards/countdown_reward_func": 0.30000001192092896, "step": 3781, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00018355359497945756, "epoch": 0.010499780676183654, "grad_norm": 0.09416547417640686, "kl": 0.15460453182458878, "learning_rate": 3e-06, "loss": 0.0408, "step": 3782 }, { "clip_ratio": 0.0, "epoch": 0.01050255692702347, "grad_norm": 0.11407826095819473, "kl": 0.14816897362470627, "learning_rate": 3e-06, "loss": 0.0402, "step": 3783 }, { "clip_ratio": 0.00010575295891612768, "epoch": 0.010505333177863287, "grad_norm": 0.08844450116157532, "kl": 0.16021054983139038, "learning_rate": 3e-06, "loss": 0.0408, "step": 3784 }, { "clip_ratio": 0.0, "epoch": 0.010508109428703102, "grad_norm": 0.08466558158397675, "kl": 0.1666407510638237, "learning_rate": 3e-06, "loss": 0.04, "step": 3785 }, { "clip_ratio": 0.0005428834410849959, "epoch": 0.010510885679542918, "grad_norm": 0.11543014645576477, "kl": 0.15691334009170532, "learning_rate": 3e-06, "loss": 0.0389, "step": 3786 }, { "clip_ratio": 0.001172050426248461, "epoch": 0.010513661930382733, "grad_norm": 0.0883648470044136, "kl": 0.16389583051204681, "learning_rate": 3e-06, "loss": 0.0384, "step": 3787 }, { "clip_ratio": 0.0017211000085808337, "epoch": 0.01051643818122255, "grad_norm": 0.08666740357875824, "kl": 0.16847123950719833, "learning_rate": 3e-06, "loss": 0.0388, "step": 3788 }, { "clip_ratio": 0.0017418515053577721, "epoch": 0.010519214432062366, "grad_norm": 0.10064001381397247, "kl": 0.16612808406352997, "learning_rate": 3e-06, "loss": 0.0357, "step": 3789 }, { "clip_ratio": 0.0025869987439364195, "epoch": 0.010521990682902181, "grad_norm": 0.07467371970415115, "kl": 0.17785871028900146, "learning_rate": 3e-06, "loss": 0.0382, "step": 3790 }, { "clip_ratio": 0.0024195719743147492, "epoch": 0.010524766933741998, "grad_norm": 0.07374972105026245, "kl": 0.18853643536567688, "learning_rate": 3e-06, "loss": 0.0367, "step": 3791 }, { "clip_ratio": 0.004484247765503824, "epoch": 0.010527543184581814, "grad_norm": 0.10098188370466232, "kl": 0.17481163889169693, "learning_rate": 3e-06, "loss": 0.0351, "step": 3792 }, { "clip_ratio": 0.0, "completion_length": 231.4166717529297, "epoch": 0.010530319435421629, "grad_norm": 0.06987127661705017, "kl": 0.18361740559339523, "learning_rate": 3e-06, "loss": 0.0175, "reward": 0.21250002086162567, "reward_std": 0.2080453634262085, "rewards/countdown_reward_func": 0.21250002086162567, "step": 3793, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.000554323720280081, "epoch": 0.010533095686261445, "grad_norm": 0.05969535931944847, "kl": 0.19971121102571487, "learning_rate": 3e-06, "loss": 0.0185, "step": 3794 }, { "clip_ratio": 0.00017829793796408921, "epoch": 0.010535871937101262, "grad_norm": 0.06509511917829514, "kl": 0.1951780468225479, "learning_rate": 3e-06, "loss": 0.018, "step": 3795 }, { "clip_ratio": 0.00018382353300694376, "epoch": 0.010538648187941077, "grad_norm": 0.059156037867069244, "kl": 0.20140810310840607, "learning_rate": 3e-06, "loss": 0.0181, "step": 3796 }, { "clip_ratio": 0.0, "epoch": 0.010541424438780892, "grad_norm": 0.05628114566206932, "kl": 0.19984527677297592, "learning_rate": 3e-06, "loss": 0.0176, "step": 3797 }, { "clip_ratio": 0.0008135426614899188, "epoch": 0.010544200689620708, "grad_norm": 0.05967726930975914, "kl": 0.19956175982952118, "learning_rate": 3e-06, "loss": 0.0181, "step": 3798 }, { "clip_ratio": 9.191176650347188e-05, "epoch": 0.010546976940460525, "grad_norm": 0.06553437560796738, "kl": 0.20413440465927124, "learning_rate": 3e-06, "loss": 0.0167, "step": 3799 }, { "clip_ratio": 0.0007381472532870248, "epoch": 0.01054975319130034, "grad_norm": 0.059987738728523254, "kl": 0.21643632650375366, "learning_rate": 3e-06, "loss": 0.0182, "step": 3800 }, { "clip_ratio": 0.0002762108197202906, "epoch": 0.010552529442140156, "grad_norm": 0.060753267258405685, "kl": 0.2092352956533432, "learning_rate": 3e-06, "loss": 0.0174, "step": 3801 }, { "clip_ratio": 0.0006434532988350838, "epoch": 0.010555305692979973, "grad_norm": 0.0576791949570179, "kl": 0.2144591212272644, "learning_rate": 3e-06, "loss": 0.0173, "step": 3802 }, { "clip_ratio": 0.00027573530678637326, "epoch": 0.010558081943819788, "grad_norm": 0.056576959788799286, "kl": 0.20837552845478058, "learning_rate": 3e-06, "loss": 0.0172, "step": 3803 }, { "clip_ratio": 0.0017675926210358739, "epoch": 0.010560858194659604, "grad_norm": 0.06516698002815247, "kl": 0.20664474368095398, "learning_rate": 3e-06, "loss": 0.0178, "step": 3804 }, { "clip_ratio": 0.0011363636003807187, "completion_length": 207.06250762939453, "epoch": 0.01056363444549942, "grad_norm": 0.10948531329631805, "kl": 0.21073997020721436, "learning_rate": 3e-06, "loss": 0.0313, "reward": 0.2875000163912773, "reward_std": 0.28183096647262573, "rewards/countdown_reward_func": 0.2875000014901161, "step": 3805, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00017605633183848113, "epoch": 0.010566410696339236, "grad_norm": 0.08565101027488708, "kl": 0.20766940712928772, "learning_rate": 3e-06, "loss": 0.0328, "step": 3806 }, { "clip_ratio": 0.00010296540131093934, "epoch": 0.010569186947179052, "grad_norm": 0.0737881287932396, "kl": 0.21920731663703918, "learning_rate": 3e-06, "loss": 0.033, "step": 3807 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.010571963198018867, "grad_norm": 0.08128449320793152, "kl": 0.21976015716791153, "learning_rate": 3e-06, "loss": 0.0325, "step": 3808 }, { "clip_ratio": 0.000697836687322706, "epoch": 0.010574739448858683, "grad_norm": 0.07308835536241531, "kl": 0.20381668210029602, "learning_rate": 3e-06, "loss": 0.0313, "step": 3809 }, { "clip_ratio": 0.0008079874023678713, "epoch": 0.0105775156996985, "grad_norm": 0.15342998504638672, "kl": 0.2025071382522583, "learning_rate": 3e-06, "loss": 0.0326, "step": 3810 }, { "clip_ratio": 0.0009868037959677167, "epoch": 0.010580291950538315, "grad_norm": 0.10091787576675415, "kl": 0.2172391563653946, "learning_rate": 3e-06, "loss": 0.0312, "step": 3811 }, { "clip_ratio": 9.342301927972585e-05, "epoch": 0.01058306820137813, "grad_norm": 0.10047537833452225, "kl": 0.21760693192481995, "learning_rate": 3e-06, "loss": 0.0321, "step": 3812 }, { "clip_ratio": 0.00010296540131093934, "epoch": 0.010585844452217948, "grad_norm": 0.07560648769140244, "kl": 0.23006898909807205, "learning_rate": 3e-06, "loss": 0.0319, "step": 3813 }, { "clip_ratio": 9.15080527192913e-05, "epoch": 0.010588620703057763, "grad_norm": 0.08520617336034775, "kl": 0.23508140444755554, "learning_rate": 3e-06, "loss": 0.0311, "step": 3814 }, { "clip_ratio": 0.00043614793685264885, "epoch": 0.010591396953897578, "grad_norm": 0.07470186054706573, "kl": 0.21669793128967285, "learning_rate": 3e-06, "loss": 0.0307, "step": 3815 }, { "clip_ratio": 0.0004575402708724141, "epoch": 0.010594173204737394, "grad_norm": 0.07896378636360168, "kl": 0.21831078827381134, "learning_rate": 3e-06, "loss": 0.031, "step": 3816 }, { "clip_ratio": 8.741259080125019e-05, "completion_length": 218.0416717529297, "epoch": 0.010596949455577211, "grad_norm": 0.0713590756058693, "kl": 0.23243780434131622, "learning_rate": 3e-06, "loss": 0.0273, "reward": 0.26875001192092896, "reward_std": 0.2359030358493328, "rewards/countdown_reward_func": 0.26874999701976776, "step": 3817, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00018787833687383682, "epoch": 0.010599725706417026, "grad_norm": 0.08793027698993683, "kl": 0.23612533509731293, "learning_rate": 3e-06, "loss": 0.0274, "step": 3818 }, { "clip_ratio": 0.00020999583648517728, "epoch": 0.010602501957256842, "grad_norm": 0.06450904160737991, "kl": 0.24309448897838593, "learning_rate": 3e-06, "loss": 0.0279, "step": 3819 }, { "clip_ratio": 0.0003825862513622269, "epoch": 0.010605278208096657, "grad_norm": 0.06632856279611588, "kl": 0.23876655101776123, "learning_rate": 3e-06, "loss": 0.0274, "step": 3820 }, { "clip_ratio": 8.283631905214861e-05, "epoch": 0.010608054458936474, "grad_norm": 0.08156977593898773, "kl": 0.25810038298368454, "learning_rate": 3e-06, "loss": 0.0276, "step": 3821 }, { "clip_ratio": 0.00017482518160250038, "epoch": 0.01061083070977629, "grad_norm": 0.06482226401567459, "kl": 0.24448567628860474, "learning_rate": 3e-06, "loss": 0.0267, "step": 3822 }, { "clip_ratio": 8.741259080125019e-05, "epoch": 0.010613606960616105, "grad_norm": 0.06589393317699432, "kl": 0.25178758800029755, "learning_rate": 3e-06, "loss": 0.0269, "step": 3823 }, { "clip_ratio": 0.0001050420178216882, "epoch": 0.010616383211455922, "grad_norm": 0.08210726827383041, "kl": 0.25378893315792084, "learning_rate": 3e-06, "loss": 0.0264, "step": 3824 }, { "clip_ratio": 0.0001050420178216882, "epoch": 0.010619159462295738, "grad_norm": 0.06439068168401718, "kl": 0.2575261890888214, "learning_rate": 3e-06, "loss": 0.0275, "step": 3825 }, { "clip_ratio": 0.0, "epoch": 0.010621935713135553, "grad_norm": 0.06325279921293259, "kl": 0.25284677743911743, "learning_rate": 3e-06, "loss": 0.0266, "step": 3826 }, { "clip_ratio": 0.0, "epoch": 0.010624711963975369, "grad_norm": 0.08113355189561844, "kl": 0.2695206254720688, "learning_rate": 3e-06, "loss": 0.0267, "step": 3827 }, { "clip_ratio": 0.0, "epoch": 0.010627488214815186, "grad_norm": 0.06280352920293808, "kl": 0.25728270411491394, "learning_rate": 3e-06, "loss": 0.0265, "step": 3828 }, { "clip_ratio": 8.24538292363286e-05, "completion_length": 200.1666717529297, "epoch": 0.010630264465655001, "grad_norm": 0.15790845453739166, "kl": 0.239765927195549, "learning_rate": 3e-06, "loss": -0.0109, "reward": 0.26250001043081284, "reward_std": 0.23991143703460693, "rewards/countdown_reward_func": 0.26250001043081284, "step": 3829, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0002173315588152036, "epoch": 0.010633040716494816, "grad_norm": 0.142349511384964, "kl": 0.2577604800462723, "learning_rate": 3e-06, "loss": -0.0107, "step": 3830 }, { "clip_ratio": 9.238728671334684e-05, "epoch": 0.010635816967334632, "grad_norm": 0.1683511584997177, "kl": 0.24353055655956268, "learning_rate": 3e-06, "loss": -0.0111, "step": 3831 }, { "clip_ratio": 0.0002202493924414739, "epoch": 0.010638593218174449, "grad_norm": 0.1554105579853058, "kl": 0.23449350148439407, "learning_rate": 3e-06, "loss": -0.0125, "step": 3832 }, { "clip_ratio": 0.0, "epoch": 0.010641369469014264, "grad_norm": 0.12939006090164185, "kl": 0.24842225015163422, "learning_rate": 3e-06, "loss": -0.0119, "step": 3833 }, { "clip_ratio": 0.0001152073746197857, "epoch": 0.01064414571985408, "grad_norm": 0.1488330066204071, "kl": 0.22002588212490082, "learning_rate": 3e-06, "loss": -0.0132, "step": 3834 }, { "clip_ratio": 0.0, "epoch": 0.010646921970693897, "grad_norm": 0.14143894612789154, "kl": 0.20986420661211014, "learning_rate": 3e-06, "loss": -0.014, "step": 3835 }, { "clip_ratio": 0.0, "epoch": 0.010649698221533712, "grad_norm": 0.133919358253479, "kl": 0.2178514301776886, "learning_rate": 3e-06, "loss": -0.015, "step": 3836 }, { "clip_ratio": 0.0004173327179159969, "epoch": 0.010652474472373528, "grad_norm": 0.1501268744468689, "kl": 0.20108965039253235, "learning_rate": 3e-06, "loss": -0.0163, "step": 3837 }, { "clip_ratio": 0.000532540085259825, "epoch": 0.010655250723213343, "grad_norm": 0.14473530650138855, "kl": 0.1916651576757431, "learning_rate": 3e-06, "loss": -0.0182, "step": 3838 }, { "clip_ratio": 0.0001152073746197857, "epoch": 0.01065802697405316, "grad_norm": 0.11157862842082977, "kl": 0.20019195973873138, "learning_rate": 3e-06, "loss": -0.0165, "step": 3839 }, { "clip_ratio": 0.0009308508597314358, "epoch": 0.010660803224892976, "grad_norm": 0.13282249867916107, "kl": 0.17687267065048218, "learning_rate": 3e-06, "loss": -0.0194, "step": 3840 }, { "clip_ratio": 0.0, "completion_length": 214.95833587646484, "epoch": 0.010663579475732791, "grad_norm": 0.07404172420501709, "kl": 0.1851140707731247, "learning_rate": 3e-06, "loss": 0.0085, "reward": 0.18958334624767303, "reward_std": 0.17232364416122437, "rewards/countdown_reward_func": 0.18958333879709244, "step": 3841, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.010666355726572607, "grad_norm": 0.07915058732032776, "kl": 0.17473771423101425, "learning_rate": 3e-06, "loss": 0.0081, "step": 3842 }, { "clip_ratio": 8.827683632262051e-05, "epoch": 0.010669131977412424, "grad_norm": 0.05724462866783142, "kl": 0.17317424714565277, "learning_rate": 3e-06, "loss": 0.0085, "step": 3843 }, { "clip_ratio": 0.0, "epoch": 0.010671908228252239, "grad_norm": 0.061244748532772064, "kl": 0.17009971290826797, "learning_rate": 3e-06, "loss": 0.0079, "step": 3844 }, { "clip_ratio": 8.185985643649474e-05, "epoch": 0.010674684479092054, "grad_norm": 0.0693727508187294, "kl": 0.1650964468717575, "learning_rate": 3e-06, "loss": 0.009, "step": 3845 }, { "clip_ratio": 0.0, "epoch": 0.010677460729931872, "grad_norm": 0.05789566785097122, "kl": 0.15960584580898285, "learning_rate": 3e-06, "loss": 0.0085, "step": 3846 }, { "clip_ratio": 8.185985643649474e-05, "epoch": 0.010680236980771687, "grad_norm": 0.07884936034679413, "kl": 0.1593479886651039, "learning_rate": 3e-06, "loss": 0.0085, "step": 3847 }, { "clip_ratio": 0.0, "epoch": 0.010683013231611502, "grad_norm": 0.08162257075309753, "kl": 0.15295331925153732, "learning_rate": 3e-06, "loss": 0.0065, "step": 3848 }, { "clip_ratio": 0.0002643078987603076, "epoch": 0.010685789482451318, "grad_norm": 0.05219534784555435, "kl": 0.15442772954702377, "learning_rate": 3e-06, "loss": 0.0077, "step": 3849 }, { "clip_ratio": 0.0002758012851700187, "epoch": 0.010688565733291135, "grad_norm": 0.05918796733021736, "kl": 0.15235479176044464, "learning_rate": 3e-06, "loss": 0.0077, "step": 3850 }, { "clip_ratio": 0.0006115381693234667, "epoch": 0.01069134198413095, "grad_norm": 0.06151420250535011, "kl": 0.14982881397008896, "learning_rate": 3e-06, "loss": 0.0082, "step": 3851 }, { "clip_ratio": 0.0006608140538446605, "epoch": 0.010694118234970766, "grad_norm": 0.0600561797618866, "kl": 0.1477828323841095, "learning_rate": 3e-06, "loss": 0.0074, "step": 3852 }, { "clip_ratio": 0.0001724217290757224, "completion_length": 224.39583587646484, "epoch": 0.010696894485810581, "grad_norm": 0.061056043952703476, "kl": 0.13181962817907333, "learning_rate": 3e-06, "loss": 0.007, "reward": 0.30625002086162567, "reward_std": 0.2202121838927269, "rewards/countdown_reward_func": 0.30625002086162567, "step": 3853, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.00010945709072984755, "epoch": 0.010699670736650398, "grad_norm": 0.06505566835403442, "kl": 0.13258632272481918, "learning_rate": 3e-06, "loss": 0.0065, "step": 3854 }, { "clip_ratio": 0.0003394876912352629, "epoch": 0.010702446987490214, "grad_norm": 0.05664518475532532, "kl": 0.13887183368206024, "learning_rate": 3e-06, "loss": 0.007, "step": 3855 }, { "clip_ratio": 0.00010513036249903962, "epoch": 0.01070522323833003, "grad_norm": 0.06246483698487282, "kl": 0.14198800176382065, "learning_rate": 3e-06, "loss": 0.0075, "step": 3856 }, { "clip_ratio": 0.0004974807379767299, "epoch": 0.010707999489169846, "grad_norm": 0.06604951620101929, "kl": 0.13549138605594635, "learning_rate": 3e-06, "loss": 0.0067, "step": 3857 }, { "clip_ratio": 9.057971328729764e-05, "epoch": 0.010710775740009662, "grad_norm": 0.08635063469409943, "kl": 0.12836401909589767, "learning_rate": 3e-06, "loss": 0.0064, "step": 3858 }, { "clip_ratio": 0.0002533401420805603, "epoch": 0.010713551990849477, "grad_norm": 0.06786448508501053, "kl": 0.12401057034730911, "learning_rate": 3e-06, "loss": 0.0057, "step": 3859 }, { "clip_ratio": 0.0002906165173044428, "epoch": 0.010716328241689293, "grad_norm": 0.06174307316541672, "kl": 0.12428383529186249, "learning_rate": 3e-06, "loss": 0.006, "step": 3860 }, { "clip_ratio": 0.0004450079068192281, "epoch": 0.01071910449252911, "grad_norm": 0.06032763049006462, "kl": 0.13097497820854187, "learning_rate": 3e-06, "loss": 0.0059, "step": 3861 }, { "clip_ratio": 0.00026828062254935503, "epoch": 0.010721880743368925, "grad_norm": 0.07023709267377853, "kl": 0.13595640659332275, "learning_rate": 3e-06, "loss": 0.0063, "step": 3862 }, { "clip_ratio": 0.0005076038796687499, "epoch": 0.01072465699420874, "grad_norm": 0.06951382011175156, "kl": 0.13012224435806274, "learning_rate": 3e-06, "loss": 0.0059, "step": 3863 }, { "clip_ratio": 0.00019571007578633726, "epoch": 0.010727433245048556, "grad_norm": 0.062258508056402206, "kl": 0.1215830147266388, "learning_rate": 3e-06, "loss": 0.0059, "step": 3864 }, { "clip_ratio": 0.000163612567121163, "completion_length": 228.0416717529297, "epoch": 0.010730209495888373, "grad_norm": 0.14154118299484253, "kl": 0.12261790782213211, "learning_rate": 3e-06, "loss": 0.035, "reward": 0.4541667103767395, "reward_std": 0.4214244484901428, "rewards/countdown_reward_func": 0.4541666954755783, "step": 3865, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.0003824733867077157, "epoch": 0.010732985746728188, "grad_norm": 0.09322069585323334, "kl": 0.1367412805557251, "learning_rate": 3e-06, "loss": 0.0366, "step": 3866 }, { "clip_ratio": 0.0, "epoch": 0.010735761997568004, "grad_norm": 0.170647531747818, "kl": 0.12093224376440048, "learning_rate": 3e-06, "loss": 0.0359, "step": 3867 }, { "clip_ratio": 0.0002454188361298293, "epoch": 0.010738538248407821, "grad_norm": 0.1056622564792633, "kl": 0.12119468674063683, "learning_rate": 3e-06, "loss": 0.0357, "step": 3868 }, { "clip_ratio": 8.272667037090287e-05, "epoch": 0.010741314499247636, "grad_norm": 0.10576245933771133, "kl": 0.1259239763021469, "learning_rate": 3e-06, "loss": 0.0358, "step": 3869 }, { "clip_ratio": 0.00010129659494850785, "epoch": 0.010744090750087452, "grad_norm": 0.10117069631814957, "kl": 0.12670771777629852, "learning_rate": 3e-06, "loss": 0.0352, "step": 3870 }, { "clip_ratio": 0.0002964426821563393, "epoch": 0.010746867000927267, "grad_norm": 0.1384722888469696, "kl": 0.12339812517166138, "learning_rate": 3e-06, "loss": 0.0338, "step": 3871 }, { "clip_ratio": 0.0001882276774267666, "epoch": 0.010749643251767084, "grad_norm": 0.08846452087163925, "kl": 0.13836079835891724, "learning_rate": 3e-06, "loss": 0.0349, "step": 3872 }, { "clip_ratio": 0.0, "epoch": 0.0107524195026069, "grad_norm": 0.14437542855739594, "kl": 0.12357539683580399, "learning_rate": 3e-06, "loss": 0.0333, "step": 3873 }, { "clip_ratio": 0.00018790040485328063, "epoch": 0.010755195753446715, "grad_norm": 0.11370283365249634, "kl": 0.12542566657066345, "learning_rate": 3e-06, "loss": 0.0327, "step": 3874 }, { "clip_ratio": 0.00018484493193682283, "epoch": 0.01075797200428653, "grad_norm": 0.10147980600595474, "kl": 0.1327906772494316, "learning_rate": 3e-06, "loss": 0.0333, "step": 3875 }, { "clip_ratio": 9.881422738544643e-05, "epoch": 0.010760748255126348, "grad_norm": 0.09310440719127655, "kl": 0.13527791947126389, "learning_rate": 3e-06, "loss": 0.0327, "step": 3876 }, { "clip_ratio": 0.0, "completion_length": 227.87500762939453, "epoch": 0.010763524505966163, "grad_norm": 0.0789758563041687, "kl": 0.1478492096066475, "learning_rate": 3e-06, "loss": 0.0124, "reward": 0.2666666731238365, "reward_std": 0.23236336559057236, "rewards/countdown_reward_func": 0.2666666731238365, "step": 3877, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.010766300756805978, "grad_norm": 0.06837975233793259, "kl": 0.15891195833683014, "learning_rate": 3e-06, "loss": 0.0126, "step": 3878 }, { "clip_ratio": 0.0, "epoch": 0.010769077007645796, "grad_norm": 0.06978116184473038, "kl": 0.15026108175516129, "learning_rate": 3e-06, "loss": 0.0132, "step": 3879 }, { "clip_ratio": 9.349289757665247e-05, "epoch": 0.010771853258485611, "grad_norm": 0.058400485664606094, "kl": 0.15256555378437042, "learning_rate": 3e-06, "loss": 0.0125, "step": 3880 }, { "clip_ratio": 0.0001818181772250682, "epoch": 0.010774629509325426, "grad_norm": 0.05994013324379921, "kl": 0.15840592980384827, "learning_rate": 3e-06, "loss": 0.0128, "step": 3881 }, { "clip_ratio": 9.238728671334684e-05, "epoch": 0.010777405760165242, "grad_norm": 0.07151427865028381, "kl": 0.16682183742523193, "learning_rate": 3e-06, "loss": 0.0129, "step": 3882 }, { "clip_ratio": 0.0001645819575060159, "epoch": 0.010780182011005059, "grad_norm": 0.06882087141275406, "kl": 0.15999305993318558, "learning_rate": 3e-06, "loss": 0.0125, "step": 3883 }, { "clip_ratio": 0.00018477457342669368, "epoch": 0.010782958261844874, "grad_norm": 0.062174707651138306, "kl": 0.16916058212518692, "learning_rate": 3e-06, "loss": 0.0115, "step": 3884 }, { "clip_ratio": 0.0, "epoch": 0.01078573451268469, "grad_norm": 0.0812239795923233, "kl": 0.16087709367275238, "learning_rate": 3e-06, "loss": 0.012, "step": 3885 }, { "clip_ratio": 0.0, "epoch": 0.010788510763524505, "grad_norm": 0.051038846373558044, "kl": 0.1612291932106018, "learning_rate": 3e-06, "loss": 0.0113, "step": 3886 }, { "clip_ratio": 0.0, "epoch": 0.010791287014364322, "grad_norm": 0.058411333709955215, "kl": 0.1676073521375656, "learning_rate": 3e-06, "loss": 0.0117, "step": 3887 }, { "clip_ratio": 0.0001786363500286825, "epoch": 0.010794063265204138, "grad_norm": 0.06501974165439606, "kl": 0.1733514815568924, "learning_rate": 3e-06, "loss": 0.0125, "step": 3888 }, { "clip_ratio": 8.138021075865254e-05, "completion_length": 240.20833587646484, "epoch": 0.010796839516043953, "grad_norm": 0.0743788480758667, "kl": 0.14464589208364487, "learning_rate": 3e-06, "loss": 0.0031, "reward": 0.3604166805744171, "reward_std": 0.243092592805624, "rewards/countdown_reward_func": 0.3604166507720947, "step": 3889, "zero_std_ratio": 0.375 }, { "clip_ratio": 9.286775457439944e-05, "epoch": 0.01079961576688377, "grad_norm": 0.06514990329742432, "kl": 0.15557913482189178, "learning_rate": 3e-06, "loss": 0.0033, "step": 3890 }, { "clip_ratio": 0.00010390690295025706, "epoch": 0.010802392017723586, "grad_norm": 0.07818073034286499, "kl": 0.13804951310157776, "learning_rate": 3e-06, "loss": 0.0026, "step": 3891 }, { "clip_ratio": 0.00024983345065265894, "epoch": 0.010805168268563401, "grad_norm": 0.09265992045402527, "kl": 0.14229778945446014, "learning_rate": 3e-06, "loss": 0.0025, "step": 3892 }, { "clip_ratio": 0.00018573550914879888, "epoch": 0.010807944519403217, "grad_norm": 0.08459020406007767, "kl": 0.13693518191576004, "learning_rate": 3e-06, "loss": 0.0021, "step": 3893 }, { "clip_ratio": 0.0003340085386298597, "epoch": 0.010810720770243034, "grad_norm": 0.081746406853199, "kl": 0.144972525537014, "learning_rate": 3e-06, "loss": 0.0024, "step": 3894 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010813497021082849, "grad_norm": 0.0960216075181961, "kl": 0.1413847878575325, "learning_rate": 3e-06, "loss": 0.0021, "step": 3895 }, { "clip_ratio": 0.0, "epoch": 0.010816273271922664, "grad_norm": 0.06882328540086746, "kl": 0.1520405262708664, "learning_rate": 3e-06, "loss": 0.0025, "step": 3896 }, { "clip_ratio": 0.0001665556337684393, "epoch": 0.01081904952276248, "grad_norm": 0.07881037890911102, "kl": 0.13445325195789337, "learning_rate": 3e-06, "loss": 0.0011, "step": 3897 }, { "clip_ratio": 0.0001665556337684393, "epoch": 0.010821825773602297, "grad_norm": 0.08665142208337784, "kl": 0.137092687189579, "learning_rate": 3e-06, "loss": 0.001, "step": 3898 }, { "clip_ratio": 0.00018573550914879888, "epoch": 0.010824602024442112, "grad_norm": 0.08987218141555786, "kl": 0.13117504864931107, "learning_rate": 3e-06, "loss": 0.0007, "step": 3899 }, { "clip_ratio": 0.0003340085386298597, "epoch": 0.010827378275281928, "grad_norm": 0.08218669891357422, "kl": 0.13707150518894196, "learning_rate": 3e-06, "loss": 0.001, "step": 3900 }, { "clip_ratio": 0.0, "completion_length": 216.2916717529297, "epoch": 0.010830154526121745, "grad_norm": 0.07107607275247574, "kl": 0.1364014446735382, "learning_rate": 3e-06, "loss": 0.007, "reward": 0.24791669100522995, "reward_std": 0.25994232296943665, "rewards/countdown_reward_func": 0.24791667610406876, "step": 3901, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.01083293077696156, "grad_norm": 0.06697147339582443, "kl": 0.13636207580566406, "learning_rate": 3e-06, "loss": 0.0069, "step": 3902 }, { "clip_ratio": 0.0, "epoch": 0.010835707027801376, "grad_norm": 0.07514923810958862, "kl": 0.13531578332185745, "learning_rate": 3e-06, "loss": 0.0072, "step": 3903 }, { "clip_ratio": 0.00026642831653589383, "epoch": 0.010838483278641191, "grad_norm": 0.09162987023591995, "kl": 0.13396459072828293, "learning_rate": 3e-06, "loss": 0.0062, "step": 3904 }, { "clip_ratio": 9.13075273274444e-05, "epoch": 0.010841259529481008, "grad_norm": 0.06672176718711853, "kl": 0.13935266435146332, "learning_rate": 3e-06, "loss": 0.007, "step": 3905 }, { "clip_ratio": 0.0, "epoch": 0.010844035780320824, "grad_norm": 0.08831381797790527, "kl": 0.12978626042604446, "learning_rate": 3e-06, "loss": 0.0064, "step": 3906 }, { "clip_ratio": 0.00019484231597743928, "epoch": 0.010846812031160639, "grad_norm": 0.07163853943347931, "kl": 0.1308143399655819, "learning_rate": 3e-06, "loss": 0.0058, "step": 3907 }, { "clip_ratio": 0.0006286058633122593, "epoch": 0.010849588282000455, "grad_norm": 0.0692080482840538, "kl": 0.13222164288163185, "learning_rate": 3e-06, "loss": 0.0062, "step": 3908 }, { "clip_ratio": 0.0, "epoch": 0.010852364532840272, "grad_norm": 0.07337203621864319, "kl": 0.12936417013406754, "learning_rate": 3e-06, "loss": 0.0066, "step": 3909 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010855140783680087, "grad_norm": 0.10020852833986282, "kl": 0.1290852054953575, "learning_rate": 3e-06, "loss": 0.0056, "step": 3910 }, { "clip_ratio": 0.00017867152928374708, "epoch": 0.010857917034519902, "grad_norm": 0.0748172178864479, "kl": 0.13483864814043045, "learning_rate": 3e-06, "loss": 0.0062, "step": 3911 }, { "clip_ratio": 0.0, "epoch": 0.01086069328535972, "grad_norm": 0.08579611778259277, "kl": 0.12586190551519394, "learning_rate": 3e-06, "loss": 0.0056, "step": 3912 }, { "clip_ratio": 9.689922444522381e-05, "completion_length": 235.0625, "epoch": 0.010863469536199535, "grad_norm": 0.08423938602209091, "kl": 0.13688649237155914, "learning_rate": 3e-06, "loss": 0.0305, "reward": 0.22708334028720856, "reward_std": 0.2304183915257454, "rewards/countdown_reward_func": 0.22708333283662796, "step": 3913, "zero_std_ratio": 0.25 }, { "clip_ratio": 0.0, "epoch": 0.01086624578703935, "grad_norm": 0.06947766244411469, "kl": 0.13631176203489304, "learning_rate": 3e-06, "loss": 0.0306, "step": 3914 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010869022037879166, "grad_norm": 0.07213205099105835, "kl": 0.13915787637233734, "learning_rate": 3e-06, "loss": 0.0304, "step": 3915 }, { "clip_ratio": 0.0, "epoch": 0.010871798288718983, "grad_norm": 0.0842897817492485, "kl": 0.13428127020597458, "learning_rate": 3e-06, "loss": 0.0299, "step": 3916 }, { "clip_ratio": 0.0, "epoch": 0.010874574539558798, "grad_norm": 0.08177607506513596, "kl": 0.1366531103849411, "learning_rate": 3e-06, "loss": 0.0297, "step": 3917 }, { "clip_ratio": 0.00016276042151730508, "epoch": 0.010877350790398614, "grad_norm": 0.07037891447544098, "kl": 0.13778027892112732, "learning_rate": 3e-06, "loss": 0.0297, "step": 3918 }, { "clip_ratio": 0.0, "epoch": 0.01088012704123843, "grad_norm": 0.0744611993432045, "kl": 0.13939858973026276, "learning_rate": 3e-06, "loss": 0.0294, "step": 3919 }, { "clip_ratio": 0.0, "epoch": 0.010882903292078246, "grad_norm": 0.07017367333173752, "kl": 0.14066072553396225, "learning_rate": 3e-06, "loss": 0.0294, "step": 3920 }, { "clip_ratio": 0.0, "epoch": 0.010885679542918062, "grad_norm": 0.09415730088949203, "kl": 0.14238177984952927, "learning_rate": 3e-06, "loss": 0.0293, "step": 3921 }, { "clip_ratio": 0.00018418973195366561, "epoch": 0.010888455793757877, "grad_norm": 0.0937318280339241, "kl": 0.13934161514043808, "learning_rate": 3e-06, "loss": 0.0285, "step": 3922 }, { "clip_ratio": 0.0, "epoch": 0.010891232044597694, "grad_norm": 0.07316996157169342, "kl": 0.14452099800109863, "learning_rate": 3e-06, "loss": 0.0283, "step": 3923 }, { "clip_ratio": 0.0002717403694987297, "epoch": 0.01089400829543751, "grad_norm": 0.07191247493028641, "kl": 0.14651203155517578, "learning_rate": 3e-06, "loss": 0.0295, "step": 3924 }, { "clip_ratio": 0.00016276042151730508, "completion_length": 226.87500762939453, "epoch": 0.010896784546277325, "grad_norm": 0.10965219885110855, "kl": 0.15437418967485428, "learning_rate": 3e-06, "loss": 0.0008, "reward": 0.5291666984558105, "reward_std": 0.46913227438926697, "rewards/countdown_reward_func": 0.5291666686534882, "step": 3925, "zero_std_ratio": 0.0 }, { "clip_ratio": 0.0001809492241591215, "epoch": 0.01089956079711714, "grad_norm": 0.11571619659662247, "kl": 0.15728124976158142, "learning_rate": 3e-06, "loss": 0.0009, "step": 3926 }, { "clip_ratio": 9.897070412989706e-05, "epoch": 0.010902337047956958, "grad_norm": 0.11320040374994278, "kl": 0.15982800722122192, "learning_rate": 3e-06, "loss": 0.0022, "step": 3927 }, { "clip_ratio": 0.00016657958622090518, "epoch": 0.010905113298796773, "grad_norm": 0.1336049884557724, "kl": 0.162838414311409, "learning_rate": 3e-06, "loss": 0.0015, "step": 3928 }, { "clip_ratio": 0.0, "epoch": 0.010907889549636588, "grad_norm": 0.11056411266326904, "kl": 0.16444478929042816, "learning_rate": 3e-06, "loss": 0.001, "step": 3929 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010910665800476404, "grad_norm": 0.11216159909963608, "kl": 0.16198892146348953, "learning_rate": 3e-06, "loss": 0.0011, "step": 3930 }, { "clip_ratio": 9.897070412989706e-05, "epoch": 0.010913442051316221, "grad_norm": 0.10605109483003616, "kl": 0.16194820404052734, "learning_rate": 3e-06, "loss": 0.0003, "step": 3931 }, { "clip_ratio": 9.897070412989706e-05, "epoch": 0.010916218302156036, "grad_norm": 0.12103704363107681, "kl": 0.16279541701078415, "learning_rate": 3e-06, "loss": -0.0003, "step": 3932 }, { "clip_ratio": 0.0001901450305012986, "epoch": 0.010918994552995852, "grad_norm": 0.12349989265203476, "kl": 0.165398471057415, "learning_rate": 3e-06, "loss": -0.0001, "step": 3933 }, { "clip_ratio": 0.0003327630620333366, "epoch": 0.010921770803835669, "grad_norm": 0.14619022607803345, "kl": 0.16759123653173447, "learning_rate": 3e-06, "loss": -0.0007, "step": 3934 }, { "clip_ratio": 0.00016960651555564255, "epoch": 0.010924547054675484, "grad_norm": 0.11500994861125946, "kl": 0.17032650113105774, "learning_rate": 3e-06, "loss": -0.0013, "step": 3935 }, { "clip_ratio": 0.00018035090761259198, "epoch": 0.0109273233055153, "grad_norm": 0.10531459748744965, "kl": 0.16395649313926697, "learning_rate": 3e-06, "loss": -0.0008, "step": 3936 }, { "clip_ratio": 0.0, "completion_length": 225.9375, "epoch": 0.010930099556355115, "grad_norm": 0.07268326729536057, "kl": 0.16464074701070786, "learning_rate": 3e-06, "loss": 0.023, "reward": 0.2666666731238365, "reward_std": 0.22531529515981674, "rewards/countdown_reward_func": 0.2666666731238365, "step": 3937, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.00018623018695507199, "epoch": 0.010932875807194932, "grad_norm": 0.06680519878864288, "kl": 0.1632915586233139, "learning_rate": 3e-06, "loss": 0.0237, "step": 3938 }, { "clip_ratio": 0.0003694338010973297, "epoch": 0.010935652058034748, "grad_norm": 0.07507689297199249, "kl": 0.1571510210633278, "learning_rate": 3e-06, "loss": 0.0228, "step": 3939 }, { "clip_ratio": 0.0002519474510336295, "epoch": 0.010938428308874563, "grad_norm": 0.07229724526405334, "kl": 0.15701914578676224, "learning_rate": 3e-06, "loss": 0.023, "step": 3940 }, { "clip_ratio": 0.0001801349935703911, "epoch": 0.010941204559714379, "grad_norm": 0.14276354014873505, "kl": 0.15372420847415924, "learning_rate": 3e-06, "loss": 0.0222, "step": 3941 }, { "clip_ratio": 0.0003685771589516662, "epoch": 0.010943980810554196, "grad_norm": 0.07825231552124023, "kl": 0.14864331483840942, "learning_rate": 3e-06, "loss": 0.0219, "step": 3942 }, { "clip_ratio": 0.0003582022254704498, "epoch": 0.010946757061394011, "grad_norm": 0.07013280689716339, "kl": 0.16708237677812576, "learning_rate": 3e-06, "loss": 0.0219, "step": 3943 }, { "clip_ratio": 0.0004395824362291023, "epoch": 0.010949533312233826, "grad_norm": 0.06309743225574493, "kl": 0.16635598987340927, "learning_rate": 3e-06, "loss": 0.0228, "step": 3944 }, { "clip_ratio": 0.0006040066582500003, "epoch": 0.010952309563073644, "grad_norm": 0.07390635460615158, "kl": 0.1595904305577278, "learning_rate": 3e-06, "loss": 0.0218, "step": 3945 }, { "clip_ratio": 0.0007201871194411069, "epoch": 0.010955085813913459, "grad_norm": 0.0704895630478859, "kl": 0.1592741757631302, "learning_rate": 3e-06, "loss": 0.0219, "step": 3946 }, { "clip_ratio": 0.00027591593243414536, "epoch": 0.010957862064753274, "grad_norm": 0.11689643561840057, "kl": 0.1568462774157524, "learning_rate": 3e-06, "loss": 0.0214, "step": 3947 }, { "clip_ratio": 0.0008281506015919149, "epoch": 0.01096063831559309, "grad_norm": 0.08274061977863312, "kl": 0.1527530774474144, "learning_rate": 3e-06, "loss": 0.0222, "step": 3948 }, { "clip_ratio": 0.0, "completion_length": 215.20833587646484, "epoch": 0.010963414566432907, "grad_norm": 0.07381125539541245, "kl": 0.1581275835633278, "learning_rate": 3e-06, "loss": 0.0164, "reward": 0.397916704416275, "reward_std": 0.30087657272815704, "rewards/countdown_reward_func": 0.397916704416275, "step": 3949, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010966190817272722, "grad_norm": 0.07644526660442352, "kl": 0.1597534418106079, "learning_rate": 3e-06, "loss": 0.016, "step": 3950 }, { "clip_ratio": 0.0005517470926861279, "epoch": 0.010968967068112538, "grad_norm": 0.10783946514129639, "kl": 0.15697012096643448, "learning_rate": 3e-06, "loss": 0.0166, "step": 3951 }, { "clip_ratio": 0.0, "epoch": 0.010971743318952353, "grad_norm": 0.07541754841804504, "kl": 0.16189389675855637, "learning_rate": 3e-06, "loss": 0.0163, "step": 3952 }, { "clip_ratio": 0.0002608489812701009, "epoch": 0.01097451956979217, "grad_norm": 0.08128286898136139, "kl": 0.16703475266695023, "learning_rate": 3e-06, "loss": 0.0162, "step": 3953 }, { "clip_ratio": 0.00024254612799268216, "epoch": 0.010977295820631986, "grad_norm": 0.07985997200012207, "kl": 0.15401344746351242, "learning_rate": 3e-06, "loss": 0.0149, "step": 3954 }, { "clip_ratio": 0.00030562348547391593, "epoch": 0.010980072071471801, "grad_norm": 0.07999222725629807, "kl": 0.15990176796913147, "learning_rate": 3e-06, "loss": 0.0153, "step": 3955 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010982848322311618, "grad_norm": 0.08383301645517349, "kl": 0.16068574041128159, "learning_rate": 3e-06, "loss": 0.0151, "step": 3956 }, { "clip_ratio": 0.0004341553649283014, "epoch": 0.010985624573151434, "grad_norm": 0.10838162153959274, "kl": 0.15964771807193756, "learning_rate": 3e-06, "loss": 0.0157, "step": 3957 }, { "clip_ratio": 0.00017985611339099705, "epoch": 0.010988400823991249, "grad_norm": 0.08186893910169601, "kl": 0.16316110640764236, "learning_rate": 3e-06, "loss": 0.0151, "step": 3958 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.010991177074831065, "grad_norm": 0.09029525518417358, "kl": 0.16957978159189224, "learning_rate": 3e-06, "loss": 0.0145, "step": 3959 }, { "clip_ratio": 0.00017111460329033434, "epoch": 0.010993953325670882, "grad_norm": 0.11054430902004242, "kl": 0.15555671602487564, "learning_rate": 3e-06, "loss": 0.0148, "step": 3960 }, { "clip_ratio": 0.00018704377725953236, "completion_length": 234.9791717529297, "epoch": 0.010996729576510697, "grad_norm": 0.07871285080909729, "kl": 0.14937568455934525, "learning_rate": 3e-06, "loss": 0.0035, "reward": 0.2875000312924385, "reward_std": 0.27318818494677544, "rewards/countdown_reward_func": 0.2875000312924385, "step": 3961, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0, "epoch": 0.010999505827350512, "grad_norm": 0.07785376906394958, "kl": 0.15198642760515213, "learning_rate": 3e-06, "loss": 0.0042, "step": 3962 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.011002282078190328, "grad_norm": 0.08732501417398453, "kl": 0.15406041592359543, "learning_rate": 3e-06, "loss": 0.0023, "step": 3963 }, { "clip_ratio": 0.00010566356650087982, "epoch": 0.011005058329030145, "grad_norm": 0.09631951153278351, "kl": 0.14392270147800446, "learning_rate": 3e-06, "loss": 0.0029, "step": 3964 }, { "clip_ratio": 0.0003390650817891583, "epoch": 0.01100783457986996, "grad_norm": 0.1050824522972107, "kl": 0.1541462168097496, "learning_rate": 3e-06, "loss": 0.0034, "step": 3965 }, { "clip_ratio": 0.00010566356650087982, "epoch": 0.011010610830709776, "grad_norm": 0.08569294959306717, "kl": 0.14983107894659042, "learning_rate": 3e-06, "loss": 0.0023, "step": 3966 }, { "clip_ratio": 0.000691078239469789, "epoch": 0.011013387081549593, "grad_norm": 0.07939525693655014, "kl": 0.14374879747629166, "learning_rate": 3e-06, "loss": 0.002, "step": 3967 }, { "clip_ratio": 0.000244140625, "epoch": 0.011016163332389408, "grad_norm": 0.07721323519945145, "kl": 0.14837448298931122, "learning_rate": 3e-06, "loss": 0.0022, "step": 3968 }, { "clip_ratio": 0.0002509127516532317, "epoch": 0.011018939583229224, "grad_norm": 0.1131884902715683, "kl": 0.14939847588539124, "learning_rate": 3e-06, "loss": 0.0008, "step": 3969 }, { "clip_ratio": 0.00018704377725953236, "epoch": 0.01102171583406904, "grad_norm": 0.09274110943078995, "kl": 0.13957054913043976, "learning_rate": 3e-06, "loss": 0.0018, "step": 3970 }, { "clip_ratio": 0.0006851391517557204, "epoch": 0.011024492084908856, "grad_norm": 0.10548820346593857, "kl": 0.14772838354110718, "learning_rate": 3e-06, "loss": 0.0023, "step": 3971 }, { "clip_ratio": 0.00035237295378465205, "epoch": 0.011027268335748672, "grad_norm": 0.08493470400571823, "kl": 0.1449984833598137, "learning_rate": 3e-06, "loss": 0.0009, "step": 3972 }, { "clip_ratio": 0.00019574453472159803, "completion_length": 214.18750762939453, "epoch": 0.011030044586588487, "grad_norm": 0.07708398252725601, "kl": 0.14360886812210083, "learning_rate": 3e-06, "loss": 0.0159, "reward": 0.4187500476837158, "reward_std": 0.2696641534566879, "rewards/countdown_reward_func": 0.41875001788139343, "step": 3973, "zero_std_ratio": 0.375 }, { "clip_ratio": 8.272667037090287e-05, "epoch": 0.011032820837428303, "grad_norm": 0.08697324991226196, "kl": 0.14602020382881165, "learning_rate": 3e-06, "loss": 0.0162, "step": 3974 }, { "clip_ratio": 0.00010322048183297738, "epoch": 0.01103559708826812, "grad_norm": 0.08354435116052628, "kl": 0.14531078934669495, "learning_rate": 3e-06, "loss": 0.017, "step": 3975 }, { "clip_ratio": 0.0, "epoch": 0.011038373339107935, "grad_norm": 0.08271390199661255, "kl": 0.1384219452738762, "learning_rate": 3e-06, "loss": 0.0159, "step": 3976 }, { "clip_ratio": 0.00038407426472986117, "epoch": 0.01104114958994775, "grad_norm": 0.07351138442754745, "kl": 0.14436528831720352, "learning_rate": 3e-06, "loss": 0.0161, "step": 3977 }, { "clip_ratio": 0.00029020178044447675, "epoch": 0.011043925840787568, "grad_norm": 0.0882362499833107, "kl": 0.13983634114265442, "learning_rate": 3e-06, "loss": 0.0158, "step": 3978 }, { "clip_ratio": 0.00030172908009262756, "epoch": 0.011046702091627383, "grad_norm": 0.07803701609373093, "kl": 0.14247574657201767, "learning_rate": 3e-06, "loss": 0.0154, "step": 3979 }, { "clip_ratio": 0.00018732918397290632, "epoch": 0.011049478342467198, "grad_norm": 0.08534128218889236, "kl": 0.14510250836610794, "learning_rate": 3e-06, "loss": 0.0153, "step": 3980 }, { "clip_ratio": 0.0, "epoch": 0.011052254593307014, "grad_norm": 0.07797124981880188, "kl": 0.1448824480175972, "learning_rate": 3e-06, "loss": 0.0157, "step": 3981 }, { "clip_ratio": 0.00031380754080601037, "epoch": 0.011055030844146831, "grad_norm": 0.08957452327013016, "kl": 0.13767676800489426, "learning_rate": 3e-06, "loss": 0.0147, "step": 3982 }, { "clip_ratio": 0.00017486924480181187, "epoch": 0.011057807094986646, "grad_norm": 0.08202137053012848, "kl": 0.1451864391565323, "learning_rate": 3e-06, "loss": 0.0149, "step": 3983 }, { "clip_ratio": 0.00019774290558416396, "epoch": 0.011060583345826462, "grad_norm": 0.07562907040119171, "kl": 0.13992220908403397, "learning_rate": 3e-06, "loss": 0.0138, "step": 3984 }, { "clip_ratio": 0.00011160714348079637, "completion_length": 227.77083587646484, "epoch": 0.011063359596666277, "grad_norm": 0.08965593576431274, "kl": 0.14933496713638306, "learning_rate": 3e-06, "loss": -0.0006, "reward": 0.32500001788139343, "reward_std": 0.29752181470394135, "rewards/countdown_reward_func": 0.32499998807907104, "step": 3985, "zero_std_ratio": 0.375 }, { "clip_ratio": 0.0002469354949425906, "epoch": 0.011066135847506094, "grad_norm": 0.09366835653781891, "kl": 0.1415039673447609, "learning_rate": 3e-06, "loss": -0.0001, "step": 3986 }, { "clip_ratio": 0.0, "epoch": 0.01106891209834591, "grad_norm": 0.07634873688220978, "kl": 0.14010533690452576, "learning_rate": 3e-06, "loss": 0.0003, "step": 3987 }, { "clip_ratio": 9.865824540611356e-05, "epoch": 0.011071688349185725, "grad_norm": 0.07587511837482452, "kl": 0.14075452834367752, "learning_rate": 3e-06, "loss": -0.0002, "step": 3988 }, { "clip_ratio": 0.00020524010324152187, "epoch": 0.011074464600025542, "grad_norm": 0.09486761689186096, "kl": 0.13673031330108643, "learning_rate": 3e-06, "loss": 0.0001, "step": 3989 }, { "clip_ratio": 0.0, "epoch": 0.011077240850865358, "grad_norm": 0.11473195999860764, "kl": 0.14757098257541656, "learning_rate": 3e-06, "loss": 0.0003, "step": 3990 }, { "clip_ratio": 8.138021075865254e-05, "epoch": 0.011080017101705173, "grad_norm": 0.09273962676525116, "kl": 0.14637595415115356, "learning_rate": 3e-06, "loss": -0.0014, "step": 3991 }, { "clip_ratio": 0.00035389703407417983, "epoch": 0.011082793352544988, "grad_norm": 0.1004997044801712, "kl": 0.13867132365703583, "learning_rate": 3e-06, "loss": -0.0007, "step": 3992 }, { "clip_ratio": 9.865824540611356e-05, "epoch": 0.011085569603384806, "grad_norm": 0.07709572464227676, "kl": 0.13640530407428741, "learning_rate": 3e-06, "loss": -0.0008, "step": 3993 }, { "clip_ratio": 0.0003348214377183467, "epoch": 0.011088345854224621, "grad_norm": 0.08142408728599548, "kl": 0.13745896518230438, "learning_rate": 3e-06, "loss": -0.001, "step": 3994 }, { "clip_ratio": 0.0005026364233344793, "epoch": 0.011091122105064436, "grad_norm": 0.08481746912002563, "kl": 0.13326407223939896, "learning_rate": 3e-06, "loss": -0.0011, "step": 3995 }, { "clip_ratio": 0.000609725946560502, "epoch": 0.011093898355904252, "grad_norm": 0.09647399187088013, "kl": 0.14342287182807922, "learning_rate": 3e-06, "loss": -0.0012, "step": 3996 }, { "clip_ratio": 0.0002661047619767487, "completion_length": 218.39583587646484, "epoch": 0.011096674606744069, "grad_norm": 0.08994139730930328, "kl": 0.14984092116355896, "learning_rate": 3e-06, "loss": 0.0248, "reward": 0.3229166865348816, "reward_std": 0.24278182908892632, "rewards/countdown_reward_func": 0.3229166716337204, "step": 3997, "zero_std_ratio": 0.5 }, { "clip_ratio": 0.0, "epoch": 0.011099450857583884, "grad_norm": 0.08023153990507126, "kl": 0.15013036876916885, "learning_rate": 3e-06, "loss": 0.0246, "step": 3998 }, { "clip_ratio": 0.0005482456181198359, "epoch": 0.0111022271084237, "grad_norm": 0.0821889266371727, "kl": 0.15376033633947372, "learning_rate": 3e-06, "loss": 0.0248, "step": 3999 }, { "epoch": 0.011105003359263517, "grad_norm": 0.11552915722131729, "learning_rate": 3e-06, "loss": 0.025, "step": 4000 } ], "logging_steps": 1, "max_steps": 3601980, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }