| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.992, | |
| "eval_steps": 500, | |
| "global_step": 125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 140.9583396911621, | |
| "epoch": 0.016, | |
| "grad_norm": 1.4258953228320013, | |
| "kl": 0.0, | |
| "learning_rate": 1.25e-07, | |
| "loss": 0.0, | |
| "reward": 0.5152640044689178, | |
| "reward_std": 0.5508254170417786, | |
| "rewards/correct_code_reward_func": 0.2291666716337204, | |
| "rewards/len_reward_func": 0.28609737753868103, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 131.50000762939453, | |
| "epoch": 0.032, | |
| "grad_norm": 1.1519258687122351, | |
| "kl": 0.0, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0, | |
| "reward": 0.541226252913475, | |
| "reward_std": 0.5189632624387741, | |
| "rewards/correct_code_reward_func": 0.2500000111758709, | |
| "rewards/len_reward_func": 0.29122625291347504, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 108.83333587646484, | |
| "epoch": 0.048, | |
| "grad_norm": 1.467116667329371, | |
| "kl": 0.00013637542724609375, | |
| "learning_rate": 3.75e-07, | |
| "loss": 0.0, | |
| "reward": 0.7587994039058685, | |
| "reward_std": 0.5140225142240524, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.21713273972272873, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 159.81250762939453, | |
| "epoch": 0.064, | |
| "grad_norm": 1.2682485669137435, | |
| "kl": 0.00018215179443359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.5178248882293701, | |
| "reward_std": 0.4526914358139038, | |
| "rewards/correct_code_reward_func": 0.1666666716337204, | |
| "rewards/len_reward_func": 0.3511582016944885, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 176.56250762939453, | |
| "epoch": 0.08, | |
| "grad_norm": 1.2033440109589688, | |
| "kl": 0.00014066696166992188, | |
| "learning_rate": 4.999157413258781e-07, | |
| "loss": 0.0, | |
| "reward": 0.32241350412368774, | |
| "reward_std": 0.32281263172626495, | |
| "rewards/correct_code_reward_func": 0.02083333395421505, | |
| "rewards/len_reward_func": 0.30158019065856934, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 124.87500762939453, | |
| "epoch": 0.096, | |
| "grad_norm": 1.5120707071506325, | |
| "kl": 0.00016808509826660156, | |
| "learning_rate": 4.996630220997057e-07, | |
| "loss": 0.0, | |
| "reward": 0.746085911989212, | |
| "reward_std": 0.5452268123626709, | |
| "rewards/correct_code_reward_func": 0.4583333432674408, | |
| "rewards/len_reward_func": 0.28775252401828766, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 169.9166717529297, | |
| "epoch": 0.112, | |
| "grad_norm": 0.9079518632617903, | |
| "kl": 0.00011348724365234375, | |
| "learning_rate": 4.992420126717784e-07, | |
| "loss": 0.0, | |
| "reward": 0.36989694088697433, | |
| "reward_std": 0.45903605222702026, | |
| "rewards/correct_code_reward_func": 0.125, | |
| "rewards/len_reward_func": 0.24489693343639374, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 219.43750762939453, | |
| "epoch": 0.128, | |
| "grad_norm": 1.2633142753352289, | |
| "kl": 0.0002155303955078125, | |
| "learning_rate": 4.986529968316653e-07, | |
| "loss": 0.0, | |
| "reward": 0.44794920086860657, | |
| "reward_std": 0.385338693857193, | |
| "rewards/correct_code_reward_func": 0.1250000037252903, | |
| "rewards/len_reward_func": 0.3229491859674454, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 227.91667938232422, | |
| "epoch": 0.144, | |
| "grad_norm": 1.0211344567101885, | |
| "kl": 0.00011777877807617188, | |
| "learning_rate": 4.978963716169165e-07, | |
| "loss": 0.0, | |
| "reward": 0.6235890090465546, | |
| "reward_std": 0.5187947303056717, | |
| "rewards/correct_code_reward_func": 0.3125, | |
| "rewards/len_reward_func": 0.31108900904655457, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 188.25000762939453, | |
| "epoch": 0.16, | |
| "grad_norm": 1.0353822839723037, | |
| "kl": 0.00011730194091796875, | |
| "learning_rate": 4.969726470454313e-07, | |
| "loss": 0.0, | |
| "reward": 0.6911160051822662, | |
| "reward_std": 0.5456923246383667, | |
| "rewards/correct_code_reward_func": 0.4166666865348816, | |
| "rewards/len_reward_func": 0.27444930374622345, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 168.27083587646484, | |
| "epoch": 0.176, | |
| "grad_norm": 1.7856755608823207, | |
| "kl": 0.00018310546875, | |
| "learning_rate": 4.958824457716706e-07, | |
| "loss": 0.0, | |
| "reward": 0.4588584154844284, | |
| "reward_std": 0.40716809034347534, | |
| "rewards/correct_code_reward_func": 0.1875, | |
| "rewards/len_reward_func": 0.271358385682106, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 203.08333587646484, | |
| "epoch": 0.192, | |
| "grad_norm": 0.9296992149271633, | |
| "kl": 0.00016641616821289062, | |
| "learning_rate": 4.946265026669454e-07, | |
| "loss": 0.0, | |
| "reward": 0.3501324951648712, | |
| "reward_std": 0.49003708362579346, | |
| "rewards/correct_code_reward_func": 0.1041666679084301, | |
| "rewards/len_reward_func": 0.245965838432312, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 115.66666793823242, | |
| "epoch": 0.208, | |
| "grad_norm": 1.4335533212366607, | |
| "kl": 0.00016570091247558594, | |
| "learning_rate": 4.932056643240618e-07, | |
| "loss": 0.0, | |
| "reward": 0.7853705883026123, | |
| "reward_std": 0.46111349761486053, | |
| "rewards/correct_code_reward_func": 0.5000000149011612, | |
| "rewards/len_reward_func": 0.2853705883026123, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 169.95833587646484, | |
| "epoch": 0.224, | |
| "grad_norm": 1.2723280538596287, | |
| "kl": 0.00021076202392578125, | |
| "learning_rate": 4.916208884866592e-07, | |
| "loss": 0.0, | |
| "reward": 0.5324039310216904, | |
| "reward_std": 0.5338821411132812, | |
| "rewards/correct_code_reward_func": 0.2708333432674408, | |
| "rewards/len_reward_func": 0.26157061755657196, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 154.58333587646484, | |
| "epoch": 0.24, | |
| "grad_norm": 1.2578666329332273, | |
| "kl": 0.00019168853759765625, | |
| "learning_rate": 4.898732434036243e-07, | |
| "loss": 0.0, | |
| "reward": 0.5949100255966187, | |
| "reward_std": 0.5048613250255585, | |
| "rewards/correct_code_reward_func": 0.3125000149011612, | |
| "rewards/len_reward_func": 0.28241002559661865, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 173.1875114440918, | |
| "epoch": 0.256, | |
| "grad_norm": 1.1230347862341579, | |
| "kl": 0.00029277801513671875, | |
| "learning_rate": 4.879639071090173e-07, | |
| "loss": 0.0, | |
| "reward": 0.4564344882965088, | |
| "reward_std": 0.4671656936407089, | |
| "rewards/correct_code_reward_func": 0.1666666679084301, | |
| "rewards/len_reward_func": 0.2897678166627884, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 169.375, | |
| "epoch": 0.272, | |
| "grad_norm": 1.3041956300758726, | |
| "kl": 0.0002574920654296875, | |
| "learning_rate": 4.858941666279955e-07, | |
| "loss": 0.0, | |
| "reward": 0.6347246468067169, | |
| "reward_std": 0.5289804339408875, | |
| "rewards/correct_code_reward_func": 0.3541666716337204, | |
| "rewards/len_reward_func": 0.2805579602718353, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 133.25000762939453, | |
| "epoch": 0.288, | |
| "grad_norm": 1.354822217310785, | |
| "kl": 0.0002689361572265625, | |
| "learning_rate": 4.836654171092682e-07, | |
| "loss": 0.0, | |
| "reward": 0.5779364109039307, | |
| "reward_std": 0.4782462567090988, | |
| "rewards/correct_code_reward_func": 0.2916666716337204, | |
| "rewards/len_reward_func": 0.2862697243690491, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 99.41667175292969, | |
| "epoch": 0.304, | |
| "grad_norm": 1.4087777232916079, | |
| "kl": 0.00031757354736328125, | |
| "learning_rate": 4.812791608846709e-07, | |
| "loss": 0.0, | |
| "reward": 0.5035808980464935, | |
| "reward_std": 0.46289560198783875, | |
| "rewards/correct_code_reward_func": 0.229166679084301, | |
| "rewards/len_reward_func": 0.27441420406103134, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 170.7291717529297, | |
| "epoch": 0.32, | |
| "grad_norm": 0.9923230664440412, | |
| "kl": 0.00028705596923828125, | |
| "learning_rate": 4.787370064564882e-07, | |
| "loss": 0.0, | |
| "reward": 0.5567075908184052, | |
| "reward_std": 0.44439028203487396, | |
| "rewards/correct_code_reward_func": 0.2083333432674408, | |
| "rewards/len_reward_func": 0.34837424755096436, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 124.72917175292969, | |
| "epoch": 0.336, | |
| "grad_norm": 1.2245791922735345, | |
| "kl": 0.00035572052001953125, | |
| "learning_rate": 4.7604066741321253e-07, | |
| "loss": 0.0, | |
| "reward": 0.8560027182102203, | |
| "reward_std": 0.6356588900089264, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.31433598697185516, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 123.64583969116211, | |
| "epoch": 0.352, | |
| "grad_norm": 1.2080469812565267, | |
| "kl": 0.00035858154296875, | |
| "learning_rate": 4.731919612744659e-07, | |
| "loss": 0.0, | |
| "reward": 0.7242447733879089, | |
| "reward_std": 0.4742405414581299, | |
| "rewards/correct_code_reward_func": 0.3958333432674408, | |
| "rewards/len_reward_func": 0.32841143012046814, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 146.2916717529297, | |
| "epoch": 0.368, | |
| "grad_norm": 1.2440640880474592, | |
| "kl": 0.00040721893310546875, | |
| "learning_rate": 4.7019280826586604e-07, | |
| "loss": 0.0, | |
| "reward": 0.5270938575267792, | |
| "reward_std": 0.4260385036468506, | |
| "rewards/correct_code_reward_func": 0.2291666679084301, | |
| "rewards/len_reward_func": 0.2979271858930588, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 141.9166717529297, | |
| "epoch": 0.384, | |
| "grad_norm": 1.455943571941334, | |
| "kl": 0.0006427764892578125, | |
| "learning_rate": 4.6704523002466094e-07, | |
| "loss": 0.0, | |
| "reward": 0.5917265266180038, | |
| "reward_std": 0.47722122073173523, | |
| "rewards/correct_code_reward_func": 0.3333333358168602, | |
| "rewards/len_reward_func": 0.25839313119649887, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 240.85417938232422, | |
| "epoch": 0.4, | |
| "grad_norm": 0.8411889507435418, | |
| "kl": 0.0003604888916015625, | |
| "learning_rate": 4.6375134823700503e-07, | |
| "loss": 0.0, | |
| "reward": 0.3353981524705887, | |
| "reward_std": 0.351834774017334, | |
| "rewards/correct_code_reward_func": 0.0833333358168602, | |
| "rewards/len_reward_func": 0.2520648390054703, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 97.31250381469727, | |
| "epoch": 0.416, | |
| "grad_norm": 1.374585753278975, | |
| "kl": 0.0008258819580078125, | |
| "learning_rate": 4.603133832077953e-07, | |
| "loss": 0.0, | |
| "reward": 0.6881800889968872, | |
| "reward_std": 0.5626422464847565, | |
| "rewards/correct_code_reward_func": 0.4375, | |
| "rewards/len_reward_func": 0.2506800442934036, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 131.08333587646484, | |
| "epoch": 0.432, | |
| "grad_norm": 1.5040369557196518, | |
| "kl": 0.0006847381591796875, | |
| "learning_rate": 4.5673365236403216e-07, | |
| "loss": 0.0, | |
| "reward": 0.6470239758491516, | |
| "reward_std": 0.39606642723083496, | |
| "rewards/correct_code_reward_func": 0.4375, | |
| "rewards/len_reward_func": 0.20952393114566803, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 198.06250762939453, | |
| "epoch": 0.448, | |
| "grad_norm": 1.1110007536297855, | |
| "kl": 0.00054168701171875, | |
| "learning_rate": 4.530145686927125e-07, | |
| "loss": 0.0, | |
| "reward": 0.5166794955730438, | |
| "reward_std": 0.504486620426178, | |
| "rewards/correct_code_reward_func": 0.2500000149011612, | |
| "rewards/len_reward_func": 0.2666794955730438, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 152.52083587646484, | |
| "epoch": 0.464, | |
| "grad_norm": 1.134262039216797, | |
| "kl": 0.00078582763671875, | |
| "learning_rate": 4.4915863911430897e-07, | |
| "loss": 0.0, | |
| "reward": 0.5144253522157669, | |
| "reward_std": 0.4733017832040787, | |
| "rewards/correct_code_reward_func": 0.1875000111758709, | |
| "rewards/len_reward_func": 0.3269253224134445, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 139.7916717529297, | |
| "epoch": 0.48, | |
| "grad_norm": 1.010573889887009, | |
| "kl": 0.0007152557373046875, | |
| "learning_rate": 4.45168462792932e-07, | |
| "loss": 0.0, | |
| "reward": 0.5882390439510345, | |
| "reward_std": 0.43310636281967163, | |
| "rewards/correct_code_reward_func": 0.2500000074505806, | |
| "rewards/len_reward_func": 0.33823904395103455, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 87.41666793823242, | |
| "epoch": 0.496, | |
| "grad_norm": 1.540244950569226, | |
| "kl": 0.0012340545654296875, | |
| "learning_rate": 4.4104672938431223e-07, | |
| "loss": 0.0, | |
| "reward": 0.7711681425571442, | |
| "reward_std": 0.4805651605129242, | |
| "rewards/correct_code_reward_func": 0.5833333432674408, | |
| "rewards/len_reward_func": 0.18783476203680038, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 101.43750381469727, | |
| "epoch": 0.512, | |
| "grad_norm": 2.3673085026520297, | |
| "kl": 0.0012607574462890625, | |
| "learning_rate": 4.367962172227866e-07, | |
| "loss": 0.0, | |
| "reward": 0.7279457449913025, | |
| "reward_std": 0.4627054035663605, | |
| "rewards/correct_code_reward_func": 0.4583333432674408, | |
| "rewards/len_reward_func": 0.2696124166250229, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 155.2291717529297, | |
| "epoch": 0.528, | |
| "grad_norm": 1.2624598609488873, | |
| "kl": 0.00139617919921875, | |
| "learning_rate": 4.324197914485075e-07, | |
| "loss": 0.0, | |
| "reward": 0.6401492655277252, | |
| "reward_std": 0.515736848115921, | |
| "rewards/correct_code_reward_func": 0.375, | |
| "rewards/len_reward_func": 0.26514923572540283, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 252.91667938232422, | |
| "epoch": 0.544, | |
| "grad_norm": 1.043728438493038, | |
| "kl": 0.0008392333984375, | |
| "learning_rate": 4.2792040207614e-07, | |
| "loss": 0.0, | |
| "reward": 0.6339870393276215, | |
| "reward_std": 0.5688490867614746, | |
| "rewards/correct_code_reward_func": 0.3333333432674408, | |
| "rewards/len_reward_func": 0.30065372586250305, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 178.25, | |
| "epoch": 0.56, | |
| "grad_norm": 1.2442169258805433, | |
| "kl": 0.00205230712890625, | |
| "learning_rate": 4.2330108200634723e-07, | |
| "loss": 0.0, | |
| "reward": 0.43357332795858383, | |
| "reward_std": 0.3690243661403656, | |
| "rewards/correct_code_reward_func": 0.16666667722165585, | |
| "rewards/len_reward_func": 0.26690666377544403, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 150.1666717529297, | |
| "epoch": 0.576, | |
| "grad_norm": 1.0937981889230137, | |
| "kl": 0.0016021728515625, | |
| "learning_rate": 4.185649449814045e-07, | |
| "loss": 0.0, | |
| "reward": 0.8725252151489258, | |
| "reward_std": 0.5368492603302002, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.3308584541082382, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 74.41666793823242, | |
| "epoch": 0.592, | |
| "grad_norm": 1.4560552034278569, | |
| "kl": 0.0020904541015625, | |
| "learning_rate": 4.137151834863213e-07, | |
| "loss": 0.0, | |
| "reward": 0.7634576857089996, | |
| "reward_std": 0.5292592346668243, | |
| "rewards/correct_code_reward_func": 0.5416666716337204, | |
| "rewards/len_reward_func": 0.22179099917411804, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 111.77083587646484, | |
| "epoch": 0.608, | |
| "grad_norm": 1.6125607277054597, | |
| "kl": 0.002716064453125, | |
| "learning_rate": 4.087550665968846e-07, | |
| "loss": 0.0, | |
| "reward": 0.6047167330980301, | |
| "reward_std": 0.4415762424468994, | |
| "rewards/correct_code_reward_func": 0.2916666865348816, | |
| "rewards/len_reward_func": 0.3130500763654709, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 87.0625, | |
| "epoch": 0.624, | |
| "grad_norm": 2.0747921723056026, | |
| "kl": 0.0023193359375, | |
| "learning_rate": 4.036879377760752e-07, | |
| "loss": 0.0, | |
| "reward": 0.7261738479137421, | |
| "reward_std": 0.6433705389499664, | |
| "rewards/correct_code_reward_func": 0.520833358168602, | |
| "rewards/len_reward_func": 0.20534051209688187, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 128.0833396911621, | |
| "epoch": 0.64, | |
| "grad_norm": 1.352520841789316, | |
| "kl": 0.00229644775390625, | |
| "learning_rate": 3.9851721262034157e-07, | |
| "loss": 0.0, | |
| "reward": 0.49166351556777954, | |
| "reward_std": 0.4290030002593994, | |
| "rewards/correct_code_reward_func": 0.18750000558793545, | |
| "rewards/len_reward_func": 0.30416350066661835, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 117.33333587646484, | |
| "epoch": 0.656, | |
| "grad_norm": 1.5281074207353524, | |
| "kl": 0.003509521484375, | |
| "learning_rate": 3.932463765572505e-07, | |
| "loss": 0.0, | |
| "reward": 0.5800679922103882, | |
| "reward_std": 0.5416670143604279, | |
| "rewards/correct_code_reward_func": 0.3125000149011612, | |
| "rewards/len_reward_func": 0.2675679475069046, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 112.43750381469727, | |
| "epoch": 0.672, | |
| "grad_norm": 1.2084984435618142, | |
| "kl": 0.00252532958984375, | |
| "learning_rate": 3.8787898249606767e-07, | |
| "loss": 0.0, | |
| "reward": 0.42490366101264954, | |
| "reward_std": 0.46323399245738983, | |
| "rewards/correct_code_reward_func": 0.14583333395421505, | |
| "rewards/len_reward_func": 0.27907034754753113, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 56.85416793823242, | |
| "epoch": 0.688, | |
| "grad_norm": 1.8756323954488632, | |
| "kl": 0.00452423095703125, | |
| "learning_rate": 3.8241864843284964e-07, | |
| "loss": 0.0, | |
| "reward": 0.7274035811424255, | |
| "reward_std": 0.5209662765264511, | |
| "rewards/correct_code_reward_func": 0.5000000149011612, | |
| "rewards/len_reward_func": 0.22740358859300613, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 153.68750762939453, | |
| "epoch": 0.704, | |
| "grad_norm": 1.785627080388602, | |
| "kl": 0.0055084228515625, | |
| "learning_rate": 3.768690550116639e-07, | |
| "loss": 0.0, | |
| "reward": 0.49254634976387024, | |
| "reward_std": 0.4052678644657135, | |
| "rewards/correct_code_reward_func": 0.1666666716337204, | |
| "rewards/len_reward_func": 0.32587967813014984, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 170.1041717529297, | |
| "epoch": 0.72, | |
| "grad_norm": 1.2057879792669277, | |
| "kl": 0.0038299560546875, | |
| "learning_rate": 3.712339430435792e-07, | |
| "loss": 0.0, | |
| "reward": 0.5373264253139496, | |
| "reward_std": 0.4612013250589371, | |
| "rewards/correct_code_reward_func": 0.2708333432674408, | |
| "rewards/len_reward_func": 0.2664930745959282, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 122.79167175292969, | |
| "epoch": 0.736, | |
| "grad_norm": 1.23844328247912, | |
| "kl": 0.00384521484375, | |
| "learning_rate": 3.65517110985099e-07, | |
| "loss": 0.0, | |
| "reward": 0.6534424722194672, | |
| "reward_std": 0.5896010398864746, | |
| "rewards/correct_code_reward_func": 0.354166679084301, | |
| "rewards/len_reward_func": 0.29927581548690796, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 73.39583396911621, | |
| "epoch": 0.752, | |
| "grad_norm": 2.222315006145743, | |
| "kl": 0.0058135986328125, | |
| "learning_rate": 3.597224123777389e-07, | |
| "loss": 0.0, | |
| "reward": 0.7357015609741211, | |
| "reward_std": 0.5119403451681137, | |
| "rewards/correct_code_reward_func": 0.4583333432674408, | |
| "rewards/len_reward_func": 0.2773682177066803, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 75.54166793823242, | |
| "epoch": 0.768, | |
| "grad_norm": 1.9981519435567456, | |
| "kl": 0.0053863525390625, | |
| "learning_rate": 3.5385375325047163e-07, | |
| "loss": 0.0, | |
| "reward": 0.6428782939910889, | |
| "reward_std": 0.6202229559421539, | |
| "rewards/correct_code_reward_func": 0.3958333432674408, | |
| "rewards/len_reward_func": 0.24704494327306747, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 73.27083587646484, | |
| "epoch": 0.784, | |
| "grad_norm": 2.073070842958071, | |
| "kl": 0.00554656982421875, | |
| "learning_rate": 3.479150894867926e-07, | |
| "loss": 0.0, | |
| "reward": 0.8005061745643616, | |
| "reward_std": 0.5489170849323273, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.25883948802948, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 93.62500381469727, | |
| "epoch": 0.8, | |
| "grad_norm": 1.7280406240103203, | |
| "kl": 0.0070953369140625, | |
| "learning_rate": 3.4191042415818e-07, | |
| "loss": 0.0, | |
| "reward": 0.6382943987846375, | |
| "reward_std": 0.4014574736356735, | |
| "rewards/correct_code_reward_func": 0.3750000149011612, | |
| "rewards/len_reward_func": 0.26329439133405685, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 110.31250381469727, | |
| "epoch": 0.816, | |
| "grad_norm": 1.5732703630042588, | |
| "kl": 0.008453369140625, | |
| "learning_rate": 3.3584380482574717e-07, | |
| "loss": 0.0, | |
| "reward": 0.8389279842376709, | |
| "reward_std": 0.6495693922042847, | |
| "rewards/correct_code_reward_func": 0.5208333432674408, | |
| "rewards/len_reward_func": 0.31809471547603607, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 81.4375, | |
| "epoch": 0.832, | |
| "grad_norm": 1.3555162901411408, | |
| "kl": 0.0072479248046875, | |
| "learning_rate": 3.297193208119047e-07, | |
| "loss": 0.0, | |
| "reward": 0.7050519585609436, | |
| "reward_std": 0.522288054227829, | |
| "rewards/correct_code_reward_func": 0.4375000298023224, | |
| "rewards/len_reward_func": 0.2675519585609436, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 145.2291717529297, | |
| "epoch": 0.848, | |
| "grad_norm": 1.2256688073258564, | |
| "kl": 0.00726318359375, | |
| "learning_rate": 3.235411004438741e-07, | |
| "loss": 0.0, | |
| "reward": 0.6400169730186462, | |
| "reward_std": 0.5816708207130432, | |
| "rewards/correct_code_reward_func": 0.3541666716337204, | |
| "rewards/len_reward_func": 0.28585030883550644, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 120.20833587646484, | |
| "epoch": 0.864, | |
| "grad_norm": 1.8462631631415796, | |
| "kl": 0.0084991455078125, | |
| "learning_rate": 3.173133082709086e-07, | |
| "loss": 0.0, | |
| "reward": 0.643402487039566, | |
| "reward_std": 0.3417808264493942, | |
| "rewards/correct_code_reward_func": 0.3333333432674408, | |
| "rewards/len_reward_func": 0.31006917357444763, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 55.56250190734863, | |
| "epoch": 0.88, | |
| "grad_norm": 1.7370166581779802, | |
| "kl": 0.01177978515625, | |
| "learning_rate": 3.1104014225709784e-07, | |
| "loss": 0.0, | |
| "reward": 0.9137917459011078, | |
| "reward_std": 0.5003669559955597, | |
| "rewards/correct_code_reward_func": 0.583333358168602, | |
| "rewards/len_reward_func": 0.3304583728313446, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 189.25000762939453, | |
| "epoch": 0.896, | |
| "grad_norm": 1.2196760565152192, | |
| "kl": 0.0058441162109375, | |
| "learning_rate": 3.0472583095164873e-07, | |
| "loss": 0.0, | |
| "reward": 0.4673280417919159, | |
| "reward_std": 0.4577627182006836, | |
| "rewards/correct_code_reward_func": 0.1666666716337204, | |
| "rewards/len_reward_func": 0.3006613999605179, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 57.37500190734863, | |
| "epoch": 0.912, | |
| "grad_norm": 2.0919947468048976, | |
| "kl": 0.010162353515625, | |
| "learning_rate": 2.983746306385499e-07, | |
| "loss": 0.0, | |
| "reward": 0.6931174695491791, | |
| "reward_std": 0.5172313153743744, | |
| "rewards/correct_code_reward_func": 0.4791666865348816, | |
| "rewards/len_reward_func": 0.21395081281661987, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 86.00000190734863, | |
| "epoch": 0.928, | |
| "grad_norm": 1.5907089477428527, | |
| "kl": 0.0113677978515625, | |
| "learning_rate": 2.919908224675412e-07, | |
| "loss": 0.0, | |
| "reward": 0.5865814685821533, | |
| "reward_std": 0.5177368223667145, | |
| "rewards/correct_code_reward_func": 0.3125000149011612, | |
| "rewards/len_reward_func": 0.27408143877983093, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 90.72916793823242, | |
| "epoch": 0.944, | |
| "grad_norm": 1.1269292807249032, | |
| "kl": 0.00830078125, | |
| "learning_rate": 2.8557870956832133e-07, | |
| "loss": 0.0, | |
| "reward": 0.4935041069984436, | |
| "reward_std": 0.41843119263648987, | |
| "rewards/correct_code_reward_func": 0.2083333432674408, | |
| "rewards/len_reward_func": 0.285170778632164, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 85.60416793823242, | |
| "epoch": 0.96, | |
| "grad_norm": 2.320388470663489, | |
| "kl": 0.014678955078125, | |
| "learning_rate": 2.7914261414993976e-07, | |
| "loss": 0.0, | |
| "reward": 0.7554058134555817, | |
| "reward_std": 0.5069911777973175, | |
| "rewards/correct_code_reward_func": 0.4166666716337204, | |
| "rewards/len_reward_func": 0.3387391269207001, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 63.375, | |
| "epoch": 0.976, | |
| "grad_norm": 1.7319214973496064, | |
| "kl": 0.02532958984375, | |
| "learning_rate": 2.726868745873286e-07, | |
| "loss": 0.0, | |
| "reward": 0.7839343547821045, | |
| "reward_std": 0.6209487617015839, | |
| "rewards/correct_code_reward_func": 0.4791666716337204, | |
| "rewards/len_reward_func": 0.3047676384449005, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 87.14583587646484, | |
| "epoch": 0.992, | |
| "grad_norm": 1.8272498546531741, | |
| "kl": 0.0134735107421875, | |
| "learning_rate": 2.662158424969357e-07, | |
| "loss": 0.0, | |
| "reward": 0.8219521045684814, | |
| "reward_std": 0.6945097148418427, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.28028544783592224, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 55.66666793823242, | |
| "epoch": 1.0, | |
| "grad_norm": 1.8272498546531741, | |
| "kl": 0.02587890625, | |
| "learning_rate": 2.597338798034344e-07, | |
| "loss": 0.0, | |
| "reward": 0.713922381401062, | |
| "reward_std": 0.519837498664856, | |
| "rewards/correct_code_reward_func": 0.4166666865348816, | |
| "rewards/len_reward_func": 0.29725566506385803, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 88.75000381469727, | |
| "epoch": 1.016, | |
| "grad_norm": 1.6950346991160663, | |
| "kl": 0.0108642578125, | |
| "learning_rate": 2.532453557994827e-07, | |
| "loss": 0.0, | |
| "reward": 0.5927524715662003, | |
| "reward_std": 0.39128445088863373, | |
| "rewards/correct_code_reward_func": 0.3750000149011612, | |
| "rewards/len_reward_func": 0.21775247156620026, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 151.7291717529297, | |
| "epoch": 1.032, | |
| "grad_norm": 1.6408461481438466, | |
| "kl": 0.011138916015625, | |
| "learning_rate": 2.467546442005173e-07, | |
| "loss": 0.0, | |
| "reward": 0.6122622489929199, | |
| "reward_std": 0.5165137350559235, | |
| "rewards/correct_code_reward_func": 0.3125000149011612, | |
| "rewards/len_reward_func": 0.2997622489929199, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 104.85417175292969, | |
| "epoch": 1.048, | |
| "grad_norm": 1.1573620161491798, | |
| "kl": 0.01092529296875, | |
| "learning_rate": 2.4026612019656556e-07, | |
| "loss": 0.0, | |
| "reward": 0.8486100733280182, | |
| "reward_std": 0.3942585438489914, | |
| "rewards/correct_code_reward_func": 0.5, | |
| "rewards/len_reward_func": 0.348610058426857, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 62.47916793823242, | |
| "epoch": 1.064, | |
| "grad_norm": 2.1966023559129266, | |
| "kl": 0.018798828125, | |
| "learning_rate": 2.337841575030642e-07, | |
| "loss": 0.0, | |
| "reward": 0.8105108737945557, | |
| "reward_std": 0.4338831454515457, | |
| "rewards/correct_code_reward_func": 0.4583333432674408, | |
| "rewards/len_reward_func": 0.35217756032943726, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 74.95833587646484, | |
| "epoch": 1.08, | |
| "grad_norm": 1.796160832910341, | |
| "kl": 0.02294921875, | |
| "learning_rate": 2.2731312541267143e-07, | |
| "loss": 0.0, | |
| "reward": 0.549996554851532, | |
| "reward_std": 0.3687018007040024, | |
| "rewards/correct_code_reward_func": 0.2083333358168602, | |
| "rewards/len_reward_func": 0.3416632413864136, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 80.14583587646484, | |
| "epoch": 1.096, | |
| "grad_norm": 2.1344146728324653, | |
| "kl": 0.02447509765625, | |
| "learning_rate": 2.2085738585006021e-07, | |
| "loss": 0.0, | |
| "reward": 0.8650955259799957, | |
| "reward_std": 0.4139704555273056, | |
| "rewards/correct_code_reward_func": 0.5208333432674408, | |
| "rewards/len_reward_func": 0.34426216781139374, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 60.958335876464844, | |
| "epoch": 1.112, | |
| "grad_norm": 1.6686676921157912, | |
| "kl": 0.025634765625, | |
| "learning_rate": 2.1442129043167873e-07, | |
| "loss": 0.0, | |
| "reward": 0.6947443187236786, | |
| "reward_std": 0.5725615322589874, | |
| "rewards/correct_code_reward_func": 0.375, | |
| "rewards/len_reward_func": 0.319744348526001, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 108.1875, | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 1.7272596794076989, | |
| "kl": 0.0130615234375, | |
| "learning_rate": 2.0800917753245875e-07, | |
| "loss": 0.0, | |
| "reward": 0.7587291896343231, | |
| "reward_std": 0.5232284665107727, | |
| "rewards/correct_code_reward_func": 0.4166666865348816, | |
| "rewards/len_reward_func": 0.3420625329017639, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 108.04167175292969, | |
| "epoch": 1.144, | |
| "grad_norm": 1.6272563745253346, | |
| "kl": 0.01654052734375, | |
| "learning_rate": 2.0162536936145008e-07, | |
| "loss": 0.0, | |
| "reward": 0.5046872794628143, | |
| "reward_std": 0.3378771096467972, | |
| "rewards/correct_code_reward_func": 0.1666666679084301, | |
| "rewards/len_reward_func": 0.33802059292793274, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 54.02083396911621, | |
| "epoch": 1.16, | |
| "grad_norm": 1.9418689539056528, | |
| "kl": 0.0308837890625, | |
| "learning_rate": 1.9527416904835132e-07, | |
| "loss": 0.0, | |
| "reward": 0.9055829644203186, | |
| "reward_std": 0.3730238378047943, | |
| "rewards/correct_code_reward_func": 0.5, | |
| "rewards/len_reward_func": 0.405582919716835, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 94.31250381469727, | |
| "epoch": 1.176, | |
| "grad_norm": 1.5576616620611914, | |
| "kl": 0.02215576171875, | |
| "learning_rate": 1.889598577429022e-07, | |
| "loss": 0.0, | |
| "reward": 0.9071804285049438, | |
| "reward_std": 0.44920457899570465, | |
| "rewards/correct_code_reward_func": 0.5000000298023224, | |
| "rewards/len_reward_func": 0.40718045830726624, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 53.79166793823242, | |
| "epoch": 1.192, | |
| "grad_norm": 2.3725141345867544, | |
| "kl": 0.03057861328125, | |
| "learning_rate": 1.8268669172909136e-07, | |
| "loss": 0.0, | |
| "reward": 0.9221459329128265, | |
| "reward_std": 0.4697086811065674, | |
| "rewards/correct_code_reward_func": 0.5000000298023224, | |
| "rewards/len_reward_func": 0.42214588820934296, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 89.79167175292969, | |
| "epoch": 1.208, | |
| "grad_norm": 2.003223060045919, | |
| "kl": 0.03094482421875, | |
| "learning_rate": 1.7645889955612592e-07, | |
| "loss": 0.0, | |
| "reward": 1.0163878798484802, | |
| "reward_std": 0.43504565954208374, | |
| "rewards/correct_code_reward_func": 0.6250000298023224, | |
| "rewards/len_reward_func": 0.3913878947496414, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 68.79166984558105, | |
| "epoch": 1.224, | |
| "grad_norm": 2.361523245499291, | |
| "kl": 0.0457763671875, | |
| "learning_rate": 1.7028067918809535e-07, | |
| "loss": 0.0, | |
| "reward": 0.7535229325294495, | |
| "reward_std": 0.47849828004837036, | |
| "rewards/correct_code_reward_func": 0.375, | |
| "rewards/len_reward_func": 0.3785228729248047, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 54.14583396911621, | |
| "epoch": 1.24, | |
| "grad_norm": 2.120116927446423, | |
| "kl": 0.0394287109375, | |
| "learning_rate": 1.6415619517425294e-07, | |
| "loss": 0.0, | |
| "reward": 0.8538325130939484, | |
| "reward_std": 0.44848716259002686, | |
| "rewards/correct_code_reward_func": 0.4791666865348816, | |
| "rewards/len_reward_func": 0.3746658265590668, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 89.0, | |
| "epoch": 1.256, | |
| "grad_norm": 1.2055136830985975, | |
| "kl": 0.0272216796875, | |
| "learning_rate": 1.5808957584181994e-07, | |
| "loss": 0.0, | |
| "reward": 0.755169004201889, | |
| "reward_std": 0.4014817923307419, | |
| "rewards/correct_code_reward_func": 0.3541666716337204, | |
| "rewards/len_reward_func": 0.40100236237049103, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 99.39583969116211, | |
| "epoch": 1.272, | |
| "grad_norm": 1.84690544945913, | |
| "kl": 0.024322509765625, | |
| "learning_rate": 1.5208491051320744e-07, | |
| "loss": 0.0, | |
| "reward": 0.7356246709823608, | |
| "reward_std": 0.47616493701934814, | |
| "rewards/correct_code_reward_func": 0.3958333432674408, | |
| "rewards/len_reward_func": 0.33979131281375885, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 73.04166793823242, | |
| "epoch": 1.288, | |
| "grad_norm": 1.7278725529442787, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1.461462467495284e-07, | |
| "loss": 0.0, | |
| "reward": 0.7051982879638672, | |
| "reward_std": 0.48877203464508057, | |
| "rewards/correct_code_reward_func": 0.3125, | |
| "rewards/len_reward_func": 0.3926983177661896, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 59.354169845581055, | |
| "epoch": 1.304, | |
| "grad_norm": 2.077567652472909, | |
| "kl": 0.0345458984375, | |
| "learning_rate": 1.4027758762226107e-07, | |
| "loss": 0.0, | |
| "reward": 0.816185712814331, | |
| "reward_std": 0.4705541431903839, | |
| "rewards/correct_code_reward_func": 0.4791666865348816, | |
| "rewards/len_reward_func": 0.3370189964771271, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 81.58333587646484, | |
| "epoch": 1.32, | |
| "grad_norm": 1.609719907980881, | |
| "kl": 0.0234375, | |
| "learning_rate": 1.3448288901490092e-07, | |
| "loss": 0.0, | |
| "reward": 0.7908000648021698, | |
| "reward_std": 0.45585089921951294, | |
| "rewards/correct_code_reward_func": 0.4166666716337204, | |
| "rewards/len_reward_func": 0.374133437871933, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 87.33333587646484, | |
| "epoch": 1.336, | |
| "grad_norm": 1.6587537084233746, | |
| "kl": 0.02667236328125, | |
| "learning_rate": 1.2876605695642084e-07, | |
| "loss": 0.0, | |
| "reward": 0.6749401688575745, | |
| "reward_std": 0.42905712127685547, | |
| "rewards/correct_code_reward_func": 0.3541666716337204, | |
| "rewards/len_reward_func": 0.3207734525203705, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 95.20833587646484, | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 2.538472018686139, | |
| "kl": 0.02581787109375, | |
| "learning_rate": 1.231309449883361e-07, | |
| "loss": 0.0, | |
| "reward": 0.7594759464263916, | |
| "reward_std": 0.5746750831604004, | |
| "rewards/correct_code_reward_func": 0.3750000149011612, | |
| "rewards/len_reward_func": 0.3844759315252304, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 55.43750190734863, | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 1.797373425635401, | |
| "kl": 0.03289794921875, | |
| "learning_rate": 1.1758135156715041e-07, | |
| "loss": 0.0, | |
| "reward": 0.9961144328117371, | |
| "reward_std": 0.5648430436849594, | |
| "rewards/correct_code_reward_func": 0.6250000298023224, | |
| "rewards/len_reward_func": 0.37111443281173706, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 121.25000762939453, | |
| "epoch": 1.384, | |
| "grad_norm": 1.7119982491506713, | |
| "kl": 0.0286865234375, | |
| "learning_rate": 1.1212101750393235e-07, | |
| "loss": 0.0, | |
| "reward": 0.7243427634239197, | |
| "reward_std": 0.3805614560842514, | |
| "rewards/correct_code_reward_func": 0.3333333358168602, | |
| "rewards/len_reward_func": 0.39100944995880127, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 57.35416793823242, | |
| "epoch": 1.4, | |
| "grad_norm": 1.7713124187158098, | |
| "kl": 0.034912109375, | |
| "learning_rate": 1.0675362344274952e-07, | |
| "loss": 0.0, | |
| "reward": 0.7016758322715759, | |
| "reward_std": 0.5317542552947998, | |
| "rewards/correct_code_reward_func": 0.3541666865348816, | |
| "rewards/len_reward_func": 0.34750914573669434, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 59.0625, | |
| "epoch": 1.416, | |
| "grad_norm": 1.6492634665708499, | |
| "kl": 0.034423828125, | |
| "learning_rate": 1.0148278737965844e-07, | |
| "loss": 0.0, | |
| "reward": 0.7394144237041473, | |
| "reward_std": 0.4491709917783737, | |
| "rewards/correct_code_reward_func": 0.3541666716337204, | |
| "rewards/len_reward_func": 0.38524775207042694, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 48.6875, | |
| "epoch": 1.432, | |
| "grad_norm": 1.9432473699712165, | |
| "kl": 0.06494140625, | |
| "learning_rate": 9.631206222392479e-08, | |
| "loss": 0.0001, | |
| "reward": 0.8676341474056244, | |
| "reward_std": 0.3966159522533417, | |
| "rewards/correct_code_reward_func": 0.4791666865348816, | |
| "rewards/len_reward_func": 0.388467475771904, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 91.62500381469727, | |
| "epoch": 1.448, | |
| "grad_norm": 1.9189293687085252, | |
| "kl": 0.13482666015625, | |
| "learning_rate": 9.124493340311537e-08, | |
| "loss": 0.0001, | |
| "reward": 0.7231810688972473, | |
| "reward_std": 0.4981995224952698, | |
| "rewards/correct_code_reward_func": 0.3333333432674408, | |
| "rewards/len_reward_func": 0.3898477256298065, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 60.729169845581055, | |
| "epoch": 1.464, | |
| "grad_norm": 1.9825880271843388, | |
| "kl": 0.03424072265625, | |
| "learning_rate": 8.628481651367875e-08, | |
| "loss": 0.0, | |
| "reward": 0.8303024768829346, | |
| "reward_std": 0.40181903541088104, | |
| "rewards/correct_code_reward_func": 0.4375000149011612, | |
| "rewards/len_reward_func": 0.39280249178409576, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 58.22916793823242, | |
| "epoch": 1.48, | |
| "grad_norm": 1.8747344082688029, | |
| "kl": 0.0426025390625, | |
| "learning_rate": 8.143505501859551e-08, | |
| "loss": 0.0, | |
| "reward": 0.7909549474716187, | |
| "reward_std": 0.4536728262901306, | |
| "rewards/correct_code_reward_func": 0.458333358168602, | |
| "rewards/len_reward_func": 0.33262157440185547, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 125.10417175292969, | |
| "epoch": 1.496, | |
| "grad_norm": 1.5754029745287528, | |
| "kl": 0.02886962890625, | |
| "learning_rate": 7.669891799365282e-08, | |
| "loss": 0.0, | |
| "reward": 0.6297820806503296, | |
| "reward_std": 0.5051470398902893, | |
| "rewards/correct_code_reward_func": 0.2708333432674408, | |
| "rewards/len_reward_func": 0.3589487075805664, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 89.27083587646484, | |
| "epoch": 1.512, | |
| "grad_norm": 1.698829198816419, | |
| "kl": 0.02362060546875, | |
| "learning_rate": 7.207959792385998e-08, | |
| "loss": 0.0, | |
| "reward": 0.7924558222293854, | |
| "reward_std": 0.42506614327430725, | |
| "rewards/correct_code_reward_func": 0.3541666865348816, | |
| "rewards/len_reward_func": 0.4382891356945038, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 82.18750381469727, | |
| "epoch": 1.528, | |
| "grad_norm": 1.4031599496951968, | |
| "kl": 0.03643798828125, | |
| "learning_rate": 6.758020855149249e-08, | |
| "loss": 0.0, | |
| "reward": 0.6851500123739243, | |
| "reward_std": 0.2974398583173752, | |
| "rewards/correct_code_reward_func": 0.25000000558793545, | |
| "rewards/len_reward_func": 0.43515002727508545, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 54.6875, | |
| "epoch": 1.544, | |
| "grad_norm": 1.4467008481635895, | |
| "kl": 0.039306640625, | |
| "learning_rate": 6.320378277721342e-08, | |
| "loss": 0.0, | |
| "reward": 0.7509966492652893, | |
| "reward_std": 0.3042096644639969, | |
| "rewards/correct_code_reward_func": 0.3125, | |
| "rewards/len_reward_func": 0.4384966343641281, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 68.08333587646484, | |
| "epoch": 1.56, | |
| "grad_norm": 2.082709482850275, | |
| "kl": 0.03460693359375, | |
| "learning_rate": 5.895327061568775e-08, | |
| "loss": 0.0, | |
| "reward": 0.7968247532844543, | |
| "reward_std": 0.36605267226696014, | |
| "rewards/correct_code_reward_func": 0.3750000149011612, | |
| "rewards/len_reward_func": 0.42182472348213196, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 56.020835876464844, | |
| "epoch": 1.576, | |
| "grad_norm": 2.726579074776626, | |
| "kl": 0.0662841796875, | |
| "learning_rate": 5.483153720706798e-08, | |
| "loss": 0.0001, | |
| "reward": 0.8111520707607269, | |
| "reward_std": 0.548240602016449, | |
| "rewards/correct_code_reward_func": 0.4166666716337204, | |
| "rewards/len_reward_func": 0.3944854289293289, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 54.25000190734863, | |
| "epoch": 1.592, | |
| "grad_norm": 2.079061824739654, | |
| "kl": 0.0452880859375, | |
| "learning_rate": 5.0841360885690996e-08, | |
| "loss": 0.0, | |
| "reward": 0.9174363613128662, | |
| "reward_std": 0.46667972207069397, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.375769704580307, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 65.72916793823242, | |
| "epoch": 1.608, | |
| "grad_norm": 1.5292386933354263, | |
| "kl": 0.04522705078125, | |
| "learning_rate": 4.698543130728755e-08, | |
| "loss": 0.0, | |
| "reward": 0.8213175535202026, | |
| "reward_std": 0.38392098248004913, | |
| "rewards/correct_code_reward_func": 0.458333358168602, | |
| "rewards/len_reward_func": 0.3629842549562454, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 67.77083587646484, | |
| "epoch": 1.624, | |
| "grad_norm": 1.352325105446135, | |
| "kl": 0.0390625, | |
| "learning_rate": 4.326634763596784e-08, | |
| "loss": 0.0, | |
| "reward": 0.7263242900371552, | |
| "reward_std": 0.37168650329113007, | |
| "rewards/correct_code_reward_func": 0.31250002048909664, | |
| "rewards/len_reward_func": 0.41382429003715515, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 64.10416793823242, | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 1.9987254276022863, | |
| "kl": 0.02880859375, | |
| "learning_rate": 3.968661679220467e-08, | |
| "loss": 0.0, | |
| "reward": 1.174392580986023, | |
| "reward_std": 0.4813085198402405, | |
| "rewards/correct_code_reward_func": 0.7500000298023224, | |
| "rewards/len_reward_func": 0.42439255118370056, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 57.437503814697266, | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 1.5506203528349733, | |
| "kl": 0.041015625, | |
| "learning_rate": 3.624865176299499e-08, | |
| "loss": 0.0, | |
| "reward": 0.9918626546859741, | |
| "reward_std": 0.5309067815542221, | |
| "rewards/correct_code_reward_func": 0.6666666865348816, | |
| "rewards/len_reward_func": 0.3251959830522537, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 114.50000762939453, | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 1.538301941895194, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 3.295476997533905e-08, | |
| "loss": 0.0, | |
| "reward": 0.9100688099861145, | |
| "reward_std": 0.29824198782444, | |
| "rewards/correct_code_reward_func": 0.4583333432674408, | |
| "rewards/len_reward_func": 0.4517354816198349, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 129.81250381469727, | |
| "epoch": 1.688, | |
| "grad_norm": 1.3867754807731443, | |
| "kl": 0.0283203125, | |
| "learning_rate": 2.980719173413396e-08, | |
| "loss": 0.0, | |
| "reward": 0.818383663892746, | |
| "reward_std": 0.5115247815847397, | |
| "rewards/correct_code_reward_func": 0.4166666716337204, | |
| "rewards/len_reward_func": 0.4017169624567032, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 73.33333587646484, | |
| "epoch": 1.704, | |
| "grad_norm": 2.2267187145460765, | |
| "kl": 0.04461669921875, | |
| "learning_rate": 2.680803872553408e-08, | |
| "loss": 0.0, | |
| "reward": 0.8567679226398468, | |
| "reward_std": 0.51302769780159, | |
| "rewards/correct_code_reward_func": 0.4375, | |
| "rewards/len_reward_func": 0.4192679077386856, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 53.54166793823242, | |
| "epoch": 1.72, | |
| "grad_norm": 3.1940102299602953, | |
| "kl": 0.0521240234375, | |
| "learning_rate": 2.395933258678745e-08, | |
| "loss": 0.0001, | |
| "reward": 0.9940223693847656, | |
| "reward_std": 0.46572498977184296, | |
| "rewards/correct_code_reward_func": 0.6041666865348816, | |
| "rewards/len_reward_func": 0.3898557126522064, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 41.52083396911621, | |
| "epoch": 1.736, | |
| "grad_norm": 2.0727978566546295, | |
| "kl": 0.0655517578125, | |
| "learning_rate": 2.1262993543511715e-08, | |
| "loss": 0.0001, | |
| "reward": 0.9489125609397888, | |
| "reward_std": 0.5604254603385925, | |
| "rewards/correct_code_reward_func": 0.6250000298023224, | |
| "rewards/len_reward_func": 0.32391248643398285, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 106.08333587646484, | |
| "epoch": 1.752, | |
| "grad_norm": 2.3414859603625806, | |
| "kl": 0.03424072265625, | |
| "learning_rate": 1.872083911532907e-08, | |
| "loss": 0.0, | |
| "reward": 0.5710697174072266, | |
| "reward_std": 0.4303289204835892, | |
| "rewards/correct_code_reward_func": 0.1666666679084301, | |
| "rewards/len_reward_func": 0.4044030159711838, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 60.437503814697266, | |
| "epoch": 1.768, | |
| "grad_norm": 1.5494191116212308, | |
| "kl": 0.046875, | |
| "learning_rate": 1.6334582890731697e-08, | |
| "loss": 0.0, | |
| "reward": 1.0543819665908813, | |
| "reward_std": 0.4688963294029236, | |
| "rewards/correct_code_reward_func": 0.6666666865348816, | |
| "rewards/len_reward_func": 0.38771532475948334, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 139.43750381469727, | |
| "epoch": 1.784, | |
| "grad_norm": 1.8975149766982131, | |
| "kl": 0.0323486328125, | |
| "learning_rate": 1.4105833372004523e-08, | |
| "loss": 0.0, | |
| "reward": 0.7198583781719208, | |
| "reward_std": 0.2770904451608658, | |
| "rewards/correct_code_reward_func": 0.2708333395421505, | |
| "rewards/len_reward_func": 0.4490250498056412, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 71.87500190734863, | |
| "epoch": 1.8, | |
| "grad_norm": 1.8779975481307012, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 1.2036092890982619e-08, | |
| "loss": 0.0, | |
| "reward": 0.6213224828243256, | |
| "reward_std": 0.39381173253059387, | |
| "rewards/correct_code_reward_func": 0.25, | |
| "rewards/len_reward_func": 0.3713224530220032, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 73.16666793823242, | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 1.625916920606493, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.0126756596375685e-08, | |
| "loss": 0.0, | |
| "reward": 0.8906111121177673, | |
| "reward_std": 0.5251133739948273, | |
| "rewards/correct_code_reward_func": 0.4791666865348816, | |
| "rewards/len_reward_func": 0.41144441068172455, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 39.85416793823242, | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 1.8155165345051183, | |
| "kl": 0.0440673828125, | |
| "learning_rate": 8.379111513340753e-09, | |
| "loss": 0.0, | |
| "reward": 0.8687795996665955, | |
| "reward_std": 0.4838385283946991, | |
| "rewards/correct_code_reward_func": 0.4583333358168602, | |
| "rewards/len_reward_func": 0.41044625639915466, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 75.58333396911621, | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 1.8222797961879316, | |
| "kl": 0.03985595703125, | |
| "learning_rate": 6.7943356759381785e-09, | |
| "loss": 0.0, | |
| "reward": 0.9320607483386993, | |
| "reward_std": 0.5384509861469269, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.39039406180381775, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 68.54166984558105, | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 2.0020075086567775, | |
| "kl": 0.031982421875, | |
| "learning_rate": 5.373497333054616e-09, | |
| "loss": 0.0, | |
| "reward": 0.9275134801864624, | |
| "reward_std": 0.4482097327709198, | |
| "rewards/correct_code_reward_func": 0.5000000298023224, | |
| "rewards/len_reward_func": 0.4275134950876236, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 73.91666793823242, | |
| "epoch": 1.88, | |
| "grad_norm": 1.7788611304062052, | |
| "kl": 0.03240966796875, | |
| "learning_rate": 4.117554228329406e-09, | |
| "loss": 0.0, | |
| "reward": 0.9304822385311127, | |
| "reward_std": 0.5174555033445358, | |
| "rewards/correct_code_reward_func": 0.5416666865348816, | |
| "rewards/len_reward_func": 0.38881558179855347, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 56.20833396911621, | |
| "epoch": 1.896, | |
| "grad_norm": 2.1126141119280257, | |
| "kl": 0.0341796875, | |
| "learning_rate": 3.0273529545687125e-09, | |
| "loss": 0.0, | |
| "reward": 0.7594221532344818, | |
| "reward_std": 0.480338990688324, | |
| "rewards/correct_code_reward_func": 0.3958333432674408, | |
| "rewards/len_reward_func": 0.3635888248682022, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 72.47916793823242, | |
| "epoch": 1.912, | |
| "grad_norm": 1.4598566413193612, | |
| "kl": 0.03466796875, | |
| "learning_rate": 2.1036283830834224e-09, | |
| "loss": 0.0, | |
| "reward": 0.7889427244663239, | |
| "reward_std": 0.48503294587135315, | |
| "rewards/correct_code_reward_func": 0.3958333432674408, | |
| "rewards/len_reward_func": 0.39310936629772186, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 40.85416793823242, | |
| "epoch": 1.928, | |
| "grad_norm": 2.335195303935002, | |
| "kl": 0.056640625, | |
| "learning_rate": 1.347003168334665e-09, | |
| "loss": 0.0001, | |
| "reward": 1.0662382543087006, | |
| "reward_std": 0.2768351137638092, | |
| "rewards/correct_code_reward_func": 0.6250000149011612, | |
| "rewards/len_reward_func": 0.44123825430870056, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 50.62500190734863, | |
| "epoch": 1.944, | |
| "grad_norm": 1.8386331097859265, | |
| "kl": 0.03173828125, | |
| "learning_rate": 7.579873282216598e-10, | |
| "loss": 0.0, | |
| "reward": 0.8906074166297913, | |
| "reward_std": 0.5252098143100739, | |
| "rewards/correct_code_reward_func": 0.5833333730697632, | |
| "rewards/len_reward_func": 0.30727406591176987, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 99.4375057220459, | |
| "epoch": 1.96, | |
| "grad_norm": 1.621045537411182, | |
| "kl": 0.0238037109375, | |
| "learning_rate": 3.3697790029424413e-10, | |
| "loss": 0.0, | |
| "reward": 0.9505272507667542, | |
| "reward_std": 0.5842320024967194, | |
| "rewards/correct_code_reward_func": 0.5833333432674408, | |
| "rewards/len_reward_func": 0.36719387769699097, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 63.000003814697266, | |
| "epoch": 1.976, | |
| "grad_norm": 2.157350197672568, | |
| "kl": 0.0465087890625, | |
| "learning_rate": 8.425867412190091e-11, | |
| "loss": 0.0, | |
| "reward": 0.9762873649597168, | |
| "reward_std": 0.5066816210746765, | |
| "rewards/correct_code_reward_func": 0.5833333432674408, | |
| "rewards/len_reward_func": 0.3929540067911148, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 126.97917175292969, | |
| "epoch": 1.992, | |
| "grad_norm": 1.7642641467833304, | |
| "kl": 0.02130126953125, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 0.7899810075759888, | |
| "reward_std": 0.38732415437698364, | |
| "rewards/correct_code_reward_func": 0.3750000149011612, | |
| "rewards/len_reward_func": 0.41498102247714996, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "step": 125, | |
| "total_flos": 0.0, | |
| "train_loss": 1.9367338650191358e-05, | |
| "train_runtime": 3648.0047, | |
| "train_samples_per_second": 0.206, | |
| "train_steps_per_second": 0.034 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |