epoch    0  train_reward=+0.01271  val_advantage=-0.02739
epoch   10  train_reward=-0.01056  val_advantage=-0.02739
epoch   20  train_reward=-0.07051  val_advantage=-0.02739
epoch   30  train_reward=-0.07403  val_advantage=-0.02739
epoch   40  train_reward=-0.12299  val_advantage=-0.02739
epoch   50  train_reward=-0.08713  val_advantage=-0.02739
epoch   60  train_reward=-0.00111  val_advantage=-0.02739
epoch   70  train_reward=-0.09614  val_advantage=-0.02739
epoch   80  train_reward=-0.07110  val_advantage=-0.02739
epoch   90  train_reward=-0.17650  val_advantage=-0.02739
epoch  100  train_reward=+0.20533  val_advantage=-0.02739
epoch  110  train_reward=+0.05896  val_advantage=-0.02739
epoch  120  train_reward=+0.01636  val_advantage=-0.02739
epoch  130  train_reward=-0.09593  val_advantage=-0.02739
epoch  140  train_reward=-0.14889  val_advantage=-0.02739
epoch  150  train_reward=-0.37466  val_advantage=-0.02739
epoch  160  train_reward=+0.01004  val_advantage=-0.02739
epoch  170  train_reward=-0.19538  val_advantage=-0.02739
epoch  180  train_reward=-0.12952  val_advantage=-0.02739
epoch  190  train_reward=+0.14019  val_advantage=-0.02739
epoch  199  train_reward=+0.11508  val_advantage=-0.02739
