【发布时间】:2021-08-11 18:44:38
【问题描述】:
您好,我在 TF 2 中实现的 DDPG 模型在 openai-gym 上的每个环境中都得到了可怕的结果,这些环境具有连续的动作,我需要帮助来找出问题所在。我在我的 GPU 上运行它。在 env Pendulum 上,我每集都会获得 -1200/-1000 奖励。这段代码来自我在 udemy 上的一门课程,但它是用 TF1.x 编写的,我在 TF2 中重写了它,但他的 TF1.x 实现效果更好。代码如下:
import tensorflow as tf
import numpy as np
import os
import gym
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model
class ReplayBuffer():
def __init__(self, obs_dim, act_dim, size):
self.obs1_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
self.obs2_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
self.act_buf = np.zeros([size, act_dim], dtype=np.float32)
self.reward_buf = np.zeros(size, dtype=np.float32)
self.done_buf = np.zeros(size, dtype=np.float32)
self.current = 0
self.count = 0
self.size = size
def add_experience(self, state, action, reward, next_state, done):
self.obs1_buf[self.current] = state
self.act_buf[self.current] = action
self.reward_buf[self.current] = reward
self.obs2_buf[self.current] = next_state
self.done_buf[self.current] = done
self.current = (self.current + 1) % self.size
self.count = min(self.count+1, self.size)
def sample_batch(self, batch_size=32):
idx = np.random.randint(0, self.count, size=batch_size)
return dict(s=self.obs1_buf[idx],
s2=self.obs2_buf[idx],
a=self.act_buf[idx],
r=self.reward_buf[idx],
d=self.done_buf[idx])
class DDPG():
def __init__(self, env, num_states, num_actions, action_max):
self.env = env
self.num_states = num_states
self.num_actions = num_actions
self.action_max = action_max
self.gamma = 0.99
self.decay = 0.995
self.mu_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
def mu_model(hidden_layers):
inp = Input(shape=(self.num_states, ))
x = inp
for layers in hidden_layers[:-1]:
x = Dense(layers, activation='relu')(x)
x = Dense(hidden_layers[-1], activation='tanh')(x)
mu_model = Model(inp, x)
return mu_model
self.mu_model = mu_model([300, self.num_actions])
def q_model(inp_state, inp_act, hidden_layers):
inp_state = Input(shape=(inp_state, ))
inp_mu = Input(shape=(inp_act, ))
inp = concatenate([inp_state, inp_mu])
x = inp
for layers in hidden_layers[:-1]:
x = Dense(layers, activation='relu')(x)
x = Dense(hidden_layers[-1], activation='linear')(x)
q_model = Model([inp_state, inp_mu], x)
return q_model
self.q_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
self.q_target_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
#Eself.mu_do_minimize = tf.function(self.mu_minimize, input_signature=[
#tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state')])
self.q_do_minimize = tf.function(self.q_minimize, input_signature=[
tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state'),
tf.TensorSpec(shape=(None, self.num_actions), dtype=tf.float32, name='action'),
tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='next_state'),
tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='reward'),
tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='done_flags')])
@tf.function
def train_mu(self, state):
with tf.GradientTape() as tape:
actions = self.mu_model(state, training=True)
critic_value = self.q_model([state, actions], training=True)
# Used `-value` as we want to maximize the value given
# by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value)
actor_grad = tape.gradient(actor_loss, self.mu_model.trainable_variables)
self.mu_optimizer.apply_gradients(
zip(actor_grad, self.mu_model.trainable_variables)
)
def q_minimize(self, state, action, next_state, reward, done):
def calc_loss():
q_targ = reward + self.gamma * (1 - done) * self.q_target_model([next_state, action])
q = self.q_model([state, action])
cost = tf.reduce_mean((q - q_targ)**2)
return cost
self.q_optimizer.minimize(calc_loss, self.q_model.trainable_variables)
def train(self, state, action, reward, done, next_state):
state = np.atleast_2d(state)
next_state = np.atleast_2d(next_state)
action = np.atleast_2d(action)
reward = np.atleast_1d(reward)
done = np.atleast_1d(done)
self.update_target_net()
self.train_mu(state)
self.q_do_minimize(state, action, next_state, reward, done)
def update_target_net(self):
mu_weights = np.array(self.mu_model.get_weights())
q_weights = np.array(self.q_model.get_weights())
#print(mu_weights.shape)
#print(q_weights.shape)
mu_target_weights = np.array(self.mu_target_model.get_weights())
q_target_weights = np.array(self.q_target_model.get_weights())
self.q_target_model.set_weights(self.decay * q_weights + (1 - self.decay) * q_target_weights)
def get_action(self, states, noise=None):
if noise is None: noise = self.ACT_NOISE_SCALE
if len(states.shape) == 1: states = states.reshape(1,-1)
action = self.mu_model.predict_on_batch(states)[0]
if noise != 0:
action += noise * np.random.randn(self.num_actions)
action = np.clip(action, -self.action_max, self.action_max)
return action
def play_one(env, agent, replay_buffer, gamma=0.99, noise=0.1, max_episode_len=1000, start_steps=10000, num_train_ep=100, batch_size=100, test_ep_agent=25):
returns = []
num_steps = 0
for ep in range(num_train_ep):
s, ep_return, ep_len, d = env.reset(), 0, 0, False
while not (d or ep_len == max_episode_len):
env.render()
if num_steps > start_steps:
a = agent.get_action(s, noise)
else:
a = env.action_space.sample()
num_steps+=1
if num_steps == start_steps:
print("USING AGENT ACTIONS NOW")
s2, r, d, _ = env.step(a)
ep_return+=r
ep_len+=1
#print(s.shape)
d = False if ep_len == max_episode_len else d
replay_buffer.add_experience(s, a, r, s2, d)
s = s2
for _ in range(ep_len):
batch = replay_buffer.sample_batch()
state, next_state, action, reward, done = batch['s'], batch['s2'], batch['a'], batch['r'], batch['d']
loss = agent.train(state, action, reward, done, next_state)
returns.append(ep_return)
print('Iter:', ep, 'Rewards:', ep_return)
return returns
if __name__ == '__main__':
env = gym.make('Pendulum-v0')
obs_dim1 = env.observation_space.shape[0]
act_dim1 = env.action_space.shape[0]
action_max1 = env.action_space.high[0]
actor = DDPG(env, obs_dim1, act_dim1, action_max1)
replay_buffer = ReplayBuffer(obs_dim1, act_dim1, size=100000)
returns = play_one(env, actor, replay_buffer)
提前谢谢你!
【问题讨论】:
标签: python tensorflow keras reinforcement-learning openai-gym