我的 DDPG 模型 TF2 得到了可怕的结果答案

【问题标题】：I get horrible results with my DDPG model TF2我的 DDPG 模型 TF2 得到了可怕的结果
【发布时间】：2021-08-11 18:44:38
【问题描述】：

您好，我在 TF 2 中实现的 DDPG 模型在 openai-gym 上的每个环境中都得到了可怕的结果，这些环境具有连续的动作，我需要帮助来找出问题所在。我在我的 GPU 上运行它。在 env Pendulum 上，我每集都会获得 -1200/-1000 奖励。这段代码来自我在 udemy 上的一门课程，但它是用 TF1.x 编写的，我在 TF2 中重写了它，但他的 TF1.x 实现效果更好。代码如下：

import tensorflow as tf
import numpy as np
import os
import gym
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model

class ReplayBuffer():
    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
        self.act_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.reward_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.current = 0
        self.count = 0 
        self.size = size

    def add_experience(self, state, action, reward, next_state, done):
        self.obs1_buf[self.current] = state
        self.act_buf[self.current] = action
        self.reward_buf[self.current] = reward
        self.obs2_buf[self.current] = next_state
        self.done_buf[self.current] = done
        self.current = (self.current + 1) % self.size
        self.count = min(self.count+1, self.size)

    def sample_batch(self, batch_size=32):
        idx = np.random.randint(0, self.count, size=batch_size)
        return dict(s=self.obs1_buf[idx],
                    s2=self.obs2_buf[idx],
                    a=self.act_buf[idx],
                    r=self.reward_buf[idx],
                    d=self.done_buf[idx])


class DDPG():
    def __init__(self, env, num_states, num_actions, action_max):

        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions
        self.action_max = action_max
        self.gamma = 0.99
        self.decay = 0.995
    
        self.mu_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
        self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

        def mu_model(hidden_layers):
            inp = Input(shape=(self.num_states, ))
            x = inp

            for layers in hidden_layers[:-1]:
                x = Dense(layers, activation='relu')(x)
            x = Dense(hidden_layers[-1], activation='tanh')(x)

            mu_model = Model(inp, x)
            
            return mu_model
        
        self.mu_model = mu_model([300, self.num_actions])

        def q_model(inp_state, inp_act, hidden_layers):
            inp_state = Input(shape=(inp_state, ))
            inp_mu = Input(shape=(inp_act, ))
            inp = concatenate([inp_state, inp_mu])
            x = inp

            for layers in hidden_layers[:-1]:
                x = Dense(layers, activation='relu')(x)
            x = Dense(hidden_layers[-1], activation='linear')(x)

            q_model = Model([inp_state, inp_mu], x)
            return q_model

        self.q_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
        
        self.q_target_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])

        #Eself.mu_do_minimize = tf.function(self.mu_minimize, input_signature=[
                                                            #tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state')])
        
        self.q_do_minimize = tf.function(self.q_minimize, input_signature=[
                                                            tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state'),
                                                            tf.TensorSpec(shape=(None, self.num_actions), dtype=tf.float32, name='action'),
                                                            tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='next_state'),
                                                            tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='reward'),
                                                            tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='done_flags')])

    @tf.function
    def train_mu(self, state):
        with tf.GradientTape() as tape:
            actions = self.mu_model(state, training=True)
            critic_value = self.q_model([state, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.mu_model.trainable_variables)
        self.mu_optimizer.apply_gradients(
            zip(actor_grad, self.mu_model.trainable_variables)
        )
        
    

    def q_minimize(self, state, action, next_state, reward, done):
        def calc_loss():
            q_targ = reward + self.gamma * (1 - done) * self.q_target_model([next_state, action])
            q = self.q_model([state, action])
            cost = tf.reduce_mean((q - q_targ)**2)
            return cost
        self.q_optimizer.minimize(calc_loss, self.q_model.trainable_variables)


    def train(self, state, action, reward, done, next_state):
        state = np.atleast_2d(state)
        next_state = np.atleast_2d(next_state)
        action = np.atleast_2d(action)
        reward = np.atleast_1d(reward)
        done = np.atleast_1d(done)

        self.update_target_net()
        self.train_mu(state)
        self.q_do_minimize(state, action, next_state, reward, done)


    def update_target_net(self):

        mu_weights = np.array(self.mu_model.get_weights())
        q_weights = np.array(self.q_model.get_weights())
        #print(mu_weights.shape)
        #print(q_weights.shape)
        
        mu_target_weights = np.array(self.mu_target_model.get_weights())
        q_target_weights = np.array(self.q_target_model.get_weights())
        
        
        
        self.q_target_model.set_weights(self.decay * q_weights + (1 - self.decay) * q_target_weights)


    def get_action(self, states, noise=None):
        if noise is None: noise = self.ACT_NOISE_SCALE
        if len(states.shape) == 1: states = states.reshape(1,-1)
        action = self.mu_model.predict_on_batch(states)[0]
        if noise != 0:
            action += noise * np.random.randn(self.num_actions)
            action = np.clip(action, -self.action_max, self.action_max)
        return action


def play_one(env, agent, replay_buffer, gamma=0.99, noise=0.1, max_episode_len=1000, start_steps=10000, num_train_ep=100, batch_size=100, test_ep_agent=25):
    returns = []
    num_steps = 0

    for ep in range(num_train_ep):
        s, ep_return, ep_len, d = env.reset(), 0, 0, False

        while not (d or ep_len == max_episode_len):
            env.render()
            if num_steps > start_steps:
                a = agent.get_action(s, noise)
            else:
                a = env.action_space.sample()

            num_steps+=1
            if num_steps == start_steps:
                print("USING AGENT ACTIONS NOW")

            s2, r, d, _ = env.step(a)
            ep_return+=r
            ep_len+=1
            #print(s.shape)
            d = False if ep_len == max_episode_len else d

            replay_buffer.add_experience(s, a, r, s2, d)

            s = s2
        for _ in range(ep_len):
            batch = replay_buffer.sample_batch()
            state, next_state, action, reward, done = batch['s'], batch['s2'], batch['a'], batch['r'], batch['d']

            loss = agent.train(state, action, reward, done, next_state)

        returns.append(ep_return)
        print('Iter:', ep, 'Rewards:', ep_return)


    
    return returns
    
if __name__ == '__main__':
    
    env = gym.make('Pendulum-v0')
    obs_dim1 = env.observation_space.shape[0]
    
    act_dim1 = env.action_space.shape[0]
    
    action_max1 = env.action_space.high[0]
    actor = DDPG(env, obs_dim1, act_dim1, action_max1)
    replay_buffer = ReplayBuffer(obs_dim1, act_dim1, size=100000)

    returns = play_one(env, actor, replay_buffer)

提前谢谢你！

【问题讨论】：

标签： python tensorflow keras reinforcement-learning openai-gym

【解决方案1】：

首先想到的是学习率：0.01 太高了，即使对于钟摆也是如此。尝试较低的学习率（例如，演员为 1e-3，评论者为 5e-3）。

您的代码中还有几件事：

参与者没有目标网络。这是为什么？ IIRC ddpg 为演员和评论家提供目标网络。
通常最好使用相同的参数初始化主网络和目标网络。您可以使用target_model.set_weights(model.get_weights()) 做到这一点
在函数play_one 中，训练步骤在播放一整集后完成。这可能没问题，但没有必要：因为 pendulum 不是实时的，所以你不需要你的代码很快，所以你可以边玩边训练。

如果你想看看我不久前在 tensorflow 2 中实现了 ddpg。它解决了 80 集的钟摆问题。 GitHub

【讨论】：