【发布时间】:2020-02-23 08:53:25
【问题描述】:
我正在尝试从其原始资源Andrej Karpathy blog 重新创建非常简单的策略梯度示例。在那篇文章中,您将找到带有权重列表和 Softmax 激活列表的 CartPole 和 Policy Gradient 示例。这是我重新创建的非常简单的 CartPole 策略梯度示例,效果很好。
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import copy
NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.rand(5, 2)
def policy(self, state):
z = state.dot(self.w)
exp = np.exp(z)
return exp/np.sum(exp)
def __softmax_grad(self, softmax):
s = softmax.reshape(-1,1)
return np.diagflat(s) - np.dot(s, s.T)
def grad(self, probs, action, state):
dsoftmax = self.__softmax_grad(probs)[action,:]
dlog = dsoftmax / probs[0,action]
grad = state.T.dot(dlog[None,:])
return grad
def update_with(self, grads, rewards):
for i in range(len(grads)):
# Loop through everything that happend in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
print("Grads update: " + str(np.sum(grads[i])))
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=probs[0])
next_state, reward, done,_ = env.step(action)
next_state = next_state[None,:]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
.
.
问题
我正在尝试做几乎相同的示例,但使用 Sigmoid 激活(只是为了简单起见)。 这就是我需要做的。将模型中的激活从 softmax 切换到 sigmoid。 这肯定会起作用(根据下面的解释)。但是我的 Policy Gradient 模型什么也没学到,而且保持随机。有什么建议吗?
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.rand(5, 1) - 0.5
# Our policy that maps state to action parameterized by w
# noinspection PyShadowingNames
def policy(self, state):
z = np.sum(state.dot(self.w))
return self.sigmoid(z)
def sigmoid(self, x):
s = 1 / (1 + np.exp(-x))
return s
def sigmoid_grad(self, sig_x):
return sig_x * (1 - sig_x)
def grad(self, probs, action, state):
dsoftmax = self.sigmoid_grad(probs)
dlog = dsoftmax / probs
grad = state.T.dot(dlog)
grad = grad.reshape(5, 1)
return grad
def update_with(self, grads, rewards):
if len(grads) < 50:
return
for i in range(len(grads)):
# Loop through everything that happened in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=[1 - probs, probs])
next_state, reward, done, _ = env.step(action)
next_state = next_state[None, :]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
绘制所有学习保持随机。调整超参数没有任何帮助。在示例图片下方。
参考文献:
更新
似乎下面的答案可以从图形中做一些工作。但这不是对数概率,甚至不是策略的梯度。并改变了 RL 梯度策略的整个目的。请检查上面的参考资料。我们下图后声明。
我需要我的策略的日志函数的梯度(这只是权重和sigmoid 激活)。如有任何问题,请告诉我。
【问题讨论】:
-
我建议你在Data Science Stack Exchange 上发布这个问题,因为它主要是一个理论问题(堆栈溢出主要用于编码问题)。您还将接触到更多在该领域有知识的人。
-
@Gilles-PhilippePaillé 我添加了代表问题的代码。我需要做的,只是修复一些激活的部分。请检查更新的答案。
-
To Derive Policy Gradients,这里是参考文章,其中包含相同类型安排的工作示例,希望您能详细了解:medium.com/@thechrisyoon/…。
-
@Jason Chia 你是对的,但我已经尝试过它并且在这种特殊情况下减少它通常不会产生明显的效果。
-
@JasonChia sigmoid 输出
[0, 1]范围内的实数,可以解释为积极行动的概率(例如,在 CartPole 中右转)。那么消极行动的概率(左转)是1 - sigmoid。这个概率的总和是 1。是的,这是一个标准的杆卡环境。
标签: python machine-learning math deep-learning reinforcement-learning