Deep Deterministic Policy Gradient (DDPG)
- off-policy
- Actor-Critic structure Sequential Decision
Principle
DDPG 简单来说就是 DQN + Actor-Critic
DDPG
结合了之前获得成功的 DQN
结构, 提高了 Actor-Critic
的稳定性和收敛性。为了体现DQN
的思想,每种神经网络都需要再细分为两个,
Actor
有估计网络和现实网络,估计网络用来输出实时的动作, 供actor在现实中实行。而现实网络则是用来更新价值网络系统的。Critic
这边也有现实网络和估计网络,他们都在输出这个状态的价值,而输入端却有不同,状态现实网络这边会拿着从动作现实网络来的动作加上状态的观测值加以分析,而状态估计网络则是拿着当时Actor施加的动作当作输入。
Pseudocode
Advantage
off-policy -> sample efficient
Disadvantage
If the Q-function approximator develops an incorrect sharp peak for some actions, the policy will quickly exploit that peak and then have brittle or incorrect behavior.
Implement
class Actor(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Actor, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, output_size)
def forward(self, s):
x = F.relu(self.linear1(s))
x = F.relu(self.linear2(x))
x = torch.tanh(self.linear3(x))
return x
class Critic(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, output_size)
def forward(self, s, a):
x = torch.cat([s, a], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class Skylark_DDPG():
def __init__(self, env):
self.env = env
self.gamma = 0.99
self.actor_lr = 0.001
self.critic_lr = 0.001
self.tau = 0.02
self.capacity = 10000
self.batch_size = 32
s_dim = self.env.observation_space.shape[0]
a_dim = self.env.action_space.shape[0]
self.actor = Actor(s_dim, 256, a_dim)
self.actor_target = Actor(s_dim, 256, a_dim)
self.critic = Critic(s_dim+a_dim, 256, a_dim)
self.critic_target = Critic(s_dim+a_dim, 256, a_dim)
self.actor_optim = optim.Adam(self.actor.parameters(), lr = self.actor_lr)
self.critic_optim = optim.Adam(self.critic.parameters(), lr = self.critic_lr)
self.buffer = []
self.actor_target.load_state_dict(self.actor.state_dict())
self.critic_target.load_state_dict(self.critic.state_dict())
def act(self, s0):
s0 = torch.tensor(s0, dtype=torch.float).unsqueeze(0)
a0 = self.actor(s0).squeeze(0).detach().numpy()
return a0
def put(self, *transition):
if len(self.buffer)== self.capacity:
self.buffer.pop(0)
self.buffer.append(transition)
def learn(self):
if len(self.buffer) < self.batch_size:
return
samples = random.sample(self.buffer, self.batch_size)
s0, a0, r1, s1 = zip(*samples)
s0 = torch.tensor(s0, dtype=torch.float)
a0 = torch.tensor(a0, dtype=torch.float)
r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size,-1)
s1 = torch.tensor(s1, dtype=torch.float)
def critic_learn():
a1 = self.actor_target(s1).detach()
y_true = r1 + self.gamma * self.critic_target(s1, a1).detach()
y_pred = self.critic(s0, a0)
loss_fn = nn.MSELoss()
loss = loss_fn(y_pred, y_true)
self.critic_optim.zero_grad()
loss.backward()
self.critic_optim.step()
def actor_learn():
loss = -torch.mean( self.critic(s0, self.actor(s0)) )
self.actor_optim.zero_grad()
loss.backward()
self.actor_optim.step()
def soft_update(net_target, net, tau):
for target_param, param in zip(net_target.parameters(), net.parameters()):
target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
critic_learn()
actor_learn()
soft_update(self.critic_target, self.critic, self.tau)
soft_update(self.actor_target, self.actor, self.tau)
def train(self, num_episodes):
for i in range(1, num_episodes):
s0 = self.env.reset()
episode_reward = 0
for t in range(1, 1000):
# self.env.render()
a0 = self.act(s0)
s1, r1, done, _ = self.env.step(a0)
self.put(s0, a0, r1, s1)
episode_reward += r1
s0 = s1
self.learn()
print('Episode {} : {}'.format(i, episode_reward))
评论(0)
您还未登录,请登录后发表或查看评论