強化學習經典算法筆記(八):LSTM加持的A2C算法解決POMDP問題
最近用到LSTM構建Agent,找到了一個非常簡明易讀的示例代碼。
https://github.com/HaiyinPiao/pytorch-a2clstm-DRQN
環境採用CartPole-v1。原本狀態是一個4維向量,現刪去第二維,即小車的速度,保留小車的位移,杆的角度和角速度,使問題從MDP問題變爲POMDP(Partial Observable Markov Decision Process)問題。
代碼如下:
導入必要package
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import math
import random
import os
import gym
參數設置
STATE_DIM = 4-1; # 刪去小車速度這一維度,使之成爲POMDP
ACTION_DIM = 2; # 動作空間大小
NUM_EPISODE = 5000; # 訓練的Episode數量
EPISODE_LEN = 1000; # episode最大長度
A_HIDDEN = 40; # Actor網絡的隱層神經元數量
C_HIDDEN = 40; # Critic網絡的隱層神經元數量
ActorCritic網絡
# ActorNet使用LSTM + MLP估計完整的狀態
class ActorNetwork(nn.Module):
def __init__(self,in_size,hidden_size,out_size):
super(ActorNetwork, self).__init__()
self.lstm = nn.LSTM(in_size, hidden_size, batch_first = True)
self.fc = nn.Linear(hidden_size,out_size)
def forward(self, x, hidden):
x, hidden = self.lstm(x, hidden)
x = self.fc(x)
x = F.log_softmax(x,2) # log(softmax(x))
return x, hidden
class ValueNetwork(nn.Module):
def __init__(self,in_size,hidden_size,out_size):
super(ValueNetwork, self).__init__()
self.lstm = nn.LSTM(in_size, hidden_size, batch_first = True)
self.fc = nn.Linear(hidden_size,out_size)
def forward(self,x, hidden):
x, hidden = self.lstm(x, hidden)
x = self.fc(x)
return x, hidden
完成單episode交互並記錄trajectory
def roll_out(actor_network,env,episode_len,value_network,init_state):
'''
rollout最長1000frames
返回:
狀態序列,不包括終態
動作序列,獨熱編碼
獎勵序列,不包括終態獎勵
state:遊戲環境初始化後的初始狀態
'''
states = []
actions = []
rewards = []
is_done = False
final_r = 0
state = init_state # 初始狀態
a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0); # 初始化隱狀態
a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
for j in range(episode_len):
states.append(state)
log_softmax_action, (a_hx,a_cx) = actor_network(Variable(torch.Tensor([state]).unsqueeze(0)), (a_hx,a_cx))
# 這個部分可以用torch Categorical來實現
# from torch.distributions import Categorical
softmax_action = torch.exp(log_softmax_action) # 對數softmax取指數,保證大於0
action = np.random.choice(ACTION_DIM,p=softmax_action.cpu().data.numpy()[0][0])
# 動作獨熱編碼
one_hot_action = [int(k == action) for k in range(ACTION_DIM)]
next_state,reward,done,_ = env.step(action)
next_state = np.delete(next_state, 1)
#fix_reward = -10 if done else 1
actions.append(one_hot_action)
rewards.append(reward)
final_state = next_state # final_state和state是一回事
state = next_state
if done:
is_done = True
state = env.reset()
state = np.delete(state,1)
a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
# 打印episode總分
print(j+1)
break
if not is_done: # 1000frame後如果episode還未結束,就用VNet估計終態價值c_out
c_out, (c_hx,c_cx) = value_network(Variable(torch.Tensor([final_state])), (c_hx,c_cx))
final_r = c_out.cpu().data.numpy() # 如果episode正常結束,final_r=0表示終態cart失去控制得0分
return states,actions,rewards,final_r,state
計算累計折扣獎勵的函數
def discount_reward(r, gamma,final_r):
'''
r: list
final_r: scalar
'''
discounted_r = np.zeros_like(r)
running_add = final_r
for t in reversed(range(0, len(r))):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
訓練和測試主函數
def main():
# 初始化env
env = gym.make("CartPole-v1")
init_state = env.reset()
init_state = np.delete(init_state,1) # 刪掉cart velocity這一維度
# 初始化價值網絡
value_network = ValueNetwork(in_size=STATE_DIM, hidden_size=C_HIDDEN, out_size=1)
value_network_optim = torch.optim.Adam(value_network.parameters(),lr=0.005)
# 初始化動作網絡
actor_network = ActorNetwork(in_size=STATE_DIM, hidden_size=A_HIDDEN, out_size=ACTION_DIM)
actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.001)
steps =[]
task_episodes =[]
test_results =[]
for episode in range(NUM_EPISODE):
# 完成一輪rollout
states,actions,rewards,final_r,current_state = roll_out(actor_network,env,EPISODE_LEN,value_network,init_state)
# states.shape = [epi_len,3],list
# rollout結束後的初態
init_state = current_state
actions_var = Variable(torch.Tensor(actions).view(-1,ACTION_DIM)).unsqueeze(0)
states_var = Variable(torch.Tensor(states).view(-1,STATE_DIM)).unsqueeze(0)
# 訓練動作網絡
a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
actor_network_optim.zero_grad()
# print(states_var.unsqueeze(0).size())
log_softmax_actions, (a_hx,a_cx) = actor_network(states_var, (a_hx,a_cx))
vs, (c_hx,c_cx) = value_network(states_var, (c_hx,c_cx)) # 給出狀態價值估計
vs.detach() # 不參與求梯度
# 計算Q(s,a)和Advantage函數
qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r)))
qs = qs.view(1, -1, 1)
advantages = qs - vs
# print('adv,',advantages.shape)
# log_softmax_actions * actions_var是利用獨熱編碼特性取出對應action的對數概率
actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages)
actor_network_loss.backward()
torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5)
actor_network_optim.step()
# 訓練價值網絡
value_network_optim.zero_grad()
target_values = qs
a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
values, (c_hx,c_cx) = value_network(states_var, (c_hx,c_cx))
criterion = nn.MSELoss()
value_network_loss = criterion(values,target_values)
value_network_loss.backward()
torch.nn.utils.clip_grad_norm(value_network.parameters(),0.5)
value_network_optim.step()
# Testing
if (episode + 1) % 50== 0:
result = 0
test_task = gym.make("CartPole-v1")
for test_epi in range(10): # 測試10個episode
state = test_task.reset()
state = np.delete(state,1)
a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
for test_step in range(500): # 每個episode最長500frame
log_softmax_actions, (a_hx,a_cx) = actor_network(Variable(torch.Tensor([state]).view(1,1,3)), (a_hx,a_cx))
softmax_action = torch.exp(log_softmax_actions)
#print(softmax_action.data)
action = np.argmax(softmax_action.data.numpy()[0])
next_state,reward,done,_ = test_task.step(action)
next_state = np.delete(next_state,1)
result += reward
state = next_state
if done:
break
print("episode:",episode+1,"test result:",result/10.0)
steps.append(episode+1)
test_results.append(result/10)
plt.plot(steps,test_results)
plt.savefig('training_score.png')
if __name__ == '__main__':
main()
實驗結果如下: