把MADDPG拆分成多个算法
Multi-Agent:多智能体 Deep:与DQN类似,使用目标网络+经验回放 Deterministic:直接输出确定性的动作 Policy Gradient: 基于策略Policy来做梯度下降从而优化模型
PG算法介绍
什么是多智能体?有哪些环境?
从PARL的代码解读MADDPG
我原来的思路是通过PARL里DDPG的代码与MADDPG的代码作比较,但是我发现这两个算法的代码不是一个人写的,在对比时区别比较大,不易从中找到两个算法的区别,因此我打算只看MADDPG的算法,就不做代码对比了。
Algorithm:
self.model = model self.target_model = deepcopy(model)
Actor-Critir结构
给Actor输入环境的观察值obs,输出的就是动作; 把Actor输出的动作和对应的环境的观察值obs输入给Critir,最后输出Q值。
# Actor def predict(self, obs): """ input: obs: observation, shape([B] + shape of obs_n[agent_index]) output: act: action, shape([B] + shape of act_n[agent_index]) """ this_policy = self.model.policy(obs) this_action = SoftPDistribution( logits=this_policy, act_space=self.act_space[self.agent_index]).sample() return this_action def predict_next(self, obs): """ input: observation, shape([B] + shape of obs_n[agent_index]) output: action, shape([B] + shape of act_n[agent_index]) """ next_policy = self.target_model.policy(obs) next_action = SoftPDistribution( logits=next_policy, act_space=self.act_space[self.agent_index]).sample() return next_action # Critir def Q(self, obs_n, act_n): """ input: obs_n: all agents' observation, shape([B] + shape of obs_n) output: act_n: all agents' action, shape([B] + shape of act_n) """ return self.model.value(obs_n, act_n) def Q_next(self, obs_n, act_n): """ input: obs_n: all agents' observation, shape([B] + shape of obs_n) output: act_n: all agents' action, shape([B] + shape of act_n) """ return self.target_model.value(obs_n, act_n)
Actor网络的参数更新
def _actor_learn(self, obs_n, act_n): i = self.agent_index this_policy = self.model.policy(obs_n[i]) sample_this_action = SoftPDistribution( logits=this_policy, act_space=self.act_space[self.agent_index]).sample() action_input_n = act_n + [] action_input_n[i] = sample_this_action eval_q = self.Q(obs_n, action_input_n) act_cost = layers.reduce_mean(-1.0 * eval_q) act_reg = layers.reduce_mean(layers.square(this_policy)) cost = act_cost + act_reg * 1e-3 fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByNorm(clip_norm=0.5), param_list=self.model.get_actor_params()) optimizer = fluid.optimizer.AdamOptimizer(self.lr) optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) return cost
Critic网络的参数更新
def _critic_learn(self, obs_n, act_n, target_q): pred_q = self.Q(obs_n, act_n) cost = layers.reduce_mean(layers.square_error_cost(pred_q, target_q)) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByNorm(clip_norm=0.5), param_list=self.model.get_critic_params()) optimizer = fluid.optimizer.AdamOptimizer(self.lr) optimizer.minimize(cost, parameter_list=self.model.get_critic_params()) return cost
设置Agent build_program
def build_program(self): self.pred_program = fluid.Program() #Actor self.learn_program = fluid.Program() #Critic self.next_q_program = fluid.Program() #target_Critic self.next_a_program = fluid.Program() #target_Actor with fluid.program_guard(self.pred_program): #Actor,输入环境的状态量,输出动作 #环境的状态量 obs = layers.data( name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.pred_act = self.alg.predict(obs) with fluid.program_guard(self.learn_program): #Critic,输入环境的状态量以及对应的Actor动作,输出评分Q #环境的状态量 obs_n = [ layers.data( name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] #Actor根据环境输出的动作 act_n = [ layers.data( name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] target_q = layers.data(name='target_q', shape=[], dtype='float32') self.critic_cost = self.alg.learn(obs_n, act_n, target_q) with fluid.program_guard(self.next_q_program): #Critic的目标网络,输入环境的状态量以及对应的Actor动作,输出评分Q,用于稳定Q值 #环境的状态量 obs_n = [ layers.data( name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] #Actor根据环境输出的动作 act_n = [ layers.data( name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] self.next_Q = self.alg.Q_next(obs_n, act_n) with fluid.program_guard(self.next_a_program): #Actor的目标网络,输入环境的状态量,输出动作 #环境的状态量 obs = layers.data( name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.next_action = self.alg.predict_next(obs) if self.speedup: self.pred_program = parl.compile(self.pred_program) self.learn_program = parl.compile(self.learn_program, self.critic_cost) self.next_q_program = parl.compile(self.next_q_program) self.next_a_program = parl.compile(self.next_a_program)
网络参数里只有obs的就是Actor,因为Actor只需要根据环境的观察值输出动作;
既包含obs,又包含act的就是Critic了,Critic根据Actor输出的动作act以及环境的观察值obs对Actor进行打分,分数就是Q值。
MADDPG算法的核心
def learn(self, agents): self.global_train_step += 1 #经验池有数据且达到一定数量后再learn() # only update parameter every 100 steps if self.global_train_step % 100 != 0: return 0.0 if self.rpm.size() <= self.min_memory_size: return 0.0 # 从经验池中读取数据,分别是当前环境的状态量、根据当前环境的状态量做的动作、做出动作后的环境状态量 batch_obs_n = [] batch_act_n = [] batch_obs_new_n = [] rpm_sample_index = self.rpm.make_index(self.batch_size) for i in range(self.n): batch_obs, batch_act, _, batch_obs_new, _ \ = agents[i].rpm.sample_batch_by_index(rpm_sample_index) batch_obs_n.append(batch_obs) batch_act_n.append(batch_act) batch_obs_new_n.append(batch_obs_new) _, _, batch_rew, _, batch_isOver \ = self.rpm.sample_batch_by_index(rpm_sample_index) # compute target q target_q = 0.0 target_act_next_n = [] for i in range(self.n): feed = {'obs': batch_obs_new_n[i]} target_act_next = agents[i].fluid_executor.run( agents[i].next_a_program, # 每个Agent单独采样 feed=feed, fetch_list=[agents[i].next_action])[0] target_act_next_n.append(target_act_next) feed_obs = {'obs' + str(i): batch_obs_new_n[i] for i in range(self.n)} feed_act = {'act' + str(i): target_act_next_n[i]for i in range(self.n)} feed = feed_obs.copy() feed.update(feed_act) # merge two dict target_q_next = self.fluid_executor.run( self.next_q_program, # 可以观测全局的Critic的目标网络,专门用来稳定Q_target feed=feed, fetch_list=[self.next_Q])[0] target_q += ( batch_rew + self.alg.gamma * (1.0 - batch_isOver) * target_q_next) feed_obs = {'obs' + str(i): batch_obs_n[i] for i in range(self.n)} feed_act = {'act' + str(i): batch_act_n[i] for i in range(self.n)} target_q = target_q.astype('float32') feed = feed_obs.copy() feed.update(feed_act) feed['target_q'] = target_q critic_cost = self.fluid_executor.run( self.learn_program, # 训练可以观测全局的Critic feed=feed, fetch_list=[self.critic_cost])[0] self.alg.sync_target() return critic_cost
复现“老鹰捉小鸡”的游戏环境
这个游戏环境在OpenAI的代码库里可以找到,从简单到复杂,一共有6个环境,因为是追逐的游戏,并且官方给的名称不好翻译,我就把这个环境称为“老鹰捉小鸡”。配置游戏所需环境:
!pip uninstall -y parl # 说明:AIStudio预装的parl版本太老,容易跟其他库产生兼容性冲突,建议先卸载 !pip uninstall -y pandas scikit-learn # 提示:在AIStudio中卸载这两个库再import parl可避免warning提示,不卸载也不影响parl的使用 !pip install paddlepaddle-gpu==1.6.3.post97 -i https://mirror.baidu.com/pypi/simple !pip install parl==1.3.1 #一定要安装gym==0.10.5版本的gym,否则报错 !pip install gym==0.10.5 -I https://mirror.baidu.com/pypi/simple
!git clone https://github.com/openai/multiagent-particle-envs #如果无法运行,请到终端操作 !cd multiagent-particle-envs && !pip install -e .
回归论文
最后,我们回归论文。
总结与展望
MADDPG算法是在DDPG算法的基础上做的改进,其中最核心的思想:一方面继承了DDPG的Actor-Critir即演员-评论家的结构;另一方面,MADDPG在Actor-Critir结构的基础上,让每个智能体Agent的Actor独立地采样,而每个智能体Agent的Critir都有全局的信息,以此在指导Actor做出动作。