defjoin(str1, str2): """把输入的两个字符串通过“-”连接,便于使用定义的P、R变量""" return str1 + '-' + str2 defsample(MDP, Pi, timestep_max, number): ''' 在MDP中随机采样状态序列,状态序列由多个(s,a,r,s_next)组成,策略Pi,限制最长时间步timestep_max,总共采样序列数number ''' S, A, P, R, gamma = MDP episodes = []#保存所有采样序列 for _ inrange(number): episode = []#一条采样序列 timestep = 0 s = S[np.random.randint(4)] # 随机选择一个除s5以外的状态s作为起点 while s != "s5"and timestep <= timestep_max:# 当前状态为终止状态或者时间步太长时,一次采样结束 timestep += 1 # 在状态s下根据策略选择动作 rand, temp = np.random.rand(), 0 for a_opt in A: temp += Pi.get(join(s, a_opt), 0) if temp > rand: a = a_opt r = R.get(join(s, a), 0) break # 根据状态转移概率得到下一个状态s_next rand, temp = np.random.rand(), 0 for s_opt in S: temp += P.get(join(join(s, a), s_opt), 0) if temp > rand: s_next = s_opt break episode.append((s, a, r, s_next)) # 把(s,a,r,s_next)元组放入序列中 s = s_next # s_next变成当前状态,开始接下来的循环 episodes.append(episode) return episodes
3.从后向前统计每个状态的出现次数和总回报
1 2 3 4 5 6 7 8 9
defMC(episodes, V, N, gamma): """对所有采样序列episodes计算所有状态的价值V,采样序列中状态出现的次数N,折扣因子gamma平衡短期和长期利益""" for episode in episodes: G = 0 for i inrange(len(episode) - 1, -1, -1): #一个序列从后往前计算,便于计算当前状态至未来的累积奖励即回报G (s, a, r, s_next) = episode[i] G = r + gamma * G N[s] = N[s] + 1 V[s] = V[s] + (G - V[s]) / N[s]#增量更新