Chapter06 maximization-bias and Double-Learning

引用来自ShangtongZhang的代码chapter06/maximization_bias.py

因为TD算法中的target policy建立中经常会用到maximization操作，在这些算法中，a maximum over estimated values is used implicitly as an estimate of the maximum value,这可能会导致显著的正向偏差，本例通过一个简单的MRP来讨论这个问题。

问题描述

这个例子是为了说明并解决TD方法将估计的最大值作为实际Q的最大值造成的正向偏差(bias)，使用了一个简单的MRP:

png

A是start state，左右的灰色小方格是terminal state。A状态下可以有两个action：left和right；如果向右直接会结束，reward=0；向左则进入B状态，B状态有多个达到左边terminal state的actions可选，并对应reward=N(-0.1,1)，所以(A,left)开始的expect return = -0.1。

引入模块并定义常量

# 6.7 Maximization Bias and Double Learnin

import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
# copy module可以提供浅拷贝和深拷贝方法
import copy

# state A
STATE_A = 0

# state B
STATE_B = 1

# use one terminal state
STATE_TERMINAL = 2

# starts from state A
STATE_START = STATE_A

# possible actions in A
ACTION_A_RIGHT = 0
ACTION_A_LEFT = 1

# probability for exploration
EPSILON = 0.1

# step size
ALPHA = 0.1

# discount for max value
GAMMA = 1.0

# possible actions in B, maybe 10 actions
ACTIONS_B = range(0, 10)

# all possible actions
STATE_ACTIONS = [[ACTION_A_RIGHT, ACTION_A_LEFT], ACTIONS_B]

# state action pair values, if a state is a terminal state, then the value is always 0
# 按照顺序分别是:(A,left) (A,right) (B,0) ... (B,9) (C)
INITIAL_Q = [np.zeros(2), np.zeros(len(ACTIONS_B)), np.zeros(1)]

# set up destination for each state and each action
# 这个list通过索引给出next state，之前的都是在take_action函数中顺便给出下一个state的
# TRANSITION[state][action] = next-state
TRANSITION = [[STATE_TERMINAL, STATE_B], [STATE_TERMINAL] * len(ACTIONS_B)]

提供choose action和take action函数

# choose an action based on epsilon greedy algorithm
def choose_action(state, q_value):
    if np.random.binomial(1, EPSILON) == 1:
        return np.random.choice(STATE_ACTIONS[state])
    else:
        values_ = q_value[state]
        return np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])

# take @action in @state, return the reward
def take_action(state, action):
    if state == STATE_A:
        return 0
    return np.random.normal(-0.1, 1)

使用Q-Learning 和 Double Q-Learning方法来训练并计算policy选择(A|left)的概率

# if there are two state action pair value array, use double Q-Learning
# otherwise use normal Q-Learning
def q_learning(q1, q2=None):
    state = STATE_START
    # track the # of action left in state A
    left_count = 0
    while state != STATE_TERMINAL:
        if q2 is None:
            action = choose_action(state, q1)
        else:
            # derive(得到) a action form Q1 and Q2
            # choose action的时候是结合两个Q综合选择的
            action = choose_action(state, [item1 + item2 for item1, item2 in zip(q1, q2)])
        if state == STATE_A and action == ACTION_A_LEFT:
            left_count += 1
        reward = take_action(state, action)
        next_state = TRANSITION[state][action]
        
        if q2 is None:
            active_q = q1
            target = np.max(active_q[next_state])
        else:
            # 根据50%的概率来选择更新q1还是q2
            if np.random.binomial(1, 0.5) == 1:
                active_q = q1
                target_q = q2
            else:
                active_q = q2
                target_q = q1
            # 从active_q中选择max value的action
            best_action = np.random.choice([action_ for action_, value_ in enumerate(active_q[next_state]) if value_ == np.max(active_q[next_state])])
            # 从target_q中选择与active_q对应的target来更新Q
            target = target_q[next_state][best_action]

        # Q-Learning update
        active_q[state][action] += ALPHA * (
            reward + GAMMA * target - active_q[state][action])
        state = next_state
    return left_count

# Figure 6.7, 1,000 runs may be enough, the number of actions in state B will also affect the curves
def figure_6_7():
    # each independent run has 300 episodes
    episodes = 300
    runs = 1000
    left_counts_q = np.zeros((runs, episodes))
    left_counts_double_q = np.zeros((runs, episodes))
    for run in tqdm(range(runs)):
        q = copy.deepcopy(INITIAL_Q)
        q1 = copy.deepcopy(INITIAL_Q)
        q2 = copy.deepcopy(INITIAL_Q)
        for ep in range(0, episodes):
            left_counts_q[run, ep] = q_learning(q)
            left_counts_double_q[run, ep] = q_learning(q1, q2)
    left_counts_q = left_counts_q.mean(axis=0)
    left_counts_double_q = left_counts_double_q.mean(axis=0)

    plt.plot(left_counts_q, label='Q-Learning')
    plt.plot(left_counts_double_q, label='Double Q-Learning')
    plt.plot(np.ones(episodes) * 0.05, label='Optimal')
    plt.xlabel('episodes')
    plt.ylabel('% left actions from A')
    plt.legend()

    plt.savefig('./figure_6_7.png')
    plt.show()

figure_6_7()

100%|██████████| 1000/1000 [00:44<00:00, 23.10it/s]

png