Chapter11 counterexample

引用来自ShangtongZhang的代码chapter11/counterexample.py

通过Baird’s counterexample来分析off-policy的approximation方法的性能。

问题描述

这是第11章唯一使用的一个例子，使用了一个MRP如下：

png

其中虚线dashed action对应从任意state到1-6th state的跳转，实线solid action对应从任意state到7th state的跳转，所有action的reward都是0。off-policy的behavior policy b 选取实线action和虚线action按照概率6/7和1/7，所以由behavior policy产生的next-state的分布是均匀的；target policy π 是只选择solid(实线) action的，所以on-policy 分布更集中于7th state，这里的on-policy分布是指的会产生weight更新的state的分布。

每个state的value都是通过圆圈里的linear function来描述的，如果weight vector是零向量，那么可以精确的推出true value都是0。

通过使用不同的off-policy approximation方法，来讨论off-policy面临的两大挑战，就是和on-policy相比，update target和update distribution的改变，例子里我们可以看到，直接从on-policy推广得到的off-policy，如果没有解决好这两个问题，效果大都差强人意。

引入模块，定义常量

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from mpl_toolkits.mplot3d.axes3d import Axes3D

# all states: state 0-5 are upper states
STATES = np.arange(0, 7)
# state 6 is lower state
LOWER_STATE = 6
# discount factor
DISCOUNT = 0.99

# each state is represented by a vector of length 8
FEATURE_SIZE = 8
FEATURES = np.zeros((len(STATES), FEATURE_SIZE))
for i in range(LOWER_STATE):
    FEATURES[i, i] = 2
    FEATURES[i, 7] = 1
FEATURES[LOWER_STATE, 6] = 1
FEATURES[LOWER_STATE, 7] = 2

# all possible actions
DASHED = 0
SOLID = 1
ACTIONS = [DASHED, SOLID]

# reward is always zero
REWARD = 0

定义target policy和behavior policy，并使用step函数来建立agent和environment之间的交互

# take @action at @state, return the new state
def step(state, action):
    if action == SOLID:
        return LOWER_STATE
    return np.random.choice(STATES[: LOWER_STATE])

# target policy
def target_policy(state):
    return SOLID

# behavior policy
BEHAVIOR_SOLID_PROBABILITY = 1.0 / 7

def behavior_policy(state):
    if np.random.binomial(1, BEHAVIOR_SOLID_PROBABILITY) == 1:
        return SOLID
    return DASHED

使用semi-gradient off-policy TD方法完成一次weight update

# Semi-gradient off-policy temporal difference
# @state: current state
# @theta: weight for each component of the feature vector
# @alpha: step size
# @return: next state
def semi_gradient_off_policy_TD(state, theta, alpha):
    action = behavior_policy(state)
    next_state = step(state, action)
    # get the importance ratio
    if action == DASHED:
        rho = 0.0
    else:
        rho = 1.0 / BEHAVIOR_SOLID_PROBABILITY
    # theta是w1-w8的值，也就是需要学习的weight
    delta = REWARD + DISCOUNT * np.dot(FEATURES[next_state, :], theta) - \
            np.dot(FEATURES[state, :], theta)
    # ρ×α×δ_t
    delta *= rho * alpha
    # derivatives happen to be the same matrix due to the linearity
    theta += FEATURES[state, :] * delta
    return next_state

使用semi-gradient DP方法完成一次weight update

# Semi-gradient DP
# @theta: weight for each component of the feature vector
# @alpha: step size
def semi_gradient_DP(theta, alpha):
    delta = 0.0
    # go through all the states
    for state in STATES:
        expected_return = 0.0
        # compute bellman error for each state
        for next_state in STATES:
            # 使用DP方法，直接使用target policy选择action，所以只选择7th state来完成bellman error的计算
            if next_state == LOWER_STATE:
                expected_return += REWARD + DISCOUNT * np.dot(theta, FEATURES[next_state, :])
        bellmanError = expected_return - np.dot(theta, FEATURES[state, :])
        # accumulate gradients
        delta += bellmanError * FEATURES[state, :]
    # derivatives happen to be the same matrix due to the linearity
    theta += alpha / len(STATES) * delta

绘制使用semi-gradient off-policy TD和semi-gradient DP方法的weight收敛曲线

# Figure 11.2(left), semi-gradient off-policy TD
def figure_11_2_left():
    # Initialize the theta
    theta = np.ones(FEATURE_SIZE)
    theta[6] = 10

    alpha = 0.01

    steps = 1000
    thetas = np.zeros((FEATURE_SIZE, steps))
    state = np.random.choice(STATES)
    for step in tqdm(range(steps)):
        state = semi_gradient_off_policy_TD(state, theta, alpha)
        thetas[:, step] = theta

    for i in range(FEATURE_SIZE):
        plt.plot(thetas[i, :], label='theta' + str(i + 1))
    plt.xlabel('Steps')
    plt.ylabel('Theta value')
    plt.title('semi-gradient off-policy TD')
    plt.legend()

# Figure 11.2(right), semi-gradient DP
def figure_11_2_right():
    # Initialize the theta
    theta = np.ones(FEATURE_SIZE)
    theta[6] = 10

    alpha = 0.01

    sweeps = 1000
    thetas = np.zeros((FEATURE_SIZE, sweeps))
    for sweep in tqdm(range(sweeps)):
        semi_gradient_DP(theta, alpha)
        thetas[:, sweep] = theta

    for i in range(FEATURE_SIZE):
        plt.plot(thetas[i, :], label='theta' + str(i + 1))
    plt.xlabel('Sweeps')
    plt.ylabel('Theta value')
    plt.title('semi-gradient DP')
    plt.legend()

def figure_11_2():
    plt.figure(figsize=(10, 20))
    plt.subplot(2, 1, 1)
    figure_11_2_left()
    plt.subplot(2, 1, 2)
    figure_11_2_right()

    plt.savefig('./figure_11_2.png')
    plt.show()

# test
figure_11_2()

100%|██████████| 1000/1000 [00:00<00:00, 38275.13it/s]
100%|██████████| 1000/1000 [00:00<00:00, 13016.09it/s]

png

可见使用semi-gradient off-policy TD和semi-gradient DP方法训练weight均未收敛

使用TDC方法训练

# state distribution for the behavior policy
STATE_DISTRIBUTION = np.ones(len(STATES)) / 7

# 使用np.diag构造对角矩阵，即矩阵D
STATE_DISTRIBUTION_MAT = np.matrix(np.diag(STATE_DISTRIBUTION))

# projection matrix for minimize MSVE
# x*inv(x.T*D*x)*(x.T)*D,书P218
PROJECTION_MAT = np.matrix(FEATURES) * \
                 np.linalg.pinv(np.matrix(FEATURES.T) * STATE_DISTRIBUTION_MAT * np.matrix(FEATURES)) * \
                 np.matrix(FEATURES.T) * \
                 STATE_DISTRIBUTION_MAT

# temporal difference with gradient correction P228
# @state: current state
# @theta: weight of each component of the feature vector
# @weight: auxiliary trace for gradient correction
# @alpha: step size of @theta
# @beta: step size of @weight
def TDC(state, theta, weight, alpha, beta):
    action = behavior_policy(state)
    next_state = step(state, action)
    # get the importance ratio
    if action == DASHED:
        rho = 0.0
    else:
        rho = 1.0 / BEHAVIOR_SOLID_PROBABILITY
    delta = REWARD + DISCOUNT * np.dot(FEATURES[next_state, :], theta) - \
            np.dot(FEATURES[state, :], theta)
    theta += alpha * rho * (delta * FEATURES[state, :] - DISCOUNT * FEATURES[next_state, :] * np.dot(FEATURES[state, :], weight))
    weight += beta * rho * (delta - np.dot(FEATURES[state, :], weight)) * FEATURES[state, :]
    return next_state

使用expectedTDC训练

# expected temporal difference with gradient correction
# @theta: weight of each component of the feature vector
# @weight: auxiliary trace for gradient correction
# @alpha: step size of @theta
# @beta: step size of @weight
def expected_TDC(theta, weight, alpha, beta):
    # 可以认为使用了均匀distribution，和第一个DP方法一样，current state并没有真正的改变，即没有agent和environment之间的交互
    for state in STATES:
        # When computing expected update target, if next state is not lower state, importance ratio will be 0,
        # so we can safely ignore this case and assume next state is always lower state
        delta = REWARD + DISCOUNT * np.dot(FEATURES[LOWER_STATE, :], theta) - np.dot(FEATURES[state, :], theta)
        rho = 1 / BEHAVIOR_SOLID_PROBABILITY
        # Under behavior policy, state distribution is uniform, so the probability for each state is 1.0 / len(STATES)
        # 这里和sample的TDC的区别主要是前面加上了1/len(STATES)*BEHAVIOR_SOLID_PROBABILITY，
        # 是为了计算期望值，注意需要考虑x_t和x_t+1的概率，所以使用了条件概率的乘法
        expected_update_theta = 1.0 / len(STATES) * BEHAVIOR_SOLID_PROBABILITY * rho * (
            delta * FEATURES[state, :] - DISCOUNT * FEATURES[LOWER_STATE, :] * np.dot(weight, FEATURES[state, :]))
        theta += alpha * expected_update_theta
        expected_update_weight = 1.0 / len(STATES) * BEHAVIOR_SOLID_PROBABILITY * rho * (
            delta - np.dot(weight, FEATURES[state, :])) * FEATURES[state, :]
        weight += beta * expected_update_weight

使用ETD方法训练

# interest is 1 for every state
INTEREST = 1
# ETD方法也是使用的期望值来训练
# expected update of ETD
# @theta: weight of each component of the feature vector
# @emphasis: current emphasis
# @alpha: step size of @theta
# @return: expected next emphasis
def expected_emphatic_TD(theta, emphasis, alpha):
    # we perform synchronous update for both theta and emphasis
    expected_update = 0
    expected_next_emphasis = 0.0
    # go through all the states
    for state in STATES:
        # compute rho(t-1)
        if state == LOWER_STATE:
            rho = 1.0 / BEHAVIOR_SOLID_PROBABILITY
        else:
            rho = 0
        # update emphasis
        next_emphasis = DISCOUNT * rho * emphasis + INTEREST
        expected_next_emphasis += next_emphasis
        # When computing expected update target, if next state is not lower state, importance ratio will be 0,
        # so we can safely ignore this case and assume next state is always lower state
        next_state = LOWER_STATE
        delta = REWARD + DISCOUNT * np.dot(FEATURES[next_state, :], theta) - np.dot(FEATURES[state, :], theta)
        expected_update += 1.0 / len(STATES) * BEHAVIOR_SOLID_PROBABILITY * next_emphasis * 1 / BEHAVIOR_SOLID_PROBABILITY * delta * FEATURES[state, :]
    theta += alpha * expected_update
    return expected_next_emphasis / len(STATES)

计算RMSVE和RMSPBE

# compute RMSVE for a value function parameterized by @theta
# true value function is always 0 in this example
def compute_RMSVE(theta):
    return np.sqrt(np.dot(np.power(np.dot(FEATURES, theta), 2), STATE_DISTRIBUTION))

# compute RMSPBE for a value function parameterized by @theta
# true value function is always 0 in this example
def compute_RMSPBE(theta):
    bellman_error = np.zeros(len(STATES))
    for state in STATES:
        for next_state in STATES:
            if next_state == LOWER_STATE:
                bellman_error[state] += REWARD + DISCOUNT * np.dot(theta, FEATURES[next_state, :]) - np.dot(theta, FEATURES[state, :])
    bellman_error = np.dot(np.asarray(PROJECTION_MAT), bellman_error)
    return np.sqrt(np.dot(np.power(bellman_error, 2), STATE_DISTRIBUTION))

绘制图像比较TDC和expected TDC性能

# Figure 11.6(left), temporal difference with gradient correction
def figure_11_6_left():
    # Initialize the theta
    theta = np.ones(FEATURE_SIZE)
    theta[6] = 10
    weight = np.zeros(FEATURE_SIZE)

    alpha = 0.005
    beta = 0.05

    steps = 1000
    thetas = np.zeros((FEATURE_SIZE, steps))
    RMSVE = np.zeros(steps)
    RMSPBE = np.zeros(steps)
    state = np.random.choice(STATES)
    for step in tqdm(range(steps)):
        state = TDC(state, theta, weight, alpha, beta)
        thetas[:, step] = theta
        RMSVE[step] = compute_RMSVE(theta)
        RMSPBE[step] = compute_RMSPBE(theta)

    for i in range(FEATURE_SIZE):
        plt.plot(thetas[i, :], label='theta' + str(i + 1))
    plt.plot(RMSVE, label='RMSVE')
    plt.plot(RMSPBE, label='RMSPBE')
    plt.xlabel('Steps')
    plt.title('TDC')
    plt.legend()

# Figure 11.6(right), expected temporal difference with gradient correction
def figure_11_6_right():
    # Initialize the theta
    theta = np.ones(FEATURE_SIZE)
    theta[6] = 10
    weight = np.zeros(FEATURE_SIZE)

    alpha = 0.005
    beta = 0.05

    sweeps = 1000
    thetas = np.zeros((FEATURE_SIZE, sweeps))
    RMSVE = np.zeros(sweeps)
    RMSPBE = np.zeros(sweeps)
    for sweep in tqdm(range(sweeps)):
        expected_TDC(theta, weight, alpha, beta)
        thetas[:, sweep] = theta
        RMSVE[sweep] = compute_RMSVE(theta)
        RMSPBE[sweep] = compute_RMSPBE(theta)

    for i in range(FEATURE_SIZE):
        plt.plot(thetas[i, :], label='theta' + str(i + 1))
    plt.plot(RMSVE, label='RMSVE')
    plt.plot(RMSPBE, label='RMSPBE')
    plt.xlabel('Sweeps')
    plt.title('Expected TDC')
    plt.legend()

def figure_11_6():
    plt.figure(figsize=(10, 20))
    plt.subplot(2, 1, 1)
    figure_11_6_left()
    plt.subplot(2, 1, 2)
    figure_11_6_right()

    plt.savefig('./figure_11_6.png')
    plt.show()
    
figure_11_6()

100%|██████████| 1000/1000 [00:00<00:00, 10149.88it/s]
100%|██████████| 1000/1000 [00:00<00:00, 4183.01it/s]

png

可以看到虽然最后结果收敛到了PBE为0，但是VE仍然是2，即和w=(0,0,0,0,0,0,0,0)的true weight仍然有差距。

绘制图像测试ETD算法性能

# Figure 11.7, expected ETD
def figure_11_7():
    # Initialize the theta
    theta = np.ones(FEATURE_SIZE)
    theta[6] = 10

    alpha = 0.03

    sweeps = 1000
    thetas = np.zeros((FEATURE_SIZE, sweeps))
    RMSVE = np.zeros(sweeps)
    emphasis = 0.0
    for sweep in tqdm(range(sweeps)):
        emphasis = expected_emphatic_TD(theta, emphasis, alpha)
        thetas[:, sweep] = theta
        RMSVE[sweep] = compute_RMSVE(theta)

    for i in range(FEATURE_SIZE):
        plt.plot(thetas[i, :], label='theta' + str(i + 1))
    plt.plot(RMSVE, label='RMSVE')
    plt.xlabel('Sweeps')
    plt.title('emphatic TD')
    plt.legend()

    plt.savefig('./figure_11_7.png')
    plt.show()

figure_11_7()

100%|██████████| 1000/1000 [00:00<00:00, 12319.59it/s]

png

使用ETD方法训练时使用的期望值，可以看到最后VE收敛到了0，但是实际应用时，即实际sample造成的方差会很大，所以这种方法理论收敛但是实际一般不能收敛。