Chapter12 mountain car

引用来自ShangtongZhang的代码chapter12/mountain_car.py

使用喜闻乐见的MountainCar来测试Sarsa(λ)算法的性能

因为一直在纠结true online Sarsa(λ)的问题,然后我发现效果惨不忍睹。。。不知道是自己的问题还是不太合适用在这,可以在下面看出来。因为训练速度很慢(全部训练完差不多一下午,中间我还在sao改),所以就借鉴一下大佬的结果。

问题描述

使用强化学习中的经典任务——Mountain Car来测试Sarsa(λ)和true on-line Sarsa(λ)算法的性能,同时也测试了不同的Eligibility trace对Sarsa(λ)算法性能的影响。

Mountain Car的具体描述可以参考例子mountain car

引入模块

1
2
3
4
5
6
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from math import floor
from tqdm import tqdm

下面模块的代码和mountain car一致,建议看之前再熟悉一下Tile Coding的原理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#######################################################################
# Following are some utilities for tile coding from Rich.
# To make each file self-contained, I copied them from
# http://incompleteideas.net/tiles/tiles3.py-remove
# with some naming convention changes
#
# Tile coding starts
class IHT:
"Structure to handle collisions"
def __init__(self, size_val):
self.size = size_val
self.overfull_count = 0
self.dictionary = {}

def count(self):
return len(self.dictionary)

def full(self):
return len(self.dictionary) >= self.size

def get_index(self, obj, read_only=False):
d = self.dictionary
if obj in d:
return d[obj]
elif read_only:
return None
size = self.size
count = self.count()
if count >= size:
if self.overfull_count == 0: print('IHT full, starting to allow collisions')
self.overfull_count += 1
return hash(obj) % self.size
else:
d[obj] = count
return count

def hash_coords(coordinates, m, read_only=False):
if isinstance(m, IHT): return m.get_index(tuple(coordinates), read_only)
if isinstance(m, int): return hash(tuple(coordinates)) % m
if m is None: return coordinates

def tiles(iht_or_size, num_tilings, floats, ints=None, read_only=False):
"""returns num-tilings tile indices corresponding to the floats and ints"""
if ints is None:
ints = []
qfloats = [floor(f * num_tilings) for f in floats]
tiles = []
for tiling in range(num_tilings):
tilingX2 = tiling * 2
coords = [tiling]
b = tiling
for q in qfloats:
coords.append((q + b) // num_tilings)
b += tilingX2
coords.extend(ints)
tiles.append(hash_coords(coords, iht_or_size, read_only))
return tiles
# Tile coding ends
#######################################################################

# all possible actions
ACTION_REVERSE = -1
ACTION_ZERO = 0
ACTION_FORWARD = 1
# order is important
ACTIONS = [ACTION_REVERSE, ACTION_ZERO, ACTION_FORWARD]

# bound for position and velocity
POSITION_MIN = -1.2
POSITION_MAX = 0.5
VELOCITY_MIN = -0.07
VELOCITY_MAX = 0.07

# discount is always 1.0 in these experiments
DISCOUNT = 1.0

# use optimistic initial value, so it's ok to set epsilon to 0
EPSILON = 0

# maximum steps per episode
STEP_LIMIT = 5000

# take an @action at @position and @velocity
# @return: new position, new velocity, reward (always -1)
def step(position, velocity, action):
new_velocity = velocity + 0.001 * action - 0.0025 * np.cos(3 * position)
new_velocity = min(max(VELOCITY_MIN, new_velocity), VELOCITY_MAX)
new_position = position + new_velocity
new_position = min(max(POSITION_MIN, new_position), POSITION_MAX)
reward = -1.0
if new_position == POSITION_MIN:
new_velocity = 0.0
return new_position, new_velocity, reward


# get action at @position and @velocity based on epsilon greedy policy and @valueFunction
def get_action(position, velocity, valueFunction):
if np.random.binomial(1, EPSILON) == 1:
return np.random.choice(ACTIONS)
values = []
for action in ACTIONS:
values.append(valueFunction.value(position, velocity, action))
return np.argmax(values) - 1

定义3种常见的Eligibility trace,就是更新公式里常见的z

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# accumulating trace update rule
# @trace: old trace (will be modified)
# @activeTiles: current active tile indices
# @lam: lambda
# @return: new trace for convenience
def accumulating_trace(trace, active_tiles, lam):
trace *= lam * DISCOUNT
trace[active_tiles] += 1
return trace

# replacing trace update rule
# 这种trace只适用于Tile Coding这种的binary feature vector
# @trace: old trace (will be modified)
# @activeTiles: current active tile indices
# @lam: lambda
# @return: new trace for convenience
def replacing_trace(trace, activeTiles, lam):
active = np.in1d(np.arange(len(trace)), activeTiles)
trace[active] = 1
trace[~active] *= lam * DISCOUNT
return trace

# replacing trace update rule, 'clearing' means set all tiles corresponding to non-selected actions to 0
# @trace: old trace (will be modified)
# @activeTiles: current active tile indices
# @lam: lambda
# @clearingTiles: tiles to be cleared
# @return: new trace for convenience
def replacing_trace_with_clearing(trace, active_tiles, lam, clearing_tiles):
active = np.in1d(np.arange(len(trace)), active_tiles)
trace[~active] *= lam * DISCOUNT
trace[clearing_tiles] = 0
trace[active] = 1
return trace

# dutch trace update rule
# @trace: old trace (will be modified)
# @activeTiles: current active tile indices
# @lam: lambda
# @alpha: step size for all tiles
# @return: new trace for convenience
def dutch_trace(trace, active_tiles, lam, alpha):
coef = 1 - alpha * DISCOUNT * lam * np.sum(trace[active_tiles])
trace *= DISCOUNT * lam
trace[active_tiles] += coef
return trace

定义Sarsa(λ)算法类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# wrapper class for Sarsa(lambda)
class Sarsa:
# In this example I use the tiling software instead of implementing standard tiling by myself
# One important thing is that tiling is only a map from (state, action) to a series of indices
# It doesn't matter whether the indices have meaning, only if this map satisfy some property
# View the following webpage for more information
# http://incompleteideas.net/sutton/tiles/tiles3.html
# @maxSize: the maximum # of indices
def __init__(self, step_size, lam, trace_update=accumulating_trace, num_of_tilings=8, max_size=2048):
self.max_size = max_size
self.num_of_tilings = num_of_tilings
self.trace_update = trace_update
self.lam = lam
self.last_value = 0.0

# divide step size equally to each tiling
self.step_size = step_size / num_of_tilings

self.hash_table = IHT(max_size)

# weight for each tile
self.weights = np.zeros(max_size)

# trace for each tile
self.trace = np.zeros(max_size)

# position and velocity needs scaling to satisfy the tile software
self.position_scale = self.num_of_tilings / (POSITION_MAX - POSITION_MIN)
self.velocity_scale = self.num_of_tilings / (VELOCITY_MAX - VELOCITY_MIN)

# get indices of active tiles for given state and action
def get_active_tiles(self, position, velocity, action):
# I think positionScale * (position - position_min) would be a good normalization.
# However positionScale * position_min is a constant, so it's ok to ignore it.
active_tiles = tiles(self.hash_table, self.num_of_tilings,
[self.position_scale * position, self.velocity_scale * velocity],
[action])
return active_tiles

# estimate the value of given state and action
def value(self, position, velocity, action):
if position == POSITION_MAX:
return 0.0
active_tiles = self.get_active_tiles(position, velocity, action)
return np.sum(self.weights[active_tiles])

# learn with given state, action and target
def learn(self, position, velocity, action, target, next_value):
active_tiles = self.get_active_tiles(position, velocity, action)
estimation = np.sum(self.weights[active_tiles])
delta = target - estimation
if self.trace_update == accumulating_trace or self.trace_update == replacing_trace:
self.trace_update(self.trace, active_tiles, self.lam)
elif self.trace_update == dutch_trace:
self.trace_update(self.trace, active_tiles, self.lam, self.step_size)
# use true on-line update
self.weights += self.step_size * (estimation - self.last_value) * self.trace
self.weights[active_tiles] -= self.step_size * (estimation - self.last_value)
self.last_value = next_value
elif self.trace_update == replacing_trace_with_clearing:
clearing_tiles = []
for act in ACTIONS:
if act != action:
clearing_tiles.extend(self.get_active_tiles(position, velocity, act))
self.trace_update(self.trace, active_tiles, self.lam, clearing_tiles)
else:
raise Exception('Unexpected Trace Type')
self.weights += self.step_size * delta * self.trace

# get # of steps to reach the goal under current state value function
def cost_to_go(self, position, velocity):
costs = []
for action in ACTIONS:
costs.append(self.value(position, velocity, action))
return -np.max(costs)

agent完成play,并绘制使用不同trace的cost map

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

# play Mountain Car for one episode based on given method @evaluator
# @return: total steps in this episode
def play(evaluator):
position = np.random.uniform(-0.6, -0.4)
velocity = 0.0
action = get_action(position, velocity, evaluator)
steps = 0
while True:
next_position, next_velocity, reward = step(position, velocity, action)
next_action = get_action(next_position, next_velocity, evaluator)
steps += 1
target = reward + DISCOUNT * evaluator.value(next_position, next_velocity, next_action)
next_value = evaluator.value(next_position, next_velocity, next_action)
evaluator.learn(position, velocity, action, target,next_value)
position = next_position
velocity = next_velocity
action = next_action
if next_position == POSITION_MAX:
break
if steps >= STEP_LIMIT:
print('Step Limit Exceeded!')
break
return steps

# figure 12.10, effect of the lambda and alpha on early performance of Sarsa(lambda)
def figure_12_10():
runs = 30
episodes = 50
alphas = np.arange(1, 8) / 4.0
lams = [0.99, 0.95, 0.5, 0]

steps = np.zeros((len(lams), len(alphas), runs, episodes))
for lamInd, lam in enumerate(lams):
for alphaInd, alpha in enumerate(alphas):
for run in tqdm(range(runs)):
evaluator = Sarsa(alpha, lam, replacing_trace)
for ep in range(episodes):
step = play(evaluator)
steps[lamInd, alphaInd, run, ep] = step

# average over episodes
steps = np.mean(steps, axis=3)

# average over runs
steps = np.mean(steps, axis=2)

for lamInd, lam in enumerate(lams):
plt.plot(alphas, steps[lamInd, :], label='lambda = %s' % (str(lam)))
plt.xlabel('alpha * # of tilings (8)')
plt.ylabel('averaged steps per episode')
plt.ylim([180, 300])
plt.legend()

plt.savefig('./figure_12_10.png')
plt.show()

# figure 12.11, summary comparision of Sarsa(lambda) algorithms
# I use 8 tilings rather than 10 tilings
def figure_12_11():
# traceTypes = [dutch_trace, replacing_trace, replacing_trace_with_clearing, accumulating_trace]
traceTypes = [dutch_trace,replacing_trace]
alphas = np.arange(0.2, 1.8, 0.2)
episodes = 20
runs = 30
lam = 0.9
rewards = np.zeros((len(traceTypes), len(alphas), runs, episodes))

for traceInd, trace in enumerate(traceTypes):
for alphaInd, alpha in enumerate(alphas):
for run in tqdm(range(runs)):
evaluator = Sarsa(alpha, lam, trace)
for ep in range(episodes):
if trace == accumulating_trace and alpha > 0.6:
steps = STEP_LIMIT
else:
steps = play(evaluator)
rewards[traceInd, alphaInd, run, ep] = -steps

# average over episodes
rewards = np.mean(rewards, axis=3)

# average over runs
rewards = np.mean(rewards, axis=2)

for traceInd, trace in enumerate(traceTypes):
plt.plot(alphas, rewards[traceInd, :], label=trace.__name__)
plt.xlabel('alpha * # of tilings (8)')
plt.ylabel('averaged rewards pre episode')
# plt.ylim([-550, -150])
plt.legend()

plt.savefig('./figure_12_11.png')
plt.show()

figure_12_10()
figure_12_11()
100%|██████████| 30/30 [02:07<00:00,  4.19s/it]
100%|██████████| 30/30 [01:49<00:00,  3.72s/it]
100%|██████████| 30/30 [01:42<00:00,  3.45s/it]
100%|██████████| 30/30 [01:37<00:00,  3.13s/it]
100%|██████████| 30/30 [01:44<00:00,  3.31s/it]
100%|██████████| 30/30 [01:42<00:00,  3.17s/it]
100%|██████████| 30/30 [01:40<00:00,  3.37s/it]
100%|██████████| 30/30 [01:36<00:00,  3.14s/it]
100%|██████████| 30/30 [01:36<00:00,  3.36s/it]
100%|██████████| 30/30 [01:07<00:00,  2.24s/it]
100%|██████████| 30/30 [00:57<00:00,  1.90s/it]
100%|██████████| 30/30 [00:53<00:00,  1.77s/it]
100%|██████████| 30/30 [00:49<00:00,  1.63s/it]
100%|██████████| 30/30 [00:46<00:00,  1.56s/it]
100%|██████████| 30/30 [00:45<00:00,  1.45s/it]
100%|██████████| 30/30 [00:45<00:00,  1.54s/it]

figure_12_10:

png

figure_12_11:

png

why online Sarsa(λ) so terrible QAQ:

png