Chapter04 car_rental

引用来自ShangtongZhang的代码chapter04/car_rental.py

car rental 问题

问题描述

这个问题就比较复杂了。。。说是Jack是两个汽车租赁公司的老板,他收入是靠租车出去,租出一辆赚10刀,每次有人还车,那么第二天这车就可以租出去了;每天夜里可以将一个地方的车运到另一个地方,不过每运一辆车要花2刀。关于业务,Jack发现了一些斯巴拉西的规律:每个地方每天汽车租借和归还的数量都遵循泊松分布:

png

我们就把两个位置称为first和second吧。

first的汽车每天借出去λ=3,归还λ=3;

second每天借出去λ=4,归还λ=2;

并且每个地方汽车库存不能超过20辆,超过了总公司就会回收;

夜里从一个地方运到另一个地方的汽车数量不能超过5辆。

这个问题我们把它设计成discount因子=0.9的MDP问题,step是每一天,action是每晚运的车,并设从first运到second为正,从second运到first为负,state是first和second可以租赁车的总数量。同时做一个简化,就是如果泊松分布n>10,就把概率人为截断为0。

引入模块并定义常量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from math import exp, factorial
import seaborn as sns
%matplotlib inline

# maximum # of cars in each location
MAX_CARS = 20

# maximum # of cars to move during night
MAX_MOVE_OF_CARS = 5

# expectation for rental requests in first location
RENTAL_REQUEST_FIRST_LOC = 3

# expectation for rental requests in second location
RENTAL_REQUEST_SECOND_LOC = 4

# expectation for # of cars returned in first location
RETURNS_FIRST_LOC = 3

# expectation for # of cars returned in second location
RETURNS_SECOND_LOC = 2

DISCOUNT = 0.9

# credit earned by a car
RENTAL_CREDIT = 10

# cost of moving a car
MOVE_CAR_COST = 2

# all possible actions
actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1)

# An up bound for poisson distribution
# If n is greater than this value, then the probability of getting n is truncated to 0
POISSON_UPPER_BOUND = 11

进行policy evaluation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Probability for poisson distribution
# @lam: lambda should be less than 10 for this function
poisson_cache = dict()
def poisson(n, lam):
global poisson_cache
key = n * 10 + lam
if key not in poisson_cache.keys():
poisson_cache[key] = exp(-lam) * pow(lam, n) / factorial(n)
return poisson_cache[key]

# @state: [# of cars in first location, # of cars in second location]
# @action: positive if moving cars from first location to second location,
# negative if moving cars from second location to first location
# @stateValue: state value matrix
# @constant_returned_cars: if set True, model is simplified such that
# the # of cars returned in daytime becomes constant
# rather than a random value from poisson distribution, which will reduce calculation time
# and leave the optimal policy/value state matrix almost the same
def expected_return(state, action, state_value, constant_returned_cars):
# initailize total return
returns = 0.0

# cost for moving cars
returns -= MOVE_CAR_COST * abs(action)

# go through all possible rental requests
for rental_request_first_loc in range(0, POISSON_UPPER_BOUND):
for rental_request_second_loc in range(0, POISSON_UPPER_BOUND):
# moving cars
num_of_cars_first_loc = int(min(state[0] - action, MAX_CARS))
num_of_cars_second_loc = int(min(state[1] + action, MAX_CARS))

# valid rental requests should be less than actual # of cars
real_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc)
real_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc)

# get credits for renting
reward = (real_rental_first_loc + real_rental_second_loc) * RENTAL_CREDIT
num_of_cars_first_loc -= real_rental_first_loc
num_of_cars_second_loc -= real_rental_second_loc

# probability for current combination of rental requests
# possion(n,lam)
# P(AB) = P(A)*P(B)
prob = poisson(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \
poisson(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC)

if constant_returned_cars:
# get returned cars, those cars can be used for renting tomorrow
returned_cars_first_loc = RETURNS_FIRST_LOC
returned_cars_second_loc = RETURNS_SECOND_LOC
num_of_cars_first_loc = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc])
else:
for returned_cars_first_loc in range(0, POISSON_UPPER_BOUND):
for returned_cars_second_loc in range(0, POISSON_UPPER_BOUND):
num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
prob_ = poisson(returned_cars_first_loc, RETURNS_FIRST_LOC) * \
poisson(returned_cars_second_loc, RETURNS_SECOND_LOC) * prob
returns += prob_ * (reward + DISCOUNT * state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
return returns

进行policy iteration

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def figure_4_2(constant_returned_cars=True):
value = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
policy = np.zeros(value.shape, dtype=np.int)

iterations = 0
_, axes = plt.subplots(2, 3, figsize=(40, 20))
plt.subplots_adjust(wspace=0.1, hspace=0.2)
axes = axes.flatten()
while True:
fig = sns.heatmap(np.flipud(policy), cmap="YlGnBu", ax=axes[iterations])
fig.set_ylabel('# cars at first location', fontsize=30)
fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
fig.set_xlabel('# cars at second location', fontsize=30)
fig.set_title('policy %d' % (iterations), fontsize=30)

# policy evaluation (in-place)
while True:
new_value = np.copy(value)
for i in range(MAX_CARS + 1):
for j in range(MAX_CARS + 1):
new_value[i, j] = expected_return([i, j], policy[i, j], new_value,
constant_returned_cars)
value_change = np.abs((new_value - value)).sum()
print('value change %f' % (value_change))
value = new_value
if value_change < 1e-4:
break

# policy improvement
new_policy = np.copy(policy)
for i in range(MAX_CARS + 1):
for j in range(MAX_CARS + 1):
action_returns = []
for action in actions:
if (action >= 0 and i >= action) or (action < 0 and j >= abs(action)):
action_returns.append(expected_return([i, j], action, value, constant_returned_cars))
else:
action_returns.append(-float('inf'))
new_policy[i, j] = actions[np.argmax(action_returns)]

policy_change = (new_policy != policy).sum()
print('policy changed in %d states' % (policy_change))
policy = new_policy
if policy_change == 0:
fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1])
fig.set_ylabel('# cars at first location', fontsize=30)
fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
fig.set_xlabel('# cars at second location', fontsize=30)
fig.set_title('optimal value', fontsize=30)
break

iterations += 1

plt.savefig('./figure_4_2.png')
plt.show()

# figure_4_2()

png

注意图片,总共经过4次迭代最终收敛,前5个图片是policy的温度分布图,最后一个是value的分布图。还有一点需要注意,就是这个任务没有终止状态,所以是属于continuing task,discount因子<1,而本章第一个和第三个都是有终止状态的,所以是undiscount的。