实践
Epsilon贪婪算法
E概率下随机选择未知动作,1-E下基于过去已知信息选择最佳杠杆
import numpy as np
from scipy import stats
import random
import matplotlib.pyplot as plt
n = 10
probs = np.random.rand(n) #A
eps = 0.1
# 10 actions x 2 columns
# Columns: Count #, Avg Reward
record = np.zeros((n,2))
def get_reward(prob, n=10):
reward = 0;
for i in range(n):
if random.random() < prob:
reward += 1
return reward
def get_best_arm(record):
arm_index = np.argmax(record[:,1],axis=0)
return arm_index
def update_record(record,action,r):
new_r = (record[action,0] * record[action,1] + r) / (record[action,0] + 1)
record[action,0] += 1
record[action,1] = new_r
return record
fig,ax = plt.subplots(1,1)
ax.set_xlabel("Plays")
ax.set_ylabel("Avg Reward")
fig.set_size_inches(9,5)
rewards = [0]
for i in range(500):
if random.random() > 0.2:
choice = get_best_arm(record)
else:
choice = np.random.randint(10)
r = get_reward(probs[choice])
record = update_record(record,choice,r)
mean_reward = ((i+1) * rewards[-1] + r)/(i+2)
rewards.append(mean_reward)
ax.scatter(np.arange(len(rewards)),rewards)
# 要加一行这个
plt.show()
Softmax选择策略
首先,Softmax可以将一组分数转换为概率分布。更具体来说,对于某个训练样本的向量的分数是[ 2, 3, 4 ],经过Softmax函数作用后概率将会是[0.2, 0.3, 0.5],这些值都在[0,1]之间并且总和为1。
Softmax函数的形式为:
import numpy as np
from scipy import stats
import random
import matplotlib.pyplot as plt
n = 10
probs = np.random.rand(n)
record = np.zeros((n,2))
def softmax(av, tau=1.12):
softm = ( np.exp(av / tau) / np.sum( np.exp(av / tau) ) )
return softm
def get_reward(prob, n=10):
reward = 0;
for i in range(n):
if random.random() < prob:
reward += 1
return reward
def get_best_arm(record):
arm_index = np.argmax(record[:,1],axis=0)
return arm_index
def update_record(record,action,r):
new_r = (record[action,0] * record[action,1] + r) / (record[action,0] + 1)
record[action,0] += 1
record[action,1] = new_r
return record
fig,ax = plt.subplots(1,1)
ax.set_xlabel("Plays")
ax.set_ylabel("Avg Reward")
fig.set_size_inches(9,5)
rewards = [0]
for i in range(500):
p = softmax(record[:,1],tau=0.7)
choice = np.random.choice(np.arange(n),p=p)
r = get_reward(probs[choice])
record = update_record(record,choice,r)
mean_reward = ((i+1) * rewards[-1] + r)/(i+2)
rewards.append(mean_reward)
ax.scatter(np.arange(len(rewards)),rewards)
plt.show()
广告投放的算法
在浏览某个网站,希望用户看到另一个网站的广告,按照softmax和epsilon贪婪都是某种概率随机投放广告,但实际用户在浏览某个网站时,更原因看相关性的广告,而正在看的特定网站,这一状态”(上下文信息),有助于更好的投放特定的广告。
强化学习,目标时在过程中最大化奖励,那么【状态、动作、奖励】的排列组合就太庞大了,远不是像softmax、epsilon贪婪中使用【动作、奖励】这样简单的数组能用查表的方法来解决。但神经网络就能很好的处理这个难题,只保留有价值的抽象信息(数据中的组合模式和规律)
使用pytorch来解决这个问题:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
class ContextBandit:
def __init__(self, arms=10):
self.arms = arms
self.init_distribution(arms)
self.update_state()
def init_distribution(self, arms):
# Num states = Num Arms to keep things simple
self.bandit_matrix = np.random.rand(arms,arms)
#each row represents a state, each column an arm
def reward(self, prob):
reward = 0
for i in range(self.arms):
if random.random() < prob:
reward += 1
return reward
def get_state(self):
return self.state
def update_state(self):
self.state = np.random.randint(0,self.arms)
def get_reward(self,arm):
return self.reward(self.bandit_matrix[self.get_state()][arm])
def choose_arm(self, arm):
reward = self.get_reward(arm)
self.update_state()
return reward
arms = 10
N, D_in, H, D_out = 1, arms, 100, arms
env = ContextBandit(arms=10)
state = env.get_state()
reward = env.choose_arm(1)
print(state)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
torch.nn.ReLU(),
)
loss_fn = torch.nn.MSELoss()
env = ContextBandit(arms)
def softmax(av, tau=1.12):
softm = ( np.exp(av / tau) / np.sum( np.exp(av / tau) ) )
return softm
def one_hot(N, pos, val=1):
one_hot_vec = np.zeros(N)
one_hot_vec[pos] = val
return one_hot_vec
def running_mean(x,N=50):
c = x.shape[0] - N
y = np.zeros(c)
conv = np.ones(N)
for i in range(c):
y[i] = (x[i:i+N] @ conv)/N
return y
def train(env, epochs=5000, learning_rate=1e-2):
cur_state = torch.Tensor(one_hot(arms,env.get_state())) #A
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
rewards = []
for i in range(epochs):
y_pred = model(cur_state) #B
av_softmax = softmax(y_pred.data.numpy(), tau=2.0) #C
av_softmax /= av_softmax.sum() #D
choice = np.random.choice(arms, p=av_softmax) #E
cur_reward = env.choose_arm(choice) #F
one_hot_reward = y_pred.data.numpy().copy() #G
one_hot_reward[choice] = cur_reward #H
reward = torch.Tensor(one_hot_reward)
rewards.append(cur_reward)
loss = loss_fn(y_pred, reward)
optimizer.zero_grad()
loss.backward()
optimizer.step()
cur_state = torch.Tensor(one_hot(arms,env.get_state())) #I
return np.array(rewards)
rewards = train(env)
plt.plot(running_mean(rewards,N=500))
plt.show()
定义
grad:梯度
梯度下降的方向,函数减小最快,更容易找到最小值
梯度下降法,沿着梯度下降的方向求解极小值,沿着梯度上升的方向可以求得最大值,这种方法叫梯度上升。
算法:求当前函数的导数,然后下降就是不停的循环(当前位置减去一个指定步长*导数)
Multi-armed bandit
一种游戏机,有多个拉杆,每个拉杆中奖的概率不同。
视频讲解