Commit 55325fca authored by Timo P. Gros's avatar Timo P. Gros

policy learning

parent 67b2d557
from r4l import agent
## change this before running the script.
## More comfortable than using command line input
checkpoint_name = 'barto-big_unnoisy_randomstart'
map_file = '../maps/barto-big.track'
noisy = False
random_start = True
num_episodes = 10000
length_episodes = 1000
eps_start = 1
eps_end = 0.0001 # set to zero if without eps_end
eps_decay = 0.999
seed = 0
#global constant for running the training function
best_score = 0
a = agent.Agent(seed, noisy, random_start, num_episodes, length_episodes, checkpoint_name, map_file, eps_start, eps_end, eps_decay, 0)
a = a.train()
from r4l import agent
## change this before running the script.
## More comfortable than using command line input
checkpoint_name = 'barto-big_unnoisy_normalstart'
map_file = '../maps/barto-big.track'
noisy = False
random_start = False
num_episodes = 120000
length_episodes = 1000
eps_start = 0.3
eps_end = 0.0001 # set to zero if without eps_end
eps_decay = 0.999
seed = 0
#global constant for running the training function
best_score = -0.5
a = agent.Agent(seed, noisy, random_start, num_episodes, length_episodes, checkpoint_name, map_file, eps_start, eps_end, eps_decay, 1)
a.load(checkpoint_name + '.pth', best_score)
a = a.train()
from r4l import model
from torch import nn
import torch
from torch import optim
import torch.nn.functional as F
import random
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output
# Constants
BUFFER_SIZE = int(1e4) # replay buffer size
BATCH_SIZE = 64 # minibatch size
GAMMA = 0.99 # discount factor
TAU = 0.001#1e-3 # for soft update of target parameters
LR = 5e-4 # learning rate
UPDATE_EVERY = 4 # how often to update the network
# define neural network
class Network(nn.Module):
def __init__(self):
super().__init__()
# Defining the layers, 64, 64, 4 units each
self.fc1 = nn.Linear(15, 64)
self.fc2 = nn.Linear(64, 64)
# Output layer, 4 units - one for each action
self.fc3 = nn.Linear(64, 9)
def forward(self, x):
''' Forward pass through the network, returns the output logits '''
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
return x
#Replay Buffer used for Experience Replay
class ReplayBuffer:
# initialize the Replay buffer
# Fix the seed for random sampling through the Replay Buffer
# Batch size defines number of samples drawn at each learning operation
# buffer is the actual buffer
def __init__(self, buffer_size, batch_size, seed):
self.batch_size = batch_size
self.buffer = deque(maxlen = buffer_size)
self.seed = random.seed(seed)
# add samples to the buffer
# transfer the done value to an integer
def add(self, state, action, reward, next_state, done):
if done:
done_value = 1
else:
done_value = 0
self.buffer.append([state, action, reward, next_state, done_value])
# sample from the database
# the samples later need to be split into tensors of each part of the samples
# thus, collects a sample and writes every part of the sample in the corresponding list
# afterwards transforms this lists into tensors and returns them
def sample(self):
samples = (random.sample(self.buffer, self.batch_size))
states = []
actions = []
rewards = []
next_states = []
dones = []
for sample in samples:
state, action, reward, next_state, done = sample
states.append(state)
actions.append(action)
rewards.append(reward)
next_states.append(next_state)
dones.append(done)
states = torch.tensor(states).float()
actions = torch.LongTensor(actions)
rewards = torch.tensor(rewards).float()
next_states = torch.tensor(next_states).float()
dones = torch.tensor(dones).float()
return [states,actions,rewards,next_states,dones]
# carries ofer the length of the buffer to the replay buffer
def __len__(self):
return len(self.buffer)
class Agent():
# defines both, the local and the target network
# defines the optimizer. Mostly, Adam is used
# initializes buffer and update_counter
# initialize hyper-parameters of learning process
def __init__(self, seed, noisy, random_start, n_episodes = 8000, l_episodes = 1000, checkpoint_name = 'Unnamed', mapname = '../maps/training.txt', eps_start = 1.0, eps_end =0.0001 , eps_decay = 0.999, learning_count = 0):
self.qnetwork_target = Network()
self.qnetwork_local = Network()
self.seed = random.seed(seed)
self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
self.update_counter = 0
# hyperparameters of learning:
self.n_episodes = n_episodes
self.l_episodes = l_episodes
self.checkpoint_name = checkpoint_name
self.eps_start = eps_start
self.eps_end = eps_end
self.eps_decay = eps_decay
self.mapname = mapname
self.env = model.environment(self.mapname, random_start = random_start, noisy = noisy)
# variables for learning
self.learning_count = learning_count
self.best_score = - float("inf")
#used after init to load an existing network
def load(self, file, best_score):
net = Network()
net.load_state_dict(torch.load(file))
net.eval()
self.qnetwork_local = net
self.qnetwork_target = net
self.best_score = best_score
# carries out one step of the agent
def step(self, state, action, reward, next_state, done):
# add the sample to the buffer
self.buffer.add(state, action, reward, next_state, done)
# increment update counter
self.update_counter = (self.update_counter + 1) % UPDATE_EVERY
# if the update counter mets the requirement of UPDATE_EVERY,
# sample and start the learning process
if self.update_counter == 0:
if (len(self.buffer)) > BATCH_SIZE:
samples = self.buffer.sample()
self.learn(samples, GAMMA)
# act epsilon greedy according to the local netowrk
def act(self, state, eps = 0):
state = torch.tensor(state).float()
with torch.no_grad():
action_values = self.qnetwork_local(state)
if random.random()>eps:
return np.argmax(action_values.numpy())
else:
return random.choice(range(len(action_values.numpy())))
# learn method
def learn(self, samples, gamma):
states, actions, rewards, next_states, dones = samples
# Implementation of dqn algorithm
q_values_next_states = self.qnetwork_target.forward(next_states).max(dim=1)[0]
targets = rewards + (gamma * (q_values_next_states) * (1 - dones))
q_values = self.qnetwork_local.forward(states)
actions = actions.view(actions.size()[0], 1)
predictions = torch.gather(q_values, 1, actions).view(actions.size()[0])
# calculate loss between targets and predictions
loss = F.mse_loss(predictions, targets)
# make backward step
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# perform a soft-update to the network
for target_weight, local_weight in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
target_weight.data.copy_(TAU*local_weight.data + (1.0-TAU)*target_weight.data)
# heart of the agent, learning process
def train(self):
self.learning_count +=1
f = open(self.checkpoint_name + ".scores", "a")
f.write('Start Training Run: eps_start = ' + str(self.eps_start) + ' eps_end = ' + str(self.eps_end) + ' eps_decay: ' + str(self.eps_decay) + 'ReplayBuffer size: ' + str(BUFFER_SIZE))
f.close()
# initialize arrays and values
means = []
scores_window = deque(maxlen=100)
eps = self.eps_start
#initialize plot
fig = plt.figure()
ax = fig.add_subplot(111)
# iterate for initialized number of episodes
for i_episode in range(1, self.n_episodes+1):
# reset state and score
state = self.env.reset()
score = 0
# make at most max_t steps
for t in range(self.l_episodes):
action = self.act(state, eps)
reward, next_state, done = self.env.step(action) # send the action to the environment and observe
self.step(state, action, reward, next_state, done)
state = next_state
score += reward*np.power(GAMMA, t)
if done:
break
scores_window.append(score)
eps = max(self.eps_end, self.eps_decay*eps)
if i_episode % 100 == 0:
score = np.mean(scores_window)
means.append(score)
# if current score is better, save the network weights and update best seen score
if score > self.best_score:
self.best_score = score
torch.save(self.qnetwork_local.state_dict(), self.checkpoint_name + '.pth')
print('\rEpisode {}\tAverage Score: {:.2f}\tBest Score: {:.2f}'.format(i_episode, score, self.best_score))
f = open(self.checkpoint_name + ".scores", "a")
for score in scores_window:
f.write(str(score) + '\n')
f.close()
plt.plot(np.arange(len(means)),means, label = 'Mean', color = 'r')
plt.ylabel('Mean Score')
plt.xlabel('Episode #')
plt.savefig(self.checkpoint_name + '_run_' + str(self.learning_count) + '.png')
return self
from r4l import parser
import random
import copy
import numpy as np
NEGATIVE_REWARD = -50
POSITIVE_REWARD = 100
STEP_REWARD = 0
GOAL = 99
WALL = 98
NOISE_PROBABILITY = 0.1
class environment:
def __init__(self, map_name, random_start = False, noisy = False):
self.height, self.width, self.map = parser.parse_file(map_name)
self.noisy = noisy
self.random_start = random_start
self.starters = []
self.goals = []
for i,row in enumerate(self.map):
for j,sign in enumerate(row):
if sign == 's':
self.starters.append((i,j))
if sign == 'g':
self.goals.append((i,j))
self.dict = {0:(-1,-1), 1:(0,-1), 2:(1,-1), 3:(-1,0), 4:(0,0), 5:(1,0), 6:(-1,1), 7:(0,1), 8:(1,1)}
self.position = np.array(random.choice(self.starters))
if random_start:
x = random.randrange(0, self.height)
y = random.randrange(0, self.width)
while self.terminal(x,y):
x = random.randrange(0, self.height)
y = random.randrange(0, self.width)
self.position = np.array([x,y])
self.velocity = np.array((0,0))
self.done = False
self.path = [self.position]
# create distance and goal-distance features
distances = np.zeros((self.height,self.width)).tolist()
for x in range(self.height):
for y in range(self.width):
if self.terminal(x,y):
all_d = np.zeros(11).tolist() #11 is hard-coded number of distances TODO: generate this
else:
d = self.calculate_distances(x,y)
dg = self.calculate_goal_distances(x,y)
all_d = d + dg
distances[x][y] = all_d
self.distances = distances
def terminal(self, x,y):
if x < 0 or y < 0 or x >= self.height or y >= self.width or ((x,y) in self.goals):
return True
return self.map[x][y] == "x"
def wall(self,x,y):
if x < 0 or y < 0 or x >= self.height or y >= self.width:
return True
return self.map[x][y] == "x"
def calculate_intermediates(self, x,y, dx, dy):
# trivial case:
if dx == 0 and dy == 0:
return [(x,y)]
res = []
# case 1, dx == 0
if dx == 0:
#each possible y value
m = np.sign(dy) #evaluates to 1 or -1, dependend of dy>0 or dy<0
for i in range(np.abs(dy)+1):
res.append((x, y + i*m))
return res
#case 2, dy == 0
if dy == 0:
m = np.sign(dx) #evaluates to 1 or -1, dependend of dx>0 or dx<0
for i in range(np.abs(dx)+1):
res.append((x + i*m,y))
return res
#case 3, dx and dy != 0, |dx| > |dy|
if np.abs(dx) >= np.abs(dy):
m_y = (dy / np.abs(dx))
m_x = np.sign(dx)
for i in range(np.abs(dx) + 1):
act_x = int(x + i*m_x)
act_y = int(round(y + i*m_y))
res.append((act_x, act_y))
return res
# case 4
if np.abs(dx) < np.abs(dy):
m_x = (dx/np.abs(dy))
m_y = np.sign(dy)
for i in range(np.abs(dy) + 1):
act_y = int(y + i*m_y)
act_x = int(round(x + i*m_x))
res.append((act_x, act_y))
return res
def calculate_goal_distances(self,x,y):
pos = np.array((x,y))
dx = self.height+1
dy = self.width+1
d_m = dx + dy
for goal in self.goals:
g = np.array(goal)
d = g - pos
m = np.abs(d[0]) + np.abs(d[1])
if m < d_m:
dx = d[0]
dy = d[1]
d_m = m
return [dx, dy, d_m]
def calculate_distances(self,x,y):
pos = np.array((x,y))
res = np.zeros(8)
x_directions = [-1, 0, 1, -1, 1, -1, 0, 1]
y_directions = [-1, -1, -1, 0, 0, 1, 1, 1]
for i,(dx,dy) in enumerate(zip(x_directions, y_directions)):
direction = np.array((dx,dy))
distance = 1
while True:
checking_coordinate = pos + distance*direction
if self.wall(checking_coordinate[0], checking_coordinate[1]):
res[i] = distance
break
distance += 1
return res.tolist()
def show(self, hide_positions = False):
show = []
for line in self.map:
show.append(list(line))
if not hide_positions:
for i, position in enumerate(self.path):
x,y = position
if not (x < 0 or y < 0 or x >= self.height or y >= self.width):
show[x][y] = str((i % 10))
for line in show:
print("".join(line))
def step(self, action):
if self.done:
print("Already done, step has no further effect")
return STEP_REWARD, (self.position, self.velocity), self.done
# NOISE!
if self.noisy:
if np.random.rand() < NOISE_PROBABILITY:
# 4 is the number of action doing nothin
action = 4
action = np.array(self.dict[action])
self.velocity = self.velocity + action
old_position = self.position
self.position = self.position + self.velocity
self.path.append(self.position)
x,y = self.position
reward = STEP_REWARD
intermediates = self.calculate_intermediates(old_position[0], old_position[1], self.velocity[0], self.velocity[1])
for intermediate in intermediates:
if intermediate in self.goals:
self.done = True
reward = POSITIVE_REWARD
break
if self.wall(intermediate[0], intermediate[1]):
self.done = True
reward = NEGATIVE_REWARD
break
#actually, the state is defined through pos and velocity and the distances are only features
#for reasons of simpler implementation, the features here are returned together with the state
if not self.done:
return reward, (list(self.position) + list(self.velocity) + self.distances[x][y]), self.done
else:
return reward, (list(self.position) + list(self.velocity) + np.zeros(11).tolist()), self.done
def reset(self):
if self.random_start:
x = random.randrange(0, self.height)
y = random.randrange(0, self.width)
while self.terminal(x,y):
x = random.randrange(0, self.height)
y = random.randrange(0, self.width)
self.position = np.array([x,y])
else:
self.position = np.array(random.choice(self.starters))
self.velocity = np.array((0,0))
self.done = False
self.path = [self.position]
x,y = self.position
return (list(self.position) + list(self.velocity) + self.distances[x][y])
#!/usr/bin/env python
def parse_file(filename, replace_starts = False):
f = open(filename, 'r+')
first = f.readline().split()
height = int(first[1])
width = int(first[2])
map = []
for i in range(height):
line = f.readline().rstrip()
if replace_starts:
line = line.replace("s", ".")
map.append(line)
return height, width, map
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment