Commit 54a973ab authored by Eduard Pizur's avatar Eduard Pizur
Browse files

changed Rainbow and C51

parent e4171e33
# import os
# import random
# import time
# import numpy as np
# import torch
#
# from collections import deque
# from utils.experience_replay.replay_memory import ReplayMemory as NormReplayMemory
#
# from utils.constant import *
# from utils.base_agent import AtariAgent
# from utils.nets.conv_nets.categorical_q_network import Qnetwork
#
#
# class Agent(AtariAgent):
# def __init__(self, per):
# super(Agent, self).__init__()
# self.per = per
# self.gamma = DISCOUNT_FACTOR
#
# self.num_of_actions = NUM_OF_ACTIONS
# self.num_of_atoms = NUM_OF_ATOMS
# self.batch_size = BATCH_SIZE
#
# self.v_max = V_MAX
# self.v_min = V_MIN
# self.delta_z = (self.v_max - self.v_min) / (self.num_of_atoms - 1)
# self.support = torch.linspace(self.v_min, self.v_max, self.num_of_atoms).to(DEVICE)
#
# self.memory = NormReplayMemory()
#
# self.path_to_folder = os.path.join(
# os.getcwd(), "models/C51_deep_q_network")
# self.path_to_folder = os.path.abspath(self.path_to_folder)
#
# # init network
# self.network = Qnetwork()
# # init target network
# self.target_network = Qnetwork()
#
# # set target network parameters same as agents
# self.target_network.load_state_dict(self.network.state_dict())
# self.target_network.eval()
#
# def choose_action(self, state):
# '''
# choosing action based on the Epsilon greedy strategy
# '''
# with torch.no_grad():
# if random.random() > self.epsilon:
# state = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
# action = self.network.forward(state)
# action = torch.argmax(action).item()
# else:
# action = random.randrange(self.num_of_actions)
# return action
#
# # def target_distribution(self, next_states, rewards, dones):
# # with torch.no_grad():
# # support = self.support
# # temp_dist = self.network.forward(next_states) * support
# # optim_actions = temp_dist.sum(2).max(1)[1]
# # optim_actions = optim_actions.view(self.batch_size,-1).unsqueeze(1).expand(self.batch_size, 1, self.num_of_atoms)
# #
# # ###
# # target_dist = self.target_network.forward(next_states)
# # target_dist = target_dist.gather(1, optim_actions).squeeze()
# #
# # rewards = rewards.unsqueeze(1).expand(self.batch_size, self.num_of_atoms)
# # dones = dones.unsqueeze(1).expand(self.batch_size, self.num_of_atoms)
# # dones = dones.type(torch.uint8)
# # support = self.support.unsqueeze(0).expand(self.batch_size, self.num_of_atoms)
# #
# # T_z = rewards + self.gamma * (1 - dones) * support
# # T_z = T_z.clamp(min = self.v_min, max = self.v_max)
# #
# # # relative position of possible values
# # b = (T_z - self.v_min) / self.delta_z
# #
# # # lower and upper bound of relative positions
# # l = b.floor().to(torch.int64)
# # u = b.ceil().to(torch.int64)
# #
# # # offset
# # offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
# # offset = offset.unsqueeze(1)
# # offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
# #
# # # distribucia a uprava hodnot atomov
# # m = torch.zeros((self.batch_size, self.num_of_atoms), dtype=torch.float).to(DEVICE)
# # m.view(-1).index_add_(0, (l + offset).view(-1),\
# # (target_dist * (u.float() - b)).view(-1))
# # m.view(-1).index_add_(0, (u + offset).view(-1), \
# # (target_dist * (b - l.float())).view(-1))
# # return m
# #
# # def train(self):
# # '''
# # train our agent
# # '''
# # states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
# # actions = actions.view(self.batch_size, 1, 1).expand(-1, -1, self.num_of_atoms)
# #
# # # Target distribution
# # target_dist = self.target_distribution(next_states, rewards, dones)
# # # Current distribution
# # curr_dist = self.network.forward(states)
# # curr_dist = curr_dist.gather(1, actions).squeeze()
# #
# # print(curr_dist)
# # print(target_dist)
# #
# # time.sleep(20)
# #
# # # pocitanie chyby KL Divergence
# # loss = -(target_dist * curr_dist.log()).sum(-1)
# # loss = loss.mean()
# #
# # # uloz chybu do tensorboard
# # self.append_loss(loss)
# #
# # # OPTIMIZE
# # self.network.optimizer.zero_grad()
# # loss.backward()
# # for param in self.network.parameters():
# # param.grad.data.clamp_(-1, 1)
# # self.network.optimizer.step()
#
# # def train(self):
# # # POSLEDNY
# # states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
# # temp_support = self.support.view(1,1,-1)
# #
# # # current distribution
# # curr_dist = self.network(states)
# # curr_dist = torch.stack([curr_dist[i].index_select(0, actions[i]) for i in range(self.batch_size)]).squeeze(1)
# #
# #
# # # find max action
# # with torch.no_grad():
# # q_next = self.target_network.forward(next_states)
# # q_next_mean = torch.sum(q_next * temp_support, dim=2)
# # max_actions = q_next_mean.argmax(dim=1)
# #
# # q_next = torch.stack([q_next[i].index_select(0, max_actions[i]) for i in range(self.batch_size)]).squeeze(1).to(DEVICE)
# # q_next = q_next.data
# #
# # rewards = rewards.view(-1,1)
# # dones = dones.to(torch.int8).view(-1,1)
# # dones = 1 - dones
# #
# # print(self.support.shape)
# # print(dones.shape)
# # print(rewards.shape)
# #
# # time.sleep(20)
# #
# #
# # t_support = self.support.unsqueeze(0).expand(self.batch_size, self.num_of_atoms)
# #
# # T_z = rewards + self.gamma * dones * t_support
# #
# # T_z = T_z.clamp(min = self.v_min, max = self.v_max)
# # b = (T_z - self.v_min) / self.delta_z
# #
# # l = b.floor().to(torch.int64).to(DEVICE)
# # u = b.ceil().to(torch.int16).to(DEVICE)
# #
# # # target distribution
# # target_dist = torch.zeros((self.batch_size, self.num_of_atoms)).to(DEVICE)
# #
# # # for i in range(self.batch_size):
# # # for j in range(self.num_of_atoms):
# # # target_dist[i, l[i,j]] += (q_next * (u - b))[i,j]
# # # target_dist[i, u[i,j]] += (q_next * (b - l))[i,j]
# #
# #
# # offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int64).to(DEVICE)
# # offset = offset.unsqueeze(1)
# # offset = offset.expand(self.batch_size, self.num_of_atoms)
# #
# # target_dist.view(-1).index_add_(0, (l+offset).view(-1), (q_next * (u-b)).view(-1))
# # target_dist.view(-1).index_add_(0, (u+offset).view(-1), (q_next * (b-l)).view(-1))
# #
# # # CHYBA
# # loss = target_dist * (- torch.log(curr_dist + 1e-8))
# # loss = torch.mean(loss)
# #
# # # uloz chybu do tensorboard
# # self.append_loss(loss)
# #
# # # OPTIMIZE
# # self.network.optimizer.zero_grad()
# # loss.backward()
# # for param in self.network.parameters():
# # param.grad.data.clamp_(-1, 1)
# # self.network.optimizer.step()
#
# def train(self):
# '''
# pome
# '''
# states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
#
# dones = dones.to(torch.int8)
# rewards = rewards.view(-1, 1)
# dones = dones.view(-1,1)
#
# with torch.no_grad():
# next_action = self.target_network.forward(next_states).argmax(1)
# next_dist = self.target_network.dist(next_states)
# next_dist = next_dist[range(self.batch_size), next_action]
#
# T_z = rewards + self.gamma * (1 - dones) * self.support
# T_z = T_z.clamp(min = self.v_min, max = self.v_max)
# b = (T_z - self.v_min) / self.delta_z
# l = b.floor().to(torch.int64).to(DEVICE)
# u = b.ceil().to(torch.int16).to(DEVICE)
#
# offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size,
# dtype=torch.int64).to(DEVICE)
# offset = offset.unsqueeze(1)
# offset = offset.expand(self.batch_size, self.num_of_atoms)
#
# proj_dist = torch.zeros(next_dist.size()).to(DEVICE)
# proj_dist.view(-1).index_add_(
# 0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)
# )
# proj_dist.view(-1).index_add_(
# 0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)
# )
#
# curr_dist = self.network.dist(states)
# log_p = torch.log(curr_dist[range(self.batch_size), actions])
#
# loss = -(proj_dist * log_p).sum(1)
# loss = torch.mean(loss)
#
# # uloz chybu do tensorboard
# self.append_loss(loss)
#
# # OPTIMIZE
# self.network.optimizer.zero_grad()
# loss.backward()
# # for param in self.network.parameters():
# # param.grad.data.clamp_(-1, 1)
# self.network.optimizer.step()
import os
import random
import time
......@@ -25,8 +273,7 @@ class Agent(AtariAgent):
self.v_max = V_MAX
self.v_min = V_MIN
self.delta_z = (self.v_max - self.v_min) / (self.num_of_atoms - 1)
self.support = torch.linspace(self.v_min, self.v_max, self.num_of_atoms).to(DEVICE)
# self.support = torch.tensor(self.support, dtype=torch.float16).to(DEVICE)
self.support = torch.linspace(self.v_min, self.v_max, self.num_of_atoms).view(1,1,self.num_of_atoms).to(DEVICE)
self.memory = NormReplayMemory()
......@@ -41,142 +288,90 @@ class Agent(AtariAgent):
# set target network parameters same as agents
self.target_network.load_state_dict(self.network.state_dict())
self.network.train()
self.target_network.eval()
def choose_action(self, state):
'''
choosing action based on the Epsilon greedy strategy
'''
with torch.no_grad():
if random.random() > self.epsilon:
state_ = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
action = self.network.forward(state_) * self.support
action = torch.sum(action, dim=2)
action = torch.argmax(action).item()
else:
action = random.randrange(self.num_of_actions)
if random.random() > self.epsilon:
with torch.no_grad():
state = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
action = self.network.forward(state)
action = action * self.support
action = action.sum(dim=2).max(1)
action = action[1].view(1,1)
else:
action = random.randrange(self.num_of_actions)
action = torch.tensor([[action]], dtype=torch.int16).to(DEVICE)
return action
# def target_distribution(self, next_states, rewards, dones):
# with torch.no_grad():
# support = self.support
# temp_dist = self.network.forward(next_states) * support
# optim_actions = temp_dist.sum(2).max(1)[1]
# optim_actions = optim_actions.view(self.batch_size,-1).unsqueeze(1).expand(self.batch_size, 1, self.num_of_atoms)
#
# ###
# target_dist = self.target_network.forward(next_states)
# target_dist = target_dist.gather(1, optim_actions).squeeze()
#
# rewards = rewards.unsqueeze(1).expand(self.batch_size, self.num_of_atoms)
# dones = dones.unsqueeze(1).expand(self.batch_size, self.num_of_atoms)
# dones = dones.type(torch.uint8)
# support = self.support.unsqueeze(0).expand(self.batch_size, self.num_of_atoms)
#
# T_z = rewards + self.gamma * (1 - dones) * support
# T_z = T_z.clamp(min = self.v_min, max = self.v_max)
#
# # relative position of possible values
# b = (T_z - self.v_min) / self.delta_z
#
# # lower and upper bound of relative positions
# l = b.floor().to(torch.int64)
# u = b.ceil().to(torch.int64)
#
# # offset
# offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
# offset = offset.unsqueeze(1)
# offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
#
# # distribucia a uprava hodnot atomov
# m = torch.zeros((self.batch_size, self.num_of_atoms), dtype=torch.float).to(DEVICE)
# m.view(-1).index_add_(0, (l + offset).view(-1),\
# (target_dist * (u.float() - b)).view(-1))
# m.view(-1).index_add_(0, (u + offset).view(-1), \
# (target_dist * (b - l.float())).view(-1))
# return m
#
# def train(self):
# '''
# train our agent
# '''
# states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
# actions = actions.view(self.batch_size, 1, 1).expand(-1, -1, self.num_of_atoms)
#
# # Target distribution
# target_dist = self.target_distribution(next_states, rewards, dones)
# # Current distribution
# curr_dist = self.network.forward(states)
# curr_dist = curr_dist.gather(1, actions).squeeze()
#
# print(curr_dist)
# print(target_dist)
#
# time.sleep(20)
#
# # pocitanie chyby KL Divergence
# loss = -(target_dist * curr_dist.log()).sum(-1)
# loss = loss.mean()
#
# # uloz chybu do tensorboard
# self.append_loss(loss)
#
# # OPTIMIZE
# self.network.optimizer.zero_grad()
# loss.backward()
# for param in self.network.parameters():
# param.grad.data.clamp_(-1, 1)
# self.network.optimizer.step()
def project_distribution(self, states, actions, next_states, rewards, dones):
def get_action_argmax_next_Q_sa(next_states):
next_dist = self.target_network.forward(next_states) * self.support
next_Q_sa = next_dist.sum(dim=2).max(1)[1]
next_Q_sa = next_Q_sa.view(next_states.size(0),1,1)
next_Q_sa = next_Q_sa.expand(-1,-1, self.num_of_atoms)
def train(self):
states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
temp_support = self.support.view(1,1,-1)
return next_Q_sa
# current distribution
curr_dist = self.network(states)
curr_dist = torch.stack([curr_dist[i].index_select(0, actions[i]) for i in range(self.batch_size)]).squeeze(1)
with torch.no_grad():
dones = dones.to(torch.bool)
max_next_action = get_action_argmax_next_Q_sa(next_states)
max_next_dist = self.target_network.forward(next_states)
max_next_dist = max_next_dist.gather(1, max_next_action)
max_next_dist = max_next_dist.squeeze()
max_next_dist[dones] = 1.0 / self.num_of_atoms
# find max action
with torch.no_grad():
q_next = self.target_network.forward(next_states)
q_next_mean = torch.sum(q_next * temp_support, dim=2)
max_actions = q_next_mean.argmax(dim=1)
dones = dones.type(torch.uint8)
q_next = torch.stack([q_next[i].index_select(0, max_actions[i]) for i in range(self.batch_size)]).squeeze(1).to(DEVICE)
q_next = q_next.data
T_z = rewards.view(-1, 1) + self.gamma * self.support.view(1,-1) * (1-dones).view(-1,1)
T_z = T_z.clamp(min = self.v_min, max = self.v_max)
rewards = rewards.view(-1,1)
dones = dones.to(torch.int8).view(-1,1)
dones = 1 - dones
t_support = self.support.unsqueeze(0).expand(self.batch_size, self.num_of_atoms)
# relative position of possible values
b = (T_z - self.v_min) / self.delta_z
T_z = rewards + self.gamma * dones * t_support
# lower and upper bound of relative positions
l = b.floor().to(torch.int64)
u = b.ceil().to(torch.int64)
T_z = T_z.clamp(min = self.v_min, max = self.v_max)
b = (T_z - self.v_min) / self.delta_z
l[(u>0) * (l==u)] -= 1
u[(l<(self.num_of_atoms-1)) * (l==u)] += 1
l = b.floor().to(torch.int64).to(DEVICE)
u = b.ceil().to(torch.int16).to(DEVICE)
# offset
offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
offset = offset.unsqueeze(1)
offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
# target distribution
target_dist = torch.zeros((self.batch_size, self.num_of_atoms)).to(DEVICE)
# distribucia a uprava hodnot atomov
m = torch.zeros((self.batch_size, self.num_of_atoms), dtype=torch.float).to(DEVICE)
m.view(-1).index_add_(0, (l + offset).view(-1),\
(max_next_dist * (u.float() - b)).view(-1))
m.view(-1).index_add_(0, (u + offset).view(-1), \
(max_next_dist * (b - l.float())).view(-1))
return m
def train(self):
'''
pome
'''
states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
# for i in range(self.batch_size):
# for j in range(self.num_of_atoms):
# target_dist[i, l[i,j]] += (q_next * (u - b))[i,j]
# target_dist[i, u[i,j]] += (q_next * (b - l))[i,j]
actions = actions.unsqueeze(1).unsqueeze(1).expand(-1,-1,self.num_of_atoms)
rewards = rewards.view(-1,1,1)
dones = dones.to(torch.int8)
offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int64).to(DEVICE)
offset = offset.unsqueeze(1)
offset = offset.expand(self.batch_size, self.num_of_atoms)
y = self.network.forward(states)
target_dist.view(-1).index_add_(0, (l+offset).view(-1), (q_next * (u-b)).view(-1))
target_dist.view(-1).index_add_(0, (u+offset).view(-1), (q_next * (b-l)).view(-1))
curr_dist = y.gather(1, actions).squeeze()
target_prob = self.project_distribution(states, actions, next_states, rewards, dones)
# CHYBA
loss = target_dist * (- torch.log(curr_dist + 1e-8))
loss = -(target_prob * curr_dist.log()).sum(-1)
loss = torch.mean(loss)
# uloz chybu do tensorboard
......
......@@ -84,7 +84,6 @@ class Agent(AtariAgent):
loss = torch.mean(loss)
self.memory.update_memory(prior_idxs, diff.detach().cpu().numpy())
else:
# MSE
loss = self.network.loss(q_vals_net, q_vals_target)
......
......@@ -2,12 +2,14 @@ import os
import random
import time
import torch
from torch.nn.utils import clip_grad_norm_
from collections import deque
from utils.constant import *
from utils.base_agent import AtariAgent
from utils.nets.conv_nets.rainbow_q_network import Qnetwork
from utils.experience_replay.prioritized_replay_memory import ReplayMemory
from utils.experience_replay.prioritized_replay_memory import ReplayMemory as Prior
from utils.experience_replay.replay_memory import ReplayMemory as Replay
class Agent(AtariAgent):
......@@ -30,7 +32,10 @@ class Agent(AtariAgent):
self.buffer = deque(maxlen=self.n_step)
# Prioritized Replay memory
self.memory = ReplayMemory()
if self.per:
self.memory = Prior()
else:
self.memory = Replay()
self.path_to_folder = os.path.join(
os.getcwd(), "models/rainbow_deep_q_network")
......@@ -62,19 +67,207 @@ class Agent(AtariAgent):
self.buffer.clear()
self.memory.add(exp)
def choose_action(self, state):
'''
choosing action based on the Epsilon greedy strategy
'''
with torch.no_grad():
self.network.generate_noise()
state_ = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
action = self.network.forward(state_) * self.support
action = action.sum(2)
action = torch.argmax(action).item()
state = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
action = self.network.forward(state).argmax()
action = action.detach().cpu().numpy()
return action
def proj_dist(self, next_state, reward, done):
with torch.no_grad():
self.network.generate_noise()
self.target_network.generate_noise()
next_action = self.network.forward(next_state).argmax(1)
next_dist = self.target_network.dist(next_state)
next_dist = next_dist[range(self.batch_size), next_action]
T_z = reward + (1 - done) * (self.gamma ** self.n_step) * self.support
T_z = T_z.clamp(min=self.v_min, max=self.v_max)
b = (T_z - self.v_min) / self.delta_z
l = b.floor().long()
u = b.ceil().long()
# offset
offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
offset = offset.unsqueeze(1)
offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
proj_dist = torch.zeros(next_dist.size()).to(DEVICE)
proj_dist.view(-1).index_add_(0, (l + offset).view(-1), \
(next_dist * (u.float() - b)).view(-1))
proj_dist.view(-1).index_add_(0, (u + offset).view(-1), \
(next_dist * (b - l.float())).view(-1))
return proj_dist
def train(self):
states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
rewards = rewards.reshape(-1,1)
dones = dones.reshape(-1,1)
dones = dones.to(torch.int8)
proj_dist = self.proj_dist(next_states, rewards, dones)
self.network.generate_noise()
curr_dist = self.network.dist(states)
log_p = torch.log(curr_dist[range(self.batch_size), actions])
loss = -(proj_dist * log_p).sum(1)
loss = torch.mean(loss)
# uloz chybu do tensorboard
self.append_loss(loss)
# OPTIMIZE
self.network.optimizer.zero_grad()
loss.backward()
# clip_grad_norm_(self.network.parameters(), 10.0)
for param in self.network.parameters():
param.grad.data.clamp_(-1, 1)
self.network.optimizer.step()
#################ODTIAL###################
# def choose_action(self, state):
# '''
# choosing action based on the Epsilon greedy strategy
# '''
# with torch.no_grad():
# self.network.generate_noise()
#
# state_ = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
# action = self.network.forward(state_) * self.support
# action = action.sum(2)