Commit 0837f084 authored by Eduard Pizur's avatar Eduard Pizur
Browse files

minor changes

parent 54a973ab
......@@ -298,80 +298,52 @@ class Agent(AtariAgent):
if random.random() > self.epsilon:
with torch.no_grad():
state = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
action = self.network.forward(state)
action = action * self.support
action = action.sum(dim=2).max(1)
action = action[1].view(1,1)
action = self.network.forward(state).argmax()
action = action.detach().cpu().numpy()
else:
action = random.randrange(self.num_of_actions)
action = torch.tensor([[action]], dtype=torch.int16).to(DEVICE)
return action
def project_distribution(self, states, actions, next_states, rewards, dones):
def get_action_argmax_next_Q_sa(next_states):
next_dist = self.target_network.forward(next_states) * self.support
next_Q_sa = next_dist.sum(dim=2).max(1)[1]
next_Q_sa = next_Q_sa.view(next_states.size(0),1,1)
next_Q_sa = next_Q_sa.expand(-1,-1, self.num_of_atoms)
return next_Q_sa
def proj_dist(self, next_state, reward, done):
with torch.no_grad():
dones = dones.to(torch.bool)
max_next_action = get_action_argmax_next_Q_sa(next_states)
max_next_dist = self.target_network.forward(next_states)
max_next_dist = max_next_dist.gather(1, max_next_action)
max_next_dist = max_next_dist.squeeze()
max_next_dist[dones] = 1.0 / self.num_of_atoms
next_action = self.network.forward(next_state).argmax(1)
next_dist = self.target_network.dist(next_state)
dones = dones.type(torch.uint8)
next_dist = next_dist[range(self.batch_size), next_action]
T_z = rewards.view(-1, 1) + self.gamma * self.support.view(1,-1) * (1-dones).view(-1,1)
T_z = T_z.clamp(min = self.v_min, max = self.v_max)
# relative position of possible values
T_z = reward + (1 - done) * self.gamma * self.support
T_z = T_z.clamp(min=self.v_min, max=self.v_max)
b = (T_z - self.v_min) / self.delta_z
# lower and upper bound of relative positions
l = b.floor().to(torch.int64)
u = b.ceil().to(torch.int64)
l[(u>0) * (l==u)] -= 1
u[(l<(self.num_of_atoms-1)) * (l==u)] += 1
l = b.floor().long()
u = b.ceil().long()
# offset
offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
offset = offset.unsqueeze(1)
offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
# distribucia a uprava hodnot atomov
m = torch.zeros((self.batch_size, self.num_of_atoms), dtype=torch.float).to(DEVICE)
m.view(-1).index_add_(0, (l + offset).view(-1),\
(max_next_dist * (u.float() - b)).view(-1))
m.view(-1).index_add_(0, (u + offset).view(-1), \
(max_next_dist * (b - l.float())).view(-1))
return m
proj_dist = torch.zeros(next_dist.size()).to(DEVICE)
proj_dist.view(-1).index_add_(0, (l + offset).view(-1), \
(next_dist * (u.float() - b)).view(-1))
proj_dist.view(-1).index_add_(0, (u + offset).view(-1), \
(next_dist * (b - l.float())).view(-1))
return proj_dist
def train(self):
'''
pome
'''
states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
actions = actions.unsqueeze(1).unsqueeze(1).expand(-1,-1,self.num_of_atoms)
rewards = rewards.view(-1,1,1)
rewards = rewards.reshape(-1,1)
dones = dones.reshape(-1,1)
dones = dones.to(torch.int8)
proj_dist = self.proj_dist(next_states, rewards, dones)
y = self.network.forward(states)
curr_dist = self.network.dist(states)
log_p = torch.log(curr_dist[range(self.batch_size), actions])
curr_dist = y.gather(1, actions).squeeze()
target_prob = self.project_distribution(states, actions, next_states, rewards, dones)
loss = -(target_prob * curr_dist.log()).sum(-1)
loss = -(proj_dist * log_p).sum(1)
loss = torch.mean(loss)
# uloz chybu do tensorboard
......@@ -380,6 +352,84 @@ class Agent(AtariAgent):
# OPTIMIZE
self.network.optimizer.zero_grad()
loss.backward()
# clip_grad_norm_(self.network.parameters(), 10.0)
for param in self.network.parameters():
param.grad.data.clamp_(-1, 1)
self.network.optimizer.step()
# def project_distribution(self, states, actions, next_states, rewards, dones):
# def get_action_argmax_next_Q_sa(next_states):
# next_dist = self.target_network.forward(next_states) * self.support
# next_Q_sa = next_dist.sum(dim=2).max(1)[1]
# next_Q_sa = next_Q_sa.view(next_states.size(0),1,1)
# next_Q_sa = next_Q_sa.expand(-1,-1, self.num_of_atoms)
#
# return next_Q_sa
#
# with torch.no_grad():
# dones = dones.to(torch.bool)
#
# max_next_action = get_action_argmax_next_Q_sa(next_states)
# max_next_dist = self.target_network.forward(next_states)
# max_next_dist = max_next_dist.gather(1, max_next_action)
# max_next_dist = max_next_dist.squeeze()
# max_next_dist[dones] = 1.0 / self.num_of_atoms
#
# dones = dones.type(torch.uint8)
#
# T_z = rewards.view(-1, 1) + self.gamma * self.support.view(1,-1) * (1-dones).view(-1,1)
# T_z = T_z.clamp(min = self.v_min, max = self.v_max)
#
# # relative position of possible values
# b = (T_z - self.v_min) / self.delta_z
#
# # lower and upper bound of relative positions
# l = b.floor().to(torch.int64)
# u = b.ceil().to(torch.int64)
#
# l[(u>0) * (l==u)] -= 1
# u[(l<(self.num_of_atoms-1)) * (l==u)] += 1
#
# # offset
# offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
# offset = offset.unsqueeze(1)
# offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
#
# # distribucia a uprava hodnot atomov
# m = torch.zeros((self.batch_size, self.num_of_atoms), dtype=torch.float).to(DEVICE)
# m.view(-1).index_add_(0, (l + offset).view(-1),\
# (max_next_dist * (u.float() - b)).view(-1))
# m.view(-1).index_add_(0, (u + offset).view(-1), \
# (max_next_dist * (b - l.float())).view(-1))
#
# return m
#
# def train(self):
# '''
# pome
# '''
# states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
#
# actions = actions.unsqueeze(1).unsqueeze(1).expand(-1,-1,self.num_of_atoms)
# rewards = rewards.view(-1,1,1)
# dones = dones.to(torch.int8)
#
#
# y = self.network.forward(states)
#
# curr_dist = y.gather(1, actions).squeeze()
# target_prob = self.project_distribution(states, actions, next_states, rewards, dones)
#
# loss = -(target_prob * curr_dist.log()).sum(-1)
# loss = torch.mean(loss)
#
# # uloz chybu do tensorboard
# self.append_loss(loss)
#
# # OPTIMIZE
# self.network.optimizer.zero_grad()
# loss.backward()
# for param in self.network.parameters():
# param.grad.data.clamp_(-1, 1)
# self.network.optimizer.step()
......@@ -59,7 +59,7 @@ def main(short_name, full_name, agent, per):
using_per = "using_RM"
run_name = "runs/{}/{}/{}".format(full_name,using_per,
run_name = "runs_f/{}/{}/{}".format(full_name,using_per,
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
writer = SummaryWriter(run_name)
......@@ -155,26 +155,25 @@ def main(short_name, full_name, agent, per):
if __name__ == "__main__":
# metoda a PER
combinations = [
# (1, False), #DQN RM
(1, False), #DQN RM
# (2, True), #DDQN Priority
# (2, False), # DDQN RM
# (4, False), #Dueling Double DQN RM
# (5, False), #Noisy DQN RM
# (9, False), # C51 DQN RM
# (10, True), #rainbow
(10, False) # rainbow
# (10, False) # rainbow
]
network = {
1: {
"full_name": "_deep_q_network",
"short_name": "_DQN",
"full_name": "_ddeep_q_network",
"short_name": "__DQN",
"agent": DQNAgent
},
2: {
"full_name": "_double_deep_q_network",
"short_name": "_DDQN",
"full_name": "_ddouble_deep_q_network",
"short_name": "__DDQN",
"agent": DDQNAgent
},
4: {
......@@ -183,8 +182,8 @@ if __name__ == "__main__":
"agent": D3QNAgent
},
5: {
"full_name": "_noisy_deep_q_network",
"short_name": "_Noisy_DQN",
"full_name": "_nnoisy_deep_q_network",
"short_name": "_NNoisy_DQN",
"agent": Noisy_DQNAgent
},
8: {
......@@ -193,8 +192,8 @@ if __name__ == "__main__":
"agent": N_Step_DQNAgent
},
9: {
"full_name": "__C51_deep_q_network",
"short_name": "_C51_DQN",
"full_name": "__CC51_deep_q_network",
"short_name": "_CC51_DQN",
"agent": C51_DQNAgent
},
10: {
......
......@@ -28,7 +28,9 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# prioritized replay experience
PRIORITIZED_EPSILON = 0.001
ALPHA = 0.6 #ako velmi prioritizovat
# ALPHA = 0.2
BETA = 0.4 #
# BETA = 0.6
# BETA_INCREMENT = 0.001
BETA_INC_STEPS = 1_000_000
# Noisy Linear
......
# # import torch
# # import torch.nn as nn
# # import torch.nn.functional as F
# # import torch.optim as optim
# # import numpy as np
# #
# # from utils.constant import *
# #
# # class Qnetwork(nn.Module):
# #
# # def __init__(self):
# # super(Qnetwork, self).__init__()
# # '''
# # CNN model for predicting Q values
# # '''
# # # self.learning_rate = LEARNING_RATE
# # self.learning_rate = 0.00005
# #
# # self.num_of_actions = NUM_OF_ACTIONS
# # self.num_of_atoms = NUM_OF_ATOMS
# # self.batch_size = BATCH_SIZE
# #
# # self.v_min = V_MIN
# # self.v_max = V_MAX
# # self.support = torch.linspace(self.v_min, self.v_max, self.num_of_atoms).to(DEVICE)
# #
# # self.cnn_layers = nn.Sequential(
# # nn.Conv2d(4, 32, kernel_size=8, stride=4),
# # nn.ReLU(),
# # nn.Conv2d(32, 64, kernel_size=4, stride=2),
# # nn.ReLU(),
# # nn.Conv2d(64, 64, kernel_size=3, stride=1),
# # nn.ReLU()
# # )
# #
# # self.fc_input = self.calculate_linear_input()
# #
# # self.linear_layers = nn.Sequential(
# # nn.Linear(in_features=self.fc_input, out_features=512),
# # nn.ReLU(),
# # nn.Linear(in_features=512, out_features=self.num_of_actions * self.num_of_atoms)
# # )
# #
# # # self.optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
# # self.optimizer = optim.RMSprop(self.parameters(), lr=self.learning_rate)
# # self.to(DEVICE)
# #
# # def calculate_linear_input(self, *conv_layers):
# # '''
# # returns size of input for fc layers
# # '''
# # fc_input = self.cnn_layers(torch.zeros(1, *INPUT_SHAPE))
# # return int(np.prod(fc_input.size()))
# #
# # def forward(self, state):
# # '''
# # returns distribution
# # '''
# # state = self.cnn_layers(state)
# # state = state.view(state.size()[0], -1)
# # dist = self.linear_layers(state)
# # dist = dist.view(-1, self.num_of_actions, self.num_of_atoms)
# # dist = F.softmax(dist, dim=2)
# # # dist = dist.clamp(min=0.0001)
# #
# # return dist
# #
# # def dist(self, state):
# # state = self.cnn_layers(state)
# # state = state.view(state.size()[0], -1)
# # q_atoms = self.linear_layers(state).view(-1, self.num_of_actions, self.num_of_atoms)
# # dist = F.softmax(q_atoms, dim=-1)
# # dist = dist.clamp(min=1e-3)
# #
# # return dist
# #
# # def forward(self, state):
# # dist = self.dist(state)
# # q = torch.sum(dist * self.support, dim=2)
# # return q
# #
#
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# import numpy as np
#
# from utils.constant import *
#
# # def init(module, weight_init, bias_init, gain=1, mode=None, nonlinearity='relu'):
# # if mode is not None:
# # weight_init(module.weight.data, mode=mode, nonlinearity=nonlinearity)
# # else:
# # weight_init(module.weight.data, gain=gain)
# # bias_init(module.bias.data)
# # return module
#
# class Qnetwork(nn.Module):
#
# def __init__(self):
# super(Qnetwork, self).__init__()
#
# self.learning_rate = LEARNING_RATE
# # self.learning_rate = 0.00005
#
# self.num_of_actions = NUM_OF_ACTIONS
# self.num_of_atoms = NUM_OF_ATOMS
# self.batch_size = BATCH_SIZE
#
# # init_ = lambda m: init(m,
# # nn.init.kaiming_uniform_,
# # lambda x: nn.init.constant_(x, 0),
# # nonlinearity='relu',
# # mode='fan_in')
# # init2_ = lambda m: init(m,
# # nn.init.kaiming_uniform_,
# # lambda x: nn.init.constant_(x, 0),
# # nonlinearity='relu',
# # mode='fan_in')
#
# self.cnn_layers = nn.Sequential(
# # init_(nn.Conv2d(4, 32, kernel_size=8, stride=4)),
# nn.Conv2d(4, 32, kernel_size=8, stride=4),
# nn.ReLU(),
# # init_(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
# nn.Conv2d(32, 64, kernel_size=4, stride=2),
# nn.ReLU(),
# # init_(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
# nn.Conv2d(64, 64, kernel_size=3, stride=1),
# nn.ReLU()
# )
#
# self.fc_input = self.calculate_linear_input()
#
# self.linear_layers = nn.Sequential(
# # init_(nn.Linear(in_features=self.fc_input, out_features=512)),
# nn.Linear(in_features=self.fc_input, out_features=512),
# nn.ReLU(),
# # init2_(nn.Linear(in_features=512, out_features=self.num_of_actions * self.num_of_atoms))
# nn.Linear(in_features=512, out_features=self.num_of_actions * self.num_of_atoms)
# )
#
# # self.optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
# self.optimizer = optim.RMSprop(self.parameters(), lr=self.learning_rate)
# self.to(DEVICE)
#
# def calculate_linear_input(self, *conv_layers):
# '''
# returns size of input for fc layers
# '''
# fc_input = self.cnn_layers(torch.zeros(1, *INPUT_SHAPE))
# return int(np.prod(fc_input.size()))
#
# def forward(self, state):
# '''
# returns distribution
# '''
# state = self.cnn_layers(state)
# state = state.view(state.size(0), -1)
# state = self.linear_layers(state)
# y = state.view(-1, self.num_of_actions, self.num_of_atoms)
# y = F.log_softmax(y, dim=2).exp()
# y = y.clamp(min=0.0001)
#
# return y
#
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
......@@ -88,6 +255,7 @@ import numpy as np
from utils.constant import *
# def init(module, weight_init, bias_init, gain=1, mode=None, nonlinearity='relu'):
# if mode is not None:
# weight_init(module.weight.data, mode=mode, nonlinearity=nonlinearity)
......@@ -108,6 +276,11 @@ class Qnetwork(nn.Module):
self.num_of_atoms = NUM_OF_ATOMS
self.batch_size = BATCH_SIZE
self.v_max = V_MAX
self.v_min = V_MIN
self.support = torch.linspace(self.v_min, self.v_max, self.num_of_atoms).to(DEVICE)
# init_ = lambda m: init(m,
# nn.init.kaiming_uniform_,
# lambda x: nn.init.constant_(x, 0),
......@@ -129,7 +302,7 @@ class Qnetwork(nn.Module):
# init_(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU()
)
)
self.fc_input = self.calculate_linear_input()
......@@ -152,10 +325,7 @@ class Qnetwork(nn.Module):
fc_input = self.cnn_layers(torch.zeros(1, *INPUT_SHAPE))
return int(np.prod(fc_input.size()))
def forward(self, state):
'''
returns distribution
'''
def dist(self, state):
state = self.cnn_layers(state)
state = state.view(state.size(0), -1)
state = self.linear_layers(state)
......@@ -164,3 +334,12 @@ class Qnetwork(nn.Module):
y = y.clamp(min=0.0001)
return y
def forward(self, state):
'''
returns distribution
'''
dist = self.dist(state)
q = torch.sum(dist * self.support, dim=2)
return q
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment