Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Eduard Pizur
dp 2020
Commits
0837f084
Commit
0837f084
authored
Apr 21, 2021
by
Eduard Pizur
Browse files
minor changes
parent
54a973ab
Changes
5
Hide whitespace changes
Inline
Side-by-side
agents/distributional_deep_q_network/agent.py
View file @
0837f084
...
...
@@ -298,80 +298,52 @@ class Agent(AtariAgent):
if
random
.
random
()
>
self
.
epsilon
:
with
torch
.
no_grad
():
state
=
torch
.
FloatTensor
(
state
).
unsqueeze
(
0
).
to
(
DEVICE
)
action
=
self
.
network
.
forward
(
state
)
action
=
action
*
self
.
support
action
=
action
.
sum
(
dim
=
2
).
max
(
1
)
action
=
action
[
1
].
view
(
1
,
1
)
action
=
self
.
network
.
forward
(
state
).
argmax
()
action
=
action
.
detach
().
cpu
().
numpy
()
else
:
action
=
random
.
randrange
(
self
.
num_of_actions
)
action
=
torch
.
tensor
([[
action
]],
dtype
=
torch
.
int16
).
to
(
DEVICE
)
return
action
def
project_distribution
(
self
,
states
,
actions
,
next_states
,
rewards
,
dones
):
def
get_action_argmax_next_Q_sa
(
next_states
):
next_dist
=
self
.
target_network
.
forward
(
next_states
)
*
self
.
support
next_Q_sa
=
next_dist
.
sum
(
dim
=
2
).
max
(
1
)[
1
]
next_Q_sa
=
next_Q_sa
.
view
(
next_states
.
size
(
0
),
1
,
1
)
next_Q_sa
=
next_Q_sa
.
expand
(
-
1
,
-
1
,
self
.
num_of_atoms
)
return
next_Q_sa
def
proj_dist
(
self
,
next_state
,
reward
,
done
):
with
torch
.
no_grad
():
dones
=
dones
.
to
(
torch
.
bool
)
max_next_action
=
get_action_argmax_next_Q_sa
(
next_states
)
max_next_dist
=
self
.
target_network
.
forward
(
next_states
)
max_next_dist
=
max_next_dist
.
gather
(
1
,
max_next_action
)
max_next_dist
=
max_next_dist
.
squeeze
()
max_next_dist
[
dones
]
=
1.0
/
self
.
num_of_atoms
next_action
=
self
.
network
.
forward
(
next_state
).
argmax
(
1
)
next_dist
=
self
.
target_network
.
dist
(
next_state
)
dones
=
dones
.
type
(
torch
.
uint8
)
next_dist
=
next_dist
[
range
(
self
.
batch_size
),
next_action
]
T_z
=
rewards
.
view
(
-
1
,
1
)
+
self
.
gamma
*
self
.
support
.
view
(
1
,
-
1
)
*
(
1
-
dones
).
view
(
-
1
,
1
)
T_z
=
T_z
.
clamp
(
min
=
self
.
v_min
,
max
=
self
.
v_max
)
# relative position of possible values
T_z
=
reward
+
(
1
-
done
)
*
self
.
gamma
*
self
.
support
T_z
=
T_z
.
clamp
(
min
=
self
.
v_min
,
max
=
self
.
v_max
)
b
=
(
T_z
-
self
.
v_min
)
/
self
.
delta_z
# lower and upper bound of relative positions
l
=
b
.
floor
().
to
(
torch
.
int64
)
u
=
b
.
ceil
().
to
(
torch
.
int64
)
l
[(
u
>
0
)
*
(
l
==
u
)]
-=
1
u
[(
l
<
(
self
.
num_of_atoms
-
1
))
*
(
l
==
u
)]
+=
1
l
=
b
.
floor
().
long
()
u
=
b
.
ceil
().
long
()
# offset
offset
=
torch
.
linspace
(
0
,
(
self
.
batch_size
-
1
)
*
self
.
num_of_atoms
,
self
.
batch_size
,
dtype
=
torch
.
int16
)
offset
=
offset
.
unsqueeze
(
1
)
offset
=
offset
.
expand
(
self
.
batch_size
,
self
.
num_of_atoms
).
to
(
DEVICE
)
# distribucia a uprava hodnot atomov
m
=
torch
.
zeros
((
self
.
batch_size
,
self
.
num_of_atoms
),
dtype
=
torch
.
float
).
to
(
DEVICE
)
m
.
view
(
-
1
).
index_add_
(
0
,
(
l
+
offset
).
view
(
-
1
),
\
(
max_next_dist
*
(
u
.
float
()
-
b
)).
view
(
-
1
))
m
.
view
(
-
1
).
index_add_
(
0
,
(
u
+
offset
).
view
(
-
1
),
\
(
max_next_dist
*
(
b
-
l
.
float
())).
view
(
-
1
))
return
m
proj_dist
=
torch
.
zeros
(
next_dist
.
size
()).
to
(
DEVICE
)
proj_dist
.
view
(
-
1
).
index_add_
(
0
,
(
l
+
offset
).
view
(
-
1
),
\
(
next_dist
*
(
u
.
float
()
-
b
)).
view
(
-
1
))
proj_dist
.
view
(
-
1
).
index_add_
(
0
,
(
u
+
offset
).
view
(
-
1
),
\
(
next_dist
*
(
b
-
l
.
float
())).
view
(
-
1
))
return
proj_dist
def
train
(
self
):
'''
pome
'''
states
,
actions
,
next_states
,
rewards
,
dones
=
self
.
extract_batch_of_memory
()
actions
=
actions
.
unsqueeze
(
1
).
unsqueeze
(
1
).
expand
(
-
1
,
-
1
,
self
.
num_of_atoms
)
rewards
=
rewards
.
view
(
-
1
,
1
,
1
)
rewards
=
rewards
.
reshape
(
-
1
,
1
)
dones
=
dones
.
reshape
(
-
1
,
1
)
dones
=
dones
.
to
(
torch
.
int8
)
proj_dist
=
self
.
proj_dist
(
next_states
,
rewards
,
dones
)
y
=
self
.
network
.
forward
(
states
)
curr_dist
=
self
.
network
.
dist
(
states
)
log_p
=
torch
.
log
(
curr_dist
[
range
(
self
.
batch_size
),
actions
])
curr_dist
=
y
.
gather
(
1
,
actions
).
squeeze
()
target_prob
=
self
.
project_distribution
(
states
,
actions
,
next_states
,
rewards
,
dones
)
loss
=
-
(
target_prob
*
curr_dist
.
log
()).
sum
(
-
1
)
loss
=
-
(
proj_dist
*
log_p
).
sum
(
1
)
loss
=
torch
.
mean
(
loss
)
# uloz chybu do tensorboard
...
...
@@ -380,6 +352,84 @@ class Agent(AtariAgent):
# OPTIMIZE
self
.
network
.
optimizer
.
zero_grad
()
loss
.
backward
()
# clip_grad_norm_(self.network.parameters(), 10.0)
for
param
in
self
.
network
.
parameters
():
param
.
grad
.
data
.
clamp_
(
-
1
,
1
)
self
.
network
.
optimizer
.
step
()
# def project_distribution(self, states, actions, next_states, rewards, dones):
# def get_action_argmax_next_Q_sa(next_states):
# next_dist = self.target_network.forward(next_states) * self.support
# next_Q_sa = next_dist.sum(dim=2).max(1)[1]
# next_Q_sa = next_Q_sa.view(next_states.size(0),1,1)
# next_Q_sa = next_Q_sa.expand(-1,-1, self.num_of_atoms)
#
# return next_Q_sa
#
# with torch.no_grad():
# dones = dones.to(torch.bool)
#
# max_next_action = get_action_argmax_next_Q_sa(next_states)
# max_next_dist = self.target_network.forward(next_states)
# max_next_dist = max_next_dist.gather(1, max_next_action)
# max_next_dist = max_next_dist.squeeze()
# max_next_dist[dones] = 1.0 / self.num_of_atoms
#
# dones = dones.type(torch.uint8)
#
# T_z = rewards.view(-1, 1) + self.gamma * self.support.view(1,-1) * (1-dones).view(-1,1)
# T_z = T_z.clamp(min = self.v_min, max = self.v_max)
#
# # relative position of possible values
# b = (T_z - self.v_min) / self.delta_z
#
# # lower and upper bound of relative positions
# l = b.floor().to(torch.int64)
# u = b.ceil().to(torch.int64)
#
# l[(u>0) * (l==u)] -= 1
# u[(l<(self.num_of_atoms-1)) * (l==u)] += 1
#
# # offset
# offset = torch.linspace(0, (self.batch_size - 1) * self.num_of_atoms, self.batch_size, dtype=torch.int16)
# offset = offset.unsqueeze(1)
# offset = offset.expand(self.batch_size, self.num_of_atoms).to(DEVICE)
#
# # distribucia a uprava hodnot atomov
# m = torch.zeros((self.batch_size, self.num_of_atoms), dtype=torch.float).to(DEVICE)
# m.view(-1).index_add_(0, (l + offset).view(-1),\
# (max_next_dist * (u.float() - b)).view(-1))
# m.view(-1).index_add_(0, (u + offset).view(-1), \
# (max_next_dist * (b - l.float())).view(-1))
#
# return m
#
# def train(self):
# '''
# pome
# '''
# states, actions, next_states, rewards, dones = self.extract_batch_of_memory()
#
# actions = actions.unsqueeze(1).unsqueeze(1).expand(-1,-1,self.num_of_atoms)
# rewards = rewards.view(-1,1,1)
# dones = dones.to(torch.int8)
#
#
# y = self.network.forward(states)
#
# curr_dist = y.gather(1, actions).squeeze()
# target_prob = self.project_distribution(states, actions, next_states, rewards, dones)
#
# loss = -(target_prob * curr_dist.log()).sum(-1)
# loss = torch.mean(loss)
#
# # uloz chybu do tensorboard
# self.append_loss(loss)
#
# # OPTIMIZE
# self.network.optimizer.zero_grad()
# loss.backward()
# for param in self.network.parameters():
# param.grad.data.clamp_(-1, 1)
# self.network.optimizer.step()
main.py
View file @
0837f084
...
...
@@ -59,7 +59,7 @@ def main(short_name, full_name, agent, per):
using_per
=
"using_RM"
run_name
=
"runs/{}/{}/{}"
.
format
(
full_name
,
using_per
,
run_name
=
"runs
_f
/{}/{}/{}"
.
format
(
full_name
,
using_per
,
datetime
.
datetime
.
now
().
strftime
(
"%Y-%m-%d_%H-%M"
))
writer
=
SummaryWriter
(
run_name
)
...
...
@@ -155,26 +155,25 @@ def main(short_name, full_name, agent, per):
if
__name__
==
"__main__"
:
# metoda a PER
combinations
=
[
#
(1, False), #DQN RM
(
1
,
False
),
#DQN RM
# (2, True), #DDQN Priority
# (2, False), # DDQN RM
# (4, False), #Dueling Double DQN RM
# (5, False), #Noisy DQN RM
# (9, False), # C51 DQN RM
# (10, True), #rainbow
(
10
,
False
)
# rainbow
# (10, False) # rainbow
]
network
=
{
1
:
{
"full_name"
:
"_deep_q_network"
,
"short_name"
:
"_DQN"
,
"full_name"
:
"_
d
deep_q_network"
,
"short_name"
:
"_
_
DQN"
,
"agent"
:
DQNAgent
},
2
:
{
"full_name"
:
"_double_deep_q_network"
,
"short_name"
:
"_DDQN"
,
"full_name"
:
"_
d
double_deep_q_network"
,
"short_name"
:
"_
_
DDQN"
,
"agent"
:
DDQNAgent
},
4
:
{
...
...
@@ -183,8 +182,8 @@ if __name__ == "__main__":
"agent"
:
D3QNAgent
},
5
:
{
"full_name"
:
"_noisy_deep_q_network"
,
"short_name"
:
"_Noisy_DQN"
,
"full_name"
:
"_
n
noisy_deep_q_network"
,
"short_name"
:
"_
N
Noisy_DQN"
,
"agent"
:
Noisy_DQNAgent
},
8
:
{
...
...
@@ -193,8 +192,8 @@ if __name__ == "__main__":
"agent"
:
N_Step_DQNAgent
},
9
:
{
"full_name"
:
"__C51_deep_q_network"
,
"short_name"
:
"_C51_DQN"
,
"full_name"
:
"__
C
C51_deep_q_network"
,
"short_name"
:
"_
C
C51_DQN"
,
"agent"
:
C51_DQNAgent
},
10
:
{
...
...
runs_f/_rainbow_deep_q_network/using_PER/2021-04-20_17-42/2021-04-18_15-45/events.out.tfevents.1618753542.Hermes.82084.0
0 → 100644
View file @
0837f084
File added
utils/constant.py
View file @
0837f084
...
...
@@ -28,7 +28,9 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# prioritized replay experience
PRIORITIZED_EPSILON
=
0.001
ALPHA
=
0.6
#ako velmi prioritizovat
# ALPHA = 0.2
BETA
=
0.4
#
# BETA = 0.6
# BETA_INCREMENT = 0.001
BETA_INC_STEPS
=
1_000_000
# Noisy Linear
...
...
utils/nets/conv_nets/categorical_q_network.py
View file @
0837f084
# # import torch
# # import torch.nn as nn
# # import torch.nn.functional as F
# # import torch.optim as optim
# # import numpy as np
# #
# # from utils.constant import *
# #
# # class Qnetwork(nn.Module):
# #
# # def __init__(self):
# # super(Qnetwork, self).__init__()
# # '''
# # CNN model for predicting Q values
# # '''
# # # self.learning_rate = LEARNING_RATE
# # self.learning_rate = 0.00005
# #
# # self.num_of_actions = NUM_OF_ACTIONS
# # self.num_of_atoms = NUM_OF_ATOMS
# # self.batch_size = BATCH_SIZE
# #
# # self.v_min = V_MIN
# # self.v_max = V_MAX
# # self.support = torch.linspace(self.v_min, self.v_max, self.num_of_atoms).to(DEVICE)
# #
# # self.cnn_layers = nn.Sequential(
# # nn.Conv2d(4, 32, kernel_size=8, stride=4),
# # nn.ReLU(),
# # nn.Conv2d(32, 64, kernel_size=4, stride=2),
# # nn.ReLU(),
# # nn.Conv2d(64, 64, kernel_size=3, stride=1),
# # nn.ReLU()
# # )
# #
# # self.fc_input = self.calculate_linear_input()
# #
# # self.linear_layers = nn.Sequential(
# # nn.Linear(in_features=self.fc_input, out_features=512),
# # nn.ReLU(),
# # nn.Linear(in_features=512, out_features=self.num_of_actions * self.num_of_atoms)
# # )
# #
# # # self.optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
# # self.optimizer = optim.RMSprop(self.parameters(), lr=self.learning_rate)
# # self.to(DEVICE)
# #
# # def calculate_linear_input(self, *conv_layers):
# # '''
# # returns size of input for fc layers
# # '''
# # fc_input = self.cnn_layers(torch.zeros(1, *INPUT_SHAPE))
# # return int(np.prod(fc_input.size()))
# #
# # def forward(self, state):
# # '''
# # returns distribution
# # '''
# # state = self.cnn_layers(state)
# # state = state.view(state.size()[0], -1)
# # dist = self.linear_layers(state)
# # dist = dist.view(-1, self.num_of_actions, self.num_of_atoms)
# # dist = F.softmax(dist, dim=2)
# # # dist = dist.clamp(min=0.0001)
# #
# # return dist
# #
# # def dist(self, state):
# # state = self.cnn_layers(state)
# # state = state.view(state.size()[0], -1)
# # q_atoms = self.linear_layers(state).view(-1, self.num_of_actions, self.num_of_atoms)
# # dist = F.softmax(q_atoms, dim=-1)
# # dist = dist.clamp(min=1e-3)
# #
# # return dist
# #
# # def forward(self, state):
# # dist = self.dist(state)
# # q = torch.sum(dist * self.support, dim=2)
# # return q
# #
#
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# import numpy as np
#
# from utils.constant import *
#
# # def init(module, weight_init, bias_init, gain=1, mode=None, nonlinearity='relu'):
# # if mode is not None:
# # weight_init(module.weight.data, mode=mode, nonlinearity=nonlinearity)
# # else:
# # weight_init(module.weight.data, gain=gain)
# # bias_init(module.bias.data)
# # return module
#
# class Qnetwork(nn.Module):
#
# def __init__(self):
# super(Qnetwork, self).__init__()
#
# self.learning_rate = LEARNING_RATE
# # self.learning_rate = 0.00005
#
# self.num_of_actions = NUM_OF_ACTIONS
# self.num_of_atoms = NUM_OF_ATOMS
# self.batch_size = BATCH_SIZE
#
# # init_ = lambda m: init(m,
# # nn.init.kaiming_uniform_,
# # lambda x: nn.init.constant_(x, 0),
# # nonlinearity='relu',
# # mode='fan_in')
# # init2_ = lambda m: init(m,
# # nn.init.kaiming_uniform_,
# # lambda x: nn.init.constant_(x, 0),
# # nonlinearity='relu',
# # mode='fan_in')
#
# self.cnn_layers = nn.Sequential(
# # init_(nn.Conv2d(4, 32, kernel_size=8, stride=4)),
# nn.Conv2d(4, 32, kernel_size=8, stride=4),
# nn.ReLU(),
# # init_(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
# nn.Conv2d(32, 64, kernel_size=4, stride=2),
# nn.ReLU(),
# # init_(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
# nn.Conv2d(64, 64, kernel_size=3, stride=1),
# nn.ReLU()
# )
#
# self.fc_input = self.calculate_linear_input()
#
# self.linear_layers = nn.Sequential(
# # init_(nn.Linear(in_features=self.fc_input, out_features=512)),
# nn.Linear(in_features=self.fc_input, out_features=512),
# nn.ReLU(),
# # init2_(nn.Linear(in_features=512, out_features=self.num_of_actions * self.num_of_atoms))
# nn.Linear(in_features=512, out_features=self.num_of_actions * self.num_of_atoms)
# )
#
# # self.optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
# self.optimizer = optim.RMSprop(self.parameters(), lr=self.learning_rate)
# self.to(DEVICE)
#
# def calculate_linear_input(self, *conv_layers):
# '''
# returns size of input for fc layers
# '''
# fc_input = self.cnn_layers(torch.zeros(1, *INPUT_SHAPE))
# return int(np.prod(fc_input.size()))
#
# def forward(self, state):
# '''
# returns distribution
# '''
# state = self.cnn_layers(state)
# state = state.view(state.size(0), -1)
# state = self.linear_layers(state)
# y = state.view(-1, self.num_of_actions, self.num_of_atoms)
# y = F.log_softmax(y, dim=2).exp()
# y = y.clamp(min=0.0001)
#
# return y
#
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
...
...
@@ -88,6 +255,7 @@ import numpy as np
from
utils.constant
import
*
# def init(module, weight_init, bias_init, gain=1, mode=None, nonlinearity='relu'):
# if mode is not None:
# weight_init(module.weight.data, mode=mode, nonlinearity=nonlinearity)
...
...
@@ -108,6 +276,11 @@ class Qnetwork(nn.Module):
self
.
num_of_atoms
=
NUM_OF_ATOMS
self
.
batch_size
=
BATCH_SIZE
self
.
v_max
=
V_MAX
self
.
v_min
=
V_MIN
self
.
support
=
torch
.
linspace
(
self
.
v_min
,
self
.
v_max
,
self
.
num_of_atoms
).
to
(
DEVICE
)
# init_ = lambda m: init(m,
# nn.init.kaiming_uniform_,
# lambda x: nn.init.constant_(x, 0),
...
...
@@ -129,7 +302,7 @@ class Qnetwork(nn.Module):
# init_(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
nn
.
Conv2d
(
64
,
64
,
kernel_size
=
3
,
stride
=
1
),
nn
.
ReLU
()
)
)
self
.
fc_input
=
self
.
calculate_linear_input
()
...
...
@@ -152,10 +325,7 @@ class Qnetwork(nn.Module):
fc_input
=
self
.
cnn_layers
(
torch
.
zeros
(
1
,
*
INPUT_SHAPE
))
return
int
(
np
.
prod
(
fc_input
.
size
()))
def
forward
(
self
,
state
):
'''
returns distribution
'''
def
dist
(
self
,
state
):
state
=
self
.
cnn_layers
(
state
)
state
=
state
.
view
(
state
.
size
(
0
),
-
1
)
state
=
self
.
linear_layers
(
state
)
...
...
@@ -164,3 +334,12 @@ class Qnetwork(nn.Module):
y
=
y
.
clamp
(
min
=
0.0001
)
return
y
def
forward
(
self
,
state
):
'''
returns distribution
'''
dist
=
self
.
dist
(
state
)
q
=
torch
.
sum
(
dist
*
self
.
support
,
dim
=
2
)
return
q
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment