Commit b5ea4935 authored by Eduard Pizur's avatar Eduard Pizur
Browse files

Update dqn_agent.py

parent 9866f13a
import random
import numpy as np
import torch as T
from deep_q_network import DQN
from replay_memory import ReplayMemory
class Agent():
'''
DQN agent for atari games
'''
def __init__(self, epsilon_start, num_of_actions, learning_rate, device, input_shape, \
epsilon_dec, epsilon_min, memory_size, discount_factor, batch_size):
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.device = device
self.input_shape = input_shape
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon = epsilon_start
self.eps_dec = epsilon_dec
self.eps_min = epsilon_min
self.num_of_actions = num_of_actions
self.actions = [i for i in range(self.num_of_actions)]
self.memory = ReplayMemory(max_len=self.memory_size, batch_size=self.batch_size)
self.network = DQN(lr=self.learning_rate,
num_of_actions=self.num_of_actions,
device = self.device,
input_shape=input_shape)
self.target_network = DQN(lr=self.learning_rate,
num_of_actions=self.num_of_actions,
device = self.device,
input_shape=input_shape)
def choose_action(self, observation):
'''
choosing action based on the Epsilon greedy strategy
'''
if random.random() > self.epsilon:
observation = T.tensor([observation], dtype=T.float)
actions = self.network.forward(observation)
action = T.argmax(actions).item()
else:
action = random.choice(self.actions)
return action
def decaying_epsilon(self):
'''
decaying epsilon
'''
self.epsilon *= self.eps_dec if self.epsilon > self.eps_min else self.eps_min
def replace_weights(self):
'''
replacing weight from network to target network
'''
self.target_network.load_state_dict(self.network.state_dict())
def is_train_process_possible(self):
'''
returns True if memory is bigger or equal than batch size
'''
return len(self.memory) >= self.batch_size
def sample_memory(self):
'''
return batch of extracted experiences
'''
experiences = self.memory.sample()
observations, actions, rewards, new_observations, dones = self.extract_experiences(experiences)
observations = T.tensor(observations ,dtype=T.float32).to(self.device)
rewards = T.tensor(rewards).to(self.device,dtype=T.int32)
actions = T.tensor(actions).to(self.device,dtype=T.int32)
next_observations = T.tensor(new_observations, dtype=T.float32).to(self.device)
dones = T.tensor(dones).to(self.device,dtype=T.int32)
return observations, actions, rewards, next_observations, dones
def extract_experiences(self, experiences):
'''
extract observation, action, new_bservation, reward from each experience
'''
observation, action, new_observation, reward, dones = [],[],[],[],[]
for experience in experiences:
observation.append(experience.state)
action.append(experience.action)
new_observation.append(experience.next_state)
reward.append(experience.reward)
dones.append(experience.dones)
return observation, action, reward, new_observation, dones
def train(self):
'''
train our agent's network
'''
self.network.optimizer.zero_grad()
observation, actions, rewards, new_observation, dones = self.sample_memory()
q_values_output = self.network.forward(observation)
q_values_next_output = self.network.forward(new_observation)
q_values_current_state = []
q_values_next_state = []
for i in range(self.batch_size):
q_values_current_state.append(q_values_output[i][int(actions[i])].item())
if dones[i].item() != 1:
q_values_next_state.append(T.max(q_values_next_output[i]).item())
else:
q_values_next_state.append(0.0)
q_target = rewards + self.discount_factor*T.tensor(q_values_next_state,requires_grad=True)
#optimize network
loss = self.network.loss(T.tensor(q_values_current_state), q_target)
loss.backward()
self.network.optimizer.step()
# decaying epsilon
self.decaying_epsilon()
import random
import numpy as np
import torch as T
from deep_q_network import DQN
from replay_memory import ReplayMemory
class Agent():
'''
DQN agent for atari games
'''
def __init__(self, epsilon_start, num_of_actions, learning_rate, device, input_shape, \
epsilon_dec, epsilon_min, memory_size, discount_factor, batch_size):
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.device = device
self.input_shape = input_shape
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon = epsilon_start
self.eps_dec = epsilon_dec
self.eps_min = epsilon_min
self.num_of_actions = num_of_actions
self.actions = [i for i in range(self.num_of_actions)]
self.memory = ReplayMemory(max_len=self.memory_size, batch_size=self.batch_size)
self.network = DQN(lr=self.learning_rate,
num_of_actions=self.num_of_actions,
device = self.device,
input_shape=input_shape)
self.target_network = DQN(lr=self.learning_rate,
num_of_actions=self.num_of_actions,
device = self.device,
input_shape=input_shape)
def choose_action(self, observation):
'''
choosing action based on the Epsilon greedy strategy
'''
if random.random() > self.epsilon:
observation = T.tensor([observation], dtype=T.float)
actions = self.network.forward(observation)
action = T.argmax(actions).item()
else:
action = random.choice(self.actions)
return action
def decaying_epsilon(self):
'''
decaying epsilon
'''
self.epsilon *= self.eps_dec if self.epsilon > self.eps_min else self.eps_min
def replace_weights(self):
'''
replacing weight from network to target network
'''
self.target_network.load_state_dict(self.network.state_dict())
def is_train_process_possible(self):
'''
returns True if memory is bigger or equal than batch size
'''
return len(self.memory) >= self.batch_size
def sample_memory(self):
'''
return batch of extracted experiences
'''
experiences = self.memory.sample()
observations, actions, rewards, new_observations, dones = self.extract_experiences(experiences)
observations = T.tensor(observations ,dtype=T.float32).to(self.device)
rewards = T.tensor(rewards).to(self.device,dtype=T.int32)
actions = T.tensor(actions).to(self.device,dtype=T.int32)
next_observations = T.tensor(new_observations, dtype=T.float32).to(self.device)
dones = T.tensor(dones).to(self.device,dtype=T.int32)
return observations, actions, rewards, next_observations, dones
def extract_experiences(self, experiences):
'''
extract observation, action, new_bservation, reward from each experience
'''
observation, action, new_observation, reward, dones = [],[],[],[],[]
for experience in experiences:
observation.append(experience.state)
action.append(experience.action)
new_observation.append(experience.next_state)
reward.append(experience.reward)
dones.append(experience.dones)
return observation, action, reward, new_observation, dones
def train(self):
'''
train our agent's network
'''
self.network.optimizer.zero_grad()
observation, actions, rewards, new_observation, dones = self.sample_memory()
q_values_output = self.network.forward(observation)
q_values_next_output = self.target_network.forward(new_observation)
q_values_current_state = []
q_values_next_state = []
for i in range(self.batch_size):
q_values_current_state.append(q_values_output[i][int(actions[i])].item())
if dones[i].item() != 1:
q_values_next_state.append(T.max(q_values_next_output[i]).item())
else:
q_values_next_state.append(0.0)
q_target = rewards + self.discount_factor*T.tensor(q_values_next_state,requires_grad=True)
#optimize network
loss = self.network.loss(T.tensor(q_values_current_state), q_target)
loss.backward()
self.network.optimizer.step()
# decaying epsilon
self.decaying_epsilon()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment