Commit 6aad48b2 authored by udoedo's avatar udoedo
Browse files

frozen lake Qlearning and SARSA

parent 1b26a909
# dp 2020
# dp 2020
diplomovy projekt
\ No newline at end of file
{
"python.pythonPath": "C:\\Users\\Eduard Pizur\\AppData\\Local\\Programs\\Python\\Python36\\python.exe"
}
\ No newline at end of file
import gym
import random
import numpy as np
import time
import pprint as pp
import matplotlib.pyplot as plt
from frozenagent import Agent
#HYPERPARAMETERS
LEARNING_RATE = .01
DISCOUNT_FACTOR = .9
EPSILON_START = 1
EPSILON_END = .1
EPSILON_DECAY_RATE= .999995
NUMBER_OF_GAMES = 400_000
def print_table(table):
possible_actions = {
0: 'LEFT',
1: 'DOWN',
2: 'RIGHT',
3: 'UP'
}
k = 0
for i in range(4):
for j in range(4):
action = np.argmax(table[k])
if k not in [5,7,11,12,15]:
print("{} ".format(possible_actions[action]), end='')
else:
print("----", end='')
k += 1
print()
if __name__ == "__main__":
env = gym.make("FrozenLake-v0")
states = env.observation_space.n
agent = Agent(lr=LEARNING_RATE,
gamma=DISCOUNT_FACTOR,
eps_start=EPSILON_START,
eps_end=EPSILON_END,
eps_dec=EPSILON_DECAY_RATE,
n_states=states)
# # #Q learning
q_scores = []
q_avg_score_list = []
for i in range(NUMBER_OF_GAMES):
done = False
obs = env.reset()
score = 0
while not done:
action = agent.choose_action(obs)
next_obs, reward, done, info = env.step(action)
agent.q_learning(obs, action, reward, next_obs)
score += reward
obs = next_obs
q_scores.append(score)
if i % 100 == 0:
avg_score = np.mean(q_scores[-100:])
q_avg_score_list.append(avg_score)
if i % 10_000 == 0:
print(i)
plt.plot(q_avg_score_list, "r")
agent_sarsa = Agent(lr=LEARNING_RATE,
gamma=DISCOUNT_FACTOR,
eps_start=EPSILON_START,
eps_end=EPSILON_END,
eps_dec=EPSILON_DECAY_RATE,
n_states=states)
#SARSA learning
sarsa_scores = []
sarsa_avg_score_list = []
for i in range(NUMBER_OF_GAMES):
done = False
obs = env.reset()
action = agent_sarsa.choose_action(obs)
scores = 0
while not done:
next_obs, reward, done, info = env.step(action)
next_action = agent_sarsa.choose_action(next_obs)
agent_sarsa.sarsa_learning(obs, action, reward, next_obs, next_action)
scores += reward
obs = next_obs
action = next_action
sarsa_scores.append(scores)
if i % 100 == 0:
avg_score = np.average(sarsa_scores[-100:])
sarsa_avg_score_list.append(avg_score)
if i % 10_000 == 0:
print(i)
plt.plot(sarsa_avg_score_list, "g")
plt.xlabel('epsiodes')
plt.ylabel('steps')
print("Q LEARNING","-"*30)
print_table(agent.q_table)
print("SARSA","-"*30)
print_table(agent_sarsa.q_table)
plt.show()
import numpy as np
class Agent:
def __init__(self, lr, gamma, n_states, eps_start, eps_end, eps_dec):
self.lr = lr
self.gamma = gamma
self.n_states = n_states
self.epsilon = eps_start
self.eps_min = eps_end
self.eps_dec = eps_dec
self.q_table = None
self.initialize_q_table()
def initialize_q_table(self):
self.q_table = {state: [0, 0, 0, 0] for state in range(self.n_states)}
def choose_action(self, state):
if np.random.uniform(0,1) < (1-self.epsilon):
action = np.argmax(self.q_table[state])
else:
action = np.random.choice([0,1,2,3])
return action
def decaying_epsilon(self):
self.epsilon *= self.eps_dec if self.epsilon > self.eps_min else self.eps_min
def q_learning(self, state, action, reward, next_state):
next_max_action = np.argmax(self.q_table[next_state])
self.q_table[state][action] += self.lr * (reward + self.gamma * self.q_table[next_state][next_max_action] - self.q_table[state][action])
self.decaying_epsilon()
def sarsa_learning(self, state, action, reward, next_state, next_action):
self.q_table[state][action] += self.lr * (reward + self.gamma * self.q_table[next_state][next_action] - self.q_table[state][action])
self.decaying_epsilon()
# class Agent:
# def __init__(self, env):
# self.env = env
# self.state = None
# self.q_table = None
# self.epsilon = EPSILON
# self.lr_rate = LR_RATE
# def initialize_q_table(self):
# self.q_table = {state: [0.1, 0.1, 0.1, 0.1] for state in range(self.env.observation_space.n)}
# def choose_action(self, state):
# if np.random.uniform(0,1) < (1-self.epsilon):
# action = np.argmax(self.q_table[state])
# else:
# action = np.random.choice([0,1,2,3])
# return action
# def decaying_epsilon(self):
# # if self.epsilon > 0.005:
# self.epsilon *= DECAY_RATE_EPSILON
# def resetAgent(self):
# self.state = self.env.reset()
# def learn_Qlearning(self):
# count = 0
# self.initialize_q_table()
# rewards = []
# avg_rewards = []
# for episode in range(1,NUM_OF_EPISODES):
# self.resetAgent()
# done = False
# while not done:
# action = self.choose_action(self.state)
# next_state, reward, done, info = self.env.step(action)
# if done == True:
# if reward == 0:
# reward = -1
# else:
# reward = STEP_PENALTY
# current_q_value = self.q_table[self.state][action]
# max_next_q_value = np.max(self.q_table[next_state])
# self.q_table[self.state][action] = current_q_value + self.lr_rate*(reward + DISCOUNT * max_next_q_value - current_q_value)
# self.state = next_state
# if reward == 1:
# count += 1
# self.decaying_epsilon()
# rewards.append(reward)
# avg_rewards.append(sum(rewards)/episode)
# print(count)
# print(self.epsilon)
# return avg_rewards, self.q_table
# def learn_SARSA(self):
# self.initialize_q_table()
# done = False
# count = 0
# rewards = []
# avg_reward = []
# for episode in range(1,NUM_OF_EPISODES):
# self.resetAgent()
# action = self.choose_action(self.state)
# done = False
# while not done:
# next_state, reward, done, info = self.env.step(action)
# next_action = self.choose_action(next_state)
# if done == True:
# if reward == 0:
# reward = -1
# else:
# reward = STEP_PENALTY
# current_q_value = self.q_table[self.state][action]
# next_q_value = self.q_table[next_state][next_action]
# self.q_table[self.state][action] = current_q_value + LR_RATE*(reward + DISCOUNT * next_q_value - current_q_value)
# self.state = next_state
# action = next_action
# if reward == 1:
# count += 1
# self.decaying_epsilon()
# rewards.append(reward)
# avg_reward.append(sum(rewards)/episode)
# print(count)
# return avg_reward, self.q_table
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment