-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdqn.py
83 lines (59 loc) · 3.21 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""Defines the architecture of a deep Q-network.
Heavily influenced by DeepMind's seminal paper 'Playing Atari with Deep Reinforcement Learning'
(Mnih et al., 2013) and 'Human-level control through deep reinforcement learning' (Mnih et al.,
2015).
"""
import math
import tensorflow as tf
def _fully_connected_layer(x, shape, bias_shape, activation_fn):
if len(shape) != 2:
raise ValueError('Shape "{}" is invalid. Must have length 2.'.format(shape))
maxval = 1 / math.sqrt(shape[0] + shape[1])
W = tf.Variable(tf.random_uniform(shape, -maxval, maxval), name='Weights')
b = tf.Variable(tf.constant(0.1, tf.float32, bias_shape), name='Bias')
return activation_fn(tf.matmul(x, W) + b)
class DeepQNetwork():
"""A neural network that learns the Q (action value) function."""
def __init__(self, num_features, num_hidden_units, num_actions):
"""Creates a deep Q-network that is implemented using a single hidden layer.
Args:
num_features: Number of features in the input vector.
num_hidden_units: Number of units in the hidden layer.
num_actions: Number of possible actions. Represents the size of the output layer.
"""
self.x = tf.placeholder(tf.float32, [None, num_features], name='Input_States')
with tf.name_scope('Fully_Connected_Layer_1'):
h_fc = _fully_connected_layer(
self.x, [num_features, num_hidden_units], [num_hidden_units], tf.nn.relu)
with tf.name_scope('Fully_Connected_Layer_2'):
# Use a single shared bias for each action.
self.Q = _fully_connected_layer(h_fc, [num_hidden_units, num_actions], [1], tf.identity)
# Estimate the optimal action and its expected value.
self.optimal_action = tf.squeeze(tf.argmax(self.Q, 1, name='Optimal_Action'))
self.optimal_action_value = tf.squeeze(tf.reduce_max(self.Q, 1))
# Estimate the value of the specified action.
self.action = tf.placeholder(tf.uint8, name='Action')
one_hot_action = tf.one_hot(self.action, num_actions)
self.estimated_action_value = tf.reduce_sum(self.Q * one_hot_action, 1)
def get_action_value(self, state, action):
"""Estimates the value of the specified action for the specified state.
Args:
state: State of the environment. Can be batched into multiple states.
action: A valid action. Can be batched into multiple actions.
"""
sess = tf.get_default_session()
return sess.run(self.estimated_action_value, {self.x: state, self.action: action})
def get_optimal_action_value(self, state):
"""Estimates the optimal action value for the specified state.
Args:
state: State of the environment. Can be batched into multiple states.
"""
sess = tf.get_default_session()
return sess.run(self.optimal_action_value, {self.x: state})
def get_optimal_action(self, state):
"""Estimates the optimal action for the specified state.
Args:
state: State of the environment. Can be batched into multiple states.
"""
sess = tf.get_default_session()
return sess.run(self.optimal_action, {self.x: state})