-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_double.py
128 lines (92 loc) · 3.54 KB
/
main_double.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from agent.agent_numpy.policy_gradient_PPO import Agent_PPO
from agent.agent_torch.torch_ppo_agent import Agent_PPO_torch
from neural_v3.network import network
from neural_v3.activations import softmax, linear
from snake_game.double_snake import Snake_Game
import pickle
action_space = 4
obs_space = (42, )
# obs_space = (1, 5, 5)
perso = True
BATCH_SIZE = 250
TARGET_SCORE = 350
BUFFER = 10000
EPI_STEPS = 2000
INTERVAL = 10
env = Snake_Game()
agent = Agent_PPO(action_space, obs_space, BUFFER, use_next_state=True)
# agent = Agent_PPO_torch(action_space, obs_space, BUFFER, use_next_state=True)
agent.auto_report = False
if perso :
structure = [256, 128]
LR = 0.00025
LR_CRITIC = 0.00025
activation = 'tanh'
optimizer = "adam"
try :
brain = pickle.load(open("ppo_double_snake.pickle", 'rb'))
print("checkpoint loaded")
baseline = network([*structure, 1], activation, learning_rate=LR_CRITIC, optimizer=optimizer)
baseline.activ_layers[-1] = linear()
except :
brain = network([*structure, action_space], activation, learning_rate=LR, optimizer=optimizer)
brain.activ_layers[-1] = softmax()
baseline = network([*structure, 1], activation, learning_rate=LR_CRITIC, optimizer=optimizer)
baseline.activ_layers[-1] = linear()
for a_layer ,c_layer in zip(brain.layers, baseline.layers) :
a_layer.initialize_relu_weights()
c_layer.initialize_relu_weights()
agent.policy.model = brain
agent.policy.baseline = baseline
agent.policy.advantage_func.baseline = baseline
def train_loop():
global state, sum_reward
state_1, state_2 = state
action_1 = agent.observe(state_1)
action_2 = agent.custom_policy_action(state_2)
next_state, reward, done = env.step([action_1, action_2])
nx_state_1, nx_state_2 = next_state
sum_reward += reward
if done :
agent.observe(next_state=nx_state_1, reward=reward/2, is_done=True, store=True)
agent.observe(state_2)
agent.actual_sequence[1] = action_2
agent.observe(next_state=nx_state_2, reward=reward/2, is_done=True, store=True)
return True
agent.observe(next_state=nx_state_1, reward=reward, is_done=False, store=True)
agent.observe(state_2)
agent.actual_sequence[1] = action_2
agent.observe(next_state=nx_state_2, reward=reward/2, is_done=False, store=True)
state = next_state
going = True
index = 0
global_step = 1
reward_list = []
record = 0
avg_reward_record = 0
while going :
state = env.reset()
sum_reward = 0
for _ in range(EPI_STEPS) :
terminated = train_loop()
global_step += 1
if global_step % BATCH_SIZE == 0 :
agent.report()
agent.clear_memory()
if terminated :
index += 1
sum_reward = round(sum_reward, 2)
reward_list.append(sum_reward)
avg_reward = sum(reward_list[-INTERVAL:]) / len(reward_list[-INTERVAL:])
avg_reward = round(avg_reward, 2)
game_score = env.score
if game_score > record :
record = game_score
if avg_reward > avg_reward_record :
avg_reward_record = avg_reward
agent.save_model("ppo_double_snake")
print(f"episode: {index}\t score: {game_score}\t record: {record}\t avg reward:{avg_reward}")
if avg_reward >= TARGET_SCORE :
print(f"reached satisfactory results at episode {index}")
going = False
break