-
Notifications
You must be signed in to change notification settings - Fork 200
/
Copy pathcognitive-agent-v1.py
executable file
·103 lines (82 loc) · 2.88 KB
/
cognitive-agent-v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gym
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from tensorflow import keras
from ns3gym import ns3env
env = gym.make('ns3-v0')
ob_space = env.observation_space
ac_space = env.action_space
print("Observation space: ", ob_space, ob_space.dtype)
print("Action space: ", ac_space, ac_space.n)
s_size = ob_space.shape[0]
a_size = ac_space.n
model = keras.Sequential()
model.add(keras.layers.Dense(s_size, input_shape=(s_size,), activation='relu'))
model.add(keras.layers.Dense(a_size, activation='softmax'))
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
loss='categorical_crossentropy',
metrics=['accuracy'])
total_episodes = 200
max_env_steps = 100
env._max_episode_steps = max_env_steps
epsilon = 1.0 # exploration rate
epsilon_min = 0.01
epsilon_decay = 0.999
time_history = []
rew_history = []
for e in range(total_episodes):
state = env.reset()
state = np.reshape(state, [1, s_size])
rewardsum = 0
for time in range(max_env_steps):
# Choose action
if np.random.rand(1) < epsilon:
action = np.random.randint(a_size)
else:
action = np.argmax(model.predict(state)[0])
# Step
next_state, reward, done, _ = env.step(action)
if done:
print("episode: {}/{}, time: {}, rew: {}, eps: {:.2}"
.format(e, total_episodes, time, rewardsum, epsilon))
break
next_state = np.reshape(next_state, [1, s_size])
# Train
target = reward
if not done:
target = (reward + 0.95 * np.amax(model.predict(next_state)[0]))
target_f = model.predict(state)
target_f[0][action] = target
model.fit(state, target_f, epochs=1, verbose=0)
state = next_state
rewardsum += reward
if epsilon > epsilon_min: epsilon *= epsilon_decay
time_history.append(time)
rew_history.append(rewardsum)
#for n in range(2 ** s_size):
# state = [n >> i & 1 for i in range(0, 2)]
# state = np.reshape(state, [1, s_size])
# print("state " + str(state)
# + " -> prediction " + str(model.predict(state)[0])
# )
#print(model.get_config())
#print(model.to_json())
#print(model.get_weights())
print("Plot Learning Performance")
mpl.rcdefaults()
mpl.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(10,4))
plt.grid(True, linestyle='--')
plt.title('Learning Performance')
plt.plot(range(len(time_history)), time_history, label='Steps', marker="^", linestyle=":")#, color='red')
plt.plot(range(len(rew_history)), rew_history, label='Reward', marker="", linestyle="-")#, color='k')
plt.xlabel('Episode')
plt.ylabel('Time')
plt.legend(prop={'size': 12})
plt.savefig('learning.pdf', bbox_inches='tight')
plt.show()