-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathplatformer.py
More file actions
215 lines (171 loc) · 6.15 KB
/
platformer.py
File metadata and controls
215 lines (171 loc) · 6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.registration import register
import numpy as np
import matplotlib.pyplot as plt
class PlatformerEnv(gym.Env):
"""
Custom 1D Platformer environment.
The agents goal is to walk right while avoiding pits, and reach the goal at the right most position.
"""
@staticmethod
def get_actions():
return {
0: "Left",
1: "Right",
2: "Jump Right"
}
def __init__(self,
length=15,
walk_cost=-1.0,
jump_cost=-3,
pit_penalty=-100.0,
goal_reward=50.0
):
super(PlatformerEnv, self).__init__()
self.length = length
self.walk_cost = walk_cost
self.jump_cost = jump_cost
self.pit_penalty = pit_penalty
self.goal_reward = goal_reward
# Action space -> 0: Left, 1: Right, 2: Jump Right (moves 2 spaces)
self.action_space = spaces.Discrete(3)
# Observation space -> agents position integer from 0 to length-1
self.observation_space = spaces.Discrete(self.length)
# Pit location
self.pits = [4, 8, 12]
self.reset()
def get_empty_policy(self):
# Returns a starting/sample probabalistic policy (all zeros)
return np.zeros((self.observation_space.n, self.action_space.n))
@property
def nS(self):
return self.observation_space.n
@property
def nA(self):
return self.action_space.n
@property
def P(self):
P = {}
for s in range(self.nS):
P[s] = {}
for a in range(self.nA):
if s in self.pits or s == self.goal_pos:
P[s][a] = [(1.0, s, 0.0, True, {})]
continue
if a == 0: # Left
next_state = max(0, s - 1)
reward = self.walk_cost
elif a == 1: # Right
next_state = min(self.length - 1, s + 1)
reward = self.walk_cost
elif a == 2: # Jump Right
next_state = min(self.length - 1, s + 2)
reward = self.jump_cost
terminated = False
if next_state in self.pits:
reward = self.pit_penalty
terminated = True
elif next_state == self.goal_pos:
reward = self.goal_reward
terminated = True
P[s][a] = [(1.0, next_state, reward, terminated, {})]
return P
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.agent_pos = 0
self.goal_pos = self.length - 1
self.steps = 0
return self.agent_pos, {}
def step(self, action):
if action == 0: # Left
self.agent_pos = max(0, self.agent_pos - 1)
reward = self.walk_cost
elif action == 1: # Right
self.agent_pos = min(self.length - 1, self.agent_pos + 1)
reward = self.walk_cost
elif action == 2: # Jump Right
self.agent_pos = min(self.length - 1, self.agent_pos + 2)
reward = self.jump_cost
terminated = False
truncated = False
if self.agent_pos in self.pits:
reward = self.pit_penalty
terminated = True
elif self.agent_pos == self.goal_pos:
reward = self.goal_reward
terminated = True
return self.agent_pos, reward, terminated, truncated, {}
def render(self, V=None, policy=None):
"""
Args:
V: State-value function [nS]
policy: Policy [nS, nA], deterministic or not
"""
nS = self.length
corridor = np.array(["_"] * nS, dtype=object)
corridor[self.pits] = "U"
corridor[self.goal_pos] = "G"
corridor[self.agent_pos] = "A"
fig, ax = plt.subplots(figsize=(nS * 0.7, 4))
x = np.arange(nS)
if V is not None:
values = np.array(V)
else:
values = np.zeros(nS)
bars = ax.bar(
x,
values,
color="lightgray",
edgecolor="black",
width=0.8,
linewidth=2
)
for i in range(nS):
if i in self.pits:
bars[i].set_color("red")
elif i == self.goal_pos:
bars[i].set_color("green")
elif i == self.agent_pos:
bars[i].set_color("orange")
if policy is not None:
action_colors = ["blue", "purple", "gold"]
action_labels = ["← Left", "→ Right", "⇑ Jump"]
for i in range(nS):
if i in self.pits or i == self.goal_pos:
continue
probs = policy[i]
probs = probs / np.sum(probs)
base_height = values[i]
bottom = base_height
for a in range(self.nA):
height = probs[a] * 2.0
ax.bar(
i,
height,
bottom=bottom,
width=0.5,
color=action_colors[a],
alpha=0.8
)
bottom += height
ax.set_xticks(x)
ax.set_xticklabels(corridor, fontsize=12)
ax.set_ylabel("State Value")
ax.set_title("Platformer: Value Function + Action Probabilities")
ax.axhline(0, linewidth=2)
ax.set_ylim(min(values) - 5, max(values) + 6)
if policy is not None:
from matplotlib.patches import Patch
legend_elements = [
Patch(facecolor="blue", label="← Left"),
Patch(facecolor="purple", label="→ Right"),
Patch(facecolor="gold", label="⇑ Jump"),
]
ax.legend(handles=legend_elements, loc="upper left")
plt.tight_layout()
plt.show()
register(
id='Platformer-v0',
entry_point='platformer:PlatformerEnv',
)