This repository was archived by the owner on Feb 21, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
62 lines (50 loc) · 2.23 KB
/
main.py
File metadata and controls
62 lines (50 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gymnasium as gym # Use gymnasium instead of gym
import numpy as np
# Create the MountainCar environment
env = gym.make("MountainCar-v0", render_mode="human")
# Hyperparameters
learning_rate = 1.1
discount_factor = 0.1
epsilon = 0.01 # Exploration rate
# Discretization settings (similar to bucketing)
DISCRETE_OS_SIZE = [20, 20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
# Q-table initialized to random values between -2 and 0
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))
# Discretize the continuous state into a bucket index
def get_discrete_state(state):
discrete_state = (state - env.observation_space.low) / discrete_os_win_size
return tuple(discrete_state.astype(np.int32))
# Reset environment and get initial discrete state
discrete_state = get_discrete_state(env.reset()[0]) # reset() returns a tuple, access the first element
done = False
while not done:
try:
print(q_table)
except Exception:
pass # Printing the Q-table may cause an error in a notebook environment
# Exploration vs. exploitation
if np.random.random() > epsilon:
# Exploit: choose action with the highest Q-value
action = np.argmax(q_table[discrete_state])
else:
# Explore: choose a random action
action = np.random.randint(0, env.action_space.n)
# Step the environment
new_state, reward, done, truncated, info = env.step(action) # New API provides 'truncated'
new_discrete_state = get_discrete_state(new_state)
goal_position = env.unwrapped.goal_position
if not done:
# Update Q-table using Q-learning algorithm
max_future_q = np.max(q_table[new_discrete_state])
current_q = q_table[discrete_state + (action,)]
new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount_factor * max_future_q)
q_table[discrete_state + (action,)] = new_q
elif new_state[0] >= env.goal_position:
# Reached the goal: reward is 0, set Q-value to 0
q_table[discrete_state + (action,)] = 0
# Move to the next state
discrete_state = new_discrete_state
# Render the environment
env.render()
env.close()