Working DQN

C-Earl · C-Earl · commit ce07cddc8252 · 2024-09-10T17:58:27.000-04:00
diff --git a/scripts/Chris/DQN/Environment.py b/scripts/Chris/DQN/Environment.py
@@ -5,14 +5,18 @@
 import pickle as pkl
 import matplotlib.pyplot as plt
 
-class Maze_Environment(Maze):
+class Maze_Environment():
   def __init__(self, width, height):
 
     # Generate basic maze & solve
-    super().__init__(width=width, height=height, generator=DepthFirstSearchGenerator())
-    solver = MazeSolver()
-    self.path = solver.solve(self)
-    self.agent_cell = self.start_cell
+    self.width = width
+    self.height = height
+    self.maze = Maze(width=width, height=height, generator=DepthFirstSearchGenerator())
+    self.solver = MazeSolver()
+    self.path = self.solver.solve(self.maze)
+    self.maze.path = self.path    # No idea why this is necessary
+    self.agent_cell = self.maze.start_cell
+    self.num_actions = 4
 
   def plot(self):
     # Box around maze
@@ -40,18 +44,37 @@ def plot(self):
           plt.plot([row+0.5, row+0.5], [column-0.5, column+0.5], color='black')
 
   def reset(self):
-    pass
+    # self.maze = Maze(width=self.width, height=self.height, generator=DepthFirstSearchGenerator())
+    # self.solver = MazeSolver()
+    # self.path = self.solver.solve(self.maze)
+    # self.maze.path = self.path    # No idea why this is necessary
+    # self.agent_cell = self.maze.start_cell
+    # return self.agent_cell, {}
+    self.agent_cell = self.maze.start_cell
+    return self.agent_cell, {}
 
-  # Takes action, returns next state, reward, done, info
+
+  # Takes action
+  # Returns next state, reward, done, info
   def step(self, action):
+    # Transform action into Direction
+    if action == 0:
+      action = Direction.N
+    elif action == 1:
+      action = Direction.E
+    elif action == 2:
+      action = Direction.S
+    elif action == 3:
+      action = Direction.W
+
     # Check if action runs into wall
     if action not in self.agent_cell.open_walls:
-      return self.agent_cell, -1, False, {}
+      return self.agent_cell, -.5, False, {}
 
     # Move agent
     else:
-      self.agent_cell = self.agent_pos.neighbor(action)
-      if self.agent_cell == self.end_cell:
+      self.agent_cell = self.maze.neighbor(self.agent_cell, action)
+      if self.agent_cell == self.maze.end_cell:    # Check if agent has reached the end
         return self.agent_cell, 1, True, {}
       else:
         return self.agent_cell, 0, False, {}
@@ -61,11 +84,14 @@ def save(self, filename):
       pkl.dump(self, f)
 
 
+
+
 if __name__ == '__main__':
-  maze = Maze_Environment(width=25, height=25)
-  solver = MazeSolver()
-  path = solver.solve(maze)
-  maze.path = path
-  print(maze)
-  print(f'start: {maze.start_cell}')
-  print(f'end: {maze.end_cell}')
+  maze_env = Maze_Environment(width=25, height=25)
+  print(maze_env.maze)
+  print(f'start: {maze_env.maze.start_cell}')
+  print(f'end: {maze_env.maze.end_cell}')
+  maze_env.reset()
+  print(maze_env.maze)
+  print(f'start: {maze_env.maze.start_cell}')
+  print(f'end: {maze_env.maze.end_cell}')
diff --git a/scripts/Chris/DQN/pipeline_executor.py b/scripts/Chris/DQN/pipeline_executor.py
@@ -11,7 +11,7 @@
   ## Constants ##
   WIDTH = 5
   HEIGHT = 5
-  SAMPLES_PER_POS = 1000
+  SAMPLES_PER_POS = 5000
   NOISE = 0.1   # Noise in sampling
   WINDOW_FREQ = 10
   WINDOW_SIZE = 10
@@ -74,5 +74,5 @@
   # # Preprocess Recalls ##
   # recalled_mem_preprocessing(WINDOW_FREQ, WINDOW_SIZE, PLOT)
 
-  # Train ANN ##
+  ## Train ANN ##
   classify_recalls(OUT_DIM, TRAIN_RATIO, BATCH_SIZE, TRAIN_EPOCHS)
diff --git a/scripts/Chris/DQN/recall_reservoir.py b/scripts/Chris/DQN/recall_reservoir.py
@@ -13,7 +13,6 @@ def recall_reservoir(exc_size, inh_size, sim_time, plot=False):
     memory_keys, labels = pkl.load(f)
 
   ## Recall memories ##
-  # TODO: Plot output spikes according to inh exc populations
   recalled_memories = np.zeros((len(memory_keys), sim_time, exc_size + inh_size))
   recalled_memories_sorted = {}
   for i, (key, label) in enumerate(zip(memory_keys, labels)):
diff --git a/scripts/Chris/DQN/train_DQN.py b/scripts/Chris/DQN/train_DQN.py
@@ -0,0 +1,182 @@
+import math
+import random
+import matplotlib
+import matplotlib.pyplot as plt
+from collections import namedtuple, deque
+from itertools import count
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+
+from scripts.Chris.DQN.Environment import Maze_Environment
+
+Transition = namedtuple('Transition',
+                        ('state', 'action', 'next_state', 'reward'))
+
+class ReplayMemory(object):
+  def __init__(self, capacity):
+      self.memory = deque([], maxlen=capacity)
+
+  def push(self, *args):
+      """Save a transition"""
+      self.memory.append(Transition(*args))
+
+  def sample(self, batch_size):
+      return random.sample(self.memory, batch_size)
+
+  def __len__(self):
+      return len(self.memory)
+
+class DQN(nn.Module):
+
+  def __init__(self, n_observations, n_actions):
+    super(DQN, self).__init__()
+    self.layer1 = nn.Linear(n_observations, 128)
+    self.layer2 = nn.Linear(128, 128)
+    self.layer3 = nn.Linear(128, n_actions)
+
+  # Called with either one element to determine next action, or a batch
+  # during optimization. Returns tensor([[left0exp,right0exp]...]).
+  def forward(self, x):
+    x = F.relu(self.layer1(x))
+    x = F.relu(self.layer2(x))
+    return self.layer3(x)
+
+
+# Select action using epsilon-greedy policy
+def select_action(state, step, eps, policy_net, env):
+  # eps_threshold = EPS_END + (EPS_START - EPS_END) * \
+  #                 math.exp(-1. * step / EPS_DECAY)
+
+  # Select action from policy net
+  if random.random() > eps:
+    with torch.no_grad():
+      # t.max(1) will return the largest column value of each row.
+      # second column on max result is index of where max element was
+      # found, so we pick action with the larger expected reward.
+      return policy_net(state).max(1).indices.view(1, 1)
+
+  # Select random action (exploration)
+  else:
+    return torch.tensor(np.random.choice(env.num_actions)).view(1, 1)
+
+
+# Optimize DQN
+def optimize_model(memory, batch_size, policy_net, target_net, optimizer, gamma, device):
+  if len(memory) < batch_size:
+    return
+  transitions = memory.sample(batch_size)
+  # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
+  # detailed explanation). This converts batch-array of Transitions
+  # to Transition of batch-arrays.
+  batch = Transition(*zip(*transitions))
+
+  # Compute a mask of non-final states and concatenate the batch elements
+  # (a final state would've been the one after which simulation ended)
+  non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
+                                          batch.next_state)), device=device, dtype=torch.bool)
+  non_final_next_states = torch.cat([s for s in batch.next_state
+                                     if s is not None])
+  state_batch = torch.cat(batch.state)
+  action_batch = torch.cat(batch.action)
+  reward_batch = torch.cat(batch.reward)
+
+  # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
+  # columns of actions taken. These are the actions which would've been taken
+  # for each batch state according to policy_net
+  state_action_values = policy_net(state_batch).gather(1, action_batch)
+
+  # Compute V(s_{t+1}) for all next states.
+  # Expected values of actions for non_final_next_states are computed based
+  # on the "older" target_net; selecting their best reward with max(1).values
+  # This is merged based on the mask, such that we'll have either the expected
+  # state value or 0 in case the state was final.
+  next_state_values = torch.zeros(batch_size, device=device)
+  with torch.no_grad():
+    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
+  # Compute the expected Q values
+  expected_state_action_values = (next_state_values * gamma) + reward_batch
+
+  # Compute Huber loss
+  criterion = nn.SmoothL1Loss()
+  loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
+
+  # Optimize the model
+  optimizer.zero_grad()
+  loss.backward()
+  # In-place gradient clipping
+  torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
+  optimizer.step()
+
+
+if __name__ == '__main__':
+  device = 'cpu'
+  n_actions = 4
+  n_observations = 2
+  LR = 0.01
+  EPS_START = 0.9
+  EPS_END = 0.05
+  EPS_DECAY = 1000
+  TAU = 0.005
+  GAMMA = 0.99
+  MAX_STEPS_PER_EP = 1000
+  TOTAL_STEPS = 10000
+  MAX_EPS = 300
+  BATCH_SIZE = 128
+
+  policy_net_ = DQN(n_observations, n_actions).to(device)
+  target_net_ = DQN(n_observations, n_actions).to(device)
+  target_net_.load_state_dict(policy_net_.state_dict())
+  optimizer_ = optim.AdamW(policy_net_.parameters(), lr=LR, amsgrad=True)
+  memory_ = ReplayMemory(10000)
+  env_ = Maze_Environment(width=5, height=5)
+
+  episode_durations = []
+  episodes = 0
+  total_steps = 0
+  print(env_.maze)
+  while total_steps < TOTAL_STEPS and episodes < MAX_EPS:
+    # Initialize the environment and get its state
+    state, info = env_.reset()
+    state = torch.tensor(state.coordinates, dtype=torch.float32, device=device).unsqueeze(0)
+    # print(f"Episode {i_episode}")
+    for t in count():
+      eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * total_steps / EPS_DECAY)
+      action = select_action(state, t, eps, policy_net_, env_)
+      observation, reward, terminated, _ = env_.step(action.item())
+      reward = torch.tensor([reward], device=device)
+
+      if terminated:
+        next_state = None
+      else:
+        next_state = torch.tensor(observation.coordinates, dtype=torch.float32, device=device).unsqueeze(0)
+
+      # Store the transition in memory
+      memory_.push(state, action, next_state, reward)
+
+      # Move to the next state
+      state = next_state
+
+      # Perform one step of the optimization (on the policy network)
+      optimize_model(memory_, BATCH_SIZE, policy_net_, target_net_, optimizer_, gamma=GAMMA, device=device)
+
+      # Soft update of the target network's weights
+      # θ′ ← τ θ + (1 −τ )θ′
+      target_net_state_dict = target_net_.state_dict()
+      policy_net_state_dict = policy_net_.state_dict()
+      for key in policy_net_state_dict:
+        target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] * (1 - TAU)
+      target_net_.load_state_dict(target_net_state_dict)
+
+      total_steps += 1
+      if terminated or t > MAX_STEPS_PER_EP:
+        episode_durations.append(t + 1)
+        break
+    print(f"Episode {episodes} lasted {t+1} steps, eps = {round(eps, 2)} total steps = {total_steps}")
+    episodes += 1
+
+  plt.plot(episode_durations)
+  plt.show()