Environment animation

C-Earl · C-Earl · commit 701a18e71e02 · 2024-09-11T15:47:09.000-04:00
diff --git a/scripts/Chris/DQN/ANN.py b/scripts/Chris/DQN/ANN.py
@@ -43,70 +43,6 @@ def forward(self, x):
     x = x.to(torch.float32)
     return self.sequence(x)
 
-
-class DQN:
-  def __init__(self, input_dim, output_dim, gamma=0.99, batch_size=128, device='cpu'):
-    self.policy_net = ANN(input_dim, output_dim)
-    self.target_net = ANN(input_dim, output_dim)
-    self.optimizer = Adam(self.policy_net.parameters())
-    self.memory = ReplayMemory(10000)
-    self.gamma = gamma
-    self.batch_size = batch_size
-    self.device = device
-
-  def select_action(self, state, epsilon):
-    # Random action
-    if random.random() < epsilon:
-      return torch.tensor([[random.randrange(2)]], dtype=torch.float32)
-
-    # ANN action
-    else:
-      with torch.no_grad():
-        return self.policy_net(state).argmax()
-
-  def optimize_model(self):
-    if len(self.memory) < self.batch_size:
-      return
-    transitions = self.memory.sample(self.batch_size)
-    batch = Transition(*zip(*transitions))
-
-    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
-                                            batch.next_state)), device=self.device, dtype=torch.bool)
-    non_final_next_states = torch.cat([s for s in batch.next_state
-                                       if s is not None]).reshape(-1, 2)
-    state_batch = torch.cat(batch.state).reshape(-1, 2)
-    action_batch = torch.tensor(batch.action).to(torch.int64)
-    reward_batch = torch.tensor(batch.reward)
-
-    # Compute Q(s_t, a)
-    state_action_values = self.policy_net(state_batch)[action_batch]
-
-    # Compute V(s_{t+1}) for all next states.
-    next_state_values = torch.zeros(self.batch_size, device=self.device)
-    with torch.no_grad():
-      next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values
-
-    # Compute the expected Q values
-    expected_state_action_values = (next_state_values * self.gamma) + reward_batch
-
-    # Compute Loss
-    criterion = torch.nn.SmoothL1Loss()
-    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
-
-    # Optimize the model
-    self.optimizer.zero_grad()
-    loss.backward()
-    # In-place gradient clipping
-    torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
-    self.optimizer.step()
-
-  def update_target(self, tau=0.005):
-    target_net_state_dict = self.target_net.state_dict()
-    policy_net_state_dict = self.policy_net.state_dict()
-    for key in policy_net_state_dict:
-      target_net_state_dict[key] = policy_net_state_dict[key]*tau + target_net_state_dict[key] * (1 - tau)
-
-
 class Mem_Dataset(torch.utils.data.Dataset):
   def __init__(self, samples, labels):
     self.samples = samples
diff --git a/scripts/Chris/DQN/Environment.py b/scripts/Chris/DQN/Environment.py
@@ -1,9 +1,15 @@
+import random
+import numpy as np
 from labyrinth.generate import DepthFirstSearchGenerator
 from labyrinth.grid import Cell, Direction
 from labyrinth.maze import Maze
 from labyrinth.solve import MazeSolver
+from matplotlib.pyplot import plot as plt
+from matplotlib.animation import FuncAnimation
+
 import pickle as pkl
 import matplotlib.pyplot as plt
+from torch import optim
 
 class Maze_Environment():
   def __init__(self, width, height):
@@ -17,6 +23,8 @@ def __init__(self, width, height):
     self.maze.path = self.path    # No idea why this is necessary
     self.agent_cell = self.maze.start_cell
     self.num_actions = 4
+    self.history = [(self.agent_cell.coordinates, 0, False, {})]  # (state, reward, done, info)
+    self.pos_history = []
 
   def plot(self):
     # Box around maze
@@ -29,12 +37,12 @@ def plot(self):
     for row in range(self.height):
       for column in range(self.width):
         # Path
-        cell = self[column, row]  # Tranpose maze coordinates (just how the maze is stored)
-        if cell == self.start_cell:
+        cell = self.maze[column, row]  # Tranpose maze coordinates (just how the maze is stored)
+        if cell == self.maze.start_cell:
           plt.plot(row, column, 'go')
-        elif cell == self.end_cell:
+        elif cell == self.maze.end_cell:
           plt.plot(row, column,'bo')
-        elif cell in self.path:
+        elif cell in self.maze.path:
           plt.plot(row, column, 'ro')
 
         # Walls
@@ -44,16 +52,11 @@ def plot(self):
           plt.plot([row+0.5, row+0.5], [column-0.5, column+0.5], color='black')
 
   def reset(self):
-    # self.maze = Maze(width=self.width, height=self.height, generator=DepthFirstSearchGenerator())
-    # self.solver = MazeSolver()
-    # self.path = self.solver.solve(self.maze)
-    # self.maze.path = self.path    # No idea why this is necessary
-    # self.agent_cell = self.maze.start_cell
-    # return self.agent_cell, {}
     self.agent_cell = self.maze.start_cell
+    self.step_history = []
+    self.pos_history = []
     return self.agent_cell, {}
 
-
   # Takes action
   # Returns next state, reward, done, info
   def step(self, action):
@@ -69,29 +72,68 @@ def step(self, action):
 
     # Check if action runs into wall
     if action not in self.agent_cell.open_walls:
+      self.history.append((self.agent_cell.coordinates, -0.5, False, {}))
       return self.agent_cell, -.5, False, {}
 
     # Move agent
     else:
       self.agent_cell = self.maze.neighbor(self.agent_cell, action)
       if self.agent_cell == self.maze.end_cell:    # Check if agent has reached the end
+        self.history.append(self.agent_cell.coordinates, 1, True, {})
         return self.agent_cell, 1, True, {}
       else:
+        self.history.append((self.agent_cell.coordinates, 0, False, {}))
         return self.agent_cell, 0, False, {}
 
   def save(self, filename):
     with open(filename, 'wb') as f:
       pkl.dump(self, f)
 
+  def animate_history(self):
+    def update(i):
+      plt.clf()
+      self.plot()
+      plt.plot(self.history[i][0][1], self.history[i][0][0], 'yo')
+      plt.title(f'Step {i}, Reward: {self.history[i][1]}')
+    ani = FuncAnimation(plt.gcf(), update, frames=len(self.history), repeat=False)
+    ani.save('maze.gif', writer='ffmpeg', fps=10)
+
+class Grid_Cell_Maze_Environment(Maze_Environment):
+  def __init__(self, width, height):
+    super().__init__(width, height)
 
+    # Load spike train samples
+    # {position: [spike_trains]}
+    with open('Data/preprocessed_recalls_sorted.pkl', 'rb') as f:
+      self.samples = pkl.load(f)
+
+  def reset(self):
+    cell, info = super().reset()
+    return self.state_to_grid_cell_spikes(cell), info
+
+  def step(self, action):
+    obs, reward, done, info = super().step(action)
+    obs = self.state_to_grid_cell_spikes(obs)
+    return obs, reward, done, info
+
+  def state_to_grid_cell_spikes(self, cell):
+    return random.choice(self.samples[cell.coordinates])
 
 
 if __name__ == '__main__':
-  maze_env = Maze_Environment(width=25, height=25)
-  print(maze_env.maze)
-  print(f'start: {maze_env.maze.start_cell}')
-  print(f'end: {maze_env.maze.end_cell}')
-  maze_env.reset()
-  print(maze_env.maze)
-  print(f'start: {maze_env.maze.start_cell}')
-  print(f'end: {maze_env.maze.end_cell}')
+  from train_DQN import DQN, ReplayMemory
+  from scripts.Chris.DQN.train_DQN import run_episode
+
+  device = 'cpu'
+  n_actions = 4
+  input_size = 300
+  lr = 0.01
+  policy_net = DQN(input_size, n_actions).to(device)
+  target_net = DQN(input_size, n_actions).to(device)
+  target_net.load_state_dict(policy_net.state_dict())
+  optimizer = optim.AdamW(policy_net.parameters(), lr=lr, amsgrad=True)
+  memory = ReplayMemory(10000)
+  env = Grid_Cell_Maze_Environment(width=5, height=5)
+
+  run_episode(env, policy_net, 'cpu', 100, eps=0.9)
+  env.animate_history()
diff --git a/scripts/Chris/DQN/pipeline_executor.py b/scripts/Chris/DQN/pipeline_executor.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pickle as pkl
+from train_DQN import train_DQN
 from sample_generator import sample_generator
 from spike_train_generator import spike_train_generator
 from store_reservoir import store_reservoir
@@ -74,5 +75,19 @@
   # # Preprocess Recalls ##
   # recalled_mem_preprocessing(WINDOW_FREQ, WINDOW_SIZE, PLOT)
 
+  ## Train DQN ##
+  LR = 0.01
+  EPS_START = 0.9
+  EPS_END = 0.05
+  EPS_DECAY = 1000
+  TAU = 0.005
+  GAMMA = 0.99
+  MAX_STEPS_PER_EP = 10
+  MAX_TOTAL_STEPS = 10
+  MAX_EPS = 3000
+  BATCH_SIZE = 128
+  INPUT_SIZE = EXC_SIZE + INH_SIZE
+  train_DQN(INPUT_SIZE, LR, BATCH_SIZE, EPS_START, EPS_END, EPS_DECAY, TAU, GAMMA, MAX_STEPS_PER_EP, MAX_TOTAL_STEPS, MAX_EPS)
+
   ## Train ANN ##
-  classify_recalls(OUT_DIM, TRAIN_RATIO, BATCH_SIZE, TRAIN_EPOCHS)
+  # classify_recalls(OUT_DIM, TRAIN_RATIO, BATCH_SIZE, TRAIN_EPOCHS)
diff --git a/scripts/Chris/DQN/recalled_mem_preprocessing.py b/scripts/Chris/DQN/recalled_mem_preprocessing.py
@@ -42,6 +42,8 @@ def recalled_mem_preprocessing(window_freq, window_size, plot):
   ## Save transformed samples ##
   with open('Data/preprocessed_recalls.pkl', 'wb') as f:
     pkl.dump((new_samples, labels), f)
+  with open('Data/preprocessed_recalls_sorted.pkl', 'wb') as f:
+    pkl.dump(new_samples_sorted, f)
 
   if plot:
     # positions = np.array([key for key in new_samples_sorted.keys()])
diff --git a/scripts/Chris/DQN/sample_generator.py b/scripts/Chris/DQN/sample_generator.py
@@ -26,7 +26,8 @@ def inter_positional_spread(env_to_gc):
   return spread
 
 # Generate grid cell activity for all integer coordinate positions in environment
-def sample_generator(scales, offsets, vars, x_range, y_range, samples_per_pos, noise=0.1, padding=2, plot=False):
+def sample_generator(scales, offsets, vars, x_range, y_range,
+                     samples_per_pos, noise=0.1, padding=2, plot=False):
   print('Generating samples...')
   sorted_samples = {}
   samples = np.zeros((x_range[1] * y_range[1] * samples_per_pos, len(scales)))
@@ -62,24 +63,3 @@ def sample_generator(scales, offsets, vars, x_range, y_range, samples_per_pos, n
     plt.show()
 
   return samples, labels, sorted_samples
-
-if __name__ == '__main__':
-  ## Constants ##
-  WIDTH = 5
-  HEIGHT = 5
-  SAMPLES_PER_POS = 1000
-  WINDOW_FREQ = 10
-  WINDOW_SIZE = 10
-  # Grid Cells
-  num_cells_ = 20
-  x_range_ = (0, 5)
-  y_range_ = (0, 5)
-  x_offsets_ = np.random.uniform(-1, 1, num_cells_)
-  y_offsets_ = np.random.uniform(-1, 1, num_cells_)
-  offsets_ = list(zip(x_offsets_, y_offsets_))
-  scales_ = [1 + 0.01 * i for i in range(num_cells_)]
-  vars_ = [0.85]*num_cells_
-
-  # Test spread for set of parameters
-  # Shape = (num_samples, num_cells)
-  samples_, labels_, sorted_samples_ = sample_generator(scales_, offsets_, vars_, x_range_, y_range_, SAMPLES_PER_POS)
diff --git a/scripts/Chris/DQN/train_DQN.py b/scripts/Chris/DQN/train_DQN.py