diff --git a/.gitignore b/.gitignore index 319e4c5..1e7bb73 100644 --- a/.gitignore +++ b/.gitignore @@ -204,3 +204,12 @@ example_usage/* *.pth .gitignore CITATION.cff +CLAUDE.md +.claude/* +.reports/* +.mcp.json +research/* +GEMINI.md + +#Obsidian notes vault +gridmind-notes-vault/* diff --git a/example_usage/control/mountain_car/one_step_actor_critic_example.py b/example_usage/control/mountain_car/one_step_actor_critic_example.py index 93c7f08..91406ea 100644 --- a/example_usage/control/mountain_car/one_step_actor_critic_example.py +++ b/example_usage/control/mountain_car/one_step_actor_critic_example.py @@ -4,6 +4,7 @@ from gridmind.feature_construction.multi_hot import MultiHotEncoder +from gridmind.feature_construction.normalizer import MinMaxNormalizer from gridmind.feature_construction.tile_coding import TileCoding from gridmind.policies.parameterized.discrete_action_mlp_policy import ( DiscreteActionMLPPolicy, @@ -13,13 +14,20 @@ ) import gymnasium as gym import torch +import numpy as np env = gym.make("MountainCar-v0") + +# Feature construction pipeline: normalize -> tile coding -> multi-hot encoding num_tilings = 7 +normalizer = MinMaxNormalizer( + low=np.array([-1.2, -0.07]), # Mountain Car observation bounds + high=np.array([0.6, 0.07]), +) multi_hot_encoder = MultiHotEncoder(num_categories=num_tilings**4) tile_encoder = TileCoding(ihtORsize=num_tilings**4, numtilings=num_tilings) -feature_constructor = lambda x: multi_hot_encoder(tile_encoder(x)) +feature_constructor = lambda x: multi_hot_encoder(tile_encoder(normalizer(x))) observation, _ = env.reset() diff --git a/src/gridmind/algorithms/base_learning_algorithm.py b/src/gridmind/algorithms/base_learning_algorithm.py index 00f18fc..645812f 100644 --- a/src/gridmind/algorithms/base_learning_algorithm.py +++ b/src/gridmind/algorithms/base_learning_algorithm.py @@ -6,8 +6,9 @@ import dill from gridmind.policies.base_policy import BasePolicy import logging +from gridmind.config import get_save_dir from gridmind.utils.divergence.base_divergence_detector import BaseDivergenceDetector -from gridmind.utils.logtools.async_tensorboard_logger import AsyncTensorboardLogger +from gridmind.utils.logtools.null_logger import NullWriter from gridmind.utils.performance_evaluation.base_performance_evaluator import ( BasePerformanceEvaluator, ) @@ -16,12 +17,6 @@ ) from gymnasium import Env from tqdm import trange -from torch.utils.tensorboard import SummaryWriter - -try: - from data import SAVE_DATA_DIR -except ImportError: - SAVE_DATA_DIR = None class BaseLearningAlgorithm(ABC): @@ -30,7 +25,7 @@ def __init__( name: str, env: Optional[Env] = None, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: self.name = name self.logger = logging.getLogger(self.__class__.__name__) @@ -48,12 +43,14 @@ def __init__( self.monitor_divergence = False self.stop_on_divergence = False + self.summary_writer = NullWriter() self.write_summary = write_summary if self.write_summary: - assert ( - summary_dir is not None or SAVE_DATA_DIR is not None - ), "Please specify summary_dir" - + if summary_dir is None and get_save_dir() is None: + raise ValueError( + "write_summary=True requires either summary_dir or " + "gridmind.config.set_save_dir() to be set." + ) self._initialize_summary_writer(summary_dir, env_name) def _initialize_summary_writer( @@ -63,7 +60,10 @@ def _initialize_summary_writer( extra_info: str = "", use_async_writer: bool = False, ): - summary_dir = summary_dir if summary_dir is not None else SAVE_DATA_DIR + from torch.utils.tensorboard import SummaryWriter + from gridmind.utils.logtools.async_tensorboard_logger import AsyncTensorboardLogger + + summary_dir = summary_dir if summary_dir is not None else get_save_dir() log_dir = os.path.join( summary_dir, @@ -291,11 +291,8 @@ def _training_wrapper( if save_policy: env_name = self.env.spec.id if self.env.spec is not None else "unknown" - if save_policy: - env_name = self.env.spec.id if self.env.spec is not None else "unknown" - - if SAVE_DATA_DIR is not None: - saved_policy_dir = os.path.join(SAVE_DATA_DIR, env_name) + if get_save_dir() is not None: + saved_policy_dir = os.path.join(get_save_dir(), env_name) self.save_policy(saved_policy_dir) def _report_all_metrics(self): @@ -314,8 +311,8 @@ def _report_all_metrics(self): env_name = self.env.spec.id if self.env.spec is not None else "unknown" - if SAVE_DATA_DIR is not None: - saved_policy_dir = os.path.join(SAVE_DATA_DIR, env_name) + if get_save_dir() is not None: + saved_policy_dir = os.path.join(get_save_dir(), env_name) self.save_policy(saved_policy_dir) def evaluate_policy(self, num_episodes: int): diff --git a/src/gridmind/algorithms/evolutionary_rl/base_evo_rl_algorithm.py b/src/gridmind/algorithms/evolutionary_rl/base_evo_rl_algorithm.py index b0c34b3..258f1b1 100644 --- a/src/gridmind/algorithms/evolutionary_rl/base_evo_rl_algorithm.py +++ b/src/gridmind/algorithms/evolutionary_rl/base_evo_rl_algorithm.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from typing import Optional from gridmind.algorithms.base_learning_algorithm import BaseLearningAlgorithm from gridmind.policies.base_policy import BasePolicy @@ -10,7 +11,7 @@ def __init__( name: str, env: Optional[Env] = None, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name, env, summary_dir=summary_dir, write_summary=write_summary @@ -34,10 +35,13 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool): def _train_steps(self, num_steps: int, prediction_only: bool, *args, **kwargs): raise NotImplementedError + @abstractmethod + def _train(self, num_generations: int, *args, **kwargs): ... + def train(self, num_generations: int, save_policy: bool = True): self._training_wrapper( num_iter=num_generations, prediction_only=False, save_policy=save_policy, - training_fn=self.train, + training_fn=self._train, ) diff --git a/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuro_agent.py b/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuro_agent.py index f8248e8..946ecb0 100644 --- a/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuro_agent.py +++ b/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuro_agent.py @@ -28,6 +28,14 @@ def __init__( def __repr__(self): return f"NeuroAgent(id={self.id}, fitness={self.fitness}, starting_generation={self.starting_generation})" + @property + def policy(self): + return self.network + + @policy.setter + def policy(self, value): + self.network = value + @property def id(self): return str(self._id) diff --git a/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution.py b/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution.py index ee480d9..2122ba9 100644 --- a/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution.py +++ b/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution.py @@ -34,7 +34,7 @@ def __init__( num_processes: Optional[int] = None, stopping_fitness: Optional[float] = None, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ): super().__init__( name="NeuroEvolution", @@ -43,8 +43,6 @@ def __init__( write_summary=write_summary, ) - self.env = env - self.name = "NeuroEvolution" self.mu = mu self._lambda = _lambda self.mutation_mean = mutation_mean @@ -81,7 +79,7 @@ def get_best(self, unwrapped: bool = True): ), "No best agent found. Train the algorithm first." if unwrapped: - return self.best_agent.network + return self.best_agent.policy return self.best_agent @@ -163,7 +161,7 @@ def evaluate_fitness( return sum_episode_return / average_over_episodes - def train(self, num_generations: int, *args, **kwargs): + def _train(self, num_generations: int, *args, **kwargs): for num_gen in trange(num_generations): agent_to_assess_fitness = [] @@ -172,8 +170,7 @@ def train(self, num_generations: int, *args, **kwargs): agent_to_assess_fitness.append(agent) fitness_scores = [ - self.evaluate_fitness(agent.network) - for agent in agent_to_assess_fitness + self.evaluate_fitness(agent.policy) for agent in agent_to_assess_fitness ] for agent, fitness in zip(agent_to_assess_fitness, fitness_scores): @@ -227,55 +224,15 @@ def train(self, num_generations: int, *args, **kwargs): for parent in parents: for _ in range(self._lambda // self.mu): mutated_param_vector = self.mutate( - network=parent.network, + network=parent.policy, mean=self.mutation_mean, std=self.mutation_std, ) child = self.spawn_individual() NeuroEvolutionUtil.set_parameters_vector( - child.network, mutated_param_vector + child.policy, mutated_param_vector ) self.population.append(child) self._generation += 1 return self.best_agent - - -if __name__ == "__main__": - from itertools import product - - env = gym.make("CartPole-v1") - - mutation_means = [0, 0.1, 0.2] - mutation_stds = [0.1, 0.2, 0.3] - - mutation_rate_combinations = list(product(mutation_means, mutation_stds)) - - trained_agents = [] - - for mutation_mean, mutation_std in mutation_rate_combinations: - algorithm = NeuroEvolution( - env=env, - mu=5, - _lambda=20, - stopping_fitness=500, - mutation_mean=mutation_mean, - mutation_std=mutation_std, - ) - trained_agents.append(algorithm.train(num_generations=1000)) - - eval_env = gym.make("CartPole-v1", render_mode="human") - - policy = random.choice(trained_agents).network - - obs, info = eval_env.reset() - done = False - - episode_return = 0.0 - - while not done: - obs = algorithm._preprocess(obs) - action = policy.get_action(obs) - obs, reward, terminated, truncated, info = eval_env.step(action) - episode_return += reward - done = terminated or truncated diff --git a/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution_util.py b/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution_util.py index 74a4dd7..c8c6b95 100644 --- a/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution_util.py +++ b/src/gridmind/algorithms/evolutionary_rl/neuroevolution/neuroevolution_util.py @@ -48,43 +48,3 @@ def evaluate_fitness( done = terminated or truncated return sum_episode_return / average_over_episodes - - -if __name__ == "__main__": - import torch - import torch.nn as nn - import numpy as np - - # Define a simple MLP - class SimpleNN(nn.Module): - def __init__(self, input_size=4, hidden_size=10, output_size=2): - super(SimpleNN, self).__init__() - self.fc1 = nn.Linear(input_size, hidden_size) - self.fc2 = nn.Linear(hidden_size, output_size) - - def forward(self, x): - return self.fc2(torch.relu(self.fc1(x))) - - # Create an instance of the model - model = SimpleNN() - vector = NeuroEvolutionUtil.get_parameters_vector( - model - ) # Extract weights as a flat vector - print(vector) - print(vector.shape) - NeuroEvolutionUtil.set_parameters_vector(model, vector) - - def mutate(model, mean, std): - chromosome = NeuroEvolutionUtil.get_parameters_vector(model) - noise = np.random.normal(loc=mean, scale=std, size=chromosome.shape) - - mutated_chromosome = chromosome + noise - - NeuroEvolutionUtil.set_parameters_vector( - model, mutated_chromosome - ) # Set weights from a flat vector - - return mutated_chromosome - - mutated_vector = mutate(model, 0, 0.01) - print(mutated_vector) diff --git a/src/gridmind/algorithms/function_approximation/actor_critic/one_step_actor_critic.py b/src/gridmind/algorithms/function_approximation/actor_critic/one_step_actor_critic.py index ccb07fd..b1d64fe 100644 --- a/src/gridmind/algorithms/function_approximation/actor_critic/one_step_actor_critic.py +++ b/src/gridmind/algorithms/function_approximation/actor_critic/one_step_actor_critic.py @@ -28,7 +28,7 @@ def __init__( clip_grads: bool = True, grad_clip_value: float = 1.0, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ): super().__init__( "OneStepActorCritic", @@ -129,9 +129,12 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): next_observation = self._preprocess(next_observation) - next_state_value = ( - self.value_estimator(next_observation) if not terminated else 0 - ) + with torch.no_grad(): + next_state_value = ( + self.value_estimator(next_observation) + if not terminated + else torch.tensor(0.0) + ) cur_state_value = self.value_estimator(observation) @@ -146,7 +149,7 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): self.logger.debug(f"Value grads: {value_grads}") policy_grads = torch.autograd.grad( - torch.log(self.policy.get_action_prob(observation, action)), + self.policy.get_log_action_prob(observation, action), self.policy.parameters(), ) self.logger.debug(f"Policy grads: {policy_grads}") diff --git a/src/gridmind/algorithms/function_approximation/base_function_approximation_based_learning_algorithm.py b/src/gridmind/algorithms/function_approximation/base_function_approximation_based_learning_algorithm.py index 1d640a3..272d61a 100644 --- a/src/gridmind/algorithms/function_approximation/base_function_approximation_based_learning_algorithm.py +++ b/src/gridmind/algorithms/function_approximation/base_function_approximation_based_learning_algorithm.py @@ -13,7 +13,7 @@ def __init__( env: Optional[Env] = None, feature_constructor=None, summary_dir=None, - write_summary=True, + write_summary=False, ): super().__init__(name, env, summary_dir, write_summary) self.feature_constructor = feature_constructor diff --git a/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce.py b/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce.py index ea62e4b..5361de0 100644 --- a/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce.py +++ b/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce.py @@ -24,7 +24,7 @@ def __init__( feature_constructor=None, grad_clip_value: float = 1.0, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ): super().__init__( "Reinforce", env, summary_dir=summary_dir, write_summary=write_summary @@ -104,11 +104,14 @@ def _train_episodes(self, num_episodes, prediction_only: bool = False): discounted_return = 0.0 + # NOTE: The γ^t weight (discount_factor**timestep) underflows to ~0 in float32 + # for episodes longer than ~3000 steps with γ=0.99. For long-horizon tasks, + # consider using discount_factor=1.0 or removing the γ^t term. for timestep in reversed(range(trajectory.get_trajectory_length())): obs, action, reward = trajectory.get_step(timestep) discounted_return = self.discount_factor * discounted_return + reward - log_prob = torch.log(self.policy.get_action_prob(obs, action)) + log_prob = self.policy.get_log_action_prob(obs, action) policy_grads = torch.autograd.grad( log_prob, @@ -117,11 +120,11 @@ def _train_episodes(self, num_episodes, prediction_only: bool = False): if i % 100 == 0 and i != 0: self.logger.debug(f"Policy grads: {policy_grads}") - # # Clipping for policy gradients - # policy_norm = torch.sqrt(sum(grad.norm()**2 for grad in policy_grads if grad is not None)) - # if policy_norm > self.grad_clip_value: - # scaling_factor = self.grad_clip_value / policy_norm - # policy_grads = [grad * scaling_factor if grad is not None else None for grad in policy_grads] + # Clipping for policy gradients + policy_norm = torch.sqrt(sum(grad.norm()**2 for grad in policy_grads if grad is not None)) + if policy_norm > self.grad_clip_value: + scaling_factor = self.grad_clip_value / policy_norm + policy_grads = [grad * scaling_factor if grad is not None else None for grad in policy_grads] # self.logger.debug(f"Clipped policy grads: {policy_grads}") @@ -134,20 +137,3 @@ def _train_episodes(self, num_episodes, prediction_only: bool = False): * discounted_return * grad ) - - -if __name__ == "__main__": - import gymnasium as gym - - env = gym.make("CartPole-v1") - - eval_env = gym.make("CartPole-v1", render_mode="rgb_array") - - performance_evaluator = BasicPerformanceEvaluator( - env=eval_env, epoch_eval_interval=500 - ) - # policy = ActorCriticPolicy(env) - algorithm = Reinforce(env=env, step_size=0.0001) - algorithm.register_performance_evaluator(performance_evaluator) - - algorithm.train_episodes(num_episodes=10000, prediction_only=False) diff --git a/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce_with_baseline.py b/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce_with_baseline.py index e9a9aeb..d216f7a 100644 --- a/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce_with_baseline.py +++ b/src/gridmind/algorithms/function_approximation/monte_carlo/control/reinforce_with_baseline.py @@ -29,7 +29,7 @@ def __init__( feature_constructor=None, grad_clip_value: float = 1.0, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ): super().__init__( "ReinforceWithBaseline", @@ -120,11 +120,14 @@ def _train_episodes(self, num_episodes, prediction_only: bool = False): discounted_return = 0.0 + # NOTE: The γ^t weight (discount_factor**timestep) underflows to ~0 in float32 + # for episodes longer than ~3000 steps with γ=0.99. For long-horizon tasks, + # consider using discount_factor=1.0 or removing the γ^t term. for timestep in reversed(range(trajectory.get_trajectory_length())): obs, action, reward = trajectory.get_step(timestep) discounted_return = self.discount_factor * discounted_return + reward obs = self._preprocess(obs) - log_prob = torch.log(self.policy.get_action_prob(obs, action)) + log_prob = self.policy.get_log_action_prob(obs, action) value_pred = self.value_estimator(obs) delta = discounted_return - value_pred @@ -163,40 +166,3 @@ def _train_episodes(self, num_episodes, prediction_only: bool = False): * delta * grad ) - - -if __name__ == "__main__": - import gymnasium as gym - from gridmind.feature_construction.one_hot import OneHotEncoder - - env = gym.make( - "FrozenLake-v1", - desc=None, - map_name="4x4", - is_slippery=False, - ) - feature_encoder = OneHotEncoder(num_classes=env.observation_space.n) - # env = gym.make("CartPole-v1") - - # eval_env = gym.make("CartPole-v1", render_mode="rgb_array") - eval_env = gym.make( - "FrozenLake-v1", - desc=None, - map_name="4x4", - is_slippery=False, - render_mode="rgb_array", - ) - - performance_evaluator = BasicPerformanceEvaluator( - env=eval_env, epoch_eval_interval=500 - ) - # policy = ActorCriticPolicy(env) - algorithm = ReinforceWithBaseline( - env=env, - policy_step_size=0.1, - value_step_size=0.1, - feature_constructor=feature_encoder, - ) - algorithm.register_performance_evaluator(performance_evaluator) - - algorithm.train_episodes(num_episodes=10000, prediction_only=False) diff --git a/src/gridmind/algorithms/function_approximation/monte_carlo/prediction/gradient_monte_carlo_prediction.py b/src/gridmind/algorithms/function_approximation/monte_carlo/prediction/gradient_monte_carlo_prediction.py index d1dc42f..4025fca 100644 --- a/src/gridmind/algorithms/function_approximation/monte_carlo/prediction/gradient_monte_carlo_prediction.py +++ b/src/gridmind/algorithms/function_approximation/monte_carlo/prediction/gradient_monte_carlo_prediction.py @@ -25,11 +25,15 @@ def __init__( step_size: float = 0.001, discount_factor: float = 0.9, feature_constructor: Callable = None, + summary_dir: Optional[str] = None, + write_summary: bool = False, ) -> None: super().__init__( name="GradientMCPrediction", env=env, feature_constructor=feature_constructor, + summary_dir=summary_dir, + write_summary=write_summary, ) self.policy = policy self.feature_constructor = feature_constructor @@ -91,10 +95,13 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool): if isinstance(state, numbers.Number): state = torch.tensor(state).unsqueeze(0) - state = torch.tensor(state, dtype=torch.float32) + if not isinstance(state, torch.Tensor): + state = torch.tensor(state, dtype=torch.float32) + else: + state = state.to(torch.float32) value_pred = self.V(state) grads = torch.autograd.grad(value_pred, self.V.parameters()) - update = self.step_size * (discounted_return - value_pred) + update = self.step_size * (discounted_return - value_pred.item()) with torch.no_grad(): for param, grad in zip(self.V.parameters(), grads): diff --git a/src/gridmind/algorithms/function_approximation/ppo/one_step_ppo.py b/src/gridmind/algorithms/function_approximation/ppo/one_step_ppo.py new file mode 100644 index 0000000..51b06b0 --- /dev/null +++ b/src/gridmind/algorithms/function_approximation/ppo/one_step_ppo.py @@ -0,0 +1,214 @@ +import numbers +import random +from typing import Callable, Optional +from gridmind.algorithms.base_learning_algorithm import BaseLearningAlgorithm +from gridmind.utils.performance_evaluation.basic_performance_evaluator import ( + BasicPerformanceEvaluator, +) +from gymnasium import Env +import torch +from tqdm import trange +from gridmind.policies.parameterized.actor_critic_policy import ActorCriticPolicy +import torch.nn as nn + + +class OneStepPPO(BaseLearningAlgorithm): + def __init__( + self, + env: Env, + policy: Optional[ActorCriticPolicy] = None, + step_size: float = 0.001, + discount_factor: float = 0.99, + feature_constructor: Callable = None, + clip_grads: bool = True, + max_grad_norm: float = 0.5, + entropy_coefficient: float = 0.02, + num_epochs: int = 10, + minibatch_size: int = 64, + clip_epsilon: float = 0.2, + summary_dir: Optional[str] = None, + write_summary: bool = False, + ): + super().__init__( + "ProximalPolicyOptimization_with_One_Step_TD-Error", + env, + summary_dir=summary_dir, + write_summary=write_summary, + ) + self.step_size = step_size + self.discount_factor = discount_factor + self.clip_grads = clip_grads + self.max_grad_norm = max_grad_norm + self.num_epochs = num_epochs + self.minibatch_size = minibatch_size + self.clip_epsilon = clip_epsilon + + self.feature_constructor = feature_constructor + observation_shape = ( + self.env.observation_space.shape + if feature_constructor is None + else self._determine_observation_shape() + ) + num_actions = env.action_space.n + self.policy = ( + policy + if policy is not None + else ActorCriticPolicy( + observation_shape=observation_shape, num_actions=num_actions + ) + ) + self.optimizer = torch.optim.Adam( + self.policy.parameters(), lr=self.step_size + ) + self.entropy_coefficient = entropy_coefficient + + def _determine_observation_shape(self): + observation, _ = self.env.reset() + features = self.feature_constructor(observation) + shape = features.shape + + return shape + + def _preprocess(self, obs): + if self.feature_constructor is not None: + obs = self.feature_constructor(obs) + + if isinstance(obs, numbers.Number): + obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) + else: + obs = torch.tensor(obs, dtype=torch.float32) + + return obs + + def _get_state_value_fn(self, force_functional_interface=True): + raise NotImplementedError + + def _get_state_action_value_fn(self, force_functional_interface=True): + raise NotImplementedError + + def _get_policy(self): + return self.policy + + def set_policy(self, policy, **kwargs): + raise NotImplementedError + + @staticmethod + def _create_minibatches_generator(data, batch_size): + for i in range(0, len(data), batch_size): + yield data[i : i + batch_size] + + def _train_steps(self, num_steps: int, prediction_only: bool, *args, **kwargs): + raise NotImplementedError() + + def _train_episodes(self, num_episodes, prediction_only): + assert not prediction_only, "Prediction only is not supported for PPO" + + num_collect_episodes = 5 + + for episode in trange(0, num_episodes, num_collect_episodes): + observations = [] + actions = [] + deltas = [] + log_probs = [] + v_targs = [] + + with torch.no_grad(): + for i in range(num_collect_episodes): + observation, _ = self.env.reset() + observation = self._preprocess(observation) + + done = False + + while not done: + action, log_prob, _, cur_state_value = ( + self.policy.get_action_and_value(observation) + ) + next_observation, reward, terminated, truncated, _ = ( + self.env.step(action.detach().cpu().item()) + ) + + next_observation = self._preprocess(next_observation) + next_state_value = ( + self.policy.get_value(next_observation) + if not terminated + else torch.tensor([0.0]) + ) + + v_targ = reward + self.discount_factor * next_state_value + + delta = v_targ - cur_state_value + + observations.append(observation) + actions.append(action) + deltas.append(delta) + v_targs.append(v_targ) + log_probs.append(log_prob) + + done = terminated or truncated + observation = next_observation + + # Normalize advantages (1-step TD errors) for training stability. + deltas_tensor = torch.stack(deltas) + deltas_tensor = (deltas_tensor - deltas_tensor.mean()) / (deltas_tensor.std() + 1e-8) + deltas = list(deltas_tensor) + + for epoch in range(self.num_epochs): + indices = list(range(len(observations))) + random.shuffle(indices) + + for minibatch_indices in self._create_minibatches_generator( + indices, self.minibatch_size + ): + with torch.no_grad(): + minibatch_observations = torch.stack( + [observations[i] for i in minibatch_indices] + ) + minibatch_actions = torch.stack( + [actions[i] for i in minibatch_indices] + ) + minibatch_deltas = torch.stack( + [deltas[i] for i in minibatch_indices] + ).reshape(-1, 1) + minibatch_v_targs = torch.stack( + [v_targs[i] for i in minibatch_indices] + ).reshape(-1, 1) + minibatch_log_probs = torch.stack( + [log_probs[i] for i in minibatch_indices] + ).reshape(-1, 1) + + _, cur_logprob, dist_entropy, cur_values = ( + self.policy.get_action_and_value( + minibatch_observations, minibatch_actions + ) + ) + cur_logprob = cur_logprob.reshape(-1, 1) + dist_entropy = dist_entropy.reshape(-1, 1) + + log_ratio = cur_logprob - minibatch_log_probs + ratio = log_ratio.exp().reshape(-1, 1) + + clipped_ratio = torch.clamp( + ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon + ) + + clipped_surrogate_objective = torch.min( + ratio * minibatch_deltas, clipped_ratio * minibatch_deltas + ) + value_loss = 0.5 * (minibatch_v_targs - cur_values) ** 2 + + entropy_bonus = dist_entropy.reshape(-1, 1) + + total_objective = torch.mean( + clipped_surrogate_objective + - value_loss + + self.entropy_coefficient * entropy_bonus + ) + total_loss = -total_objective + + self.optimizer.zero_grad() + total_loss.backward() + if self.clip_grads: + nn.utils.clip_grad_norm_( + self.policy.parameters(), self.max_grad_norm + ) + self.optimizer.step() diff --git a/src/gridmind/algorithms/function_approximation/ppo/ppo.py b/src/gridmind/algorithms/function_approximation/ppo/ppo.py index 30b34a9..95b07ec 100644 --- a/src/gridmind/algorithms/function_approximation/ppo/ppo.py +++ b/src/gridmind/algorithms/function_approximation/ppo/ppo.py @@ -2,6 +2,7 @@ import numbers import random from typing import Callable, Optional + from gridmind.algorithms.base_learning_algorithm import BaseLearningAlgorithm from gridmind.utils.performance_evaluation.basic_performance_evaluator import ( BasicPerformanceEvaluator, @@ -12,8 +13,6 @@ from gridmind.policies.parameterized.actor_critic_policy import ActorCriticPolicy import torch.nn as nn -torch.autograd.set_detect_anomaly(True) - logging.basicConfig(level=logging.DEBUG) @@ -22,15 +21,15 @@ def __init__( self, env: Env, policy: Optional[ActorCriticPolicy] = None, - policy_step_size: float = 0.00001, - value_step_size: float = 0.001, + step_size: float = 0.0001, discount_factor: float = 0.99, + gae_lambda: float = 0.95, feature_constructor: Callable = None, clip_grads: bool = True, - grad_clip_value: float = 1.0, + max_grad_norm: float = 0.5, entropy_coefficient: float = 0.02, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ): super().__init__( "ProximalPolicyOptimization", @@ -38,11 +37,11 @@ def __init__( summary_dir=summary_dir, write_summary=write_summary, ) - self.policy_step_size = policy_step_size - self.value_step_size = value_step_size + self.policy_step_size = step_size self.discount_factor = discount_factor + self.gae_lambda = gae_lambda self.clip_grads = clip_grads - self.grad_clip_value = grad_clip_value + self.max_grad_norm = max_grad_norm self.feature_constructor = feature_constructor observation_shape = ( @@ -58,7 +57,6 @@ def __init__( observation_shape=observation_shape, num_actions=num_actions ) ) - self.T = 500 self.num_epochs = 10 self.minibatch_size = 64 self.optimizer = torch.optim.Adam( @@ -106,6 +104,9 @@ def _train_steps(self, num_steps: int, prediction_only: bool, *args, **kwargs): raise NotImplementedError() def _train_episodes(self, num_episodes, prediction_only): + # num_episodes counts update iterations, not environment episodes. + # Each iteration collects num_collect_episodes=5 env episodes, so + # total environment episodes = num_episodes * 5. assert not prediction_only, "Prediction only is not supported for PPO" num_collect_episodes = 5 @@ -113,9 +114,12 @@ def _train_episodes(self, num_episodes, prediction_only): for episode in trange(num_episodes): observations = [] actions = [] - deltas = [] log_probs = [] - v_targs = [] + values = [] + rewards = [] + next_values = [] + terminateds = [] + truncateds = [] with torch.no_grad(): for i in range(num_collect_episodes): @@ -136,25 +140,56 @@ def _train_episodes(self, num_episodes, prediction_only): next_state_value = ( self.policy.get_value(next_observation) if not terminated - else torch.tensor([0.0]) + else torch.tensor( + [0.0], device=next_observation.device + ) ) - # cur_state_value = self.policy.get_value(observation) - - v_targ = reward + self.discount_factor * next_state_value - - delta = v_targ - cur_state_value observations.append(observation) actions.append(action) - deltas.append(delta) - v_targs.append(v_targ) log_probs.append(log_prob) + values.append(cur_state_value.item()) + rewards.append(reward) + next_values.append(next_state_value.item()) + terminateds.append(terminated) + truncateds.append(truncated) done = terminated or truncated observation = next_observation + num_steps = len(rewards) + advantages = [0.0] * num_steps + last_advantage = 0.0 + + for t in reversed(range(num_steps)): + delta = ( + rewards[t] + + self.discount_factor * next_values[t] * (1.0 - terminateds[t]) + - values[t] + ) + if terminateds[t] or truncateds[t]: + last_advantage = 0.0 + last_advantage = ( + delta + + self.discount_factor + * self.gae_lambda + * (1.0 - terminateds[t]) + * last_advantage + ) + advantages[t] = last_advantage + + device = next(self.policy.parameters()).device + advantages = torch.tensor(advantages, dtype=torch.float32, device=device) + returns = torch.tensor( + [a + v for a, v in zip(advantages, values)], + dtype=torch.float32, + device=device, + ) + + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + for epoch in range(self.num_epochs): - indices = list(range(len(observations))) + indices = list(range(num_steps)) random.shuffle(indices) for minibatch_indices in self._create_minibatches_generator( @@ -167,12 +202,10 @@ def _train_episodes(self, num_episodes, prediction_only): minibatch_actions = torch.stack( [actions[i] for i in minibatch_indices] ) - minibatch_deltas = torch.stack( - [deltas[i] for i in minibatch_indices] - ).reshape(-1, 1) - minibatch_v_targs = torch.stack( - [v_targs[i] for i in minibatch_indices] - ).reshape(-1, 1) + minibatch_advantages = advantages[minibatch_indices].reshape( + -1, 1 + ) + minibatch_returns = returns[minibatch_indices].reshape(-1, 1) minibatch_log_probs = torch.stack( [log_probs[i] for i in minibatch_indices] ).reshape(-1, 1) @@ -193,36 +226,24 @@ def _train_episodes(self, num_episodes, prediction_only): ) clipped_surrogate_objective = torch.min( - ratio * minibatch_deltas, clipped_ratio * minibatch_deltas + ratio * minibatch_advantages, + clipped_ratio * minibatch_advantages, ) - squared_error_loss = 0.5 * (minibatch_v_targs - cur_values) ** 2 + value_loss = 0.5 * (minibatch_returns - cur_values) ** 2 entropy_bonus = dist_entropy.reshape(-1, 1) total_objective = torch.mean( clipped_surrogate_objective - - 0.5 * squared_error_loss + - value_loss + self.entropy_coefficient * entropy_bonus ) total_loss = -total_objective - - total_loss.backward() - nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5) - self.optimizer.step() + self.optimizer.zero_grad() - - -if __name__ == "__main__": - import gymnasium as gym - - env = gym.make("CartPole-v1") - - eval_env = gym.make("CartPole-v1", render_mode="rgb_array") - performance_evaluator = BasicPerformanceEvaluator( - env=eval_env, epoch_eval_interval=100 - ) - policy = ActorCriticPolicy(env) - algorithm = PPO(env=env, num_actions=env.action_space.n, policy=policy) - algorithm.register_performance_evaluator(performance_evaluator) - - algorithm.train_episodes(num_episodes=1000, prediction_only=False) + total_loss.backward() + if self.clip_grads: + nn.utils.clip_grad_norm_( + self.policy.parameters(), self.max_grad_norm + ) + self.optimizer.step() \ No newline at end of file diff --git a/src/gridmind/algorithms/function_approximation/temporal_difference/control/deep_q_learning.py b/src/gridmind/algorithms/function_approximation/temporal_difference/control/deep_q_learning.py index d8af432..e9a4961 100644 --- a/src/gridmind/algorithms/function_approximation/temporal_difference/control/deep_q_learning.py +++ b/src/gridmind/algorithms/function_approximation/temporal_difference/control/deep_q_learning.py @@ -4,6 +4,7 @@ from gridmind.algorithms.function_approximation.base_function_approximation_based_learning_algorithm import ( BaseFunctionApproximationBasedLearingAlgorithm, ) +from gridmind.config import get_save_dir from gridmind.policies.soft.q_derived.q_network_derived_epsilon_greedy_policy import ( QNetworkDerivedEpsilonGreedyPolicy, ) @@ -14,11 +15,6 @@ from tqdm import trange from datetime import datetime -try: - from data import SAVE_DATA_DIR -except ImportError: - SAVE_DATA_DIR = None - class DeepQLearning(BaseFunctionApproximationBasedLearingAlgorithm): def __init__( @@ -26,7 +22,7 @@ def __init__( env: Env, q_network: Optional[QNetwork] = None, step_size: float = 0.001, - discount_factor: float = 0.9, + discount_factor: float = 0.99, batch_size: int = 32, epsilon_decay: bool = True, epsilon_decay_rate: float = 0.0001, @@ -34,7 +30,7 @@ def __init__( epsilon_max: float = 1.0, feature_constructor: Optional[Callable] = None, summary_dir=None, - write_summary=True, + write_summary=False, replay_buffer_capacity: Optional[int] = None, target_network_update_frequency: int = 1000, ): @@ -57,20 +53,13 @@ def __init__( self.replay_buffer = SimpleReplayBuffer(capacity=replay_buffer_capacity) self._current_step = 0 env_name = self.env.spec.id if self.env.spec is not None else "unknown" - if SAVE_DATA_DIR is not None: - self.default_save_dir = os.path.join( - SAVE_DATA_DIR, - env_name, - self.name, - datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S"), - ) - else: - self.default_save_dir = os.path.join( - "saved_models", - env_name, - self.name, - datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S"), - ) + _base_save_dir = get_save_dir() if get_save_dir() is not None else "saved_models" + self.default_save_dir = os.path.join( + _base_save_dir, + env_name, + self.name, + datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S"), + ) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.logger.info(f"Using device: {self.device}") @@ -124,6 +113,7 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): ) observation = next_observation + # Training starts as soon as batch_size samples exist; Mnih et al. 2015 use a ~50K-step warm-up. if self.replay_buffer.size() >= self.batch_size: ( observations, @@ -158,7 +148,7 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): q_values = ( self.q_online(observations) .gather(1, actions.unsqueeze(1)) - .squeeze() + .squeeze(-1) ) loss = torch.nn.functional.mse_loss(q_values, target_q_values) loss.backward() @@ -172,7 +162,7 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): # Update target network self.q_target.load_state_dict(self.q_online.state_dict()) - if self.summary_writer is not None: + if self.write_summary: self.summary_writer.add_scalar( "target_network_update_step", self.global_network_update_step, @@ -187,6 +177,7 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): def _select_action(self, observation): """Select an action using epsilon-greedy policy.""" + # Counts action-selection calls; epsilon reaches epsilon_min after (epsilon_max - epsilon_min) / epsilon_decay_rate steps. self._current_step += 1 if self.epsilon_decay: diff --git a/src/gridmind/algorithms/function_approximation/temporal_difference/control/episodic_semi_gradient_sarsa.py b/src/gridmind/algorithms/function_approximation/temporal_difference/control/episodic_semi_gradient_sarsa.py index a996640..6ecad81 100644 --- a/src/gridmind/algorithms/function_approximation/temporal_difference/control/episodic_semi_gradient_sarsa.py +++ b/src/gridmind/algorithms/function_approximation/temporal_difference/control/episodic_semi_gradient_sarsa.py @@ -25,7 +25,7 @@ def __init__( epsilon_decay: bool = True, feature_constructor: Callable = None, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ): super().__init__( "Episodic-Semi-Gradient-SARSA", @@ -129,13 +129,12 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): next_observation = self._preprocess(next_observation) next_action = self.policy.get_action(next_observation) - target_action_value = ( - reward - + self.discount_factor - * self.action_value_estimator(next_observation)[next_action] - if not terminated - else reward - ) + if not terminated: + with torch.no_grad(): + next_q = self.action_value_estimator(next_observation)[next_action].item() + target_action_value = reward + self.discount_factor * next_q + else: + target_action_value = reward action_value_pred = self.action_value_estimator(observation)[action] @@ -155,4 +154,4 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = False): action = next_action done = terminated or truncated - self.policy.set_network(self.action_value_estimator) + self.policy.set_network(self.action_value_estimator) diff --git a/src/gridmind/algorithms/function_approximation/temporal_difference/prediction/semi_gradient_td_0_prediction.py b/src/gridmind/algorithms/function_approximation/temporal_difference/prediction/semi_gradient_td_0_prediction.py index a9b273f..59b73a4 100644 --- a/src/gridmind/algorithms/function_approximation/temporal_difference/prediction/semi_gradient_td_0_prediction.py +++ b/src/gridmind/algorithms/function_approximation/temporal_difference/prediction/semi_gradient_td_0_prediction.py @@ -18,7 +18,7 @@ def __init__( discount_factor: float = 0.9, feature_constructor: Callable = None, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="Semi-gradient-TD-0-Prediction", @@ -82,20 +82,16 @@ def _train_episodes(self, num_episodes: int, prediction_only: bool = True): action ) - _input = observation - _next_input = next_observation - - _input = self._preprocess(_input) + _input = self._preprocess(observation) + value_pred = self.V(_input) if not terminated: - _next_input = self._preprocess(_next_input) - - target_value = ( - reward + self.discount_factor * self.V(_next_input) - if not terminated - else reward - ) - value_pred = self.V(_input) + _next_input = self._preprocess(next_observation) + with torch.no_grad(): + next_val = self.V(_next_input).item() + target_value = reward + self.discount_factor * next_val + else: + target_value = reward delta = self.step_size * (target_value - value_pred) diff --git a/src/gridmind/algorithms/tabular/monte_carlo/control/monte_carlo_on_policy_first_visit.py b/src/gridmind/algorithms/tabular/monte_carlo/control/monte_carlo_on_policy_first_visit.py index 1c91d39..360877d 100644 --- a/src/gridmind/algorithms/tabular/monte_carlo/control/monte_carlo_on_policy_first_visit.py +++ b/src/gridmind/algorithms/tabular/monte_carlo/control/monte_carlo_on_policy_first_visit.py @@ -10,7 +10,7 @@ def __init__( env: Env, policy: BasePolicy, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="MonteCarloOnPolicyFirstVisit", diff --git a/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_exploring_start.py b/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_exploring_start.py index 07af64c..a6e448a 100644 --- a/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_exploring_start.py +++ b/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_exploring_start.py @@ -19,7 +19,7 @@ def __init__( policy: Optional[BasePolicy] = None, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="MCES", env=env, summary_dir=summary_dir, write_summary=write_summary diff --git a/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy.py b/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy.py index 7cb6798..42c9b60 100644 --- a/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy.py +++ b/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy.py @@ -36,7 +36,7 @@ def __init__( behavior_policy: Optional[BasePolicy] = None, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="MCPolicyControl(off-policy)", diff --git a/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy_snb.py b/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy_snb.py index 1db30a4..7645ccb 100644 --- a/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy_snb.py +++ b/src/gridmind/algorithms/tabular/monte_carlo/monte_carlo_off_policy_snb.py @@ -22,7 +22,7 @@ def __init__( behavior_policy: Optional[BasePolicy] = None, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="MCPolicyControl(off-policy-SnB)", diff --git a/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction.py b/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction.py index d1465aa..4ff91ac 100644 --- a/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction.py +++ b/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction.py @@ -22,7 +22,7 @@ def __init__( policy: BasePolicy, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="MCEveryVisitPrediction", diff --git a/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction_incremental.py b/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction_incremental.py index 96ea4cb..de222f0 100644 --- a/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction_incremental.py +++ b/src/gridmind/algorithms/tabular/monte_carlo/prediction/monte_carlo_every_visit_prediction_incremental.py @@ -22,7 +22,7 @@ def __init__( step_size: float = 0.01, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="MCEveryVisitPredictionIncremental", diff --git a/src/gridmind/algorithms/tabular/n_step/control/n_step_sarsa.py b/src/gridmind/algorithms/tabular/n_step/control/n_step_sarsa.py index ce65806..f5ff69c 100644 --- a/src/gridmind/algorithms/tabular/n_step/control/n_step_sarsa.py +++ b/src/gridmind/algorithms/tabular/n_step/control/n_step_sarsa.py @@ -27,7 +27,7 @@ def __init__( q_initializer: str = "zero", epsilon_decay: bool = False, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( "N-Step-SARSA", diff --git a/src/gridmind/algorithms/tabular/n_step/prediction/n_step_td_prediction.py b/src/gridmind/algorithms/tabular/n_step/prediction/n_step_td_prediction.py index 070f809..a5d1758 100644 --- a/src/gridmind/algorithms/tabular/n_step/prediction/n_step_td_prediction.py +++ b/src/gridmind/algorithms/tabular/n_step/prediction/n_step_td_prediction.py @@ -19,7 +19,7 @@ def __init__( step_size: float = 0.01, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( "N-Step-TD-Prediction", diff --git a/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning.py b/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning.py index 2f0ee1d..593e64e 100644 --- a/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning.py +++ b/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning.py @@ -24,7 +24,7 @@ def __init__( epsilon_decay: bool = False, epsilon: float = 0.1, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( "Q-Learning", env=env, summary_dir=summary_dir, write_summary=write_summary diff --git a/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning_with_eligibility_trace.py b/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning_with_eligibility_trace.py new file mode 100644 index 0000000..77ed766 --- /dev/null +++ b/src/gridmind/algorithms/tabular/temporal_difference/control/q_learning_with_eligibility_trace.py @@ -0,0 +1,162 @@ +from collections import defaultdict +from typing import Optional +from gridmind.algorithms.base_learning_algorithm import BaseLearningAlgorithm + +from gridmind.policies.soft.q_derived.base_q_derived_soft_policy import ( + BaseQDerivedSoftPolicy, +) +from gridmind.policies.soft.q_derived.q_table_derived_epsilon_greedy_policy import ( + QTableDerivedEpsilonGreedyPolicy, +) +from gymnasium import Env +import numpy as np +from tqdm import tqdm + + +class QLearningWithEligibilityTrace(BaseLearningAlgorithm): + def __init__( + self, + env: Env, + policy: Optional[BaseQDerivedSoftPolicy] = None, + step_size: float = 0.1, + discount_factor: float = 0.9, + eligibility_trace_decay: float = 0.9, + q_initializer: str = "zero", + epsilon_decay: bool = False, + epsilon: float = 0.1, + summary_dir: Optional[str] = None, + write_summary: bool = False, + ) -> None: + super().__init__( + "Q-Learning with Eligibility Trace", + env=env, + summary_dir=summary_dir, + write_summary=write_summary, + ) + self.num_actions = self.env.action_space.n + self.epsilon_decay = epsilon_decay + self.epsilon = epsilon + + q_initializer = q_initializer.lower() + assert q_initializer in [ + "zero", + "random", + ], "q_initializer may only take the value 'zero' or 'random'" + + if q_initializer == "zero": + self.q_values = defaultdict(lambda: np.zeros(self.num_actions)) + else: + self.q_values = defaultdict(lambda: np.random.rand(self.num_actions)) + + self.policy = ( + policy + if policy is not None + else QTableDerivedEpsilonGreedyPolicy( + q_table=self.q_values, num_actions=self.num_actions + ) + ) + self.policy.set_epsilon(self.epsilon) + + self.step_size = step_size + self.discount_factor = discount_factor + + assert ( + 0.0 <= eligibility_trace_decay <= 1.0 + ), "eligibility_trace_decay must be in range 0 to 1." + + self.eligibility_trace_decay = eligibility_trace_decay + self.eligibility_traces = defaultdict(lambda: np.zeros(self.num_actions)) + + def _get_state_value_fn(self, force_functional_interface: bool = True): + raise Exception( + f"{self.name} computes only state-action values. Use get_state_action_values() to get state-action values." + ) + + def _get_state_action_value_fn(self, force_functional_interface: bool = True): + if not force_functional_interface: + return self.q_values + + return lambda s, a: self.q_values[s][a] + + def _get_policy(self): + return self.policy + + def _train_steps(self, num_steps: int, prediction_only: bool, *args, **kwargs): + raise NotImplementedError() + + def _train_episodes(self, num_episodes: int, prediction_only: bool = False): + if prediction_only: + raise Exception("This is a control-only implementation.") + + for i in tqdm(range(num_episodes)): + obs, info = self.env.reset() + done = False + + self.eligibility_traces.clear() + + action_mask = info.get("action_mask", None) + action = self.policy.get_action(obs, action_mask=action_mask) + + while not done: + next_obs, reward, terminated, truncated, info = self.env.step(action) + + next_action_mask = info.get("action_mask", None) + next_action = self.policy.get_action( + next_obs, action_mask=next_action_mask + ) + next_q_values = self.policy.get_q_values( + next_obs, action_mask=next_action_mask + ) + next_max_q = np.max(next_q_values) + next_action_q = next_q_values[next_action] + is_next_action_greedy = np.isclose( + next_action_q, next_max_q, rtol=1e-8, atol=1e-12 + ) + + td_target = reward + self.discount_factor * np.max(next_q_values) * ( + 1 - terminated + ) + td_error = td_target - self.policy.get_q_value( + obs, action, action_mask=action_mask + ) + + self.eligibility_traces[obs][action] = 1.0 + + states_to_prune = [] + for state in self.eligibility_traces: + self.q_values[state] = ( + self.q_values[state] + + self.step_size * td_error * self.eligibility_traces[state] + ) + + for action_index, action_value in enumerate(self.q_values[state]): + self.policy.update_q( + state=state, action=action_index, value=action_value + ) + + self.eligibility_traces[state] = ( + self.discount_factor + * self.eligibility_trace_decay + * self.eligibility_traces[state] + ) + + if np.all(self.eligibility_traces[state] < 1e-12): + states_to_prune.append(state) + + # Watkins' cutoff: zero all traces when a non-greedy action is taken + if not is_next_action_greedy: + self.eligibility_traces.clear() + else: + for state in states_to_prune: + del self.eligibility_traces[state] + + obs = next_obs + action = next_action + action_mask = next_action_mask + done = terminated or truncated + + if self.epsilon_decay: + self.policy.decay_epsilon() + + def set_policy(self, policy: BaseQDerivedSoftPolicy): + self.policy = policy diff --git a/src/gridmind/algorithms/tabular/temporal_difference/control/sarsa.py b/src/gridmind/algorithms/tabular/temporal_difference/control/sarsa.py index 982ae5b..44fcddc 100644 --- a/src/gridmind/algorithms/tabular/temporal_difference/control/sarsa.py +++ b/src/gridmind/algorithms/tabular/temporal_difference/control/sarsa.py @@ -25,7 +25,7 @@ def __init__( epsilon_decay: bool = False, feature_constructor: Callable = None, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( "SARSA", env=env, summary_dir=summary_dir, write_summary=write_summary diff --git a/src/gridmind/algorithms/tabular/temporal_difference/prediction/td_0_prediction.py b/src/gridmind/algorithms/tabular/temporal_difference/prediction/td_0_prediction.py index ac51745..5fa2c5c 100644 --- a/src/gridmind/algorithms/tabular/temporal_difference/prediction/td_0_prediction.py +++ b/src/gridmind/algorithms/tabular/temporal_difference/prediction/td_0_prediction.py @@ -18,7 +18,7 @@ def __init__( step_size: float = 0.1, discount_factor: float = 0.9, summary_dir: Optional[str] = None, - write_summary: bool = True, + write_summary: bool = False, ) -> None: super().__init__( name="TD-0-Prediction", diff --git a/src/gridmind/config.py b/src/gridmind/config.py new file mode 100644 index 0000000..c03cade --- /dev/null +++ b/src/gridmind/config.py @@ -0,0 +1,11 @@ +_save_dir: str | None = None + + +def set_save_dir(path: str) -> None: + """Set the project-wide default directory for saving policies and TensorBoard logs.""" + global _save_dir + _save_dir = path + + +def get_save_dir() -> str | None: + return _save_dir diff --git a/src/gridmind/feature_construction/__init__.py b/src/gridmind/feature_construction/__init__.py index ecedac1..756d488 100644 --- a/src/gridmind/feature_construction/__init__.py +++ b/src/gridmind/feature_construction/__init__.py @@ -3,6 +3,7 @@ EmbeddingFeatureExtractor, ) from gridmind.feature_construction.multi_hot import MultiHotEncoder +from gridmind.feature_construction.normalizer import MinMaxNormalizer from gridmind.feature_construction.polynomial import PolynomialFeatureConstructor from gridmind.feature_construction.state_aggregation import SimpleStateAggregator from gridmind.feature_construction.tile_coding import TileCoding @@ -11,6 +12,7 @@ __all__ = [ "OneHotEncoder", "EmbeddingFeatureExtractor", + "MinMaxNormalizer", "MultiHotEncoder", "PolynomialFeatureConstructor", "SimpleStateAggregator", diff --git a/src/gridmind/feature_construction/multi_hot.py b/src/gridmind/feature_construction/multi_hot.py index 4b768f4..62f3a45 100644 --- a/src/gridmind/feature_construction/multi_hot.py +++ b/src/gridmind/feature_construction/multi_hot.py @@ -11,9 +11,3 @@ def __call__(self, indices: int, *args, **kwds): multi_hot[indices] = 1 return multi_hot - - -if __name__ == "__main__": - encoder = MultiHotEncoder(10) - categories = np.array([1, 3, 5]) - print(encoder(categories)) diff --git a/src/gridmind/feature_construction/normalizer.py b/src/gridmind/feature_construction/normalizer.py new file mode 100644 index 0000000..423c512 --- /dev/null +++ b/src/gridmind/feature_construction/normalizer.py @@ -0,0 +1,178 @@ +""" +Min-Max normalization for continuous observations. + +This module provides a callable MinMaxNormalizer class for normalizing observations +to a target range (default [0, 1]) based on known lower and upper bounds. +""" + +from typing import Tuple, Union + +import numpy as np + + +class MinMaxNormalizer: + """ + Min-Max normalization that scales observations to a target range. + + Transforms observations from [low, high] to [target_min, target_max] using: + normalized = (obs - low) / (high - low) * (target_max - target_min) + target_min + + This is a callable class that can be used as a feature constructor for + environments with known observation bounds (e.g., MountainCar, Pendulum). + + Parameters + ---------- + low : np.ndarray or float + Lower bounds of the observation space. Can be a scalar (applied to all dimensions) + or an array matching observation shape. + high : np.ndarray or float + Upper bounds of the observation space. Can be a scalar (applied to all dimensions) + or an array matching observation shape. + clip : bool, optional + If True, clip observations to [low, high] before normalizing. + Useful for handling observations that may occasionally exceed bounds. + Default is False. + target_range : Tuple[float, float], optional + Target range for normalized values as (min, max). + Default is (0, 1). Use (-1, 1) for tanh-like outputs. + epsilon : float, optional + Small value added to denominator to prevent division by zero when low == high. + Default is 1e-8. + + Examples + -------- + >>> # Mountain Car environment bounds + >>> normalizer = MinMaxNormalizer( + ... low=np.array([-1.2, -0.07]), + ... high=np.array([0.6, 0.07]) + ... ) + >>> obs = np.array([-0.5, 0.0]) + >>> normalized = normalizer(obs) + >>> print(normalized) # [0.38888889, 0.5] + + >>> # Normalize to [-1, 1] range + >>> normalizer = MinMaxNormalizer( + ... low=-1.0, high=1.0, target_range=(-1, 1) + ... ) + >>> obs = np.array([0.0]) + >>> normalized = normalizer(obs) + >>> print(normalized) # [0.0] + + >>> # Batch normalization + >>> normalizer = MinMaxNormalizer(low=0.0, high=10.0) + >>> batch_obs = np.array([[0.0], [5.0], [10.0]]) + >>> normalized_batch = normalizer(batch_obs) + >>> print(normalized_batch) # [[0.0], [0.5], [1.0]] + """ + + def __init__( + self, + low: Union[np.ndarray, float], + high: Union[np.ndarray, float], + clip: bool = False, + target_range: Tuple[float, float] = (0.0, 1.0), + epsilon: float = 1e-8, + ): + """ + Initialize the MinMaxNormalizer. + + Parameters + ---------- + low : np.ndarray or float + Lower bounds of the observation space. + high : np.ndarray or float + Upper bounds of the observation space. + clip : bool, optional + Whether to clip observations to [low, high] before normalizing. + target_range : Tuple[float, float], optional + Target range for normalized values as (min, max). + epsilon : float, optional + Small value to prevent division by zero. + + Raises + ------ + ValueError + If any element of low >= high. + ValueError + If target_range[0] >= target_range[1]. + """ + # Convert to numpy arrays for consistent handling + self.low = np.asarray(low, dtype=np.float32) + self.high = np.asarray(high, dtype=np.float32) + self.clip = clip + self.epsilon = epsilon + + # Validate target_range + if len(target_range) != 2: + raise ValueError( + f"target_range must be a tuple of length 2, got {len(target_range)}" + ) + + self.target_min, self.target_max = target_range + + if self.target_min >= self.target_max: + raise ValueError( + f"target_range min ({self.target_min}) must be less than max ({self.target_max})" + ) + + # Validate that low < high + if np.any(self.low >= self.high): + raise ValueError( + f"All elements of low must be strictly less than high. " + f"Got low={self.low}, high={self.high}" + ) + + # Precompute scaling factors for efficiency. + # Use epsilon only as a floor to guard against near-zero ranges. + self.scale = (self.target_max - self.target_min) / np.maximum( + self.high - self.low, self.epsilon + ) + self.offset = self.target_min - self.low * self.scale + + def __call__(self, observation: np.ndarray) -> np.ndarray: + """ + Normalize observation(s) to the target range. + + Parameters + ---------- + observation : np.ndarray + Observation(s) to normalize. Can be: + - 1D array: single observation (shape: (obs_dim,)) + - 2D array: batch of observations (shape: (batch_size, obs_dim)) + + Returns + ------- + np.ndarray + Normalized observation(s) with the same shape as input. + Values will be in the range [target_min, target_max]. + + Examples + -------- + >>> normalizer = MinMaxNormalizer(low=0.0, high=10.0) + >>> # Single observation + >>> obs = np.array([5.0]) + >>> print(normalizer(obs)) # [0.5] + >>> # Batch of observations + >>> batch = np.array([[0.0], [5.0], [10.0]]) + >>> print(normalizer(batch)) # [[0.0], [0.5], [1.0]] + """ + # Convert to numpy array if not already + obs = np.asarray(observation, dtype=np.float32) + + # Clip if requested (handles out-of-bounds values) + if self.clip: + obs = np.clip(obs, self.low, self.high) + + # Apply normalization: obs * scale + offset + # Broadcasting handles both single observations and batches + normalized = obs * self.scale + self.offset + + return normalized + + def __repr__(self) -> str: + """String representation of the normalizer.""" + return ( + f"MinMaxNormalizer(low={self.low}, high={self.high}, " + f"clip={self.clip}, target_range=({self.target_min}, {self.target_max}), " + f"epsilon={self.epsilon})" + ) diff --git a/src/gridmind/feature_construction/polynomial.py b/src/gridmind/feature_construction/polynomial.py index 27f080c..07fa571 100644 --- a/src/gridmind/feature_construction/polynomial.py +++ b/src/gridmind/feature_construction/polynomial.py @@ -27,13 +27,3 @@ def __call__(self, state: Union[np.ndarray, numbers.Number], *args, **kwds): features = self._feature_constuctor(state) return features - - -if __name__ == "__main__": - state = np.array([2, 2, 3]) - fc = PolynomialFeatureConstructor(n=1) - features_1 = fc(state) - features_2 = fc(state) - - print(features_1) - print(features_2) diff --git a/src/gridmind/feature_construction/state_aggregation.py b/src/gridmind/feature_construction/state_aggregation.py index 3534216..c4ee3fc 100644 --- a/src/gridmind/feature_construction/state_aggregation.py +++ b/src/gridmind/feature_construction/state_aggregation.py @@ -7,9 +7,3 @@ def __init__(self, span: int): def __call__(self, state: int, *args, **kwds): return math.floor(state / self.span) - - -if __name__ == "__main__": - agg = SimpleStateAggregator(span=100) - - print(agg(900)) diff --git a/src/gridmind/feature_construction/tile_coding.py b/src/gridmind/feature_construction/tile_coding.py index 5c3ac40..092fe85 100644 --- a/src/gridmind/feature_construction/tile_coding.py +++ b/src/gridmind/feature_construction/tile_coding.py @@ -106,14 +106,3 @@ def __call__(self, floats, ints=[], readonly=False): tiles = self.tiles(self.ihtORsize, self.numtilings, floats, ints, readonly) tiles = np.array(tiles) return tiles - - -if __name__ == "__main__": - iht = IHT(4096) - tc = TileCoding(iht, 8) - tiles = tc([0.1, 0.1]) - print(tiles) - x = -0.5675576 - xdot = 0.0 - A = [1] - print(tc.tiles(iht, 8, [8 * x / (0.5 + 1.2), 8 * xdot / (0.07 + 0.07)], A)) diff --git a/src/gridmind/policies/greedy/stochastic_start_greedy_policy.py b/src/gridmind/policies/greedy/stochastic_start_greedy_policy.py index 7d19a66..b5b6c12 100644 --- a/src/gridmind/policies/greedy/stochastic_start_greedy_policy.py +++ b/src/gridmind/policies/greedy/stochastic_start_greedy_policy.py @@ -32,6 +32,17 @@ def get_action_prob(self, state, action): return action_probs + def get_all_action_probabilities(self, states): + import numpy as np + + action_probs_list = [] + for state in states: + greedy_action = self.get_action(state) + probs = [1.0 if a == greedy_action else 0.0 for a in range(self.num_actions)] + action_probs_list.append(probs) + + return np.array(action_probs_list).squeeze() + def update(self, state, action): assert ( action in self.action_space if self.action_space is not None else True diff --git a/src/gridmind/policies/parameterized/actor_critic_policy.py b/src/gridmind/policies/parameterized/actor_critic_policy.py index e95fac6..f35fd54 100644 --- a/src/gridmind/policies/parameterized/actor_critic_policy.py +++ b/src/gridmind/policies/parameterized/actor_critic_policy.py @@ -69,5 +69,10 @@ def get_action_prob(self, state, action): return action_prob + def get_all_action_probabilities(self, state): + logits = self.actor(state) + dist = Categorical(logits=logits) + return dist.probs + def update(self, state, action): raise NotImplementedError diff --git a/src/gridmind/policies/parameterized/atari/atari_policy.py b/src/gridmind/policies/parameterized/atari/atari_policy.py index 6d0edef..c72546b 100644 --- a/src/gridmind/policies/parameterized/atari/atari_policy.py +++ b/src/gridmind/policies/parameterized/atari/atari_policy.py @@ -88,12 +88,3 @@ def get_action_prob(self, state, action): def update(self, state, action): pass - - -if __name__ == "__main__": - model = AtariPolicy( - observation_shape=(4, 84, 84), num_actions=6, channel_first=True - ) # 4 stacked frames, 6 possible actions - sample_input = torch.zeros((1, 4, 84, 84)) # batch of 1, 4 channels, 84x84 image - output = model(sample_input) - print(output.shape) # torch.Size([1, 6]) diff --git a/src/gridmind/policies/parameterized/discrete_action_mlp_policy.py b/src/gridmind/policies/parameterized/discrete_action_mlp_policy.py index 0bc05f1..509249d 100644 --- a/src/gridmind/policies/parameterized/discrete_action_mlp_policy.py +++ b/src/gridmind/policies/parameterized/discrete_action_mlp_policy.py @@ -88,6 +88,11 @@ def get_action_prob(self, state, action): return action_probs[action] + def get_log_action_prob(self, state: torch.Tensor, action: int) -> torch.Tensor: + logits = self.forward(state) + log_probs = F.log_softmax(logits, dim=-1) + return log_probs[action] + def get_all_action_probabilities(self, states): action_probs = self.forward(states) diff --git a/src/gridmind/policies/random_policy.py b/src/gridmind/policies/random_policy.py index 888032e..fccc988 100644 --- a/src/gridmind/policies/random_policy.py +++ b/src/gridmind/policies/random_policy.py @@ -30,5 +30,14 @@ def get_action_prob(self, state, action): return action_probs + def get_all_action_probabilities(self, states): + import numpy as np + + uniform_prob = 1.0 / self.num_actions + action_probs_list = [ + [uniform_prob] * self.num_actions for _ in states + ] + return np.array(action_probs_list).squeeze() + def update(self, state, action): raise Exception("This policy is for prediction (value estimation only).") diff --git a/src/gridmind/policies/soft/q_derived/base_q_derived_soft_policy.py b/src/gridmind/policies/soft/q_derived/base_q_derived_soft_policy.py index f05c2ca..1fcda6a 100644 --- a/src/gridmind/policies/soft/q_derived/base_q_derived_soft_policy.py +++ b/src/gridmind/policies/soft/q_derived/base_q_derived_soft_policy.py @@ -88,3 +88,11 @@ def get_action_deterministic(self, state, action_mask=None): @abstractmethod def _get_greedy_action(self, state, action_mask=None): raise NotImplementedError() + + @abstractmethod + def get_q_value(self, state, action, action_mask=None): + raise NotImplementedError() + + @abstractmethod + def get_q_values(self, state, action_mask=None): + raise NotImplementedError() diff --git a/src/gridmind/policies/soft/q_derived/q_network_derived_epsilon_greedy_policy.py b/src/gridmind/policies/soft/q_derived/q_network_derived_epsilon_greedy_policy.py index 6fd560f..c775711 100644 --- a/src/gridmind/policies/soft/q_derived/q_network_derived_epsilon_greedy_policy.py +++ b/src/gridmind/policies/soft/q_derived/q_network_derived_epsilon_greedy_policy.py @@ -1,7 +1,11 @@ +from typing import Optional + +import numpy as np +import torch + from gridmind.policies.soft.q_derived.base_q_derived_soft_policy import ( BaseQDerivedSoftPolicy, ) -import torch class QNetworkDerivedEpsilonGreedyPolicy(BaseQDerivedSoftPolicy): @@ -46,9 +50,24 @@ def update_q(self, state, action, value: float): f"{self.__class__.__name__} does not support updating Q values directly." ) - def _get_greedy_action(self, state): + def _get_greedy_action(self, state, action_mask: Optional[np.ndarray] = None): state = state.to(self.device) - action = torch.argmax(self.Q(state)).cpu().detach().item() + q_values = self.Q(state) + + if action_mask is not None: + # Convert numpy mask to torch tensor and move to the same device as q_values + action_mask_tensor = torch.tensor( + action_mask, dtype=torch.bool, device=self.device + ) + # Mask invalid actions with -inf + masked_q_values = torch.where( + action_mask_tensor, + q_values, + torch.tensor(-torch.inf, device=self.device), + ) + action = torch.argmax(masked_q_values).cpu().detach().item() + else: + action = torch.argmax(q_values).cpu().detach().item() assert ( action in self.action_space if self.action_space is not None else True @@ -74,3 +93,17 @@ def decay_epsilon(self): if decayed_epsilon >= self.epsilon_min: self.set_epsilon(value=decayed_epsilon) + + def get_q_values(self, state, action_mask=None): + state = state.to(self.device) + q_values = self.Q(state).cpu().detach().numpy() + + if action_mask is not None: + masked_q_values = np.where(action_mask, q_values, -np.inf) + return masked_q_values + + return q_values + + def get_q_value(self, state, action, action_mask=None): + q_values = self.get_q_values(state, action_mask=action_mask) + return q_values[action] diff --git a/src/gridmind/policies/soft/q_derived/q_table_derived_epsilon_greedy_policy.py b/src/gridmind/policies/soft/q_derived/q_table_derived_epsilon_greedy_policy.py index 708bf5e..89c6b3f 100644 --- a/src/gridmind/policies/soft/q_derived/q_table_derived_epsilon_greedy_policy.py +++ b/src/gridmind/policies/soft/q_derived/q_table_derived_epsilon_greedy_policy.py @@ -70,3 +70,17 @@ def decay_epsilon(self): if decayed_epsilon >= self.epsilon_min: self.set_epsilon(value=decayed_epsilon) + + + def get_q_values(self, state, action_mask=None): + q_values = self.Q[state] + + if action_mask is not None: + masked_q_values = np.where(action_mask, q_values, -np.inf) + return masked_q_values + + return q_values + + def get_q_value(self, state, action, action_mask=None): + q_values = self.get_q_values(state, action_mask=action_mask) + return q_values[action] diff --git a/src/gridmind/utils/algorithm_util/prioritized_replay_buffer.py b/src/gridmind/utils/algorithm_util/prioritized_replay_buffer.py new file mode 100644 index 0000000..1dfbf1f --- /dev/null +++ b/src/gridmind/utils/algorithm_util/prioritized_replay_buffer.py @@ -0,0 +1,370 @@ +""" +Prioritized Experience Replay Buffer for Deep Reinforcement Learning. + +This implementation extends SimpleReplayBuffer and uses a sum-tree data structure +for efficient sampling with priorities, supporting proportional prioritization. + +Reference: Schaul et al., "Prioritized Experience Replay" (2015) +""" + +from typing import Optional, Tuple +import numpy as np +import random +from gridmind.utils.algorithm_util.simple_replay_buffer import SimpleReplayBuffer + + +class SumTree: + """ + Sum Tree data structure for efficient priority-based sampling. + + The tree structure allows O(log n) updates and O(log n) sampling, + which is crucial for large replay buffers. + + Structure: + - Parent node value = sum of children values + - Leaf nodes contain priorities + - Data indices are mapped to leaf positions + """ + + def __init__(self, capacity: int): + """ + Initialize the sum tree. + + Args: + capacity: Maximum number of experiences to store + """ + self.capacity = capacity + # Tree has capacity leaf nodes and capacity-1 internal nodes + # Total nodes = 2 * capacity - 1 + self.tree = np.zeros(2 * capacity - 1) + # Current number of stored experiences + self.n_entries = 0 + + def _propagate(self, idx: int, change: float): + """Propagate priority change up the tree iteratively.""" + while idx != 0: + idx = (idx - 1) // 2 + self.tree[idx] += change + + def _retrieve(self, idx: int, s: float) -> int: + """ + Retrieve the leaf index corresponding to a priority value. + + Performs an iterative binary search down the tree. + """ + while True: + left = 2 * idx + 1 + if left >= len(self.tree): + return idx + if s <= self.tree[left]: + idx = left + else: + s -= self.tree[left] + idx = left + 1 + + def total(self) -> float: + """Return the total sum of all priorities (root node value).""" + return self.tree[0] + + def add(self, priority: float, data_idx: int): + """ + Add new experience priority. + + Args: + priority: Priority value for this experience + data_idx: Index in the data buffer + """ + # Calculate tree index for this write position + # Leaf nodes start at index (capacity - 1) + tree_idx = data_idx + self.capacity - 1 + + # Update the tree with the new priority + self.update(tree_idx, priority) + + # Track number of entries + if self.n_entries < self.capacity: + self.n_entries += 1 + + def update(self, idx: int, priority: float): + """ + Update priority for a specific tree index. + + Args: + idx: Tree index to update + priority: New priority value + """ + # Calculate the change in priority + change = priority - self.tree[idx] + + # Update this node + self.tree[idx] = priority + + # Propagate the change up to the root + self._propagate(idx, change) + + def get(self, s: float) -> Tuple[int, float, int]: + """ + Get data index and priority for a cumulative priority value. + + Args: + s: Target cumulative priority value + + Returns: + Tuple of (tree_index, priority, data_index) + """ + tree_idx = self._retrieve(0, s) + data_idx = tree_idx - self.capacity + 1 + + return (tree_idx, self.tree[tree_idx], data_idx) + + +class PrioritizedReplayBuffer(SimpleReplayBuffer): + """ + Prioritized Experience Replay Buffer extending SimpleReplayBuffer. + + This buffer samples experiences based on their TD-error, allowing + the agent to learn more from surprising transitions. Includes + importance sampling weights to correct for the bias introduced + by non-uniform sampling. + + Key features: + - Proportional prioritization: P(i) ∝ p_i^α + - Importance sampling weights: w_i = (N * P(i))^(-β) + - Efficient O(log n) sampling and updates via sum-tree + - Handles both absolute and epsilon-based priorities + - Extends SimpleReplayBuffer for compatibility + """ + + def __init__( + self, + capacity: Optional[int] = None, + alpha: float = 0.6, + beta_start: float = 0.4, + beta_frames: int = 100000, + epsilon: float = 1e-6, + max_priority: float = 1.0, + ): + """ + Initialize the prioritized replay buffer. + + Args: + capacity: Maximum number of experiences to store + alpha: Controls how much prioritization is used (0 = uniform, 1 = full prioritization) + beta_start: Initial value for importance sampling weight exponent + beta_frames: Number of frames over which to anneal beta to 1 + epsilon: Small constant added to priorities to ensure non-zero probability + max_priority: Initial priority for new experiences + """ + if capacity is None: + raise ValueError("Capacity must be specified for PrioritizedReplayBuffer") + + super().__init__(capacity=capacity) + # Replace deque with a fixed-size list so data_idx stays stable + self.buffer = [None] * capacity + self._write_idx = 0 + self._size = 0 + self.tree = SumTree(capacity) + self.alpha = alpha + self.beta = beta_start + self.beta_start = beta_start + self.beta_frames = beta_frames + self.epsilon = epsilon + self.max_priority = max_priority + self.frame = 1 + + def _get_priority(self, td_error: float) -> float: + """ + Convert TD-error to priority. + + Priority is based on the absolute TD-error plus epsilon to ensure + all transitions have non-zero probability of being sampled. + + Args: + td_error: Temporal difference error + + Returns: + Priority value + """ + return (abs(td_error) + self.epsilon) ** self.alpha + + def store( + self, + state, + action, + reward, + next_state, + terminated, + truncated, + td_error: Optional[float] = None, + ): + """ + Store a transition in the buffer with priority. + + Args: + state: Current state + action: Action taken + reward: Reward received + next_state: Next state + terminated: Whether episode terminated + truncated: Whether episode was truncated + td_error: TD-error for priority calculation (if None, uses max_priority) + """ + data_idx = self._write_idx + + if td_error is None: + priority = self.max_priority + else: + priority = self._get_priority(td_error) + self.max_priority = max(self.max_priority, priority) + + self.buffer[data_idx] = (state, action, reward, next_state, terminated, truncated) + self._write_idx = (self._write_idx + 1) % self.capacity + self._size = min(self._size + 1, self.capacity) + + self.tree.add(priority, data_idx) + + def sample( + self, + batch_size: int = 1, + sequential: bool = False, + ) -> Tuple[ + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + ]: + """ + Sample a batch of experiences with importance sampling weights. + + The buffer is divided into batch_size segments, and one sample is drawn + from each segment. This stratified sampling ensures coverage of the + priority distribution. + + Args: + batch_size: Number of experiences to sample + sequential: Ignored for prioritized sampling (always uses priority-based sampling) + + Returns: + Tuple of (states, actions, rewards, next_states, terminated, truncated, + importance_weights, tree_indices) + """ + if sequential: + # Fall back to parent's sequential sampling without priorities + states, actions, rewards, next_states, terminated, truncated = ( + super().sample(batch_size, sequential=True) + ) + # Return uniform weights and dummy indices for sequential sampling + importance_weights = np.ones(batch_size, dtype=np.float32) + tree_indices = np.zeros(batch_size, dtype=np.int32) + return ( + states, + actions, + rewards, + next_states, + terminated, + truncated, + importance_weights, + tree_indices, + ) + + if batch_size > self.size(): + raise ValueError( + f"Batch size ({batch_size}) is greater than buffer size ({self.size()})." + ) + + batch = [] + tree_indices = np.zeros(batch_size, dtype=np.int32) + priorities = np.zeros(batch_size, dtype=np.float32) + + # Divide priority range into batch_size segments + segment_size = self.tree.total() / batch_size + + # Update beta (annealing for importance sampling) + self.beta = min( + 1.0, + self.beta_start + (1.0 - self.beta_start) * self.frame / self.beta_frames, + ) + + for i in range(batch_size): + # Sample uniformly from this segment + a = segment_size * i + b = segment_size * (i + 1) + value = random.uniform(a, b) + + # Get experience corresponding to this priority value + tree_idx, priority, data_idx = self.tree.get(value) + + # Get data from parent's buffer + batch.append(self.buffer[data_idx]) + tree_indices[i] = tree_idx + priorities[i] = priority + + # Calculate importance sampling weights + # w_i = (N * P(i))^(-β) / max_w + sampling_probabilities = priorities / self.tree.total() + importance_weights = (self.size() * sampling_probabilities) ** (-self.beta) + + # Normalize weights by max weight for stability + importance_weights /= importance_weights.max() + + # Unpack batch + states, actions, rewards, next_states, terminated, truncated = zip(*batch) + + # Convert to numpy arrays + states_arr = np.array(states) + actions_arr = np.array(actions) + rewards_arr = np.array(rewards) + next_states_arr = np.array(next_states) + terminated_arr = np.array(terminated) + truncated_arr = np.array(truncated) + + # Ensure proper shapes for 1D observations + if states_arr.ndim == 1: + states_arr = states_arr.reshape(-1, 1) + if next_states_arr.ndim == 1: + next_states_arr = next_states_arr.reshape(-1, 1) + + self.frame += 1 + + return ( + states_arr, + actions_arr, + rewards_arr, + next_states_arr, + terminated_arr, + truncated_arr, + importance_weights, + tree_indices, + ) + + def update_priorities(self, tree_indices: np.ndarray, td_errors: np.ndarray): + """ + Update priorities for sampled experiences based on new TD-errors. + + This is called after training on a batch to update priorities based + on the latest TD-errors. + + Args: + tree_indices: Tree indices of the sampled experiences + td_errors: New TD-errors for these experiences + """ + for idx, td_error in zip(tree_indices, td_errors): + priority = self._get_priority(td_error) + self.tree.update(idx, priority) + self.max_priority = max(self.max_priority, priority) + + def size(self): + return self._size + + def clear(self): + """Clear the buffer and reset to initial state.""" + self.buffer = [None] * self.capacity + self._write_idx = 0 + self._size = 0 + self.tree = SumTree(self.capacity) + self.max_priority = 1.0 +self.frame = 1 diff --git a/src/gridmind/utils/algorithm_util/simple_replay_buffer.py b/src/gridmind/utils/algorithm_util/simple_replay_buffer.py index f4fdf98..3024340 100644 --- a/src/gridmind/utils/algorithm_util/simple_replay_buffer.py +++ b/src/gridmind/utils/algorithm_util/simple_replay_buffer.py @@ -72,16 +72,3 @@ def pop(self, num_elements: int = 1): if num_elements > self.size(): raise ValueError("Number of elements to pop is greater than buffer size.") return [self.buffer.popleft() for _ in range(num_elements)] - - -if __name__ == "__main__": - buffer = SimpleReplayBuffer(None) - buffer.store(np.array([0, 0, 0]), 0, 1, np.array([1, 1, 1]), False, False) - buffer.store(np.array([2, 2, 2]), 1, 0.5, np.array([3, 3, 3]), True, False) - batch = buffer.sample(2) - print(batch) - print(buffer.size()) - batch_2 = buffer.sample(2, sequential=True) - print(batch_2) - buffer.clear() - print(buffer.size()) diff --git a/src/gridmind/utils/evo_util/selection.py b/src/gridmind/utils/evo_util/selection.py index a3565f1..16e9681 100644 --- a/src/gridmind/utils/evo_util/selection.py +++ b/src/gridmind/utils/evo_util/selection.py @@ -67,15 +67,3 @@ def random_selection(population: List[NeuroAgent], num_selection: int = 1): selected = random.sample(population, num_selection) return selected - - -if __name__ == "__main__": - agents = [ - NeuroAgent(fitness=None), - NeuroAgent(fitness=1.0), - NeuroAgent(fitness=3.0), - ] - - s = Selection.fitness_proportionate_selection(agents, 10) - - print([a.name for a in s]) diff --git a/src/gridmind/utils/logtools/null_logger.py b/src/gridmind/utils/logtools/null_logger.py new file mode 100644 index 0000000..27a1a3f --- /dev/null +++ b/src/gridmind/utils/logtools/null_logger.py @@ -0,0 +1,8 @@ +class NullWriter: + """No-op logger used when write_summary=False or torch is unavailable.""" + + def add_scalar(self, *args, **kwargs): + pass + + def close(self): + pass diff --git a/src/gridmind/utils/vis_util.py b/src/gridmind/utils/vis_util.py index 95211ec..0b63091 100644 --- a/src/gridmind/utils/vis_util.py +++ b/src/gridmind/utils/vis_util.py @@ -196,14 +196,3 @@ def load_video_as_tensor( del video_frames gc.collect() return None - - -if __name__ == "__main__": - # Example usage - feature1 = [0, 0, 1, 1] - feature2 = [0, 1, 0, 1] - state_values = [1.0, 0.5, 0.8, 0.2] - - print_value_table( - feature1, feature2, state_values, feature1_name="X-axis", feature2_name="Y-axis" - ) diff --git a/src/gridmind/wrappers/env_wrappers/frozenlake_env_wrapper.py b/src/gridmind/wrappers/env_wrappers/frozenlake_env_wrapper.py index a56f003..c5932a0 100644 --- a/src/gridmind/wrappers/env_wrappers/frozenlake_env_wrapper.py +++ b/src/gridmind/wrappers/env_wrappers/frozenlake_env_wrapper.py @@ -121,25 +121,3 @@ def reset(self): ).astype(float) return observation_encoded, info - - -if __name__ == "__main__": - env = FrozenLakeEnvWrapper(render_mode="human") - - observation = env.reset() - done = False - - while not done: - env.render() - action = env.action_space.sample() - ( - observation, - reward, - terminated, - truncated, - _, - ) = env.step(action) - done = terminated or truncated - print(f"Observation: {observation}, Reward: {reward}, Done: {done}") - - env.close() diff --git a/src/gridmind/wrappers/env_wrappers/grid_discretization_wrapper.py b/src/gridmind/wrappers/env_wrappers/grid_discretization_wrapper.py new file mode 100644 index 0000000..e52a449 --- /dev/null +++ b/src/gridmind/wrappers/env_wrappers/grid_discretization_wrapper.py @@ -0,0 +1,273 @@ +"""Grid-based discretization wrapper for continuous observation spaces. + +This module provides a simple, robust wrapper that discretizes continuous +observations into discrete states using uniform grid binning. +""" + +from typing import Union + +import gymnasium +import numpy as np +from gymnasium.spaces import Box, Discrete + +from gridmind.wrappers.env_wrappers.base_gym_wrapper import BaseGymWrapper + + +class GridDiscretizationWrapper(BaseGymWrapper): + """Discretizes continuous observation spaces using uniform grid binning. + + This wrapper transforms a continuous Box observation space into a discrete + state space by dividing each dimension into uniform bins. Multi-dimensional + observations are mapped to a single discrete state using row-major ordering. + + Algorithm: + For each dimension: + 1. Optionally clip observation to [low, high] bounds + 2. Normalize to [0, 1]: (obs - low) / (high - low) + 3. Multiply by num_bins and floor: int(normalized * num_bins) + 4. Clip bin index to [0, num_bins-1] + 5. Convert multi-dimensional bin indices to single discrete state + + State Mapping: + Uses row-major ordering to convert multi-dimensional bin indices to a + single integer state. For example, with bins_per_dim=[20, 15]: + state = bin_indices[0] * 15 + bin_indices[1] + Total states = 20 * 15 = 300 + + Parameters + ---------- + env : gymnasium.Env + The environment to wrap. Must have a Box observation space. + bins_per_dim : int or list of int + Number of bins per dimension. If int, uses same number of bins for all + dimensions. If list, must match observation space dimensionality. + clip : bool, optional + Whether to clip observations to [low, high] before discretizing. + Default: True. Recommended to prevent out-of-bounds issues. + + Raises + ------ + TypeError + If the observation space is not Box (continuous). + ValueError + If bins_per_dim dimensions don't match observation space, or if + bins_per_dim contains values <= 0. + + Examples + -------- + Basic usage with MountainCar (2D observation space): + + >>> import gymnasium as gym + >>> env = gym.make("MountainCar-v0") + >>> # Use 20 bins for both position and velocity + >>> wrapped = GridDiscretizationWrapper(env, bins_per_dim=20) + >>> print(wrapped.observation_space) + Discrete(400) # 20 * 20 = 400 states + + Different bins per dimension: + + >>> wrapped = GridDiscretizationWrapper(env, bins_per_dim=[20, 15]) + >>> print(wrapped.observation_space) + Discrete(300) # 20 * 15 = 300 states + + Usage in training: + + >>> obs, info = wrapped.reset() + >>> print(type(obs)) + # Discrete state, not continuous array + >>> action = wrapped.action_space.sample() + >>> obs, reward, terminated, truncated, info = wrapped.step(action) + + Attributes + ---------- + observation_space : gymnasium.spaces.Discrete + The discretized observation space with n = product of all bins_per_dim. + bins_per_dim : np.ndarray + Array of bin counts per dimension. + clip : bool + Whether observations are clipped to bounds. + obs_low : np.ndarray + Lower bounds of the original observation space. + obs_high : np.ndarray + Upper bounds of the original observation space. + obs_shape : tuple + Shape of the original observation space. + """ + + def __init__( + self, + env: gymnasium.Env, + bins_per_dim: Union[int, list[int]], + clip: bool = True, + ): + """Initialize the grid discretization wrapper. + + Parameters + ---------- + env : gymnasium.Env + The environment to wrap. + bins_per_dim : int or list of int + Number of bins per dimension. + clip : bool, optional + Whether to clip observations to bounds, by default True. + """ + super().__init__(env) + + # Validate observation space is continuous (Box) + if not isinstance(env.observation_space, Box): + raise TypeError( + f"GridDiscretizationWrapper requires Box observation space, " + f"got {type(env.observation_space).__name__}" + ) + + self.obs_low = env.observation_space.low + self.obs_high = env.observation_space.high + self.obs_shape = env.observation_space.shape + self.clip = clip + + # Handle bins_per_dim parameter + if isinstance(bins_per_dim, int): + # Use same bins for all dimensions + if bins_per_dim <= 0: + raise ValueError(f"bins_per_dim must be positive, got {bins_per_dim}") + self.bins_per_dim = np.full(self.obs_shape[0], bins_per_dim, dtype=np.int32) + else: + # Use specific bins per dimension + bins_array = np.array(bins_per_dim, dtype=np.int32) + if len(bins_array) != self.obs_shape[0]: + raise ValueError( + f"bins_per_dim length ({len(bins_array)}) must match " + f"observation dimensionality ({self.obs_shape[0]})" + ) + if np.any(bins_array <= 0): + raise ValueError( + f"All bins_per_dim values must be positive, got {bins_array}" + ) + self.bins_per_dim = bins_array + + # Compute total number of discrete states (product of all bins) + self.total_states = int(np.prod(self.bins_per_dim)) + + # Create new discrete observation space + self.observation_space = Discrete(self.total_states) + + # Precompute bin ranges for efficiency + # Each dimension divided into equal-width bins + self.bin_widths = (self.obs_high - self.obs_low) / self.bins_per_dim + + # Precompute multipliers for row-major state index calculation + # For [n1, n2, n3] bins: multipliers = [n2*n3, n3, 1] + self.state_multipliers = np.zeros(len(self.bins_per_dim), dtype=np.int32) + self.state_multipliers[-1] = 1 + for i in range(len(self.bins_per_dim) - 2, -1, -1): + self.state_multipliers[i] = ( + self.state_multipliers[i + 1] * self.bins_per_dim[i + 1] + ) + + def _discretize_observation(self, observation: np.ndarray) -> int: + """Convert continuous observation to discrete state index. + + Parameters + ---------- + observation : np.ndarray + Continuous observation from the environment. + + Returns + ------- + int + Discrete state index in range [0, total_states-1]. + """ + # Step 1: Optionally clip to bounds + if self.clip: + obs = np.clip(observation, self.obs_low, self.obs_high) + else: + obs = observation + + # Step 2: Normalize to [0, 1] for each dimension + # Handle potential division by zero for infinite bounds + with np.errstate(divide="ignore", invalid="ignore"): + normalized = (obs - self.obs_low) / (self.obs_high - self.obs_low) + # Replace NaN/Inf with 0 (happens when low == high or infinite bounds) + normalized = np.nan_to_num(normalized, nan=0.0, posinf=1.0, neginf=0.0) + + # Step 3: Convert to bin indices + # Multiply by num_bins and floor, then clip to valid range + bin_indices = (normalized * self.bins_per_dim).astype(np.int32) + bin_indices = np.clip(bin_indices, 0, self.bins_per_dim - 1) + + # Step 4: Convert multi-dimensional bin indices to single state index + # Using row-major ordering: state = sum(bin_indices[i] * multipliers[i]) + state = int(np.dot(bin_indices, self.state_multipliers)) + + return state + + def reset(self, **kwargs): + """Reset the environment and discretize the initial observation. + + Parameters + ---------- + **kwargs + Additional keyword arguments passed to the wrapped environment's reset. + + Returns + ------- + int + Discrete state index. + dict + Info dictionary from the environment. + """ + observation, info = self.env.reset(**kwargs) + discrete_state = self._discretize_observation(observation) + return discrete_state, info + + def step(self, action): + """Execute action and discretize the resulting observation. + + Parameters + ---------- + action + Action to execute in the environment. + + Returns + ------- + int + Discrete state index. + float + Reward received. + bool + Whether the episode terminated. + bool + Whether the episode was truncated. + dict + Info dictionary from the environment. + """ + observation, reward, terminated, truncated, info = self.env.step(action) + discrete_state = self._discretize_observation(observation) + return discrete_state, reward, terminated, truncated, info + + def get_observation_space(self): + """Get the discretized observation space. + + Returns + ------- + gymnasium.spaces.Discrete + Discrete observation space with n = product of all bins_per_dim. + """ + return self.observation_space + + def discretize_observation(self, observation: np.ndarray) -> int: + """Public method to discretize a continuous observation. + + Useful for testing or manual discretization. + + Parameters + ---------- + observation : np.ndarray + Continuous observation to discretize. + + Returns + ------- + int + Discrete state index. + """ + return self._discretize_observation(observation) diff --git a/src/gridmind/wrappers/value_estimator_wrappers/q_network_to_state_value_estimator_wrapper.py b/src/gridmind/wrappers/value_estimator_wrappers/q_network_to_state_value_estimator_wrapper.py new file mode 100644 index 0000000..3be3a5c --- /dev/null +++ b/src/gridmind/wrappers/value_estimator_wrappers/q_network_to_state_value_estimator_wrapper.py @@ -0,0 +1,40 @@ +from gridmind.policies.base_policy import BasePolicy +from gridmind.value_estimators.action_value_estimators.q_network import QNetwork +import torch + + +class QNetworkToStateValueEstimatorWrapper: + def __init__(self, q_network: QNetwork, policy:BasePolicy): + self.q_network = q_network + self.policy = policy + + def forward(self, x): + q_values = self.q_network.forward(x) + with torch.no_grad(): + action_probs = self.policy.get_all_action_probabilities(x) + state_value = torch.sum(action_probs * q_values, dim=-1) + return state_value + + def set_q_network(self, q_network: QNetwork): + self.q_network = q_network + + def set_policy(self, policy: BasePolicy): + self.policy = policy + + +if __name__ == "__main__": + from gridmind.policies.parameterized.discrete_action_mlp_policy import DiscreteActionMLPPolicy + + observation_shape = (4,) + num_actions = 2 + + q_network = QNetwork(observation_shape=observation_shape, num_hidden_layers=2, num_actions=num_actions) + policy = DiscreteActionMLPPolicy(observation_shape=observation_shape, num_actions=num_actions) + + wrapper = QNetworkToStateValueEstimatorWrapper(q_network=q_network, policy=policy) + + sample_input = torch.randn(1, *observation_shape) + state_value = wrapper.forward(sample_input) + + print("Estimated State Value:", state_value.item()) + \ No newline at end of file diff --git a/tests/test_actor_critic.py b/tests/test_actor_critic.py index 7e5346e..cd4bee8 100644 --- a/tests/test_actor_critic.py +++ b/tests/test_actor_critic.py @@ -6,7 +6,7 @@ def test_actor_critic_no_exceptions(): env = gym.make("CartPole-v1") - algorithm = ActorCritic(env=env) + algorithm = ActorCritic(env=env, write_summary=False) try: algorithm.train_episodes( diff --git a/tests/test_deep_q_learning.py b/tests/test_deep_q_learning.py index d9a2a1e..76d3eec 100644 --- a/tests/test_deep_q_learning.py +++ b/tests/test_deep_q_learning.py @@ -5,7 +5,7 @@ def test_deep_q_learning_no_exceptions(): env = gym.make("CartPole-v1") - algorithm = DeepQLearning(env=env) + algorithm = DeepQLearning(env=env, write_summary=False) try: algorithm.train_episodes( num_episodes=10, prediction_only=False, save_policy=False diff --git a/tests/test_neuroevolution.py b/tests/test_neuroevolution.py index 743655c..aa831e7 100644 --- a/tests/test_neuroevolution.py +++ b/tests/test_neuroevolution.py @@ -5,7 +5,7 @@ def test_sarsa_no_exceptions(): env = gym.make("CartPole-v1") - algorithm = NeuroEvolution(env=env) + algorithm = NeuroEvolution(env=env, write_summary=False) try: algorithm.train(num_generations=10) diff --git a/tests/test_ppo.py b/tests/test_ppo.py index 8bed019..73d6abe 100644 --- a/tests/test_ppo.py +++ b/tests/test_ppo.py @@ -6,7 +6,7 @@ def test_ppo_no_exceptions(): env = gym.make("CartPole-v1") - algorithm = PPO(env=env) + algorithm = PPO(env=env, write_summary=False) try: algorithm.train_episodes( diff --git a/tests/test_sarsa.py b/tests/test_sarsa.py index 247f80f..5b34060 100644 --- a/tests/test_sarsa.py +++ b/tests/test_sarsa.py @@ -5,7 +5,7 @@ def test_sarsa_no_exceptions(): env = gym.make("CartPole-v1") - algorithm = SARSA(env=env) + algorithm = SARSA(env=env, write_summary=False) try: algorithm.train_episodes(