Refactor state management: internalize recurrent state into model (v2.4.1)

theomgdev · claude · theomgdev · commit 65196b4abc17 · 2026-04-13T22:05:18.000+03:00
Previously, callers were responsible for carrying `final_state` between
train_batch calls and passing it back as `initial_state`. This created
fragile boilerplate in experiment_llm.py and leaked an implementation
detail into the public API.

Changes:
- OdyssNet now always persists `self.state` after every forward pass,
  not only when Hebbian learning is active
- `train_batch` drops `initial_state` / `return_state` in favour of a
  single `keep_state` flag; callers no longer hold state tensors
- experiment_llm.py TBPTT loop updated to use `keep_state=(t_start &gt; 0)`
- generate() uses `model.reset_state()` before warm-up instead of
  threading a state variable through the function
- Tests updated to assert `model.state` directly and cover the new API
- Add `.claude/` to .gitignore

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -42,6 +42,7 @@ Thumbs.db
 # AI
 CLAUDE.md
 GEMINI.md
+.claude/
 
 # Migrations & Plans
 *[mM]igration*.md
diff --git a/examples/advanced/experiment_llm.py b/examples/advanced/experiment_llm.py
@@ -62,7 +62,7 @@
 # OPTIMIZER CONFIG
 RESET_OPTIMIZER_ON_LOAD = False
 OVERWRITE_LR_OF_CKPT = True
-LEARNING_RATE = 1e-4
+LEARNING_RATE = 1e-5
 
 # TIE EMBEDDINGS (VRAM Saving & Parameter Sharing)
 TIE_EMBEDDINGS = False
@@ -213,14 +213,13 @@ def generate(model, tokenizer, start_str="The", length=None, temperature=0.8, to
     encoded = tokenizer.encode(start_str)
     input_seq = encoded.ids
 
-    current_state = None
-
     # Warm up state (Native Thinking)
+    model.reset_state(batch_size=1)
     x_in = torch.tensor(input_seq, dtype=torch.long, device=model.device).unsqueeze(0)
     steps_total = x_in.shape[1] * (THINK_GAP + 1)
-    
+
     with torch.no_grad():
-        _, current_state = model(x_in, steps=steps_total)
+        model(x_in, steps=steps_total)
 
         last_token_idx = input_seq[-1]
 
@@ -230,7 +229,7 @@ def generate(model, tokenizer, start_str="The", length=None, temperature=0.8, to
 
             x_next = torch.tensor([[last_token_idx]], dtype=torch.long, device=model.device)
 
-            preds, current_state = model(x_next, steps=total_step_single, current_state=current_state)
+            preds, _ = model(x_next, steps=total_step_single)
             
             logits = preds[0, 0, :]
 
@@ -293,7 +292,8 @@ def initialize_system(vocab_size, num_neurons, device, input_count=-1, output_co
         vocab_size=vocab_size,
         vocab_mode='discrete',
         tie_embeddings=TIE_EMBEDDINGS,
-        debug=debug
+        debug=debug,
+        hebb_type='synapse'
     )
 
     trainer = OdyssNetTrainer(
@@ -608,34 +608,29 @@ def flatten_logits(out):
             total_thinking_steps = seq_len * (THINK_GAP + 1)
 
             if TRUNCATED_BPTT_SEQ_LEN != -1 and TRUNCATED_BPTT_SEQ_LEN > 0:
-                current_state = None
                 batch_loss = 0
                 steps_count = 0
-                
+
                 chunk_len = TRUNCATED_BPTT_SEQ_LEN
-                
+
                 for t_start in range(0, seq_len, chunk_len):
                     t_end = min(t_start + chunk_len, seq_len)
-                    
-                    # Extract sequence chunk
+
                     x_chunk = x[:, t_start:t_end]
                     y_chunk_flat = y[:, t_start:t_end].reshape(-1)
-                    
-                    # Thinking steps for the current chunk
+
                     actual_tokens = t_end - t_start
                     chunk_thinking_steps = actual_tokens * (THINK_GAP + 1)
 
-                    loss, current_state = trainer.train_batch(
+                    loss = trainer.train_batch(
                         x_chunk,
                         y_chunk_flat,
                         thinking_steps=chunk_thinking_steps,
                         full_sequence=True,
                         output_transform=flatten_logits,
-                        initial_state=current_state,
-                        return_state=True
+                        keep_state=(t_start > 0),
                     )
 
-                    current_state = current_state.detach()
                     batch_loss += loss
                     steps_count += 1
 
diff --git a/odyssnet/core/network.py b/odyssnet/core/network.py
@@ -696,9 +696,10 @@ def _single_step(h_t_in, t_idx, x_input_info, hebb_W_contrib, hebb_mem_contrib):
             if return_sequence and (t + 1) % ratio == 0 and len(outputs) < max_outputs:
                 outputs.append(h_t)
 
-        # Persist accumulated Hebbian state for the next forward call.
-        if self.hebb_type is not None:
-            with torch.no_grad():
+        # Persist the recurrent state and Hebbian correlations for the next forward call.
+        with torch.no_grad():
+            self.state = h_t.detach()
+            if self.hebb_type is not None:
                 self.hebb_state_W.copy_(local_hebb_W.detach())
                 self.hebb_state_mem.copy_(local_hebb_mem.detach())
 
diff --git a/odyssnet/training/trainer.py b/odyssnet/training/trainer.py
@@ -190,7 +190,7 @@ def load_state_dict(self, state):
                     continue
                 self._persistent_grads[id(param)] = persisted.to(device=param.device, dtype=param.dtype)
 
-    def train_batch(self, input_features, target_values, thinking_steps, gradient_accumulation_steps=1, full_sequence=False, mask=None, output_transform=None, initial_state=None, return_state=False):
+    def train_batch(self, input_features, target_values, thinking_steps, gradient_accumulation_steps=1, full_sequence=False, mask=None, output_transform=None, keep_state=False):
         """
         Runs a single training step on a batch.
         """
@@ -227,16 +227,12 @@ def train_batch(self, input_features, target_values, thinking_steps, gradient_ac
 
         # Forward Pass (with AMP)
         with self._get_autocast_ctx():
-            # Use initial_state if provided, otherwise reset
-            if initial_state is not None:
-                current_state_in = initial_state
-            else:
+            if not keep_state:
                 self.model.reset_state(batch_size)
-                current_state_in = None
 
-            all_states, final_state = self.model(x_input, steps=thinking_steps, current_state=current_state_in, return_sequence=full_sequence)
+            all_states, h_t = self.model(x_input, steps=thinking_steps, return_sequence=full_sequence)
 
-            predicted_outputs = self._extract_outputs(all_states, final_state, full_sequence)
+            predicted_outputs = self._extract_outputs(all_states, h_t, full_sequence)
 
             # Optional Transform
             if output_transform:
@@ -345,8 +341,6 @@ def train_batch(self, input_features, target_values, thinking_steps, gradient_ac
                 else:
                     self._plateau_hook_triggered = False
 
-        if return_state:
-            return loss_val, final_state
         return loss_val
 
     def predict(self, input_features, thinking_steps, full_sequence=False):
diff --git a/tests/core/test_network.py b/tests/core/test_network.py
@@ -254,7 +254,9 @@ def test_dropout_disabled_in_eval(self):
         model = OdyssNet(num_neurons=4, input_ids=[0], output_ids=[3], device="cpu", dropout_rate=0.5)
         model.eval()
         x = torch.randn(4, 4)
+        model.reset_state(batch_size=4)
         out1, _ = model(x, steps=3)
+        model.reset_state(batch_size=4)
         out2, _ = model(x, steps=3)
         assert torch.allclose(out1, out2), "Eval mode must produce deterministic outputs"
 
diff --git a/tests/training/test_trainer.py b/tests/training/test_trainer.py
@@ -175,34 +175,29 @@ def test_synaptic_noise_applies_without_error(self):
         loss = t.train_batch(x, y, thinking_steps=2)
         assert isinstance(loss, float)
 
-    def test_return_state_flag(self):
+    def test_state_persisted_after_train_batch(self):
         model = _model()
         t = _trainer(model)
         x = _batch()
         y = _targets()
-        result = t.train_batch(x, y, thinking_steps=2, return_state=True)
-        assert isinstance(result, tuple)
-        loss, state = result
+        loss = t.train_batch(x, y, thinking_steps=2)
         assert isinstance(loss, float)
-        assert state.shape == (4, model.num_neurons)
+        assert model.state.shape == (4, model.num_neurons)
 
-    def test_tbptt_chained_initial_state(self):
-        # experiment_llm.py: return_state=True feeds the final state back as
-        # initial_state for the next chunk (Truncated BPTT).
+    def test_tbptt_keep_state(self):
+        # keep_state=True carries model.state across chunks without reset (Truncated BPTT).
         model = _model()
         t = _trainer(model)
         x = _batch()
         y = _targets()
 
-        loss1, state1 = t.train_batch(x, y, thinking_steps=2, return_state=True)
-        state1 = state1.detach()
+        loss1 = t.train_batch(x, y, thinking_steps=2)
+        assert isinstance(loss1, float)
 
-        # Second chunk starts from where the first chunk ended
-        loss2, state2 = t.train_batch(
-            x, y, thinking_steps=2, initial_state=state1, return_state=True
-        )
+        # Second chunk continues from where the first chunk ended
+        loss2 = t.train_batch(x, y, thinking_steps=2, keep_state=True)
         assert isinstance(loss2, float)
-        assert state2.shape == (4, model.num_neurons)
+        assert model.state.shape == (4, model.num_neurons)
 
     def test_output_transform_applied(self):
         # convergence_mnist_reverse_record.py uses output_transform to slice warmup