fix: constrained decoding cache bug, task rotation, add trainer tests (#199)

abrichr · claude · web-flow · commit 4ec7d5174e1c · 2026-03-28T14:46:11.000-04:00
Constrained decoding:
- Remove (.|\n)* prefix from action regex — Outlines can't compile
  it into a DFA efficiently. Model must output action directly.
- Fix cache sentinel: use False for failure (not []) so subsequent
  calls correctly return None instead of empty logits_processor list.
  Prior bug: [] cached as "success" → model generated unconstrained.
- Upgrade warning to error level for visibility.

Task rotation:
- Fix _load_task_configs: check `not task_ids` once BEFORE the loop
  (was checking inside loop — only first task ever appended).

Tests (21 new):
- TestActionRegex: 8 valid actions match, 6 invalid texts rejected
- TestConstrainedDecodingCache: sentinel logic, regression for [] bug
- TestTaskRotation: all tasks loaded, explicit ids preserved, rotation

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/training/standalone/trainer.py b/openadapt_evals/training/standalone/trainer.py
@@ -90,59 +90,70 @@ def __init__(
 
     # --- Constrained decoding -------------------------------------------
 
-    # Regex that matches ALL valid action formats.  Allows a free-form
-    # "Thought: ..." prefix (the model's chain-of-thought) followed by
-    # exactly one action.  Outlines converts this to a token-level DFA.
+    # Regex matching valid action formats.  No free-text prefix — the
+    # model MUST output an action as its very first token.  This is
+    # intentional: constrained decoding forces structured output.
+    # If the model needs chain-of-thought, disable constrained_decoding
+    # and rely on prompt instructions instead.
     _ACTION_REGEX = (
-        r"(.|\n)*"  # allow any Thought prefix
-        r"(CLICK\(x=0\.\d{1,3},\s*y=0\.\d{1,3}\)"
+        r"CLICK\(x=0\.\d{1,3},\s*y=0\.\d{1,3}\)"
         r'|TYPE\(text="[^"]{0,200}"\)'
         r"|WAIT\(\)"
-        r"|DONE\(\))"
+        r"|DONE\(\)"
     )
+    # Sentinel: None = not yet attempted, list = success, False = failed
     _constrained_processor_cache: Any = None
 
     def _get_constrained_logits_processor(self) -> list | None:
         """Build an Outlines RegexLogitsProcessor for the action format.
 
         Returns a ``[LogitsProcessor]`` list suitable for passing to
         ``model.generate(logits_processor=...)``, or ``None`` if Outlines
-        is not installed.
+        is not installed or compilation fails.
 
         The processor is cached after first creation (the DFA compilation
         is expensive — ~2 seconds — but only happens once).
         """
-        if self._constrained_processor_cache is not None:
+        # Already attempted and failed
+        if self._constrained_processor_cache is False:
+            return None
+        # Already compiled successfully
+        if isinstance(self._constrained_processor_cache, list):
             return self._constrained_processor_cache
 
         try:
             from outlines.processors import RegexLogitsProcessor
+            tokenizer = (
+                self._processor.tokenizer
+                if hasattr(self._processor, "tokenizer")
+                else self._processor
+            )
             processor = RegexLogitsProcessor(
                 self._ACTION_REGEX,
-                tokenizer=self._processor.tokenizer
-                if hasattr(self._processor, "tokenizer")
-                else self._processor,
+                tokenizer=tokenizer,
             )
             self._constrained_processor_cache = [processor]
             logger.info(
                 "Outlines constrained decoding enabled "
-                "(action format regex compiled)"
+                "(action format regex compiled successfully)"
             )
             return self._constrained_processor_cache
         except ImportError:
-            logger.warning(
+            logger.error(
                 "constrained_decoding=True but 'outlines' is not installed. "
                 "Install with: pip install outlines>=0.1.0"
             )
-            self._constrained_processor_cache = []  # don't retry
+            self._constrained_processor_cache = False
             return None
         except Exception as exc:
-            logger.warning(
+            logger.error(
                 "Outlines RegexLogitsProcessor creation failed: %s. "
-                "Falling back to unconstrained generation.",
+                "Falling back to unconstrained generation. "
+                "This may be a tokenizer compatibility issue — try "
+                "updating outlines: pip install -U outlines",
                 exc,
             )
-            self._constrained_processor_cache = []
+            self._constrained_processor_cache = False
             return None
 
     # --- Task loading -----------------------------------------------------
@@ -156,9 +167,10 @@ def _load_task_configs(self) -> None:
         if not task_dir.exists():
             logger.warning("Task dir not found: %s", task_dir)
             return
+        auto_populate = not self._config.task_ids
         for tc in TaskConfig.from_dir(str(task_dir)):
             self._task_configs[tc.id] = tc
-            if not self._config.task_ids:
+            if auto_populate:
                 self._config.task_ids.append(tc.id)
         logger.info("Loaded %d task configs from %s", len(self._task_configs), task_dir)
 
diff --git a/tests/test_standalone_trainer.py b/tests/test_standalone_trainer.py
@@ -0,0 +1,159 @@
+"""Tests for the standalone GRPO trainer.
+
+Covers constrained decoding logic, task rotation, and config handling.
+No GPU or WAA server required — tests use mocks.
+"""
+
+from __future__ import annotations
+
+import re
+
+import pytest
+
+from openadapt_evals.training.standalone.config import TrainingConfig
+from openadapt_evals.training.standalone.trainer import GRPOTrainer
+
+
+# ---------------------------------------------------------------------------
+# Action regex tests
+# ---------------------------------------------------------------------------
+
+
+class TestActionRegex:
+    """Verify the action format regex matches valid actions and rejects junk."""
+
+    regex = GRPOTrainer._ACTION_REGEX
+
+    @pytest.mark.parametrize(
+        "action",
+        [
+            "CLICK(x=0.50, y=0.30)",
+            "CLICK(x=0.0, y=0.0)",
+            "CLICK(x=0.999, y=0.123)",
+            'TYPE(text="hello world")',
+            'TYPE(text="")',
+            'TYPE(text="notepad")',
+            "WAIT()",
+            "DONE()",
+        ],
+    )
+    def test_valid_actions_match(self, action: str) -> None:
+        assert re.match(self.regex, action), f"Expected match: {action!r}"
+
+    @pytest.mark.parametrize(
+        "text",
+        [
+            "** Let me think about this...",
+            "1. Analyze the user's goal",
+            "The user wants to open Task Manager",
+            "",
+            "CLICK",
+            "click(0.5, 0.3)",
+        ],
+    )
+    def test_invalid_text_rejected(self, text: str) -> None:
+        assert not re.match(self.regex, text), f"Should NOT match: {text!r}"
+
+
+# ---------------------------------------------------------------------------
+# Constrained decoding cache tests
+# ---------------------------------------------------------------------------
+
+
+class TestConstrainedDecodingCache:
+    """Test the caching logic for the Outlines logits processor."""
+
+    def test_cache_starts_as_none(self) -> None:
+        config = TrainingConfig()
+        trainer = GRPOTrainer(config)
+        assert trainer._constrained_processor_cache is None
+
+    def test_failed_cache_returns_none(self) -> None:
+        """When compilation fails, subsequent calls return None (not [])."""
+        config = TrainingConfig(constrained_decoding=True)
+        trainer = GRPOTrainer(config)
+        # Simulate a failed compilation
+        trainer._constrained_processor_cache = False
+        result = trainer._get_constrained_logits_processor()
+        assert result is None
+
+    def test_successful_cache_returns_list(self) -> None:
+        """When compilation succeeds, subsequent calls return the list."""
+        config = TrainingConfig(constrained_decoding=True)
+        trainer = GRPOTrainer(config)
+        # Simulate a successful compilation
+        trainer._constrained_processor_cache = ["mock_processor"]
+        result = trainer._get_constrained_logits_processor()
+        assert result == ["mock_processor"]
+
+    def test_empty_list_no_longer_caches_as_success(self) -> None:
+        """Regression test: empty list [] should NOT be treated as success.
+
+        Prior bug: failure cached [] which is truthy for `is not None`,
+        causing subsequent calls to return [] (no processors applied).
+        """
+        config = TrainingConfig(constrained_decoding=True)
+        trainer = GRPOTrainer(config)
+        # The old buggy behavior would cache [] on failure
+        # Verify the sentinel is False (not []) for failures
+        trainer._constrained_processor_cache = False
+        assert trainer._get_constrained_logits_processor() is None
+        # And [] is actually a valid success cache (with a processor in it)
+        trainer._constrained_processor_cache = ["real_processor"]
+        assert trainer._get_constrained_logits_processor() == ["real_processor"]
+
+
+# ---------------------------------------------------------------------------
+# Task rotation tests
+# ---------------------------------------------------------------------------
+
+
+class TestTaskRotation:
+    """Test that all tasks from task_dir are loaded, not just the first."""
+
+    def test_all_tasks_loaded_from_dir(self, tmp_path) -> None:
+        """Create multiple task YAMLs and verify all are loaded."""
+        import yaml
+
+        for i in range(3):
+            task = {
+                "name": f"Task {i}",
+                "id": f"task-{i}",
+                "setup": [],
+                "evaluate": [{"check": "screenshot", "description": "done"}],
+            }
+            (tmp_path / f"task_{i}.yaml").write_text(yaml.dump(task))
+
+        config = TrainingConfig(task_dir=str(tmp_path))
+        trainer = GRPOTrainer(config)
+        trainer._load_task_configs()
+
+        assert len(config.task_ids) == 3
+        assert set(config.task_ids) == {"task-0", "task-1", "task-2"}
+
+    def test_explicit_task_ids_not_overwritten(self, tmp_path) -> None:
+        """When task_ids is set explicitly, task_dir doesn't override it."""
+        import yaml
+
+        for i in range(3):
+            task = {"name": f"Task {i}", "id": f"task-{i}", "setup": [], "evaluate": []}
+            (tmp_path / f"task_{i}.yaml").write_text(yaml.dump(task))
+
+        config = TrainingConfig(
+            task_dir=str(tmp_path),
+            task_ids=["task-1"],  # explicit
+        )
+        trainer = GRPOTrainer(config)
+        trainer._load_task_configs()
+
+        # Should keep the explicit list, not auto-populate
+        assert config.task_ids == ["task-1"]
+        # But task_configs should still have all 3 loaded (for setup/eval)
+        assert len(trainer._task_configs) == 3
+
+    def test_task_rotation_in_training_loop(self) -> None:
+        """Verify step % len(task_ids) produces rotation."""
+        task_ids = ["a", "b", "c"]
+        num_steps = 9
+        selected = [task_ids[step % len(task_ids)] for step in range(num_steps)]
+        assert selected == ["a", "b", "c", "a", "b", "c", "a", "b", "c"]