fix(prompt_coevolution): fix 5 co-evolution coupling bugs (Amendment #4)

KhrulkovV · claude · KhrulkovV · commit bb8de98933f3 · 2026-03-16T19:18:52.000+03:00
1. prompt_id mismatch: write side used sha256(UUID), read side sha256(text) — never matched. Both now use prompt_text_to_id().
2. Dead archive: default fitness 0.01 for unknown prompts (was 0.0, preventing archive entry).
3. Champion-only selection: stochastic fitness-proportional sampling replaces single-best.
4. Timing mismatch: MainRunSyncHook blocks prompt engine until main run advances 1 gen.
5. pre_step_hook added to EvolutionEngine (optional, null by default).

All prior run data discarded (X1/X2 ran 4/3 gens on fallback prompts, no co-evolution occurred).
DBs 4-7 flushed. Full restart required.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/config/constants/evolution.yaml b/config/constants/evolution.yaml
@@ -8,3 +8,4 @@ num_parents: 2
 mutation_mode: rewrite
 max_generations: null
 strip_comments_and_docstrings: false
+pre_step_hook: null
diff --git a/config/evolution/default.yaml b/config/evolution/default.yaml
@@ -40,3 +40,4 @@ evolution_engine:
   config: ${engine_config}
   writer: ${writer}
   metrics_tracker: ${metrics_tracker}
+  pre_step_hook: ${pre_step_hook}
diff --git a/config/pipeline/prompt_evolution.yaml b/config/pipeline/prompt_evolution.yaml
@@ -14,6 +14,16 @@
 main_redis_db: ???
 main_redis_prefix: ???
 
+# Sync hook: blocks prompt run engine until main run advances by 1 gen
+pre_step_hook:
+  _target_: gigaevo.prompts.coevolution.sync.MainRunSyncHook
+  host: ${redis.host}
+  port: ${redis.port}
+  db: ${main_redis_db}
+  prefix: ${main_redis_prefix}
+  timeout: 600.0
+  poll_interval: 5.0
+
 prompt_stats_provider:
   _target_: gigaevo.prompts.coevolution.stats.RedisPromptStatsProvider
   host: ${redis.host}
diff --git a/gigaevo/evolution/engine/core.py b/gigaevo/evolution/engine/core.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+from collections.abc import Awaitable, Callable
 import contextlib
 from typing import TYPE_CHECKING
 
@@ -49,6 +50,7 @@ def __init__(
         config: EngineConfig,
         writer: LogWriter,
         metrics_tracker: MetricsTracker,
+        pre_step_hook: Callable[[], Awaitable[None]] | None = None,
     ):
         self.storage = storage
         self.strategy = strategy
@@ -67,6 +69,7 @@ def __init__(
         self.metrics = EngineMetrics()
         self.state = ProgramStateManager(self.storage)
         self._metrics_tracker = metrics_tracker
+        self._pre_step_hook = pre_step_hook
 
         logger.info(
             "[EvolutionEngine] Init | strategy={}, acceptor={}",
@@ -180,6 +183,9 @@ async def run(self) -> None:
 
     async def step(self) -> None:
         """One generation step (idle → mutate → idle → ingest → refresh → idle)."""
+        if self._pre_step_hook:
+            await self._pre_step_hook()
+
         # Phase 1: wait until engine is idle (no QUEUED/RUNNING programs)
         await self._await_idle()
         logger.debug("[EvolutionEngine] Phase 1: Idle confirmed")
diff --git a/gigaevo/prompts/coevolution/stages.py b/gigaevo/prompts/coevolution/stages.py
@@ -145,7 +145,12 @@ async def compute(self, program: Program) -> FloatDictContainer:
         prompt_id = execution_output.prompt_id
         stats = await self._stats_provider.get_stats(prompt_id)
 
-        fitness = stats.success_rate
+        if stats.trials < self._min_trials:
+            # Optimistic default: allows archive entry but won't displace
+            # prompts with real fitness data.
+            fitness = 0.01
+        else:
+            fitness = stats.success_rate
         prompt_length = float(len(execution_output.prompt_text))
 
         logger.debug(
diff --git a/gigaevo/prompts/coevolution/sync.py b/gigaevo/prompts/coevolution/sync.py
@@ -0,0 +1,92 @@
+"""Synchronization hook for prompt co-evolution.
+
+MainRunSyncHook blocks the prompt run's engine until the main run advances
+by at least one generation. This prevents the lightweight prompt run from
+racing far ahead of the expensive main run.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+
+from loguru import logger
+
+
+class MainRunSyncHook:
+    """Pre-step hook that blocks until the main run advances by 1 generation.
+
+    Polls the main run's ``engine:total_generations`` counter in Redis and
+    waits until it exceeds the value seen on the previous call.
+
+    Args:
+        host: Redis host of the main run
+        port: Redis port
+        db: Redis DB of the main run
+        prefix: Key prefix of the main run (e.g. "chains/hotpotqa")
+        timeout: Maximum seconds to wait before proceeding anyway
+        poll_interval: Seconds between polls
+    """
+
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        db: int,
+        prefix: str,
+        timeout: float = 600.0,
+        poll_interval: float = 5.0,
+    ):
+        self._host = host
+        self._port = port
+        self._db = db
+        self._prefix = prefix
+        self._timeout = timeout
+        self._poll_interval = poll_interval
+        self._last_main_gen: int = -1
+        self._redis: object | None = None
+
+    def _get_redis(self) -> object:
+        if self._redis is None:
+            from redis import asyncio as aioredis
+
+            self._redis = aioredis.Redis(
+                host=self._host,
+                port=self._port,
+                db=self._db,
+                decode_responses=True,
+            )
+        return self._redis
+
+    async def __call__(self) -> None:
+        """Poll until the main run's generation counter advances."""
+        r = self._get_redis()
+        key = f"{self._prefix}:run_state"
+        field = "engine:total_generations"
+        start = time.monotonic()
+
+        while True:
+            raw = await r.hget(key, field)  # type: ignore[attr-defined]
+            current_gen = int(raw) if raw else 0
+
+            if current_gen > self._last_main_gen:
+                logger.debug(
+                    "[MainRunSyncHook] Main run at gen {} (was {})",
+                    current_gen,
+                    self._last_main_gen,
+                )
+                self._last_main_gen = current_gen
+                return
+
+            elapsed = time.monotonic() - start
+            if elapsed > self._timeout:
+                logger.warning(
+                    "[MainRunSyncHook] Timeout after {:.0f}s waiting for main gen > {} "
+                    "(current={}), proceeding",
+                    elapsed,
+                    self._last_main_gen,
+                    current_gen,
+                )
+                return
+
+            await asyncio.sleep(self._poll_interval)
diff --git a/gigaevo/prompts/fetcher.py b/gigaevo/prompts/fetcher.py
@@ -9,14 +9,15 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-import hashlib
 from pathlib import Path
+import random
 import time
 from typing import TYPE_CHECKING, Any
 
 from loguru import logger
 
 from gigaevo.prompts import load_prompt
+from gigaevo.prompts.coevolution.stats import prompt_text_to_id
 
 if TYPE_CHECKING:
     from gigaevo.database.program_storage import ProgramStorage
@@ -218,10 +219,13 @@ def _is_cache_stale(self) -> bool:
         return (time.monotonic() - self._cache_timestamp) >= self._cache_ttl
 
     def _refresh_champion(self) -> "_PromptPack | None":
-        """Read the current champion from the prompt run's Redis archive.
+        """Select a prompt from the prompt run's archive using fitness-proportional sampling.
+
+        Instead of always picking the single best, uses stochastic selection so
+        that multiple prompts accumulate trial data from the main run.
 
         Returns:
-            _PromptPack if a champion was found, None if archive is empty
+            _PromptPack if a prompt was selected, None if archive is empty
         """
         try:
             r = self._get_sync_redis()
@@ -235,46 +239,47 @@ def _refresh_champion(self) -> "_PromptPack | None":
                 )
                 return None
 
-            # Fetch all programs and find the champion
-            best_program_id: str | None = None
-            best_fitness: float = float("-inf")
-            best_code: str | None = None
+            # Collect all candidates with their fitness and code
+            import json
 
+            candidates: list[tuple[str, float, str]] = []  # (pid, fitness, code)
             for pid in program_ids:
                 program_key = f"{self._prompt_prefix}:program:{pid}"
                 raw = r.get(program_key)
                 if not raw:
                     continue
                 try:
-                    import json
-
                     data = json.loads(raw)
                     metrics = data.get("metrics", {})
-                    fitness = float(metrics.get(self._fitness_key, float("-inf")))
+                    fitness = float(metrics.get(self._fitness_key, 0.0))
                     code = data.get("code", "")
-                    if fitness > best_fitness and code:
-                        best_fitness = fitness
-                        best_program_id = pid
-                        best_code = code
+                    if code:
+                        candidates.append((pid, fitness, code))
                 except Exception as exc:
                     logger.debug(
                         f"[GigaEvoArchivePromptFetcher] Error parsing program {pid}: {exc}"
                     )
                     continue
 
-            if best_code is None or best_program_id is None:
+            if not candidates:
                 return None
 
-            # Execute the champion's entrypoint() to get the prompt pack
-            prompt_id = hashlib.sha256(best_program_id.encode()).hexdigest()[:16]
-            pack = self._execute_entrypoint(best_code, prompt_id)
+            # Fitness-proportional sampling (epsilon floor for zero-fitness prompts)
+            epsilon = 0.01
+            weights = [max(f, epsilon) for _, f, _ in candidates]
+            chosen_pid, chosen_fitness, chosen_code = random.choices(
+                candidates, weights=weights, k=1
+            )[0]
+
+            pack = self._execute_entrypoint(chosen_code)
             if pack is None:
                 return None
 
             logger.debug(
-                f"[GigaEvoArchivePromptFetcher] Champion: {best_program_id[:8]} "
-                f"fitness={best_fitness:.4f} prompt_id={prompt_id} "
-                f"has_user={pack.user is not None}"
+                f"[GigaEvoArchivePromptFetcher] Selected: {chosen_pid[:8]} "
+                f"fitness={chosen_fitness:.4f} prompt_id={pack.prompt_id} "
+                f"has_user={pack.user is not None} "
+                f"(from {len(candidates)} candidates)"
             )
             return pack
 
@@ -285,17 +290,19 @@ def _refresh_champion(self) -> "_PromptPack | None":
             )
             return None
 
-    def _execute_entrypoint(self, code: str, prompt_id: str) -> "_PromptPack | None":
+    def _execute_entrypoint(self, code: str) -> "_PromptPack | None":
         """Execute a program's entrypoint() in a clean namespace.
 
+        Computes prompt_id from the system prompt TEXT (not the program UUID)
+        so it matches the ID used by PromptFitnessStage on the read side.
+
         Args:
             code: Python source code with entrypoint() function that returns
                   either a str (system prompt only) or a dict with keys
                   "system" (required) and "user" (optional).
-            prompt_id: Pre-computed prompt_id to attach to the resulting pack.
 
         Returns:
-            _PromptPack with system/user texts, or None on error
+            _PromptPack with system/user texts and text-derived prompt_id, or None on error
         """
         try:
             namespace: dict[str, Any] = {}
@@ -313,7 +320,8 @@ def _execute_entrypoint(self, code: str, prompt_id: str) -> "_PromptPack | None"
                         "[GigaEvoArchivePromptFetcher] entrypoint() returned empty string"
                     )
                     return None
-                return _PromptPack(system=result, user=None, prompt_id=prompt_id)
+                pid = prompt_text_to_id(result)
+                return _PromptPack(system=result, user=None, prompt_id=pid)
             elif isinstance(result, dict):
                 system = result.get("system", "")
                 if not isinstance(system, str) or not system.strip():
@@ -327,7 +335,8 @@ def _execute_entrypoint(self, code: str, prompt_id: str) -> "_PromptPack | None"
                         "[GigaEvoArchivePromptFetcher] dict entrypoint() has invalid 'user' key — ignoring"
                     )
                     user = None
-                return _PromptPack(system=system, user=user, prompt_id=prompt_id)
+                pid = prompt_text_to_id(system)
+                return _PromptPack(system=system, user=user, prompt_id=pid)
             else:
                 logger.warning(
                     f"[GigaEvoArchivePromptFetcher] entrypoint() returned {type(result)}, "
diff --git a/tests/prompts/test_coevolution_stages.py b/tests/prompts/test_coevolution_stages.py
@@ -272,7 +272,7 @@ async def test_compute_with_stats(self, mock_stats_provider: MagicMock):
 
     @pytest.mark.asyncio
     async def test_compute_no_stats(self, mock_stats_provider: MagicMock):
-        """compute() returns 0.0 fitness when no stats available."""
+        """compute() returns optimistic default fitness when no stats available."""
         mock_stats_provider.get_stats = AsyncMock(
             return_value=PromptMutationStats(trials=0, successes=0, success_rate=0.0)
         )
@@ -285,7 +285,7 @@ async def test_compute_no_stats(self, mock_stats_provider: MagicMock):
 
         result = await stage.compute(program)
 
-        assert result.data["fitness"] == 0.0
+        assert result.data["fitness"] == 0.01  # optimistic default
         assert result.data["is_valid"] == 1.0
 
     @pytest.mark.asyncio
diff --git a/tests/prompts/test_fetcher.py b/tests/prompts/test_fetcher.py
@@ -220,11 +220,14 @@ def test_execute_entrypoint_str_return(self, tmp_prompts_dir: Path):
             fallback_prompts_dir=tmp_prompts_dir,
         )
         code = 'def entrypoint() -> str:\n    return "Hello system."'
-        pack = fetcher._execute_entrypoint(code, "abc123")
+        pack = fetcher._execute_entrypoint(code)
         assert pack is not None
         assert pack.system == "Hello system."
         assert pack.user is None
-        assert pack.prompt_id == "abc123"
+        # prompt_id is now derived from the text, not passed in
+        from gigaevo.prompts.coevolution.stats import prompt_text_to_id
+
+        assert pack.prompt_id == prompt_text_to_id("Hello system.")
 
     def test_execute_entrypoint_dict_return_with_user(self, tmp_prompts_dir: Path):
         """_execute_entrypoint() handles dict-returning entrypoint with user key."""
@@ -237,11 +240,13 @@ def test_execute_entrypoint_dict_return_with_user(self, tmp_prompts_dir: Path):
             "def entrypoint() -> dict:\n"
             '    return {"system": "System text.", "user": "User text {count}."}'
         )
-        pack = fetcher._execute_entrypoint(code, "xyz789")
+        pack = fetcher._execute_entrypoint(code)
         assert pack is not None
         assert pack.system == "System text."
         assert pack.user == "User text {count}."
-        assert pack.prompt_id == "xyz789"
+        from gigaevo.prompts.coevolution.stats import prompt_text_to_id
+
+        assert pack.prompt_id == prompt_text_to_id("System text.")
 
     def test_execute_entrypoint_dict_return_no_user(self, tmp_prompts_dir: Path):
         """_execute_entrypoint() handles dict with system only."""
@@ -251,7 +256,7 @@ def test_execute_entrypoint_dict_return_no_user(self, tmp_prompts_dir: Path):
             fallback_prompts_dir=tmp_prompts_dir,
         )
         code = 'def entrypoint() -> dict:\n    return {"system": "System only."}'
-        pack = fetcher._execute_entrypoint(code, "sys_only")
+        pack = fetcher._execute_entrypoint(code)
         assert pack is not None
         assert pack.system == "System only."
         assert pack.user is None
@@ -266,7 +271,7 @@ def test_execute_entrypoint_dict_missing_system_returns_none(
             fallback_prompts_dir=tmp_prompts_dir,
         )
         code = 'def entrypoint() -> dict:\n    return {"user": "Only user."}'
-        pack = fetcher._execute_entrypoint(code, "no_sys")
+        pack = fetcher._execute_entrypoint(code)
         assert pack is None
 
     def test_execute_entrypoint_invalid_type_returns_none(self, tmp_prompts_dir: Path):
@@ -277,7 +282,7 @@ def test_execute_entrypoint_invalid_type_returns_none(self, tmp_prompts_dir: Pat
             fallback_prompts_dir=tmp_prompts_dir,
         )
         code = "def entrypoint():\n    return 42"
-        pack = fetcher._execute_entrypoint(code, "bad_type")
+        pack = fetcher._execute_entrypoint(code)
         assert pack is None