chore: add eval scenarios for codeflash-docs tile

KRRT7 · KRRT7 · commit 869fbe176666 · 2026-02-14T21:29:22.000-05:00
5 scenarios testing: code serialization format, candidate lifecycle/DAG,
deterministic patches, effort levels/selection criteria, and function
representation/concurrency model.
diff --git a/tiles/codeflash-docs/evals/capabilities.json b/tiles/codeflash-docs/evals/capabilities.json
@@ -0,0 +1,118 @@
+{
+  "package_name": "codeflash-docs",
+  "total_capabilities": 16,
+  "capabilities": [
+    {
+      "id": 0,
+      "name": "pipeline-stage-ordering",
+      "description": "Know the correct ordering of codeflash pipeline stages: Discovery → Ranking → Context Extraction → Test Gen + Optimization (concurrent) → Baseline → Candidate Evaluation → PR",
+      "complexity": "basic",
+      "api_elements": ["Optimizer.run()", "FunctionOptimizer.optimize_function()"]
+    },
+    {
+      "id": 1,
+      "name": "function-to-optimize-fields",
+      "description": "Know FunctionToOptimize key fields (function_name, file_path, parents, starting_line/ending_line, is_async, is_method, language) and properties (qualified_name, top_level_parent_name, class_name)",
+      "complexity": "intermediate",
+      "api_elements": ["FunctionToOptimize", "FunctionParent", "models/function_types.py"]
+    },
+    {
+      "id": 2,
+      "name": "code-strings-markdown-format",
+      "description": "Know that code is serialized as markdown fenced blocks with language:filepath syntax (```python:filepath\\ncode\\n```) and parsed via CodeStringsMarkdown.parse_markdown_code()",
+      "complexity": "intermediate",
+      "api_elements": ["CodeStringsMarkdown", "CodeString", ".markdown", ".flat", "parse_markdown_code()"]
+    },
+    {
+      "id": 3,
+      "name": "read-writable-vs-read-only",
+      "description": "Distinguish read_writable_code (LLM can modify) from read_only_context_code (reference only) in CodeOptimizationContext",
+      "complexity": "basic",
+      "api_elements": ["CodeOptimizationContext", "read_writable_code", "read_only_context_code"]
+    },
+    {
+      "id": 4,
+      "name": "candidate-source-types",
+      "description": "Know OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE and when each is used",
+      "complexity": "intermediate",
+      "api_elements": ["OptimizedCandidateSource", "OptimizedCandidate"]
+    },
+    {
+      "id": 5,
+      "name": "candidate-forest-dag",
+      "description": "Know that candidates form a forest/DAG via parent_id references where refinements and repairs build on previous candidates",
+      "complexity": "intermediate",
+      "api_elements": ["parent_id", "OptimizedCandidate", "CandidateForest"]
+    },
+    {
+      "id": 6,
+      "name": "concurrent-testgen-optimization",
+      "description": "Know that test generation and LLM optimization run concurrently using concurrent.futures, not sequentially",
+      "complexity": "intermediate",
+      "api_elements": ["concurrent.futures", "FunctionOptimizer.optimize_function()"]
+    },
+    {
+      "id": 7,
+      "name": "deterministic-patch-values",
+      "description": "Know the specific fixed values used by deterministic patches: time=1761717605.108106, datetime=2021-01-01 02:05:10 UTC, uuid=12345678-1234-5678-9abc-123456789012, random seeded with 42",
+      "complexity": "advanced",
+      "api_elements": ["_apply_deterministic_patches()", "pytest_plugin.py"]
+    },
+    {
+      "id": 8,
+      "name": "test-type-enum",
+      "description": "Know the 6 TestType variants: EXISTING_UNIT_TEST, INSPIRED_REGRESSION, GENERATED_REGRESSION, REPLAY_TEST, CONCOLIC_COVERAGE_TEST, INIT_STATE_TEST",
+      "complexity": "basic",
+      "api_elements": ["TestType", "models/test_type.py"]
+    },
+    {
+      "id": 9,
+      "name": "ai-service-endpoints",
+      "description": "Know the AI service endpoints: /ai/optimize, /ai/optimize_line_profiler, /ai/refine, /ai/repair, /ai/adaptive_optimize, /ai/rewrite_jit",
+      "complexity": "intermediate",
+      "api_elements": ["AiServiceClient", "api/aiservice.py"]
+    },
+    {
+      "id": 10,
+      "name": "repair-request-structure",
+      "description": "Know that AIServiceCodeRepairRequest includes TestDiff objects with scope (RETURN_VALUE/STDOUT/DID_PASS), original vs candidate values, and test source code",
+      "complexity": "advanced",
+      "api_elements": ["AIServiceCodeRepairRequest", "TestDiff", "TestDiffScope"]
+    },
+    {
+      "id": 11,
+      "name": "effort-level-values",
+      "description": "Know specific effort level values: LOW gets 3 candidates, MEDIUM gets 5, HIGH gets 6 (N_OPTIMIZER_CANDIDATES)",
+      "complexity": "intermediate",
+      "api_elements": ["EffortLevel", "N_OPTIMIZER_CANDIDATES", "EFFORT_VALUES"]
+    },
+    {
+      "id": 12,
+      "name": "context-token-limits",
+      "description": "Know OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000 and TESTGEN_CONTEXT_TOKEN_LIMIT=16000 and that encoded_tokens_len() is used for counting",
+      "complexity": "basic",
+      "api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
+    },
+    {
+      "id": 13,
+      "name": "best-candidate-selection",
+      "description": "Know the selection criteria: highest speedup, then shortest diff for ties, and refinement weighted ranking (2*runtime + 1*diff)",
+      "complexity": "advanced",
+      "api_elements": ["BestOptimization", "REFINED_CANDIDATE_RANKING_WEIGHTS"]
+    },
+    {
+      "id": 14,
+      "name": "plugin-blocklists",
+      "description": "Know behavioral test blocklisted plugins (benchmark, codspeed, xdist, sugar) and benchmarking blocklist (adds cov, profiling)",
+      "complexity": "intermediate",
+      "api_elements": ["BEHAVIORAL_BLOCKLISTED_PLUGINS", "BENCHMARKING_BLOCKLISTED_PLUGINS"]
+    },
+    {
+      "id": 15,
+      "name": "result-type-usage",
+      "description": "Know that Result[L,R] from either.py uses Success(value)/Failure(error) with is_successful() check before unwrap()",
+      "complexity": "basic",
+      "api_elements": ["Result", "Success", "Failure", "is_successful", "either.py"]
+    }
+  ]
+}
diff --git a/tiles/codeflash-docs/evals/scenario-1/capability.txt b/tiles/codeflash-docs/evals/scenario-1/capability.txt
@@ -0,0 +1 @@
+Code serialization format and context splitting
diff --git a/tiles/codeflash-docs/evals/scenario-1/criteria.json b/tiles/codeflash-docs/evals/scenario-1/criteria.json
@@ -0,0 +1,21 @@
+{
+  "context": "Tests whether the agent knows the CodeStringsMarkdown serialization format and the distinction between read-writable and read-only code context in the codeflash pipeline.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Markdown code block format",
+      "description": "Uses the correct fenced code block format with language:filepath syntax (```python:path/to/file.py) when constructing code for the AI service, NOT plain code blocks without file paths",
+      "max_score": 30
+    },
+    {
+      "name": "Read-writable vs read-only split",
+      "description": "Correctly separates code into read_writable_code (code the LLM can modify) and read_only_context_code (reference-only dependency code), NOT treating all code as modifiable",
+      "max_score": 35
+    },
+    {
+      "name": "parse_markdown_code usage",
+      "description": "Uses CodeStringsMarkdown.parse_markdown_code() to parse AI service responses back into structured code, NOT manual string splitting or regex",
+      "max_score": 35
+    }
+  ]
+}
diff --git a/tiles/codeflash-docs/evals/scenario-1/task.md b/tiles/codeflash-docs/evals/scenario-1/task.md
@@ -0,0 +1,35 @@
+# Format Code for AI Service Request
+
+## Context
+
+You are working on the codeflash optimization engine. The AI service accepts optimization requests with source code and dependency context. A function `calculate_total` in `analytics/metrics.py` needs to be optimized. It calls a helper `normalize_values` in the same file (both modifiable), and imports `BaseMetric` from `analytics/base.py` (not modifiable, just for reference).
+
+```python
+# analytics/metrics.py
+from analytics.base import BaseMetric
+
+def normalize_values(data: list[float]) -> list[float]:
+    max_val = max(data)
+    return [x / max_val for x in data]
+
+def calculate_total(metrics: list[BaseMetric]) -> float:
+    values = [m.value for m in metrics]
+    normalized = normalize_values(values)
+    return sum(normalized)
+```
+
+```python
+# analytics/base.py
+class BaseMetric:
+    def __init__(self, name: str, value: float):
+        self.name = name
+        self.value = value
+```
+
+## Task
+
+Write a Python function `prepare_optimization_payload` that constructs the code payload for an AI service optimization request for `calculate_total`. It should properly format the source code and dependency code, and include a function to parse the AI service response back into structured code objects.
+
+## Expected Outputs
+
+- A Python file `payload_builder.py` with the payload construction and response parsing logic
diff --git a/tiles/codeflash-docs/evals/scenario-2/capability.txt b/tiles/codeflash-docs/evals/scenario-2/capability.txt
@@ -0,0 +1 @@
+Candidate source types and DAG relationships
diff --git a/tiles/codeflash-docs/evals/scenario-2/criteria.json b/tiles/codeflash-docs/evals/scenario-2/criteria.json
@@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Lists source types",
+      "description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
+      "max_score": 25
+    },
+    {
+      "name": "Parent ID linkage",
+      "description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
+      "max_score": 25
+    },
+    {
+      "name": "Refinement uses runtime data",
+      "description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
+      "max_score": 25
+    },
+    {
+      "name": "Repair uses test diffs",
+      "description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
+      "max_score": 25
+    }
+  ]
+}
diff --git a/tiles/codeflash-docs/evals/scenario-2/task.md b/tiles/codeflash-docs/evals/scenario-2/task.md
@@ -0,0 +1,13 @@
+# Document the Candidate Lifecycle
+
+## Context
+
+A new engineer is joining the codeflash team and needs to understand how optimization candidates are generated, improved, and related to each other throughout the pipeline. They've asked for a clear explanation of the different ways candidates are produced and how the system iterates on them.
+
+## Task
+
+Write a technical document explaining the full lifecycle of an optimization candidate in codeflash — from initial generation through improvement iterations. Cover all the different ways candidates can be created, what data is sent to the AI service for each type, and how candidates relate to each other structurally.
+
+## Expected Outputs
+
+- A markdown file `candidate-lifecycle.md`
diff --git a/tiles/codeflash-docs/evals/scenario-3/capability.txt b/tiles/codeflash-docs/evals/scenario-3/capability.txt
@@ -0,0 +1 @@
+Deterministic patch values and test execution architecture
diff --git a/tiles/codeflash-docs/evals/scenario-3/criteria.json b/tiles/codeflash-docs/evals/scenario-3/criteria.json
@@ -0,0 +1,31 @@
+{
+  "context": "Tests whether the agent knows the specific deterministic patch values used in codeflash's pytest plugin and the subprocess-based test execution architecture.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Subprocess isolation",
+      "description": "States that tests run in a subprocess to isolate the test environment from the main codeflash process, NOT in the same process",
+      "max_score": 20
+    },
+    {
+      "name": "Fixed time value",
+      "description": "References the specific fixed timestamp 1761717605.108106 for time.time() or the fixed datetime 2021-01-01 02:05:10 UTC for datetime.now()",
+      "max_score": 20
+    },
+    {
+      "name": "Fixed UUID value",
+      "description": "References the specific fixed UUID 12345678-1234-5678-9abc-123456789012 for uuid4/uuid1",
+      "max_score": 20
+    },
+    {
+      "name": "Random seed",
+      "description": "States that random is seeded with 42 (NOT a different seed value)",
+      "max_score": 20
+    },
+    {
+      "name": "Plugin blocklists",
+      "description": "Mentions that behavioral tests block specific pytest plugins (at least 2 of: benchmark, codspeed, xdist, sugar) to ensure deterministic execution",
+      "max_score": 20
+    }
+  ]
+}
diff --git a/tiles/codeflash-docs/evals/scenario-3/task.md b/tiles/codeflash-docs/evals/scenario-3/task.md
@@ -0,0 +1,13 @@
+# Explain Test Reproducibility Guarantees
+
+## Context
+
+A codeflash user notices that their optimization candidate passes behavioral tests on one run but fails on the next. They suspect non-determinism in the test execution. They want to understand what guarantees codeflash provides for test reproducibility and how the system ensures consistent results.
+
+## Task
+
+Write a technical explanation of how codeflash ensures deterministic test execution. Cover the execution environment setup, what sources of non-determinism are controlled, and any specific values or configurations used. Also explain the test execution architecture.
+
+## Expected Outputs
+
+- A markdown file `test-reproducibility.md`
diff --git a/tiles/codeflash-docs/evals/scenario-4/capability.txt b/tiles/codeflash-docs/evals/scenario-4/capability.txt
@@ -0,0 +1 @@
+Effort level configuration and candidate selection criteria
diff --git a/tiles/codeflash-docs/evals/scenario-4/criteria.json b/tiles/codeflash-docs/evals/scenario-4/criteria.json
@@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent knows the specific effort level values for candidate generation and the criteria used to select the best optimization candidate.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Candidate counts by effort",
+      "description": "States correct N_OPTIMIZER_CANDIDATES values: LOW=3, MEDIUM=5, HIGH=6 (at least 2 of 3 correct)",
+      "max_score": 25
+    },
+    {
+      "name": "Speedup as primary selector",
+      "description": "States that the winning candidate is selected primarily by highest speedup ratio",
+      "max_score": 25
+    },
+    {
+      "name": "Diff length as tiebreaker",
+      "description": "States that for tied speedups, shortest diff length from original is used as tiebreaker",
+      "max_score": 25
+    },
+    {
+      "name": "Refinement ranking weights",
+      "description": "States that refinement candidates use weighted ranking with runtime weighted more heavily than diff (2:1 ratio or REFINED_CANDIDATE_RANKING_WEIGHTS=(2,1))",
+      "max_score": 25
+    }
+  ]
+}
diff --git a/tiles/codeflash-docs/evals/scenario-4/task.md b/tiles/codeflash-docs/evals/scenario-4/task.md
@@ -0,0 +1,18 @@
+# Design a Candidate Selection Dashboard
+
+## Context
+
+The codeflash team wants to build a dashboard that shows users how optimization candidates were evaluated and why a particular candidate won. The dashboard needs to display the selection process at each stage, from initial candidate pool through to the final winner.
+
+## Task
+
+Write a specification document for the dashboard that explains:
+1. How many candidates are generated at each effort level
+2. The exact criteria and order of operations used to pick the winning candidate
+3. How refinement candidates are ranked differently from initial candidates
+
+Include concrete examples showing how two hypothetical candidates would be compared.
+
+## Expected Outputs
+
+- A markdown file `selection-dashboard-spec.md`
diff --git a/tiles/codeflash-docs/evals/scenario-5/capability.txt b/tiles/codeflash-docs/evals/scenario-5/capability.txt
@@ -0,0 +1 @@
+Pipeline concurrency and FunctionToOptimize structure
diff --git a/tiles/codeflash-docs/evals/scenario-5/criteria.json b/tiles/codeflash-docs/evals/scenario-5/criteria.json
@@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent knows the FunctionToOptimize data structure and the concurrent execution model for test generation and optimization.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "FunctionToOptimize fields",
+      "description": "Includes at least 4 of: function_name, file_path, parents (list of FunctionParent), starting_line, ending_line, is_async, is_method, language",
+      "max_score": 25
+    },
+    {
+      "name": "Qualified name property",
+      "description": "Mentions qualified_name as a property that produces the full dotted name including parent classes (e.g., MyClass.my_method)",
+      "max_score": 25
+    },
+    {
+      "name": "Concurrent execution",
+      "description": "States that test generation and LLM optimization run concurrently (in parallel), NOT sequentially one after the other",
+      "max_score": 25
+    },
+    {
+      "name": "Entry point identification",
+      "description": "Correctly identifies Optimizer.run() as the top-level entry point and FunctionOptimizer.optimize_function() as the per-function entry point",
+      "max_score": 25
+    }
+  ]
+}
diff --git a/tiles/codeflash-docs/evals/scenario-5/task.md b/tiles/codeflash-docs/evals/scenario-5/task.md
@@ -0,0 +1,17 @@
+# Implement a Function Optimization Status Tracker
+
+## Context
+
+The codeflash team needs a status tracker that logs what happens to each function during an optimization run. For each function, it should record the function identity, which pipeline stages it passed through, and how long each stage took.
+
+## Task
+
+Write a design document explaining:
+1. What data structure represents a function being optimized, including its identity fields and how nested functions (methods inside classes) are represented
+2. The full name resolution strategy for identifying functions uniquely
+3. Which stages of the pipeline operate on a single function at a time vs. operating on multiple functions
+4. Where in the codebase the per-function optimization is orchestrated and what the top-level entry point is
+
+## Expected Outputs
+
+- A markdown file `status-tracker-design.md`
diff --git a/tiles/codeflash-docs/evals/summary.json b/tiles/codeflash-docs/evals/summary.json
diff --git a/tiles/codeflash-docs/evals/summary_infeasible.json b/tiles/codeflash-docs/evals/summary_infeasible.json

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Code serialization format and context splitting`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Candidate source types and DAG relationships`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Deterministic patch values and test execution architecture`