Skip to content

Commit 869fbe1

Browse files
committed
chore: add eval scenarios for codeflash-docs tile
5 scenarios testing: code serialization format, candidate lifecycle/DAG, deterministic patches, effort levels/selection criteria, and function representation/concurrency model.
1 parent ff2abd2 commit 869fbe1

18 files changed

Lines changed: 414 additions & 0 deletions

File tree

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{
2+
"package_name": "codeflash-docs",
3+
"total_capabilities": 16,
4+
"capabilities": [
5+
{
6+
"id": 0,
7+
"name": "pipeline-stage-ordering",
8+
"description": "Know the correct ordering of codeflash pipeline stages: Discovery → Ranking → Context Extraction → Test Gen + Optimization (concurrent) → Baseline → Candidate Evaluation → PR",
9+
"complexity": "basic",
10+
"api_elements": ["Optimizer.run()", "FunctionOptimizer.optimize_function()"]
11+
},
12+
{
13+
"id": 1,
14+
"name": "function-to-optimize-fields",
15+
"description": "Know FunctionToOptimize key fields (function_name, file_path, parents, starting_line/ending_line, is_async, is_method, language) and properties (qualified_name, top_level_parent_name, class_name)",
16+
"complexity": "intermediate",
17+
"api_elements": ["FunctionToOptimize", "FunctionParent", "models/function_types.py"]
18+
},
19+
{
20+
"id": 2,
21+
"name": "code-strings-markdown-format",
22+
"description": "Know that code is serialized as markdown fenced blocks with language:filepath syntax (```python:filepath\\ncode\\n```) and parsed via CodeStringsMarkdown.parse_markdown_code()",
23+
"complexity": "intermediate",
24+
"api_elements": ["CodeStringsMarkdown", "CodeString", ".markdown", ".flat", "parse_markdown_code()"]
25+
},
26+
{
27+
"id": 3,
28+
"name": "read-writable-vs-read-only",
29+
"description": "Distinguish read_writable_code (LLM can modify) from read_only_context_code (reference only) in CodeOptimizationContext",
30+
"complexity": "basic",
31+
"api_elements": ["CodeOptimizationContext", "read_writable_code", "read_only_context_code"]
32+
},
33+
{
34+
"id": 4,
35+
"name": "candidate-source-types",
36+
"description": "Know OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE and when each is used",
37+
"complexity": "intermediate",
38+
"api_elements": ["OptimizedCandidateSource", "OptimizedCandidate"]
39+
},
40+
{
41+
"id": 5,
42+
"name": "candidate-forest-dag",
43+
"description": "Know that candidates form a forest/DAG via parent_id references where refinements and repairs build on previous candidates",
44+
"complexity": "intermediate",
45+
"api_elements": ["parent_id", "OptimizedCandidate", "CandidateForest"]
46+
},
47+
{
48+
"id": 6,
49+
"name": "concurrent-testgen-optimization",
50+
"description": "Know that test generation and LLM optimization run concurrently using concurrent.futures, not sequentially",
51+
"complexity": "intermediate",
52+
"api_elements": ["concurrent.futures", "FunctionOptimizer.optimize_function()"]
53+
},
54+
{
55+
"id": 7,
56+
"name": "deterministic-patch-values",
57+
"description": "Know the specific fixed values used by deterministic patches: time=1761717605.108106, datetime=2021-01-01 02:05:10 UTC, uuid=12345678-1234-5678-9abc-123456789012, random seeded with 42",
58+
"complexity": "advanced",
59+
"api_elements": ["_apply_deterministic_patches()", "pytest_plugin.py"]
60+
},
61+
{
62+
"id": 8,
63+
"name": "test-type-enum",
64+
"description": "Know the 6 TestType variants: EXISTING_UNIT_TEST, INSPIRED_REGRESSION, GENERATED_REGRESSION, REPLAY_TEST, CONCOLIC_COVERAGE_TEST, INIT_STATE_TEST",
65+
"complexity": "basic",
66+
"api_elements": ["TestType", "models/test_type.py"]
67+
},
68+
{
69+
"id": 9,
70+
"name": "ai-service-endpoints",
71+
"description": "Know the AI service endpoints: /ai/optimize, /ai/optimize_line_profiler, /ai/refine, /ai/repair, /ai/adaptive_optimize, /ai/rewrite_jit",
72+
"complexity": "intermediate",
73+
"api_elements": ["AiServiceClient", "api/aiservice.py"]
74+
},
75+
{
76+
"id": 10,
77+
"name": "repair-request-structure",
78+
"description": "Know that AIServiceCodeRepairRequest includes TestDiff objects with scope (RETURN_VALUE/STDOUT/DID_PASS), original vs candidate values, and test source code",
79+
"complexity": "advanced",
80+
"api_elements": ["AIServiceCodeRepairRequest", "TestDiff", "TestDiffScope"]
81+
},
82+
{
83+
"id": 11,
84+
"name": "effort-level-values",
85+
"description": "Know specific effort level values: LOW gets 3 candidates, MEDIUM gets 5, HIGH gets 6 (N_OPTIMIZER_CANDIDATES)",
86+
"complexity": "intermediate",
87+
"api_elements": ["EffortLevel", "N_OPTIMIZER_CANDIDATES", "EFFORT_VALUES"]
88+
},
89+
{
90+
"id": 12,
91+
"name": "context-token-limits",
92+
"description": "Know OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000 and TESTGEN_CONTEXT_TOKEN_LIMIT=16000 and that encoded_tokens_len() is used for counting",
93+
"complexity": "basic",
94+
"api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
95+
},
96+
{
97+
"id": 13,
98+
"name": "best-candidate-selection",
99+
"description": "Know the selection criteria: highest speedup, then shortest diff for ties, and refinement weighted ranking (2*runtime + 1*diff)",
100+
"complexity": "advanced",
101+
"api_elements": ["BestOptimization", "REFINED_CANDIDATE_RANKING_WEIGHTS"]
102+
},
103+
{
104+
"id": 14,
105+
"name": "plugin-blocklists",
106+
"description": "Know behavioral test blocklisted plugins (benchmark, codspeed, xdist, sugar) and benchmarking blocklist (adds cov, profiling)",
107+
"complexity": "intermediate",
108+
"api_elements": ["BEHAVIORAL_BLOCKLISTED_PLUGINS", "BENCHMARKING_BLOCKLISTED_PLUGINS"]
109+
},
110+
{
111+
"id": 15,
112+
"name": "result-type-usage",
113+
"description": "Know that Result[L,R] from either.py uses Success(value)/Failure(error) with is_successful() check before unwrap()",
114+
"complexity": "basic",
115+
"api_elements": ["Result", "Success", "Failure", "is_successful", "either.py"]
116+
}
117+
]
118+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Code serialization format and context splitting
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"context": "Tests whether the agent knows the CodeStringsMarkdown serialization format and the distinction between read-writable and read-only code context in the codeflash pipeline.",
3+
"type": "weighted_checklist",
4+
"checklist": [
5+
{
6+
"name": "Markdown code block format",
7+
"description": "Uses the correct fenced code block format with language:filepath syntax (```python:path/to/file.py) when constructing code for the AI service, NOT plain code blocks without file paths",
8+
"max_score": 30
9+
},
10+
{
11+
"name": "Read-writable vs read-only split",
12+
"description": "Correctly separates code into read_writable_code (code the LLM can modify) and read_only_context_code (reference-only dependency code), NOT treating all code as modifiable",
13+
"max_score": 35
14+
},
15+
{
16+
"name": "parse_markdown_code usage",
17+
"description": "Uses CodeStringsMarkdown.parse_markdown_code() to parse AI service responses back into structured code, NOT manual string splitting or regex",
18+
"max_score": 35
19+
}
20+
]
21+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Format Code for AI Service Request
2+
3+
## Context
4+
5+
You are working on the codeflash optimization engine. The AI service accepts optimization requests with source code and dependency context. A function `calculate_total` in `analytics/metrics.py` needs to be optimized. It calls a helper `normalize_values` in the same file (both modifiable), and imports `BaseMetric` from `analytics/base.py` (not modifiable, just for reference).
6+
7+
```python
8+
# analytics/metrics.py
9+
from analytics.base import BaseMetric
10+
11+
def normalize_values(data: list[float]) -> list[float]:
12+
max_val = max(data)
13+
return [x / max_val for x in data]
14+
15+
def calculate_total(metrics: list[BaseMetric]) -> float:
16+
values = [m.value for m in metrics]
17+
normalized = normalize_values(values)
18+
return sum(normalized)
19+
```
20+
21+
```python
22+
# analytics/base.py
23+
class BaseMetric:
24+
def __init__(self, name: str, value: float):
25+
self.name = name
26+
self.value = value
27+
```
28+
29+
## Task
30+
31+
Write a Python function `prepare_optimization_payload` that constructs the code payload for an AI service optimization request for `calculate_total`. It should properly format the source code and dependency code, and include a function to parse the AI service response back into structured code objects.
32+
33+
## Expected Outputs
34+
35+
- A Python file `payload_builder.py` with the payload construction and response parsing logic
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Candidate source types and DAG relationships
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
3+
"type": "weighted_checklist",
4+
"checklist": [
5+
{
6+
"name": "Lists source types",
7+
"description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
8+
"max_score": 25
9+
},
10+
{
11+
"name": "Parent ID linkage",
12+
"description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
13+
"max_score": 25
14+
},
15+
{
16+
"name": "Refinement uses runtime data",
17+
"description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
18+
"max_score": 25
19+
},
20+
{
21+
"name": "Repair uses test diffs",
22+
"description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
23+
"max_score": 25
24+
}
25+
]
26+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Document the Candidate Lifecycle
2+
3+
## Context
4+
5+
A new engineer is joining the codeflash team and needs to understand how optimization candidates are generated, improved, and related to each other throughout the pipeline. They've asked for a clear explanation of the different ways candidates are produced and how the system iterates on them.
6+
7+
## Task
8+
9+
Write a technical document explaining the full lifecycle of an optimization candidate in codeflash — from initial generation through improvement iterations. Cover all the different ways candidates can be created, what data is sent to the AI service for each type, and how candidates relate to each other structurally.
10+
11+
## Expected Outputs
12+
13+
- A markdown file `candidate-lifecycle.md`
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Deterministic patch values and test execution architecture
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"context": "Tests whether the agent knows the specific deterministic patch values used in codeflash's pytest plugin and the subprocess-based test execution architecture.",
3+
"type": "weighted_checklist",
4+
"checklist": [
5+
{
6+
"name": "Subprocess isolation",
7+
"description": "States that tests run in a subprocess to isolate the test environment from the main codeflash process, NOT in the same process",
8+
"max_score": 20
9+
},
10+
{
11+
"name": "Fixed time value",
12+
"description": "References the specific fixed timestamp 1761717605.108106 for time.time() or the fixed datetime 2021-01-01 02:05:10 UTC for datetime.now()",
13+
"max_score": 20
14+
},
15+
{
16+
"name": "Fixed UUID value",
17+
"description": "References the specific fixed UUID 12345678-1234-5678-9abc-123456789012 for uuid4/uuid1",
18+
"max_score": 20
19+
},
20+
{
21+
"name": "Random seed",
22+
"description": "States that random is seeded with 42 (NOT a different seed value)",
23+
"max_score": 20
24+
},
25+
{
26+
"name": "Plugin blocklists",
27+
"description": "Mentions that behavioral tests block specific pytest plugins (at least 2 of: benchmark, codspeed, xdist, sugar) to ensure deterministic execution",
28+
"max_score": 20
29+
}
30+
]
31+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Explain Test Reproducibility Guarantees
2+
3+
## Context
4+
5+
A codeflash user notices that their optimization candidate passes behavioral tests on one run but fails on the next. They suspect non-determinism in the test execution. They want to understand what guarantees codeflash provides for test reproducibility and how the system ensures consistent results.
6+
7+
## Task
8+
9+
Write a technical explanation of how codeflash ensures deterministic test execution. Cover the execution environment setup, what sources of non-determinism are controlled, and any specific values or configurations used. Also explain the test execution architecture.
10+
11+
## Expected Outputs
12+
13+
- A markdown file `test-reproducibility.md`

0 commit comments

Comments
 (0)