Skip to content

Commit 9ac71f2

Browse files
author
LoCoBench Bot
committed
Merge remote-tracking branch 'origin/ralph/gapfill-codereview'
# Conflicts: # configs/selected_benchmark_tasks.json
2 parents dbf648b + 9c33cfe commit 9ac71f2

67 files changed

Lines changed: 4366 additions & 39 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.beads/issues.jsonl

Lines changed: 33 additions & 0 deletions
Large diffs are not rendered by default.

benchmarks/ccb_codereview/cr-calcom-001/environment/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ RUN git clone --filter=blob:none --no-checkout https://github.com/calcom/cal.com
2525
git config user.name "Agent"
2626

2727
# Inject defects into the codebase
28-
COPY inject_defects.sh /tmp/inject_defects.sh
28+
COPY environment/inject_defects.sh /tmp/inject_defects.sh
2929
RUN chmod +x /tmp/inject_defects.sh && /tmp/inject_defects.sh && rm /tmp/inject_defects.sh
3030

3131
# Create directories for verifier (tests uploaded at runtime by Harbor verifier)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FROM ubuntu:22.04
2+
3+
# Install common tools (python3 needed for inject_defects.sh)
4+
RUN apt-get update && apt-get install -y \
5+
git \
6+
curl \
7+
python3 \
8+
ripgrep \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
# Install Node.js (for Claude Code CLI)
12+
RUN if ! command -v node &> /dev/null; then \
13+
curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
14+
apt-get install -y nodejs; \
15+
fi
16+
17+
# Install Claude Code CLI
18+
RUN npm install -g @anthropic-ai/claude-code
19+
20+
# Clone envoyproxy/envoy at pinned v1.33.0 tag (sparse checkout for relevant files)
21+
RUN git clone --filter=blob:none --no-checkout https://github.com/envoyproxy/envoy.git /workspace && \
22+
cd /workspace && \
23+
git checkout v1.33.0 && \
24+
git config user.email "agent@example.com" && \
25+
git config user.name "Agent"
26+
27+
# Inject defects into the codebase
28+
COPY inject_defects.sh /tmp/inject_defects.sh
29+
RUN chmod +x /tmp/inject_defects.sh && /tmp/inject_defects.sh && rm /tmp/inject_defects.sh
30+
31+
# Create directories for verifier (tests uploaded at runtime by Harbor verifier)
32+
RUN mkdir -p /workspace/tests /logs/verifier
33+
34+
WORKDIR /workspace
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/bin/bash
2+
# Inject defects into the Envoy proxy codebase for code review benchmarking
3+
# Each defect simulates a realistic bug that an AI code reviewer should catch
4+
# 6 defects across 4 files, 3 require cross-file reasoning
5+
6+
set -e
7+
cd /workspace
8+
9+
# ── Defect 1: Return Continue instead of StopIteration when delay is active ──
10+
# Cross-file: fault_filter returns Continue, but postDelayInjection() later calls
11+
# continueDecoding() via filter_manager.cc — the stream has already moved past
12+
# this filter, causing undefined behavior or assertion failure
13+
python3 -c "
14+
path = 'source/extensions/filters/http/fault/fault_filter.cc'
15+
with open(path) as f:
16+
content = f.read()
17+
18+
old = ''' if (maybeSetupDelay(headers)) {
19+
return Http::FilterHeadersStatus::StopIteration;
20+
}'''
21+
22+
new = ''' if (maybeSetupDelay(headers)) {
23+
return Http::FilterHeadersStatus::Continue;
24+
}'''
25+
26+
content = content.replace(old, new)
27+
with open(path, 'w') as f:
28+
f.write(content)
29+
print('INJECTED defect-1: fault filter returns Continue instead of StopIteration when delay active')
30+
"
31+
32+
# ── Defect 2: Invert header matching logic in HeaderUtility::matchHeaders ──
33+
# Cross-file: This utility is called by fault_filter (line 130), RBAC, ratelimit,
34+
# and other filters. Inverting it causes header-based fault injection to match
35+
# on the WRONG requests (those NOT matching the configured headers)
36+
python3 -c "
37+
path = 'source/common/http/header_utility.cc'
38+
with open(path) as f:
39+
content = f.read()
40+
41+
old = ''' if (!config_headers.empty()) {
42+
for (const HeaderDataPtr& cfg_header_data : config_headers) {
43+
if (!cfg_header_data->matchesHeaders(request_headers)) {
44+
return false;
45+
}
46+
}
47+
}
48+
49+
return true;'''
50+
51+
new = ''' if (!config_headers.empty()) {
52+
for (const HeaderDataPtr& cfg_header_data : config_headers) {
53+
if (cfg_header_data->matchesHeaders(request_headers)) {
54+
return false;
55+
}
56+
}
57+
}
58+
59+
return true;'''
60+
61+
content = content.replace(old, new)
62+
with open(path, 'w') as f:
63+
f.write(content)
64+
print('INJECTED defect-2: inverted header matching logic in matchHeaders')
65+
"
66+
67+
# ── Defect 3: Remove route cache clearing from ext_authz onComplete ──
68+
# When ext_authz modifies request headers (adding auth headers, removing
69+
# sensitive headers), the route cache must be cleared so downstream filters
70+
# and the router see the updated route. Without this, stale routes are used.
71+
python3 -c "
72+
path = 'source/extensions/filters/http/ext_authz/ext_authz.cc'
73+
with open(path) as f:
74+
content = f.read()
75+
76+
old = ''' // Any changes to request headers or query parameters can affect how the request is going to be
77+
// routed. If we are changing the headers we also need to clear the route
78+
// cache.
79+
if (config_->clearRouteCache() &&
80+
(!response->headers_to_set.empty() || !response->headers_to_append.empty() ||
81+
!response->headers_to_remove.empty() || !response->query_parameters_to_set.empty() ||
82+
!response->query_parameters_to_remove.empty())) {
83+
ENVOY_STREAM_LOG(debug, \"ext_authz is clearing route cache\", *decoder_callbacks_);
84+
decoder_callbacks_->downstreamCallbacks()->clearRouteCache();
85+
}'''
86+
87+
new = ''' // Any changes to request headers or query parameters can affect how the request is going to be
88+
// routed. Route cache clearing handled by downstream filters.'''
89+
90+
content = content.replace(old, new)
91+
with open(path, 'w') as f:
92+
f.write(content)
93+
print('INJECTED defect-3: removed route cache clearing from ext_authz onComplete')
94+
"
95+
96+
# ── Defect 4: Invert required response header check in filter_manager ──
97+
# Cross-file: filter_manager calls HeaderUtility::checkRequiredResponseHeaders()
98+
# (defined in header_utility.cc). Inverting the status check sends 502 for all
99+
# valid responses and lets responses missing :status through
100+
python3 -c "
101+
path = 'source/common/http/filter_manager.cc'
102+
with open(path) as f:
103+
content = f.read()
104+
105+
old = ''' const auto status = HeaderUtility::checkRequiredResponseHeaders(headers);
106+
if (!status.ok()) {
107+
// If the check failed, then we reply with BadGateway, and stop the further processing.
108+
sendLocalReply('''
109+
110+
new = ''' const auto status = HeaderUtility::checkRequiredResponseHeaders(headers);
111+
if (status.ok()) {
112+
// If the check failed, then we reply with BadGateway, and stop the further processing.
113+
sendLocalReply('''
114+
115+
content = content.replace(old, new)
116+
with open(path, 'w') as f:
117+
f.write(content)
118+
print('INJECTED defect-4: inverted required response header check in filter_manager')
119+
"
120+
121+
# ── Defect 5: Invert rate limit check causing active_faults gauge leak ──
122+
# In postDelayInjection, the gauge should be decremented when NO rate limit
123+
# is configured (fault is done). Inverting this causes the gauge to be
124+
# decremented when rate limit IS active (double-dec with onDestroy) and
125+
# NOT decremented when it's not (leak), eventually exhausting max_active_faults
126+
python3 -c "
127+
path = 'source/extensions/filters/http/fault/fault_filter.cc'
128+
with open(path) as f:
129+
content = f.read()
130+
131+
old = ''' ASSERT(fault_active_);
132+
ASSERT(delay_timer_ == nullptr);
133+
if (!isResponseRateLimitConfigured()) {
134+
config_->stats().active_faults_.dec();
135+
fault_active_ = false;
136+
}'''
137+
138+
new = ''' ASSERT(fault_active_);
139+
ASSERT(delay_timer_ == nullptr);
140+
if (isResponseRateLimitConfigured()) {
141+
config_->stats().active_faults_.dec();
142+
fault_active_ = false;
143+
}'''
144+
145+
content = content.replace(old, new)
146+
with open(path, 'w') as f:
147+
f.write(content)
148+
print('INJECTED defect-5: inverted rate limit check causes active_faults gauge leak')
149+
"
150+
151+
# ── Defect 6: Remove isRemovableHeader guard from ext_authz header removal ──
152+
# ext_authz removes headers based on the external auth response. Without the
153+
# isRemovableHeader check, :-prefixed pseudo-headers and Host can be removed,
154+
# making the request malformed and causing 400 errors or protocol violations
155+
python3 -c "
156+
path = 'source/extensions/filters/http/ext_authz/ext_authz.cc'
157+
with open(path) as f:
158+
content = f.read()
159+
160+
old = ''' // We don't allow removing any :-prefixed headers, nor Host, as removing them would make the
161+
// request malformed. checkDecoderHeaderMutation also performs this check, however, so only
162+
// perform this check explicitly if decoder header mutation rules is empty.
163+
if (!config_->hasDecoderHeaderMutationRules() &&
164+
!Http::HeaderUtility::isRemovableHeader(key)) {
165+
ENVOY_STREAM_LOG(trace, \"Ignoring invalid header removal '{}'.\", *decoder_callbacks_, key);
166+
continue;
167+
}'''
168+
169+
new = ''' // Header mutation rules handle validation if configured.'''
170+
171+
content = content.replace(old, new)
172+
with open(path, 'w') as f:
173+
f.write(content)
174+
print('INJECTED defect-6: removed isRemovableHeader guard from ext_authz header removal')
175+
"
176+
177+
echo "All 6 defects injected successfully"
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Code Review: Envoy Proxy HTTP Filter Chain
2+
3+
- **Repository**: envoyproxy/envoy
4+
- **Difficulty**: hard
5+
- **Category**: code-review
6+
- **Task Type**: repo-clone
7+
8+
## Description
9+
10+
You are reviewing a recently merged pull request that modifies Envoy's HTTP filter chain. The PR touches the fault injection filter, external authorization filter, header utility functions, and the core filter manager. The stated goal was to add observability improvements and fix edge cases in filter chain iteration, but several defects were introduced during the merge — both functional bugs and cross-component interaction errors.
11+
12+
Your task is to **find the defects, fix them in the code, and produce a structured review report**.
13+
14+
## Context
15+
16+
The changes span four core areas of Envoy's HTTP filter infrastructure:
17+
18+
1. **`source/extensions/filters/http/fault/fault_filter.cc`** — Fault injection filter: intercepts requests to inject delays and aborts based on configuration. Returns `StopIteration` to pause the filter chain during delay, then calls `continueDecoding()` when the timer fires. Manages an `active_faults_` gauge for tracking in-flight faults.
19+
20+
2. **`source/extensions/filters/http/ext_authz/ext_authz.cc`** — External authorization filter: sends requests to an external auth service, then modifies request headers based on the response. Clears the route cache when headers change so downstream filters see the updated route.
21+
22+
3. **`source/common/http/header_utility.cc`** — Header matching and validation utilities: provides `matchHeaders()` used by fault filter and RBAC, `checkRequiredResponseHeaders()` used by filter manager, and `isRemovableHeader()` used by ext_authz.
23+
24+
4. **`source/common/http/filter_manager.cc`** — Core filter chain orchestrator: iterates decoder/encoder filter chains, handles `StopIteration` vs `Continue` return values, checks required response headers after encoding, and manages the `decoder_filter_chain_aborted_` flag for local reply short-circuiting.
25+
26+
## Task
27+
28+
YOU MUST IMPLEMENT CODE CHANGES to complete this task.
29+
30+
Review the files listed above for the following types of defects:
31+
32+
- **Functional bugs**: Logic errors that cause incorrect behavior (e.g., inverted conditions, wrong return values, missing state updates).
33+
- **Cross-file interaction bugs**: Defects where a change in one file breaks assumptions in another file (e.g., header matching logic change affects which requests fault filter intercepts).
34+
- **Resource management bugs**: Gauge leaks, missing cleanup, or double-counting of active resources.
35+
36+
For each defect you find:
37+
38+
1. **Fix the code** by editing the affected file in `/workspace/`.
39+
2. **Record the defect** in your review report.
40+
41+
### Expected Output
42+
43+
After completing your review, write a JSON file at `/workspace/review.json` containing an array of defect objects:
44+
45+
```json
46+
[
47+
{
48+
"file": "source/extensions/filters/http/fault/fault_filter.cc",
49+
"line": 155,
50+
"severity": "critical",
51+
"description": "Brief description of what is wrong and why",
52+
"fix_applied": true
53+
}
54+
]
55+
```
56+
57+
Each entry must include:
58+
- `file` — Relative path from repository root
59+
- `line` — Approximate line number where the defect occurs
60+
- `severity` — One of: `critical`, `high`, `medium`, `low`
61+
- `description` — What the defect is and what impact it has
62+
- `fix_applied` — Boolean indicating whether you committed a fix
63+
64+
## Scoring
65+
66+
Your submission is scored on two equally weighted components:
67+
68+
1. **Detection score (50%)**: F1 score (harmonic mean of precision and recall) of your reported defects matched against the ground truth. A reported defect matches if it identifies the correct file and a related issue.
69+
2. **Fix score (50%)**: Proportion of defects where you both identified the issue and applied a correct code fix (verified by checking for expected code patterns in the modified files).
70+
71+
**Final score** = 0.5 × detection_F1 + 0.5 × fix_score
72+
73+
## Testing
74+
75+
- **Time limit**: 1200 seconds
76+
- Run `bash /workspace/tests/test.sh` to verify your changes
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
version = "1.0"
2+
[metadata]
3+
name = "cr-envoy-001"
4+
description = "Review an Envoy proxy PR for injected defects in the HTTP filter chain spanning fault injection, external authorization, header utilities, and filter manager"
5+
license = "Apache-2.0"
6+
7+
[task]
8+
id = "cr-envoy-001"
9+
repo = "envoy"
10+
category = "code-review"
11+
language = "cpp"
12+
difficulty = "hard"
13+
time_limit_sec = 1200
14+
15+
[verification]
16+
type = "test"
17+
command = "bash /tests/test.sh"
18+
19+
reward_type = "checklist"
20+
description = "F1 score for defect detection plus fix correctness"
21+
[environment]
22+
build_timeout_sec = 1800.0
23+
24+
[environment.setup_scripts]
25+
mcp_config = """#!/bin/bash
26+
# Setup Sourcegraph MCP if credentials provided
27+
if [ -n "$SOURCEGRAPH_ACCESS_TOKEN" ] && [ -n "$SOURCEGRAPH_URL" ]; then
28+
echo "Setting up Sourcegraph MCP configuration..."
29+
mkdir -p /root/.config/claude
30+
31+
cat > /root/.config/claude/mcp.json << 'EOF'
32+
{
33+
"mcpServers": {
34+
"sourcegraph": {
35+
"command": "npx",
36+
"args": ["-y", "@sourcegraph/mcp-server"],
37+
"env": {
38+
"SRC_ACCESS_TOKEN": "$SOURCEGRAPH_ACCESS_TOKEN",
39+
"SOURCEGRAPH_URL": "$SOURCEGRAPH_URL"
40+
}
41+
}
42+
}
43+
}
44+
EOF
45+
46+
echo "PASS MCP configuration created"
47+
else
48+
echo "No Sourcegraph credentials provided, MCP disabled"
49+
fi
50+
exit 0
51+
"""

0 commit comments

Comments
 (0)