Skip to content

Commit 92f7044

Browse files
committed
perf(diffctx): Arc<str> for Fragment::content + concurrent stdout drain in git
Reduces per-instance peak RSS ~140x on huggingface/transformers (22 GB → 153 MB) by sharing fragment content via Arc<str> instead of String clones across pipeline stages. Also fixes a pipe-buffer deadlock in wait_with_timeout: stdout/stderr are now drained on background threads concurrently with try_wait, instead of after the child exits — git ls-files on large repos exceeds the 64 KB pipe buffer and would otherwise block until the global timeout. Also: env-overridable BENCH_WORKERS / BENCH_BATCH_SIZE, batched ProcessPoolExecutor that survives single-worker OOMs, Docker bench harness with sweep orchestrator and aggregation report.
1 parent 6e9ef40 commit 92f7044

16 files changed

Lines changed: 546 additions & 51 deletions

.dockerignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@ private/
3737
.zsh_history
3838
.sh_history
3939

40+
# Rust build artifacts (3.5G+)
41+
diffctx/target/
42+
target/
43+
44+
# Benchmark caches and results
45+
.cache/
46+
results/
47+
results_archive/
48+
4049
# Python artifacts
4150
__pycache__/
4251
*.py[cod]

Dockerfile.bench

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
FROM python:3.12-slim-bookworm AS rust-builder
2+
3+
RUN apt-get update && apt-get install -y --no-install-recommends \
4+
git build-essential pkg-config libssl-dev curl ca-certificates \
5+
&& rm -rf /var/lib/apt/lists/*
6+
7+
ENV RUSTUP_HOME=/usr/local/rustup CARGO_HOME=/usr/local/cargo PATH=/usr/local/cargo/bin:$PATH
8+
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
9+
sh -s -- -y --default-toolchain 1.92.0 --profile minimal
10+
11+
ENV VIRTUAL_ENV=/opt/venv
12+
RUN python3 -m venv $VIRTUAL_ENV
13+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
14+
15+
RUN pip install --no-cache-dir "maturin>=1,<2"
16+
17+
WORKDIR /build/diffctx
18+
COPY diffctx/Cargo.toml diffctx/Cargo.lock diffctx/pyproject.toml ./
19+
COPY diffctx/src ./src
20+
COPY diffctx/python ./python
21+
22+
RUN maturin build --release --out /wheels
23+
24+
25+
FROM python:3.12-slim-bookworm AS runtime
26+
27+
RUN apt-get update && apt-get install -y --no-install-recommends \
28+
git ca-certificates \
29+
&& rm -rf /var/lib/apt/lists/*
30+
31+
ENV VIRTUAL_ENV=/opt/venv
32+
RUN python3 -m venv $VIRTUAL_ENV
33+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
34+
35+
COPY --from=rust-builder /wheels/*.whl /tmp/wheels/
36+
RUN pip install --no-cache-dir /tmp/wheels/*.whl && rm -rf /tmp/wheels
37+
38+
WORKDIR /app
39+
COPY pyproject.toml README.md ./
40+
COPY src ./src
41+
COPY benchmarks ./benchmarks
42+
43+
RUN pip install --no-cache-dir -e ".[full]" && \
44+
pip install --no-cache-dir datasets rank-bm25
45+
46+
ENV CB_REPOS_DIR=/cache/contextbench_repos
47+
ENV PYTHONUNBUFFERED=1
48+
49+
VOLUME ["/cache/contextbench_repos", "/app/results"]
50+
51+
ENTRYPOINT ["python", "-m", "benchmarks"]
52+
CMD ["cb", "--help"]

benchmarks/common.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import uuid
1111
from pathlib import Path
1212

13-
WORKERS = 11
13+
WORKERS = int(os.environ.get("BENCH_WORKERS", "11"))
1414
RESULTS_DIR = Path("results")
1515

1616
LINES_RE = re.compile(r"^(\d+)-(\d+)$")
@@ -240,8 +240,10 @@ def parse_lines_field(lines_str: str) -> tuple[int, int] | None:
240240
def load_results(path: Path) -> list[dict]:
241241
data = json.loads(path.read_text())
242242
if isinstance(data, dict) and "results" in data:
243-
return data["results"]
244-
return data
243+
return list(data["results"])
244+
if isinstance(data, list):
245+
return data
246+
raise ValueError(f"unexpected results shape in {path}: {type(data).__name__}")
245247

246248

247249
def _git_commit_sha() -> str:
@@ -314,15 +316,24 @@ def _run_serial(worker_fn, run_args: list, collect: str) -> list:
314316

315317
def _run_pool(worker_fn, run_args: list, workers: int, collect: str) -> list:
316318
from concurrent.futures import ProcessPoolExecutor, as_completed
319+
from concurrent.futures.process import BrokenProcessPool
317320

321+
batch_size = int(os.environ.get("BENCH_BATCH_SIZE", str(max(workers * 4, 20))))
318322
results: list = []
319-
with ProcessPoolExecutor(max_workers=workers, initializer=_init_worker) as pool:
320-
futures = {pool.submit(worker_fn, a): a[0] for a in run_args}
321-
for future in as_completed(futures):
322-
try:
323-
_collect_result(results, future.result(), collect)
324-
except Exception as e:
325-
print(f" WORKER CRASH [{futures[future]}]: {type(e).__name__}: {e}", flush=True)
323+
for batch_start in range(0, len(run_args), batch_size):
324+
batch = run_args[batch_start : batch_start + batch_size]
325+
try:
326+
with ProcessPoolExecutor(max_workers=workers, initializer=_init_worker) as pool:
327+
futures = {pool.submit(worker_fn, a): a[0] for a in batch}
328+
for future in as_completed(futures):
329+
try:
330+
_collect_result(results, future.result(), collect)
331+
except BrokenProcessPool as e:
332+
print(f" WORKER CRASH [{futures[future]}]: {type(e).__name__}", flush=True)
333+
except Exception as e:
334+
print(f" WORKER CRASH [{futures[future]}]: {type(e).__name__}: {e}", flush=True)
335+
except BrokenProcessPool as e:
336+
print(f" POOL CRASH batch {batch_start}-{batch_start+len(batch)}: {e}", flush=True)
326337
return results
327338

328339

diffctx/src/fragmentation.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,12 @@ fn truncate_generated_fragments(file_frags: Vec<Fragment>) -> Vec<Fragment> {
148148
remaining
149149
);
150150
let new_end = frag.start_line() + max_lines - 1;
151+
let identifiers = extract_identifiers(&truncated_content, 2);
151152
Fragment {
152153
id: FragmentId::new(frag.id.path.clone(), frag.start_line(), new_end),
153154
kind: frag.kind,
154-
content: truncated_content.clone(),
155-
identifiers: extract_identifiers(&truncated_content, 2),
155+
content: Arc::from(truncated_content),
156+
identifiers,
156157
token_count: 0,
157158
symbol_name: frag.symbol_name,
158159
}
@@ -311,12 +312,13 @@ pub fn create_whole_file_fragment(
311312
let line_count = lines.len() as u32;
312313
let path_arc: Arc<str> = Arc::from(path.to_string_lossy().as_ref());
313314
let token_count = count_tokens(&content) + LIMITS.overhead_per_fragment;
315+
let identifiers = extract_identifiers(&content, 2);
314316

315317
Some(Fragment {
316318
id: FragmentId::new(path_arc, 1, line_count),
317319
kind: FragmentKind::Chunk,
318-
content: content.clone(),
319-
identifiers: extract_identifiers(&content, 2),
320+
content: Arc::from(content),
321+
identifiers,
320322
token_count,
321323
symbol_name: None,
322324
})

diffctx/src/git.rs

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -89,25 +89,25 @@ fn wait_with_timeout(
8989
_args: &[&str],
9090
) -> Result<std::process::Output> {
9191
let mut child = child;
92-
let start = std::time::Instant::now();
92+
let stdout_handle = child.stdout.take().map(|mut s| {
93+
std::thread::spawn(move || -> std::io::Result<Vec<u8>> {
94+
let mut buf = Vec::new();
95+
s.read_to_end(&mut buf)?;
96+
Ok(buf)
97+
})
98+
});
99+
let stderr_handle = child.stderr.take().map(|mut s| {
100+
std::thread::spawn(move || -> std::io::Result<Vec<u8>> {
101+
let mut buf = Vec::new();
102+
s.read_to_end(&mut buf)?;
103+
Ok(buf)
104+
})
105+
});
93106

94-
loop {
107+
let start = std::time::Instant::now();
108+
let status = loop {
95109
match child.try_wait() {
96-
Ok(Some(status)) => {
97-
let mut stdout = Vec::new();
98-
let mut stderr = Vec::new();
99-
if let Some(mut out) = child.stdout.take() {
100-
let _ = out.read_to_end(&mut stdout);
101-
}
102-
if let Some(mut err) = child.stderr.take() {
103-
let _ = err.read_to_end(&mut stderr);
104-
}
105-
return Ok(std::process::Output {
106-
status,
107-
stdout,
108-
stderr,
109-
});
110-
}
110+
Ok(Some(status)) => break status,
111111
Ok(None) => {
112112
if start.elapsed() >= timeout {
113113
let _ = child.kill();
@@ -118,7 +118,22 @@ fn wait_with_timeout(
118118
}
119119
Err(e) => return Err(GitError::Io(e)),
120120
}
121-
}
121+
};
122+
123+
let stdout = stdout_handle
124+
.and_then(|h| h.join().ok())
125+
.and_then(|r| r.ok())
126+
.unwrap_or_default();
127+
let stderr = stderr_handle
128+
.and_then(|h| h.join().ok())
129+
.and_then(|r| r.ok())
130+
.unwrap_or_default();
131+
132+
Ok(std::process::Output {
133+
status,
134+
stdout,
135+
stderr,
136+
})
122137
}
123138

124139
pub fn is_git_repo(path: &Path) -> bool {

diffctx/src/parsers/config_parser.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,12 @@ fn fragment_json(path: Arc<str>, content: &str) -> Vec<Fragment> {
8888
snippet.push('\n');
8989
}
9090

91+
let identifiers = extract_identifiers(&snippet, 2);
9192
fragments.push(Fragment {
9293
id: FragmentId::new(Arc::clone(&path), start_line, end_line),
9394
kind: FragmentKind::Chunk,
94-
content: snippet.clone(),
95-
identifiers: extract_identifiers(&snippet, 2),
95+
content: Arc::from(snippet),
96+
identifiers,
9697
token_count: 0,
9798
symbol_name: None,
9899
});
@@ -141,11 +142,12 @@ fn split_at_top_level_pattern(path: Arc<str>, content: &str, pattern: &Regex) ->
141142
snippet.push('\n');
142143
}
143144

145+
let identifiers = extract_identifiers(&snippet, 2);
144146
fragments.push(Fragment {
145147
id: FragmentId::new(Arc::clone(&path), start_line, end_line),
146148
kind: FragmentKind::Chunk,
147-
content: snippet.clone(),
148-
identifiers: extract_identifiers(&snippet, 2),
149+
content: Arc::from(snippet),
150+
identifiers,
149151
token_count: 0,
150152
symbol_name: None,
151153
});
@@ -176,11 +178,12 @@ fn make_single_fragment(path: Arc<str>, lines: &[&str]) -> Vec<Fragment> {
176178
snippet.push('\n');
177179
}
178180
let end_line = lines.len() as u32;
181+
let identifiers = extract_identifiers(&snippet, 2);
179182
vec![Fragment {
180183
id: FragmentId::new(path, 1, end_line),
181184
kind: FragmentKind::Chunk,
182-
content: snippet.clone(),
183-
identifiers: extract_identifiers(&snippet, 2),
185+
content: Arc::from(snippet),
186+
identifiers,
184187
token_count: 0,
185188
symbol_name: None,
186189
}]

diffctx/src/parsers/generic.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,13 @@ impl FragmentationStrategy for GenericStrategy {
3333
if !snippet.ends_with('\n') {
3434
snippet.push('\n');
3535
}
36+
let identifiers = extract_identifiers(&snippet, 2);
3637

3738
fragments.push(Fragment {
3839
id: FragmentId::new(Arc::clone(&path), start_line, end_line),
3940
kind: FragmentKind::Chunk,
40-
content: snippet.clone(),
41-
identifiers: extract_identifiers(&snippet, 2),
41+
content: Arc::from(snippet),
42+
identifiers,
4243
token_count: 0,
4344
symbol_name: None,
4445
});

diffctx/src/parsers/markdown.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,12 @@ impl FragmentationStrategy for MarkdownStrategy {
7373
snippet.push('\n');
7474
}
7575

76+
let identifiers = extract_identifiers(&snippet, 2);
7677
fragments.push(Fragment {
7778
id: FragmentId::new(Arc::clone(&path), start_line, end_line),
7879
kind: FragmentKind::Section,
79-
content: snippet.clone(),
80-
identifiers: extract_identifiers(&snippet, 2),
80+
content: Arc::from(snippet),
81+
identifiers,
8182
token_count: 0,
8283
symbol_name: None,
8384
});

diffctx/src/parsers/mod.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,12 @@ fn create_code_gap_fragments(
119119
continue;
120120
}
121121
if let Some(snippet) = create_snippet(lines, start, end) {
122+
let identifiers = crate::types::extract_identifiers(&snippet, 2);
122123
fragments.push(Fragment {
123124
id: crate::types::FragmentId::new(Arc::clone(&path), start, end),
124125
kind: crate::types::FragmentKind::Chunk,
125-
content: snippet.clone(),
126-
identifiers: crate::types::extract_identifiers(&snippet, 2),
126+
content: Arc::from(snippet),
127+
identifiers,
127128
token_count: 0,
128129
symbol_name: None,
129130
});

diffctx/src/parsers/tree_sitter_strategy.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,11 +1178,12 @@ fn create_and_append_fragment(
11781178
Some(s) => s,
11791179
None => return false,
11801180
};
1181+
let identifiers = extract_identifiers(&snippet, 2);
11811182
fragments.push(Fragment {
11821183
id: FragmentId::new(Arc::clone(path), start, end),
11831184
kind: FragmentKind::from_str(kind),
1184-
content: snippet.clone(),
1185-
identifiers: extract_identifiers(&snippet, 2),
1185+
content: Arc::from(snippet),
1186+
identifiers,
11861187
token_count: 0,
11871188
symbol_name: sym_name.map(|s| s.to_string()),
11881189
});
@@ -1318,11 +1319,12 @@ fn try_container_split(
13181319
}
13191320
let header_end = first_child_start - 1;
13201321
if let Some(snippet) = create_snippet(lines, start, header_end) {
1322+
let identifiers = extract_identifiers(&snippet, 2);
13211323
fragments.push(Fragment {
13221324
id: FragmentId::new(Arc::clone(path), start, header_end),
13231325
kind: FragmentKind::from_str(kind),
1324-
content: snippet.clone(),
1325-
identifiers: extract_identifiers(&snippet, 2),
1326+
content: Arc::from(snippet),
1327+
identifiers,
13261328
token_count: 0,
13271329
symbol_name: sym_name.map(|s| s.to_string()),
13281330
});

0 commit comments

Comments
 (0)