Skip to content

Commit afab367

Browse files
committed
feat(diffctx): adaptive scoring modes (auto/precise/discover)
Add --scoring flag with three modes: - auto (default): adapts to repo size (PPR for small, EgoGraph for large) - precise: PPR scoring, best precision on small repos - discover: EgoGraph 2-hop BFS, 52% nontrivial recall on ContextBench PipelineConfig centralizes all algorithm parameters per mode. Auto mode uses fragment count threshold (300) to switch. YAML tests: 83.7% avg, 1403 passed (auto → PPR for small test repos). Benchmarks with discover: 52% CB nontrivial, 25% LOO recall.
1 parent c1bb9ac commit afab367

5 files changed

Lines changed: 85 additions & 4 deletions

File tree

src/treemapper/cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,12 @@ def _build_main_parser() -> argparse.ArgumentParser:
311311
metavar="F",
312312
help="Minimum relevance to include a fragment (default: 0.08, lower = more context)",
313313
)
314+
diff_group.add_argument(
315+
"--scoring",
316+
choices=["auto", "precise", "discover"],
317+
default="auto",
318+
help="Scoring mode: auto (adapts to repo size), precise (PPR, best for small repos), discover (ego-graph, best for cross-file)",
319+
)
314320
diff_group.add_argument(
315321
"--full",
316322
action="store_true",

src/treemapper/diffctx/mode.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from enum import Enum
5+
6+
7+
class ScoringMode(Enum):
8+
AUTO = "auto"
9+
PRECISE = "precise"
10+
DISCOVER = "discover"
11+
12+
13+
@dataclass(frozen=True)
14+
class PipelineConfig:
15+
discovery: str
16+
scoring: str
17+
low_relevance: bool
18+
bm25_top_k: int
19+
ego_depth: int
20+
ppr_alpha: float
21+
22+
@staticmethod
23+
def from_mode(mode: ScoringMode, n_fragments: int = 0) -> PipelineConfig:
24+
if mode == ScoringMode.PRECISE:
25+
return PipelineConfig(
26+
discovery="default",
27+
scoring="ppr",
28+
low_relevance=True,
29+
bm25_top_k=0,
30+
ego_depth=1,
31+
ppr_alpha=0.60,
32+
)
33+
if mode == ScoringMode.DISCOVER:
34+
return PipelineConfig(
35+
discovery="ensemble",
36+
scoring="ego",
37+
low_relevance=False,
38+
bm25_top_k=1,
39+
ego_depth=2,
40+
ppr_alpha=0.60,
41+
)
42+
is_large = n_fragments > 300
43+
return PipelineConfig(
44+
discovery="ensemble" if is_large else "default",
45+
scoring="ego" if is_large else "ppr",
46+
low_relevance=not is_large,
47+
bm25_top_k=1 if is_large else 0,
48+
ego_depth=2 if is_large else 1,
49+
ppr_alpha=0.60,
50+
)

src/treemapper/diffctx/pipeline.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,19 @@
1616
from .file_importance import compute_file_importance
1717
from .fragmentation import _process_files_for_fragments
1818
from .git import CatFileBatch, GitError, split_diff_range
19+
from .mode import PipelineConfig, ScoringMode
1920
from .postpass import _coherence_post_pass, _ensure_changed_files_represented
2021
from .render import build_diff_context_output
21-
from .scoring import DiscoveryContext, EgoGraphScoring, EnsembleDiscovery, PPRScoring, ScoringStrategy
22+
from .scoring import (
23+
BM25Discovery,
24+
DefaultDiscovery,
25+
DiscoveryContext,
26+
DiscoveryStrategy,
27+
EgoGraphScoring,
28+
EnsembleDiscovery,
29+
PPRScoring,
30+
ScoringStrategy,
31+
)
2232
from .select import lazy_greedy_select
2333
from .signatures import _generate_signature_variants
2434
from .types import Fragment, FragmentId
@@ -151,6 +161,12 @@ def _log_ppr_mode(
151161
)
152162

153163

164+
def _create_discovery(config: PipelineConfig) -> DiscoveryStrategy:
165+
if config.discovery == "ensemble":
166+
return EnsembleDiscovery([DefaultDiscovery(), BM25Discovery(top_k=config.bm25_top_k)])
167+
return DefaultDiscovery()
168+
169+
154170
def _empty_tree(root_dir: Path) -> dict[str, Any]:
155171
return {
156172
"name": root_dir.name,
@@ -171,6 +187,7 @@ def build_diff_context(
171187
no_default_ignores: bool = False,
172188
full: bool = False,
173189
whitelist_file: Path | None = None,
190+
scoring_mode: str = "auto",
174191
) -> dict[str, Any]:
175192
_validate_inputs(root_dir, alpha, tau, budget_tokens)
176193
root_dir = root_dir.resolve()
@@ -217,6 +234,9 @@ def build_diff_context(
217234
except (OSError, UnicodeDecodeError):
218235
continue
219236

237+
mode = ScoringMode(os.environ.get("DIFFCTX_SCORING", scoring_mode))
238+
config = PipelineConfig.from_mode(mode, n_fragments=len(all_fragments))
239+
220240
discovery_ctx = DiscoveryContext(
221241
root_dir=root_dir,
222242
changed_files=changed_files,
@@ -225,8 +245,7 @@ def build_diff_context(
225245
expansion_concepts=frozenset(expansion_concepts),
226246
file_cache=file_cache,
227247
)
228-
discovery_strategy = EnsembleDiscovery()
229-
discovered_files = discovery_strategy.discover(discovery_ctx)
248+
discovered_files = _create_discovery(config).discover(discovery_ctx)
230249
discovered_files = [_normalize_path(p, root_dir) for p in discovered_files]
231250
all_fragments.extend(
232251
_process_files_for_fragments(discovered_files, root_dir, preferred_revs, seen_frag_ids, batch_reader)
@@ -275,7 +294,9 @@ def build_diff_context(
275294
hunks=hunks,
276295
repo_root=root_dir,
277296
seed_weights=seed_weights,
278-
scoring_strategy=EgoGraphScoring() if os.environ.get("DIFFCTX_SCORING") == "ego" else PPRScoring(alpha=alpha),
297+
scoring_strategy=(
298+
EgoGraphScoring(max_depth=config.ego_depth) if config.scoring == "ego" else PPRScoring(alpha=config.ppr_alpha)
299+
),
279300
discovered_paths=set(discovered_files),
280301
)
281302
effective_budget = budget_tokens if budget_tokens is not None else _UNLIMITED_BUDGET

src/treemapper/treemapper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def _build_diff_tree(args: ParsedArgs) -> dict[str, Any]:
3232
no_default_ignores=args.no_default_ignores,
3333
full=args.full_diff,
3434
whitelist_file=args.whitelist_file,
35+
scoring_mode=getattr(args, "scoring", "auto"),
3536
)
3637
except GitError as e:
3738
logger.error("%s", e)

whitelist_vulture.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
)
2828
from treemapper.diffctx.graph import Graph
2929
from treemapper.diffctx.graph_analytics import QuotientNode
30+
from treemapper.diffctx.mode import PipelineConfig, ScoringMode
3031
from treemapper.diffctx.project_graph import ProjectGraph
3132
from treemapper.diffctx.scoring import EgoGraphScoring
3233
from treemapper.diffctx.tokenizer import detect_profile, is_nlp_available
@@ -63,4 +64,6 @@
6364
SqlEdgeBuilder
6465
ZigEdgeBuilder
6566
blast_radius = graph_analytics.blast_radius
67+
ScoringMode.AUTO
68+
PipelineConfig.low_relevance
6669
EgoGraphScoring

0 commit comments

Comments
 (0)