7272MAX_TOKENS = 16384
7373MAX_TOOL_CALLS = 40
7474TOOL_TIMEOUT_SEC = 30
75- TASK_TIMEOUT_SEC = 300 # 5 minutes per task
75+ TASK_TIMEOUT_SEC = 600 # 10 minutes per task (Deep Search calls take 30-60s each)
7676DEFAULT_CACHE_DIR = Path .home () / ".cache" / "ccb_repos"
7777
7878BACKENDS = ("local" , "deepsearch" , "hybrid" )
@@ -920,8 +920,9 @@ def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
9209203. Use search_imports to trace import/dependency chains.
9219214. Use grep/rg for exact string matches of identifiers.
9229225. Use find_symbols to locate definitions.
923- 6. Read candidate files to verify relevance.
924- 7. Be THOROUGH — recall matters more than precision for oracle generation.
923+ 6. Read candidate files to confirm they would need code changes (not just context).
924+ 7. Be PRECISE — only include files that would appear in the fix patch (PR diff). \
925+ Do not include files you read for understanding but that don't need changes.
9259268. For multi-repo tasks, search ALL repos, not just the most obvious one.
926927""" ,
927928
@@ -939,7 +940,8 @@ def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
9399402. Use deep_search for broad semantic exploration first.
9409413. Use sourcegraph_search for precise keyword/identifier matches.
9419424. Vary your queries — try different terms, packages, file patterns.
942- 5. Be THOROUGH — recall matters more than precision for oracle generation.
943+ 5. Be PRECISE — only include files that would appear in the fix patch (PR diff). \
944+ Do not include files you found during search that don't need code changes.
943945""" ,
944946
945947 "hybrid" : """\
@@ -971,24 +973,46 @@ def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
971973 local findings should be checked for completeness via search.
9729746. If local search finds nothing, ALWAYS try Sourcegraph search before \
973975 concluding a file doesn't exist. The file may be in a repo not cloned locally.
974- 7. Be THOROUGH — recall matters more than precision for oracle generation.
976+ 7. Be PRECISE — only include files that would appear in the fix patch (PR diff). \
977+ Do not include files you read for context that don't need code changes.
9759788. For multi-repo tasks, search ALL repos — including repos you might not \
976979 have locally. Use sourcegraph_search to find files across ALL indexed repos.
977- 9. Stay FOCUSED on the specific task question. Include only files directly \
978- relevant to answering the task. Do not include every tangentially related file.
979980""" ,
980981}
981982
982983SYSTEM_PROMPT_SUFFIX = """
983- ## Precision Guidelines
984- - Include ONLY source files that would need to be **read or modified** to address the task.
985- - Do NOT include test files, documentation, or configuration files unless the task \
986- explicitly asks about testing, docs, or configuration.
987- - Do NOT include files that merely import or reference the relevant code — only files \
988- that contain the logic central to the task.
989- - When in doubt, ask: "Would a developer need to open this file to fix/understand the issue?" \
990- If no, exclude it.
991- - Aim for 1-5 files for simple bugs, 3-10 for moderate tasks, 10+ only for large refactors.
984+ ## Evaluation Rubric — What "Relevant Files" Means
985+ You are evaluated against the set of files that would appear in the **actual fix \
986+ patch** (the git diff). This means:
987+
988+ - Include ONLY files that would need **code changes** (additions, modifications, or \
989+ deletions) to address the task. These are the files that would appear in a pull \
990+ request diff.
991+ - Do NOT include files you would merely **read** for context or understanding. Reading \
992+ a file to trace a call chain is part of your research process, but only report files \
993+ that need changes in your final answer.
994+ - Do NOT include callers, importers, or dependents of the changed code unless they \
995+ themselves require modification (e.g., updating an API call signature).
996+
997+ ## File Category Rules
998+ - **Source code**: Include if it needs code changes. Exclude if it only provides context.
999+ - **Test files**: Include ONLY if the tests themselves need modification (e.g., adding \
1000+ new test cases, updating assertions). Do NOT include tests that merely validate the fix.
1001+ - **Documentation**: Include ONLY if docs need updating (e.g., CHANGELOG, man pages, \
1002+ API docs). Most bug fixes do not require doc changes.
1003+ - **Configuration/build files**: Include ONLY if they need changes (e.g., CMakeLists.txt \
1004+ for new source files, package.json for new deps). Most fixes do not touch these.
1005+ - **Type definition files** (.d.ts, .h headers): Include if type signatures change.
1006+
1007+ ## Size Calibration
1008+ Over 60% of bug fix tasks require changes to only **1 file**. Be conservative:
1009+ - Simple bugs: 1 file (most common)
1010+ - Moderate bugs: 1-3 files
1011+ - Multi-component changes: 3-5 files
1012+ - Large refactors or features: 5-10+ files
1013+
1014+ When in doubt, ask: "Would this file appear in the PR diff?" If no, exclude it.
1015+ Err on the side of fewer, more precise predictions rather than broad coverage.
9921016
9931017## Output
9941018When you have identified all relevant files, output a JSON object with:
@@ -1017,8 +1041,9 @@ def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
10171041CLI_SYSTEM_PROMPTS = {
10181042 "local" : """\
10191043 You are a code context retrieval specialist. Given a task description and \
1020- local repository paths, identify ALL source files, symbols, and dependency \
1021- chains relevant to answering the task.
1044+ local repository paths, identify the source files that would need to be \
1045+ **modified** to address the task — i.e., files that would appear in the \
1046+ fix patch / pull request diff.
10221047
10231048You have access to LOCAL TOOLS ONLY:
10241049- Bash (read-only): run grep, rg (ripgrep), find, ls, wc, and other read-only shell commands
@@ -1032,60 +1057,114 @@ def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
103210571. Read the task carefully. Identify key entities (packages, types, functions).
103310582. Start broad: use Glob and Bash (find/ls) to understand repo structure.
103410593. Use Grep to trace import/dependency chains and find symbol definitions.
1035- 4. Read candidate files with Read to verify relevance.
1036- 5. Be THOROUGH — recall matters more than precision for oracle generation.
1060+ 4. Read candidate files with Read to confirm they contain code that needs changing.
1061+ 5. Be PRECISE — report only files that need code changes, not files you read \
1062+ for context. Most bugs require changing only 1 file.
103710636. For multi-repo tasks, search ALL repos, not just the most obvious one.
10381064""" ,
10391065
10401066 "deepsearch" : """\
10411067 You are a code context retrieval specialist. Given a task description, \
1042- identify ALL source files, symbols, and dependency chains relevant to \
1043- answering the task using Sourcegraph search tools .
1068+ identify the source files that would need to be **modified** to address \
1069+ the task — i.e., files that would appear in the fix patch / pull request diff .
10441070
10451071You have access to SEARCH TOOLS:
1046- - mcp__sourcegraph__sg_nls_search: AI-powered semantic code search
1072+ - Bash: run Deep Search via the ds_wrapper.sh script (see below)
10471073- mcp__sourcegraph__sg_keyword_search: keyword/regex code search with filters
10481074
1075+ ## Deep Search (MUST USE)
1076+ Deep Search is an AI-powered semantic code search that understands code \
1077+ structure, relationships, and natural language queries. It returns detailed \
1078+ answers with file paths and code explanations.
1079+
1080+ To run Deep Search, use Bash:
1081+ ```
1082+ bash /home/stephanie_jarmak/CodeContextBench/scripts/ds_wrapper.sh "your question here"
1083+ ```
1084+
1085+ CRITICAL: Always include the repository name in your Deep Search query so it \
1086+ searches the correct codebase. Use the repo names from the "Repositories" \
1087+ section below. Examples:
1088+ - "What files implement RBAC escalation checks in kubernetes/kubernetes?"
1089+ - "Where is the ProducerBatch class defined in apache/kafka?"
1090+ - "How does the xDS protocol work in envoyproxy/envoy?"
1091+
1092+ Deep Search takes 30-60 seconds to complete. Use it for:
1093+ - Broad exploration: "What files implement X in repo Y?"
1094+ - Architecture questions: "How does the RBAC system work in repo Y?"
1095+ - Dependency tracing: "What files depend on package X in repo Y?"
1096+ - Finding implementations: "Where is interface Y implemented in repo Z?"
1097+
10491098## Strategy
105010991. Read the task carefully. Identify key entities and concepts.
1051- 2. Use mcp__sourcegraph__sg_nls_search for broad semantic exploration first.
1052- 3. Use mcp__sourcegraph__sg_keyword_search for precise keyword/identifier matches.
1100+ 2. ALWAYS start with Deep Search for broad semantic exploration. Include the \
1101+ repo name in every query.
1102+ 3. Use mcp__sourcegraph__sg_keyword_search for precise identifier matches.
105311034. Vary your queries — try different terms, packages, file patterns.
1054- 5. Be THOROUGH — recall matters more than precision for oracle generation.
1104+ 5. Be PRECISE — report only files that need code changes, not files you found \
1105+ during search that are merely related. Most bugs require changing only 1 file.
10551106""" ,
10561107
10571108 "hybrid" : """\
10581109 You are a code context retrieval specialist. Given a task description and \
1059- local repository paths, identify ALL source files, symbols, and dependency \
1060- chains relevant to answering the task.
1110+ local repository paths, identify the source files that would need to be \
1111+ **modified** to address the task — i.e., files that would appear in the \
1112+ fix patch / pull request diff.
10611113
1062- You have access to BOTH local tools AND Sourcegraph search :
1114+ You have access to BOTH local tools AND Sourcegraph Deep Search :
10631115
10641116Local tools:
1065- - Bash (read-only): run grep, rg (ripgrep), find, ls, and other read-only shell commands
1117+ - Bash: run grep, rg (ripgrep), find, ls, and other read-only shell commands. \
1118+ Also used to invoke Deep Search (see below).
10661119- Read: read source file contents
10671120- Glob: find files by glob patterns
10681121- Grep: search file contents with regex patterns
10691122
10701123Sourcegraph tools:
1071- - mcp__sourcegraph__sg_nls_search : AI-powered semantic code search
1124+ - **Deep Search** (MUST USE FIRST) : AI-powered semantic code search via Bash
10721125- mcp__sourcegraph__sg_keyword_search: keyword/regex code search with filters
10731126
1127+ ## Deep Search (MUST USE FIRST)
1128+ Deep Search is an AI-powered semantic code search that understands code \
1129+ structure, relationships, and natural language queries. It returns detailed \
1130+ answers with file paths and code explanations. You MUST run at least one \
1131+ Deep Search query before using other tools.
1132+
1133+ To run Deep Search, use Bash:
1134+ ```
1135+ bash /home/stephanie_jarmak/CodeContextBench/scripts/ds_wrapper.sh "your question here"
1136+ ```
1137+
1138+ CRITICAL: Always include the repository name in your Deep Search query so it \
1139+ searches the correct codebase. Use the repo names from the "Repositories" \
1140+ section below. Examples:
1141+ - "What files implement RBAC escalation checks in kubernetes/kubernetes?"
1142+ - "Where is the ProducerBatch class defined in apache/kafka?"
1143+ - "How does the xDS protocol work in envoyproxy/envoy?"
1144+
1145+ Deep Search takes 30-60 seconds to complete. Use it for:
1146+ - Broad exploration: "What files implement X in repo Y?"
1147+ - Architecture questions: "How does the RBAC system work in repo Y?"
1148+ - Dependency tracing: "What files depend on package X in repo Y?"
1149+ - Finding implementations: "Where is interface Y implemented in repo Z?"
1150+
10741151## Strategy
107511521. Read the task carefully. Identify key entities and concepts.
1076- 2. ALWAYS start with Sourcegraph search to discover relevant files and repos — \
1077- even if local repos are available. The local repo set may be INCOMPLETE.
1153+ 2. ALWAYS start with Deep Search to discover relevant files and repos — \
1154+ even if local repos are available. The local repo set may be INCOMPLETE. \
1155+ Include the repo name in every Deep Search query. \
1156+ Run: bash /home/stephanie_jarmak/CodeContextBench/scripts/ds_wrapper.sh "your question about repo/name"
107811573. Use local Grep/Bash (rg) to verify and refine findings against actual files.
1079- 4. Use Grep for import tracing and symbol definitions .
1158+ 4. Use mcp__sourcegraph__sg_keyword_search for precise identifier matches .
108011595. Cross-check: anything found by search should be verified locally, and \
10811160 local findings should be checked for completeness via search.
1082- 6. If local search finds nothing, ALWAYS try Sourcegraph search before \
1083- concluding a file doesn't exist.
1084- 7. Be THOROUGH — recall matters more than precision for oracle generation.
1161+ 6. If local search finds nothing, ALWAYS try Deep Search or keyword search \
1162+ before concluding a file doesn't exist.
1163+ 7. Be PRECISE — report only files that need code changes, not files you read \
1164+ for context. Most bugs require changing only 1 file. Do not include callers, \
1165+ importers, or related modules unless they themselves need modification.
108511668. For multi-repo tasks, search ALL repos — including repos you might not \
10861167 have locally.
1087- 9. Stay FOCUSED on the specific task question. Include only files directly \
1088- relevant to answering the task.
10891168""" ,
10901169}
10911170
@@ -1111,17 +1190,18 @@ def run_agent_cli(
11111190
11121191 # -- Determine allowed tools based on backend --
11131192 local_tools = ["Bash(read-only:true)" , "Read" , "Glob" , "Grep" ]
1193+ # Deep Search needs Bash (not read-only) to call ds_wrapper.sh
1194+ ds_tools = ["Bash" , "Read" , "Glob" , "Grep" ]
11141195 sg_tools = [
11151196 "mcp__sourcegraph__sg_keyword_search" ,
1116- "mcp__sourcegraph__sg_nls_search" ,
11171197 ]
11181198
11191199 if backend == "local" :
11201200 allowed_tools = local_tools
11211201 elif backend == "deepsearch" :
1122- allowed_tools = sg_tools
1202+ allowed_tools = [ "Bash" ] + sg_tools
11231203 else : # hybrid
1124- allowed_tools = local_tools + sg_tools
1204+ allowed_tools = ds_tools + sg_tools
11251205
11261206 # -- Write system prompt to temp file (avoids ARG_MAX) --
11271207 sys_prompt_file = tempfile .NamedTemporaryFile (
@@ -1153,8 +1233,8 @@ def run_agent_cli(
11531233 mcp_file .close ()
11541234 mcp_config_path = mcp_file .name
11551235 else :
1156- log .warning ("SOURCEGRAPH_ACCESS_TOKEN not set; SG tools unavailable" )
1157- # Fall back to local-only tools
1236+ log .warning ("SOURCEGRAPH_ACCESS_TOKEN not set; SG/Deep Search tools unavailable" )
1237+ # Fall back to local-only tools (no Deep Search without token)
11581238 allowed_tools = local_tools
11591239
11601240 # -- Build CLI command --
@@ -1172,6 +1252,11 @@ def run_agent_cli(
11721252
11731253 # -- Environment: unset CLAUDECODE to avoid nesting detection --
11741254 env = {k : v for k , v in os .environ .items () if k != "CLAUDECODE" }
1255+ # Ensure SRC_ACCESS_TOKEN is set for ds_wrapper.sh (Deep Search)
1256+ if "SRC_ACCESS_TOKEN" not in env :
1257+ sg_token = env .get ("SOURCEGRAPH_ACCESS_TOKEN" , "" )
1258+ if sg_token :
1259+ env ["SRC_ACCESS_TOKEN" ] = sg_token
11751260
11761261 if verbose :
11771262 log .info ("CLI cmd: claude -p <user_msg> --model %s --allowedTools %s" ,
@@ -1408,17 +1493,38 @@ def build_user_message(
14081493 if check_types :
14091494 parts .append (f"\n ## Expected Output Types\n { ', ' .join (check_types )} " )
14101495
1411- # Available repos
1412- parts .append ("\n ## Available Local Repositories" )
1496+ # Available repos — extract clean repo names for Deep Search queries
1497+ repo_names = set ()
1498+ parts .append ("\n ## Repositories" )
14131499 if repo_paths :
14141500 for name , path in sorted (repo_paths .items ()):
14151501 parts .append (f"- **{ name } **: `{ path } `" )
1502+ # Extract clean GitHub org/repo name for Deep Search
1503+ # e.g. "sg-evals/kubernetes--v1.32.0" -> "kubernetes/kubernetes"
1504+ # e.g. "django/django" -> "django/django"
1505+ clean = name .replace ("sg-evals/" , "" )
1506+ # Strip version suffix: "kubernetes--v1.32.0" -> "kubernetes"
1507+ if "--" in clean :
1508+ clean = clean .split ("--" )[0 ]
1509+ repo_names .add (clean )
14161510 else :
14171511 parts .append ("- *(none cloned locally)*" )
1512+
1513+ # Infer repo names from task context if not available from paths
1514+ instruction = ctx .get ("instruction" , "" ) + " " + ctx .get ("seed_prompt" , "" )
1515+ for pattern in re .findall (r'github\.com/([a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+)' , instruction ):
1516+ repo_names .add (pattern .rstrip ("." ))
1517+
1518+ if repo_names :
1519+ parts .append (
1520+ f"\n **Use these repo names in Deep Search queries**: "
1521+ f"{ ', ' .join (sorted (repo_names ))} "
1522+ )
1523+
14181524 parts .append (
14191525 "\n **IMPORTANT**: The repos listed above may NOT be complete. "
14201526 "The task may involve additional repositories not cloned locally. "
1421- "You MUST use sourcegraph_search or deep_search to discover files "
1527+ "Use Deep Search and keyword search to discover files "
14221528 "in repositories beyond the local set. Do not assume local repos are exhaustive."
14231529 )
14241530
0 commit comments