Skip to content

Commit 501efb5

Browse files
HeshamHM28claude
andcommitted
fix(java): add progressive fallback for token limit in code context extraction
Previously, get_code_optimization_context_for_language() would raise a hard ValueError when the extracted code context exceeded the 16,000 token limit, causing 93% of Java functions in large projects to fail optimization. This was because Java's helper traversal (max_depth=2) pulls in transitive dependencies, and type skeleton wrapping adds all class fields and constructors. This commit adds a 4-stage progressive fallback strategy: 1. Full context (all helpers, Javadoc intact) 2. Remove cross-file helpers (keep same-file helpers only) 3. Strip Javadoc comments from all code 4. Remove all helpers (target code only) Each stage is tried in order until the token limit is satisfied, with debug logging when a fallback is used. The same fallback applies independently to both optim and testgen token limits. Also extracts the code string building logic into a reusable _build_code_strings_for_language() helper and adds a _strip_javadoc_comments() utility for removing /** ... */ blocks while preserving other comments. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c11afd3 commit 501efb5

1 file changed

Lines changed: 180 additions & 69 deletions

File tree

codeflash/context/code_context_extractor.py

Lines changed: 180 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -208,39 +208,38 @@ def get_code_optimization_context(
208208
)
209209

210210

211-
def get_code_optimization_context_for_language(
212-
function_to_optimize: FunctionToOptimize,
213-
project_root_path: Path,
214-
optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
215-
testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
216-
) -> CodeOptimizationContext:
217-
"""Extract code optimization context for non-Python languages.
211+
def _strip_javadoc_comments(source: str) -> str:
212+
"""Strip Javadoc (/** ... */) comments from Java source code.
218213
219-
Uses the language support abstraction to extract code context and converts
220-
it to the CodeOptimizationContext format expected by the pipeline.
214+
Preserves single-line comments (//) and regular block comments (/* ... */).
215+
"""
216+
import re
221217

222-
This function supports multi-file context extraction, grouping helpers by file
223-
and creating proper CodeStringsMarkdown with file paths for multi-file replacement.
218+
return re.sub(r"/\*\*.*?\*/\s*", "", source, flags=re.DOTALL)
219+
220+
221+
def _build_code_strings_for_language(
222+
code_context,
223+
function_to_optimize: FunctionToOptimize,
224+
project_root_path: Path,
225+
include_cross_file_helpers: bool = True,
226+
strip_javadoc: bool = False,
227+
include_same_file_helpers: bool = True,
228+
) -> tuple[list[CodeString], list[FunctionSource], str]:
229+
"""Build CodeString list from a CodeContext with configurable reduction.
224230
225231
Args:
226-
function_to_optimize: The function to extract context for.
232+
code_context: CodeContext from language support.
233+
function_to_optimize: The target function.
227234
project_root_path: Root of the project.
228-
optim_token_limit: Token limit for optimization context.
229-
testgen_token_limit: Token limit for testgen context.
235+
include_cross_file_helpers: Whether to include helpers from other files.
236+
strip_javadoc: Whether to strip Javadoc comments from all code.
237+
include_same_file_helpers: Whether to include same-file helper methods.
230238
231239
Returns:
232-
CodeOptimizationContext with target code and dependencies.
240+
Tuple of (code_strings, helper_function_sources, read_only_context).
233241
234242
"""
235-
from codeflash.languages import get_language_support
236-
237-
# Get language support for this function
238-
language = Language(function_to_optimize.language)
239-
lang_support = get_language_support(language)
240-
241-
# Extract code context using language support
242-
code_context = lang_support.extract_code_context(function_to_optimize, project_root_path, project_root_path)
243-
244243
# Build imports string if available
245244
imports_code = "\n".join(code_context.imports) if code_context.imports else ""
246245

@@ -251,82 +250,194 @@ def get_code_optimization_context_for_language(
251250
target_relative_path = function_to_optimize.file_path
252251

253252
# Group helpers by file path
254-
helpers_by_file: dict[Path, list[HelperFunction]] = defaultdict(list)
253+
helpers_by_file: dict[Path, list] = defaultdict(list)
255254
helper_function_sources = []
256255

257256
for helper in code_context.helper_functions:
258257
helpers_by_file[helper.file_path].append(helper)
259258

260259
# Convert to FunctionSource for pipeline compatibility
261-
helper_function_sources.append(
262-
FunctionSource(
263-
file_path=helper.file_path,
264-
qualified_name=helper.qualified_name,
265-
fully_qualified_name=helper.qualified_name,
266-
only_function_name=helper.name,
267-
source_code=helper.source_code,
268-
jedi_definition=None,
269-
)
260+
should_include = (
261+
(helper.file_path == function_to_optimize.file_path and include_same_file_helpers)
262+
or (helper.file_path != function_to_optimize.file_path and include_cross_file_helpers)
270263
)
264+
if should_include:
265+
helper_function_sources.append(
266+
FunctionSource(
267+
file_path=helper.file_path,
268+
qualified_name=helper.qualified_name,
269+
fully_qualified_name=helper.qualified_name,
270+
only_function_name=helper.name,
271+
source_code=helper.source_code,
272+
jedi_definition=None,
273+
)
274+
)
271275

272-
# Build read-writable code (target file + same-file helpers + global variables)
273-
read_writable_code_strings = []
276+
# Build read-writable code (target file + same-file helpers)
277+
code_strings = []
274278

275279
# Combine target code with same-file helpers
276280
target_file_code = code_context.target_code
277-
same_file_helpers = helpers_by_file.get(function_to_optimize.file_path, [])
278-
if same_file_helpers:
279-
helper_code = "\n\n".join(h.source_code for h in same_file_helpers)
280-
target_file_code = target_file_code + "\n\n" + helper_code
281-
282-
# Note: code_context.read_only_context contains type definitions and global variables
283-
# These should be passed as read-only context to the AI, not prepended to the target code
284-
# If prepended to target code, the AI treats them as code to optimize and includes them in output
281+
if include_same_file_helpers:
282+
same_file_helpers = helpers_by_file.get(function_to_optimize.file_path, [])
283+
if same_file_helpers:
284+
helper_code = "\n\n".join(h.source_code for h in same_file_helpers)
285+
target_file_code = target_file_code + "\n\n" + helper_code
285286

286287
# Add imports to target file code
287288
if imports_code:
288289
target_file_code = imports_code + "\n\n" + target_file_code
289290

290-
read_writable_code_strings.append(
291+
if strip_javadoc:
292+
target_file_code = _strip_javadoc_comments(target_file_code)
293+
294+
code_strings.append(
291295
CodeString(code=target_file_code, file_path=target_relative_path, language=function_to_optimize.language)
292296
)
293297

294298
# Add helper files (cross-file helpers)
295-
for file_path, file_helpers in helpers_by_file.items():
296-
if file_path == function_to_optimize.file_path:
297-
continue # Already included in target file
299+
if include_cross_file_helpers:
300+
for file_path, file_helpers in helpers_by_file.items():
301+
if file_path == function_to_optimize.file_path:
302+
continue # Already included in target file
298303

299-
try:
300-
helper_relative_path = file_path.resolve().relative_to(project_root_path.resolve())
301-
except ValueError:
302-
helper_relative_path = file_path
304+
try:
305+
helper_relative_path = file_path.resolve().relative_to(project_root_path.resolve())
306+
except ValueError:
307+
helper_relative_path = file_path
308+
309+
combined_helper_code = "\n\n".join(h.source_code for h in file_helpers)
310+
if strip_javadoc:
311+
combined_helper_code = _strip_javadoc_comments(combined_helper_code)
312+
313+
code_strings.append(
314+
CodeString(
315+
code=combined_helper_code,
316+
file_path=helper_relative_path,
317+
language=function_to_optimize.language,
318+
)
319+
)
303320

304-
# Combine all helpers from this file
305-
combined_helper_code = "\n\n".join(h.source_code for h in file_helpers)
321+
read_only_context = code_context.read_only_context
322+
if strip_javadoc and read_only_context:
323+
read_only_context = _strip_javadoc_comments(read_only_context)
306324

307-
read_writable_code_strings.append(
308-
CodeString(
309-
code=combined_helper_code, file_path=helper_relative_path, language=function_to_optimize.language
310-
)
325+
return code_strings, helper_function_sources, read_only_context
326+
327+
328+
def get_code_optimization_context_for_language(
329+
function_to_optimize: FunctionToOptimize,
330+
project_root_path: Path,
331+
optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
332+
testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
333+
) -> CodeOptimizationContext:
334+
"""Extract code optimization context for non-Python languages.
335+
336+
Uses the language support abstraction to extract code context and converts
337+
it to the CodeOptimizationContext format expected by the pipeline.
338+
339+
This function supports multi-file context extraction, grouping helpers by file
340+
and creating proper CodeStringsMarkdown with file paths for multi-file replacement.
341+
342+
Applies progressive fallback when token limits are exceeded:
343+
1. Full context (all helpers, Javadoc intact)
344+
2. Remove cross-file helpers
345+
3. Strip Javadoc comments
346+
4. Remove all helpers (target code only)
347+
348+
Args:
349+
function_to_optimize: The function to extract context for.
350+
project_root_path: Root of the project.
351+
optim_token_limit: Token limit for optimization context.
352+
testgen_token_limit: Token limit for testgen context.
353+
354+
Returns:
355+
CodeOptimizationContext with target code and dependencies.
356+
357+
"""
358+
from codeflash.languages import get_language_support
359+
360+
# Get language support for this function
361+
language = Language(function_to_optimize.language)
362+
lang_support = get_language_support(language)
363+
364+
# Extract code context using language support
365+
code_context = lang_support.extract_code_context(function_to_optimize, project_root_path, project_root_path)
366+
367+
# Progressive fallback strategies, ordered from most to least context
368+
fallback_strategies = [
369+
{"include_cross_file_helpers": True, "strip_javadoc": False, "include_same_file_helpers": True},
370+
{"include_cross_file_helpers": False, "strip_javadoc": False, "include_same_file_helpers": True},
371+
{"include_cross_file_helpers": False, "strip_javadoc": True, "include_same_file_helpers": True},
372+
{"include_cross_file_helpers": False, "strip_javadoc": True, "include_same_file_helpers": False},
373+
]
374+
375+
fallback_descriptions = [
376+
"full context",
377+
"without cross-file helpers",
378+
"without cross-file helpers and Javadoc",
379+
"target code only (no helpers, no Javadoc)",
380+
]
381+
382+
code_strings = None
383+
helper_function_sources = None
384+
read_only_context = None
385+
386+
for i, strategy in enumerate(fallback_strategies):
387+
code_strings, helper_function_sources, read_only_context = _build_code_strings_for_language(
388+
code_context, function_to_optimize, project_root_path, **strategy
311389
)
312390

391+
read_writable_code = CodeStringsMarkdown(
392+
code_strings=code_strings, language=function_to_optimize.language
393+
)
394+
read_writable_tokens = encoded_tokens_len(read_writable_code.markdown)
395+
396+
if read_writable_tokens <= optim_token_limit:
397+
if i > 0:
398+
logger.debug(
399+
"Code context exceeded token limit, using fallback: %s (%d tokens)",
400+
fallback_descriptions[i],
401+
read_writable_tokens,
402+
)
403+
break
404+
else:
405+
raise ValueError("Read-writable code has exceeded token limit even after removing all helpers and Javadoc")
406+
313407
read_writable_code = CodeStringsMarkdown(
314-
code_strings=read_writable_code_strings, language=function_to_optimize.language
408+
code_strings=code_strings, language=function_to_optimize.language
315409
)
316410

317-
# Build testgen context (same as read_writable for non-Python)
411+
# Build testgen context with its own progressive fallback
412+
# Start from the same strategy level that worked for optim
413+
testgen_code_strings = code_strings
414+
testgen_helpers = helper_function_sources
415+
318416
testgen_context = CodeStringsMarkdown(
319-
code_strings=read_writable_code_strings.copy(), language=function_to_optimize.language
417+
code_strings=testgen_code_strings.copy(), language=function_to_optimize.language
320418
)
321-
322-
# Check token limits
323-
read_writable_tokens = encoded_tokens_len(read_writable_code.markdown)
324-
if read_writable_tokens > optim_token_limit:
325-
raise ValueError("Read-writable code has exceeded token limit, cannot proceed")
326-
327419
testgen_tokens = encoded_tokens_len(testgen_context.markdown)
420+
328421
if testgen_tokens > testgen_token_limit:
329-
raise ValueError("Testgen code context has exceeded token limit, cannot proceed")
422+
# Try remaining fallback strategies for testgen
423+
for j in range(i + 1, len(fallback_strategies)):
424+
testgen_code_strings, testgen_helpers, read_only_context = _build_code_strings_for_language(
425+
code_context, function_to_optimize, project_root_path, **fallback_strategies[j]
426+
)
427+
testgen_context = CodeStringsMarkdown(
428+
code_strings=testgen_code_strings.copy(), language=function_to_optimize.language
429+
)
430+
testgen_tokens = encoded_tokens_len(testgen_context.markdown)
431+
432+
if testgen_tokens <= testgen_token_limit:
433+
logger.debug(
434+
"Testgen context exceeded token limit, using fallback: %s (%d tokens)",
435+
fallback_descriptions[j],
436+
testgen_tokens,
437+
)
438+
break
439+
else:
440+
raise ValueError("Testgen code context has exceeded token limit even after removing all helpers and Javadoc")
330441

331442
# Generate code hash from all read-writable code
332443
code_hash = hashlib.sha256(read_writable_code.flat.encode("utf-8")).hexdigest()
@@ -336,7 +447,7 @@ def get_code_optimization_context_for_language(
336447
read_writable_code=read_writable_code,
337448
# Pass type definitions and globals as read-only context for the AI
338449
# This way the AI sees them as context but doesn't include them in optimized output
339-
read_only_context_code=code_context.read_only_context,
450+
read_only_context_code=read_only_context,
340451
hashing_code_context=read_writable_code.flat,
341452
hashing_code_context_hash=code_hash,
342453
helper_functions=helper_function_sources,

0 commit comments

Comments
 (0)