fix(java): add progressive fallback for token limit in code context extraction

HeshamHM28 · claude · HeshamHM28 · commit 501efb5da875 · 2026-02-13T00:43:35.000Z
Previously, get_code_optimization_context_for_language() would raise a
hard ValueError when the extracted code context exceeded the 16,000
token limit, causing 93% of Java functions in large projects to fail
optimization. This was because Java's helper traversal (max_depth=2)
pulls in transitive dependencies, and type skeleton wrapping adds all
class fields and constructors.

This commit adds a 4-stage progressive fallback strategy:
1. Full context (all helpers, Javadoc intact)
2. Remove cross-file helpers (keep same-file helpers only)
3. Strip Javadoc comments from all code
4. Remove all helpers (target code only)

Each stage is tried in order until the token limit is satisfied, with
debug logging when a fallback is used. The same fallback applies
independently to both optim and testgen token limits.

Also extracts the code string building logic into a reusable
_build_code_strings_for_language() helper and adds a _strip_javadoc_comments()
utility for removing /** ... */ blocks while preserving other comments.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py
@@ -208,39 +208,38 @@ def get_code_optimization_context(
     )
 
 
-def get_code_optimization_context_for_language(
-    function_to_optimize: FunctionToOptimize,
-    project_root_path: Path,
-    optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
-    testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
-) -> CodeOptimizationContext:
-    """Extract code optimization context for non-Python languages.
+def _strip_javadoc_comments(source: str) -> str:
+    """Strip Javadoc (/** ... */) comments from Java source code.
 
-    Uses the language support abstraction to extract code context and converts
-    it to the CodeOptimizationContext format expected by the pipeline.
+    Preserves single-line comments (//) and regular block comments (/* ... */).
+    """
+    import re
 
-    This function supports multi-file context extraction, grouping helpers by file
-    and creating proper CodeStringsMarkdown with file paths for multi-file replacement.
+    return re.sub(r"/\*\*.*?\*/\s*", "", source, flags=re.DOTALL)
+
+
+def _build_code_strings_for_language(
+    code_context,
+    function_to_optimize: FunctionToOptimize,
+    project_root_path: Path,
+    include_cross_file_helpers: bool = True,
+    strip_javadoc: bool = False,
+    include_same_file_helpers: bool = True,
+) -> tuple[list[CodeString], list[FunctionSource], str]:
+    """Build CodeString list from a CodeContext with configurable reduction.
 
     Args:
-        function_to_optimize: The function to extract context for.
+        code_context: CodeContext from language support.
+        function_to_optimize: The target function.
         project_root_path: Root of the project.
-        optim_token_limit: Token limit for optimization context.
-        testgen_token_limit: Token limit for testgen context.
+        include_cross_file_helpers: Whether to include helpers from other files.
+        strip_javadoc: Whether to strip Javadoc comments from all code.
+        include_same_file_helpers: Whether to include same-file helper methods.
 
     Returns:
-        CodeOptimizationContext with target code and dependencies.
+        Tuple of (code_strings, helper_function_sources, read_only_context).
 
     """
-    from codeflash.languages import get_language_support
-
-    # Get language support for this function
-    language = Language(function_to_optimize.language)
-    lang_support = get_language_support(language)
-
-    # Extract code context using language support
-    code_context = lang_support.extract_code_context(function_to_optimize, project_root_path, project_root_path)
-
     # Build imports string if available
     imports_code = "\n".join(code_context.imports) if code_context.imports else ""
 
@@ -251,82 +250,194 @@ def get_code_optimization_context_for_language(
         target_relative_path = function_to_optimize.file_path
 
     # Group helpers by file path
-    helpers_by_file: dict[Path, list[HelperFunction]] = defaultdict(list)
+    helpers_by_file: dict[Path, list] = defaultdict(list)
     helper_function_sources = []
 
     for helper in code_context.helper_functions:
         helpers_by_file[helper.file_path].append(helper)
 
         # Convert to FunctionSource for pipeline compatibility
-        helper_function_sources.append(
-            FunctionSource(
-                file_path=helper.file_path,
-                qualified_name=helper.qualified_name,
-                fully_qualified_name=helper.qualified_name,
-                only_function_name=helper.name,
-                source_code=helper.source_code,
-                jedi_definition=None,
-            )
+        should_include = (
+            (helper.file_path == function_to_optimize.file_path and include_same_file_helpers)
+            or (helper.file_path != function_to_optimize.file_path and include_cross_file_helpers)
         )
+        if should_include:
+            helper_function_sources.append(
+                FunctionSource(
+                    file_path=helper.file_path,
+                    qualified_name=helper.qualified_name,
+                    fully_qualified_name=helper.qualified_name,
+                    only_function_name=helper.name,
+                    source_code=helper.source_code,
+                    jedi_definition=None,
+                )
+            )
 
-    # Build read-writable code (target file + same-file helpers + global variables)
-    read_writable_code_strings = []
+    # Build read-writable code (target file + same-file helpers)
+    code_strings = []
 
     # Combine target code with same-file helpers
     target_file_code = code_context.target_code
-    same_file_helpers = helpers_by_file.get(function_to_optimize.file_path, [])
-    if same_file_helpers:
-        helper_code = "\n\n".join(h.source_code for h in same_file_helpers)
-        target_file_code = target_file_code + "\n\n" + helper_code
-
-    # Note: code_context.read_only_context contains type definitions and global variables
-    # These should be passed as read-only context to the AI, not prepended to the target code
-    # If prepended to target code, the AI treats them as code to optimize and includes them in output
+    if include_same_file_helpers:
+        same_file_helpers = helpers_by_file.get(function_to_optimize.file_path, [])
+        if same_file_helpers:
+            helper_code = "\n\n".join(h.source_code for h in same_file_helpers)
+            target_file_code = target_file_code + "\n\n" + helper_code
 
     # Add imports to target file code
     if imports_code:
         target_file_code = imports_code + "\n\n" + target_file_code
 
-    read_writable_code_strings.append(
+    if strip_javadoc:
+        target_file_code = _strip_javadoc_comments(target_file_code)
+
+    code_strings.append(
         CodeString(code=target_file_code, file_path=target_relative_path, language=function_to_optimize.language)
     )
 
     # Add helper files (cross-file helpers)
-    for file_path, file_helpers in helpers_by_file.items():
-        if file_path == function_to_optimize.file_path:
-            continue  # Already included in target file
+    if include_cross_file_helpers:
+        for file_path, file_helpers in helpers_by_file.items():
+            if file_path == function_to_optimize.file_path:
+                continue  # Already included in target file
 
-        try:
-            helper_relative_path = file_path.resolve().relative_to(project_root_path.resolve())
-        except ValueError:
-            helper_relative_path = file_path
+            try:
+                helper_relative_path = file_path.resolve().relative_to(project_root_path.resolve())
+            except ValueError:
+                helper_relative_path = file_path
+
+            combined_helper_code = "\n\n".join(h.source_code for h in file_helpers)
+            if strip_javadoc:
+                combined_helper_code = _strip_javadoc_comments(combined_helper_code)
+
+            code_strings.append(
+                CodeString(
+                    code=combined_helper_code,
+                    file_path=helper_relative_path,
+                    language=function_to_optimize.language,
+                )
+            )
 
-        # Combine all helpers from this file
-        combined_helper_code = "\n\n".join(h.source_code for h in file_helpers)
+    read_only_context = code_context.read_only_context
+    if strip_javadoc and read_only_context:
+        read_only_context = _strip_javadoc_comments(read_only_context)
 
-        read_writable_code_strings.append(
-            CodeString(
-                code=combined_helper_code, file_path=helper_relative_path, language=function_to_optimize.language
-            )
+    return code_strings, helper_function_sources, read_only_context
+
+
+def get_code_optimization_context_for_language(
+    function_to_optimize: FunctionToOptimize,
+    project_root_path: Path,
+    optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
+    testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
+) -> CodeOptimizationContext:
+    """Extract code optimization context for non-Python languages.
+
+    Uses the language support abstraction to extract code context and converts
+    it to the CodeOptimizationContext format expected by the pipeline.
+
+    This function supports multi-file context extraction, grouping helpers by file
+    and creating proper CodeStringsMarkdown with file paths for multi-file replacement.
+
+    Applies progressive fallback when token limits are exceeded:
+    1. Full context (all helpers, Javadoc intact)
+    2. Remove cross-file helpers
+    3. Strip Javadoc comments
+    4. Remove all helpers (target code only)
+
+    Args:
+        function_to_optimize: The function to extract context for.
+        project_root_path: Root of the project.
+        optim_token_limit: Token limit for optimization context.
+        testgen_token_limit: Token limit for testgen context.
+
+    Returns:
+        CodeOptimizationContext with target code and dependencies.
+
+    """
+    from codeflash.languages import get_language_support
+
+    # Get language support for this function
+    language = Language(function_to_optimize.language)
+    lang_support = get_language_support(language)
+
+    # Extract code context using language support
+    code_context = lang_support.extract_code_context(function_to_optimize, project_root_path, project_root_path)
+
+    # Progressive fallback strategies, ordered from most to least context
+    fallback_strategies = [
+        {"include_cross_file_helpers": True, "strip_javadoc": False, "include_same_file_helpers": True},
+        {"include_cross_file_helpers": False, "strip_javadoc": False, "include_same_file_helpers": True},
+        {"include_cross_file_helpers": False, "strip_javadoc": True, "include_same_file_helpers": True},
+        {"include_cross_file_helpers": False, "strip_javadoc": True, "include_same_file_helpers": False},
+    ]
+
+    fallback_descriptions = [
+        "full context",
+        "without cross-file helpers",
+        "without cross-file helpers and Javadoc",
+        "target code only (no helpers, no Javadoc)",
+    ]
+
+    code_strings = None
+    helper_function_sources = None
+    read_only_context = None
+
+    for i, strategy in enumerate(fallback_strategies):
+        code_strings, helper_function_sources, read_only_context = _build_code_strings_for_language(
+            code_context, function_to_optimize, project_root_path, **strategy
         )
 
+        read_writable_code = CodeStringsMarkdown(
+            code_strings=code_strings, language=function_to_optimize.language
+        )
+        read_writable_tokens = encoded_tokens_len(read_writable_code.markdown)
+
+        if read_writable_tokens <= optim_token_limit:
+            if i > 0:
+                logger.debug(
+                    "Code context exceeded token limit, using fallback: %s (%d tokens)",
+                    fallback_descriptions[i],
+                    read_writable_tokens,
+                )
+            break
+    else:
+        raise ValueError("Read-writable code has exceeded token limit even after removing all helpers and Javadoc")
+
     read_writable_code = CodeStringsMarkdown(
-        code_strings=read_writable_code_strings, language=function_to_optimize.language
+        code_strings=code_strings, language=function_to_optimize.language
     )
 
-    # Build testgen context (same as read_writable for non-Python)
+    # Build testgen context with its own progressive fallback
+    # Start from the same strategy level that worked for optim
+    testgen_code_strings = code_strings
+    testgen_helpers = helper_function_sources
+
     testgen_context = CodeStringsMarkdown(
-        code_strings=read_writable_code_strings.copy(), language=function_to_optimize.language
+        code_strings=testgen_code_strings.copy(), language=function_to_optimize.language
     )
-
-    # Check token limits
-    read_writable_tokens = encoded_tokens_len(read_writable_code.markdown)
-    if read_writable_tokens > optim_token_limit:
-        raise ValueError("Read-writable code has exceeded token limit, cannot proceed")
-
     testgen_tokens = encoded_tokens_len(testgen_context.markdown)
+
     if testgen_tokens > testgen_token_limit:
-        raise ValueError("Testgen code context has exceeded token limit, cannot proceed")
+        # Try remaining fallback strategies for testgen
+        for j in range(i + 1, len(fallback_strategies)):
+            testgen_code_strings, testgen_helpers, read_only_context = _build_code_strings_for_language(
+                code_context, function_to_optimize, project_root_path, **fallback_strategies[j]
+            )
+            testgen_context = CodeStringsMarkdown(
+                code_strings=testgen_code_strings.copy(), language=function_to_optimize.language
+            )
+            testgen_tokens = encoded_tokens_len(testgen_context.markdown)
+
+            if testgen_tokens <= testgen_token_limit:
+                logger.debug(
+                    "Testgen context exceeded token limit, using fallback: %s (%d tokens)",
+                    fallback_descriptions[j],
+                    testgen_tokens,
+                )
+                break
+        else:
+            raise ValueError("Testgen code context has exceeded token limit even after removing all helpers and Javadoc")
 
     # Generate code hash from all read-writable code
     code_hash = hashlib.sha256(read_writable_code.flat.encode("utf-8")).hexdigest()
@@ -336,7 +447,7 @@ def get_code_optimization_context_for_language(
         read_writable_code=read_writable_code,
         # Pass type definitions and globals as read-only context for the AI
         # This way the AI sees them as context but doesn't include them in optimized output
-        read_only_context_code=code_context.read_only_context,
+        read_only_context_code=read_only_context,
         hashing_code_context=read_writable_code.flat,
         hashing_code_context_hash=code_hash,
         helper_functions=helper_function_sources,