Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 180 additions & 69 deletions codeflash/context/code_context_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,39 +208,38 @@ def get_code_optimization_context(
)


def get_code_optimization_context_for_language(
function_to_optimize: FunctionToOptimize,
project_root_path: Path,
optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
) -> CodeOptimizationContext:
"""Extract code optimization context for non-Python languages.
def _strip_javadoc_comments(source: str) -> str:
"""Strip Javadoc (/** ... */) comments from Java source code.

Uses the language support abstraction to extract code context and converts
it to the CodeOptimizationContext format expected by the pipeline.
Preserves single-line comments (//) and regular block comments (/* ... */).
"""
import re

This function supports multi-file context extraction, grouping helpers by file
and creating proper CodeStringsMarkdown with file paths for multi-file replacement.
return re.sub(r"/\*\*.*?\*/\s*", "", source, flags=re.DOTALL)


Comment on lines +216 to +220
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚡️Codeflash found 24% (0.24x) speedup for _strip_javadoc_comments in codeflash/context/code_context_extractor.py

⏱️ Runtime : 1.78 milliseconds 1.43 milliseconds (best of 241 runs)

📝 Explanation and details

The optimized code achieves a 24% speedup by replacing regex-based pattern matching with a manual string scanning approach using Python's built-in str.find() method.

Key Optimization:
The original implementation uses re.sub(r"/\*\*.*?\*/\s*", "", source, flags=re.DOTALL) which incurs significant overhead from:

  1. Regex compilation and pattern matching engine
  2. The re.DOTALL flag enabling . to match newlines
  3. Non-greedy matching (.*?) which requires backtracking

The optimized version eliminates this overhead by:

  1. Using str.find("/**") to locate Javadoc starts - a simple C-level string search
  2. Using str.find("*/", idx + 3) to find the closing delimiter
  3. Manually scanning whitespace with s[j].isspace() instead of regex \s*
  4. Building the result with string slicing and "".join(parts)

Performance Characteristics:

  • Small inputs (single/few Javadocs): 50-100% faster due to avoiding regex overhead entirely
  • Medium inputs (dozens of Javadocs): 30-70% faster, benefiting from simpler string operations
  • Large Javadocs (thousands of characters): Up to 796% faster on very large single comments, as str.find() is more efficient than regex backtracking
  • Many Javadocs (hundreds/thousands): Shows some regression (30-50% slower) because the manual loop has more overhead per iteration than regex's optimized matching, but the overall 24% improvement indicates this case is less common in real workloads

Trade-offs:
The optimization performs exceptionally well when Javadoc comments contain large amounts of text, or when there are relatively few comments to process. The slight regression on inputs with hundreds of consecutive small Javadocs is offset by dramatic gains on large comments and typical real-world Java source files with moderate documentation density.

The line profiler shows the optimized version spends most time in str.find() calls (32.1% combined) and whitespace scanning (28.7%), which are still faster than the regex engine's 97.3% time consumption in the original.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 54 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 100.0%
🌀 Click to see Generated Regression Tests
import pytest  # used for our unit tests
from codeflash.context.code_context_extractor import _strip_javadoc_comments

def test_remove_simple_javadoc():
    # Simple Javadoc comment before a class should be removed entirely along with following whitespace/newline.
    source = "/** Simple comment */\npublic class A {}"
    # After stripping, the Javadoc and its trailing newline/space should be gone.
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 4.00μs -> 2.47μs (61.8% faster)

def test_preserve_single_line_and_regular_block_comments():
    # Ensure that single-line (//) comments and regular block comments (/* ... */) are preserved,
    # while Javadoc (/** ... */) is removed.
    source = (
        "int x = 0; // single-line comment\n"
        "/* regular block comment */\n"
        "/** Javadoc to remove */\n"
        "int y = 1;"
    )
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 3.86μs -> 2.39μs (61.1% faster)
    # Regular comments remain; Javadoc is removed along with the newline trailing it.
    expected = "int x = 0; // single-line comment\n/* regular block comment */\nint y = 1;"

def test_remove_multiline_javadoc_with_leading_stars():
    # Javadoc often has stars on each line; these should be removed entirely.
    source = "/**\n * This is a Javadoc\n * spanning multiple lines\n */\nvoid m(){}"
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 3.85μs -> 2.24μs (72.0% faster)

def test_javadoc_with_comment_like_sequences_inside():
    # Javadoc might contain sequences that look like other comments; they must be swallowed as part of the Javadoc.
    source = "/** Contains /* and // inside the Javadoc */\nint z;"
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 3.71μs -> 2.30μs (61.3% faster)

def test_no_javadoc_returns_same_string():
    # If there is no Javadoc comment, the source should remain exactly the same.
    source = "/* not a javadoc */\n// just a line\nclass C {}"
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 2.66μs -> 1.28μs (107% faster)

def test_empty_string_returns_empty():
    # Empty input should return empty output.
    codeflash_output = _strip_javadoc_comments("") # 2.56μs -> 1.25μs (104% faster)

def test_none_raises_type_error():
    # Passing a non-string (None) should raise a TypeError due to re.sub expecting a string.
    with pytest.raises(TypeError):
        _strip_javadoc_comments(None) # 5.35μs -> 2.34μs (129% faster)

def test_javadoc_at_end_of_file_removes_trailing_whitespace():
    # Javadoc at EOF should be removed and trailing whitespace after it trimmed.
    source = "class C{}\n/** End comment */   "
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 3.54μs -> 2.58μs (37.2% faster)

def test_javadoc_adjacent_to_code_no_unwanted_space_removal():
    # When Javadoc is directly adjacent to code (no intervening whitespace), removal should not delete code tokens.
    source = "int a = 0;/**doc*/int b = 1;"
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 3.58μs -> 2.29μs (56.3% faster)

def test_large_scale_many_javadocs():
    # Build a large source string containing 1000 Javadoc comments interleaved with code lines.
    n = 1000  # scale up to 1000 as required
    # Each block: code line, then a Javadoc comment, then a newline.
    source = "".join(f"code{i};\n/** Comment {i} */\n" for i in range(n))
    # Expected result: each code line remains followed by its newline; Javadocs and the newline immediately after them are removed.
    expected = "".join(f"code{i};\n" for i in range(n))
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 235μs -> 367μs (36.0% slower)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
import re

# imports
import pytest
from codeflash.context.code_context_extractor import _strip_javadoc_comments

def test_single_javadoc_comment_simple():
    """Test removal of a simple single-line Javadoc comment."""
    source = "/** This is a Javadoc comment */ public class Foo {}"
    expected = "public class Foo {}"
    codeflash_output = _strip_javadoc_comments(source) # 4.21μs -> 2.53μs (66.1% faster)

def test_multiline_javadoc_comment():
    """Test removal of a multi-line Javadoc comment."""
    source = """/**
 * This is a multi-line Javadoc comment
 * with multiple lines
 */ public class Foo {}"""
    expected = "public class Foo {}"
    codeflash_output = _strip_javadoc_comments(source) # 4.20μs -> 2.52μs (66.8% faster)

def test_javadoc_with_trailing_whitespace():
    """Test that trailing whitespace after Javadoc is removed."""
    source = "/** Comment */   public void method() {}"
    expected = "public void method() {}"
    codeflash_output = _strip_javadoc_comments(source) # 3.68μs -> 2.47μs (49.1% faster)

def test_preserves_single_line_comments():
    """Test that single-line comments (//) are preserved."""
    source = "// This is a comment\npublic class Foo {}"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 2.78μs -> 1.30μs (114% faster)

def test_preserves_block_comments():
    """Test that regular block comments (/* ... */) are preserved."""
    source = "/* This is a block comment */ public class Foo {}"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 2.72μs -> 1.30μs (110% faster)

def test_multiple_javadoc_comments():
    """Test removal of multiple Javadoc comments in sequence."""
    source = "/** Comment 1 */ public class Foo { /** Comment 2 */ public void bar() {} }"
    expected = "public class Foo { public void bar() {} }"
    codeflash_output = _strip_javadoc_comments(source) # 4.02μs -> 3.10μs (29.7% faster)

def test_javadoc_before_class():
    """Test typical Javadoc comment before class declaration."""
    source = """/**
 * Main application class
 */
public class Application {}"""
    expected = "\npublic class Application {}"
    codeflash_output = _strip_javadoc_comments(source) # 3.67μs -> 2.37μs (54.7% faster)

def test_javadoc_before_method():
    """Test typical Javadoc comment before method declaration."""
    source = """/**
 * Processes data
 * @param data the input data
 * @return processed result
 */
public String process(String data) {}"""
    expected = "\npublic String process(String data) {}"
    codeflash_output = _strip_javadoc_comments(source) # 4.25μs -> 2.39μs (77.7% faster)

def test_code_between_javadoc_comments():
    """Test that code between multiple Javadoc comments is preserved."""
    source = "/** Doc 1 */ int x = 5; /** Doc 2 */ int y = 10;"
    expected = " int x = 5;  int y = 10;"
    codeflash_output = _strip_javadoc_comments(source) # 3.80μs -> 2.88μs (32.2% faster)

def test_javadoc_with_special_characters():
    """Test Javadoc comments containing special characters."""
    source = "/** Comment with @param, @return, and <html> tags */ class Foo {}"
    expected = " class Foo {}"
    codeflash_output = _strip_javadoc_comments(source) # 3.82μs -> 2.27μs (67.9% faster)

def test_empty_string():
    """Test with empty string input."""
    source = ""
    expected = ""
    codeflash_output = _strip_javadoc_comments(source) # 2.60μs -> 1.25μs (108% faster)

def test_only_javadoc_comment():
    """Test with only a Javadoc comment and nothing else."""
    source = "/** Just a comment */"
    expected = ""
    codeflash_output = _strip_javadoc_comments(source) # 3.31μs -> 1.93μs (71.6% faster)

def test_javadoc_with_newlines():
    """Test Javadoc comment spanning many lines with various newline styles."""
    source = "/**\n\n\n* Multiple newlines\n\n*/"
    expected = ""
    codeflash_output = _strip_javadoc_comments(source) # 3.37μs -> 1.89μs (78.0% faster)

def test_nested_asterisks_in_javadoc():
    """Test Javadoc containing nested asterisks."""
    source = "/** Comment with ** asterisks ** inside */ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.80μs -> 2.31μs (64.6% faster)

def test_javadoc_with_url():
    """Test Javadoc containing URL with slashes and asterisks."""
    source = "/** See https://example.com for details */ class Foo {}"
    expected = " class Foo {}"
    codeflash_output = _strip_javadoc_comments(source) # 3.68μs -> 2.33μs (58.1% faster)

def test_incomplete_javadoc_start():
    """Test that incomplete Javadoc start (/** without closing */) is not matched."""
    source = "/** Incomplete comment\nclass Foo {}"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 3.48μs -> 1.74μs (99.6% faster)

def test_incomplete_javadoc_end():
    """Test that incomplete Javadoc end (*/ without /** start) is not matched."""
    source = "class Foo {} Incomplete end */ of comment"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 2.67μs -> 1.32μs (102% faster)

def test_javadoc_only_opening():
    """Test with only /** opening with no closing."""
    source = "/** opening only"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 3.23μs -> 1.67μs (93.7% faster)

def test_single_slash_star_not_javadoc():
    """Test that single /* is not treated as Javadoc."""
    source = "/* This is not javadoc */ code"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 2.71μs -> 1.23μs (120% faster)

def test_javadoc_with_tabs():
    """Test Javadoc comment with tab characters."""
    source = "/**\t\t* Comment with tabs\t*/ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.61μs -> 2.34μs (54.0% faster)

def test_javadoc_adjacent_to_code_no_space():
    """Test Javadoc immediately adjacent to code without spaces."""
    source = "/**Doc*/code"
    expected = "code"
    codeflash_output = _strip_javadoc_comments(source) # 3.27μs -> 2.16μs (51.4% faster)

def test_multiple_consecutive_javadoc():
    """Test multiple consecutive Javadoc comments."""
    source = "/** Doc 1 */ /** Doc 2 */ /** Doc 3 */ code"
    expected = "  code"
    codeflash_output = _strip_javadoc_comments(source) # 3.88μs -> 3.17μs (22.3% faster)

def test_javadoc_with_closing_brace_inside():
    """Test Javadoc comment containing closing braces."""
    source = "/** Comment with } and { braces */ class Foo {}"
    expected = " class Foo {}"
    codeflash_output = _strip_javadoc_comments(source) # 3.55μs -> 2.23μs (59.4% faster)

def test_javadoc_with_regex_characters():
    """Test Javadoc containing regex special characters."""
    source = "/** Comment with [abc], (x|y), and . characters */ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.69μs -> 2.24μs (64.8% faster)

def test_single_character_javadoc():
    """Test Javadoc comment with only a single character."""
    source = "/** a */ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.27μs -> 2.15μs (52.0% faster)

def test_javadoc_with_only_whitespace():
    """Test Javadoc comment containing only whitespace."""
    source = "/**     */ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.34μs -> 2.11μs (58.8% faster)

def test_javadoc_with_newline_and_spaces():
    """Test Javadoc followed by newline and spaces."""
    source = "/** Doc */\n   \npublic class Foo {}"
    expected = "\n   \npublic class Foo {}"
    codeflash_output = _strip_javadoc_comments(source) # 3.43μs -> 2.44μs (40.4% faster)

def test_line_comment_looks_like_javadoc_start():
    """Test line comment containing /** pattern (should be preserved)."""
    source = "// This comment has /** in it\ncode"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 3.42μs -> 1.85μs (84.4% faster)

def test_block_comment_with_javadoc_inside():
    """Test regular block comment that contains /** (should be preserved)."""
    source = "/* This looks like /** but is not */ code"
    expected = source
    codeflash_output = _strip_javadoc_comments(source) # 3.56μs -> 2.36μs (50.6% faster)

def test_string_with_javadoc_pattern():
    """Test that Javadoc pattern inside a string is still removed (regex operates on raw text)."""
    source = 'String s = "/** not in string */"; code'
    expected = 'String s = "; code'
    codeflash_output = _strip_javadoc_comments(source) # 3.67μs -> 2.30μs (59.8% faster)

def test_very_long_javadoc():
    """Test removal of a very long Javadoc comment."""
    long_comment = "/** " + "x" * 1000 + " */"
    source = long_comment + " code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 11.5μs -> 3.41μs (237% faster)

def test_javadoc_with_asterisk_at_start_of_lines():
    """Test typical Javadoc with asterisks at the start of each line."""
    source = """/**
 * Line 1
 * Line 2
 * Line 3
 */
code"""
    expected = "\ncode"
    codeflash_output = _strip_javadoc_comments(source) # 3.75μs -> 2.30μs (63.0% faster)

def test_javadoc_immediately_followed_by_newline():
    """Test Javadoc comment immediately followed by newline (whitespace after */)."""
    source = "/** Comment */\ncode"
    expected = "\ncode"
    codeflash_output = _strip_javadoc_comments(source) # 3.37μs -> 2.13μs (58.3% faster)

def test_unicode_in_javadoc():
    """Test Javadoc containing Unicode characters."""
    source = "/** Comment with Unicode: \u00e9\u00e0\u00fc */ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.70μs -> 2.45μs (51.1% faster)

def test_empty_javadoc():
    """Test completely empty Javadoc comment."""
    source = "/**/ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.25μs -> 1.64μs (99.0% faster)

def test_whitespace_between_opening_and_closing():
    """Test Javadoc with only whitespace between opening and closing."""
    source = "/**   \n  \t  */ code"
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source) # 3.39μs -> 2.24μs (51.4% faster)

def test_many_javadoc_comments():
    """Test removal of 100 Javadoc comments in a single source string."""
    # Build a source string with 100 Javadoc comments interspersed with code
    parts = []
    for i in range(100):
        parts.append(f"/** Comment {i} */ code_{i};")
    source = "\n".join(parts)
    
    # Verify that all Javadoc comments are removed
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 27.0μs -> 40.8μs (33.9% slower)

def test_large_javadoc_comment():
    """Test removal of a Javadoc comment with a very large amount of content."""
    # Create a large Javadoc comment with 10000 lines of text
    large_content = "\n".join([f" * Line {i}" for i in range(1000)])
    source = f"/**\n{large_content}\n */ code"
    
    expected = " code"
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 102μs -> 13.0μs (685% faster)

def test_alternating_comments_and_code():
    """Test with 500 alternating Javadoc comments and code blocks."""
    parts = []
    for i in range(500):
        parts.append(f"/** Doc {i} */")
        parts.append(f"int x{i} = {i};")
    source = "\n".join(parts)
    
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 113μs -> 197μs (42.6% slower)

def test_nested_structure_with_many_javadocs():
    """Test removal of Javadoc comments from a complex nested structure."""
    source = ""
    for class_idx in range(10):
        source += f"/** Class {class_idx} */\nclass Class{class_idx} {{\n"
        for method_idx in range(20):
            source += f"  /** Method {method_idx} */\n"
            source += f"  public void method{method_idx}() {{}}\n"
        source += "}\n"
    
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 52.5μs -> 103μs (49.2% slower)

def test_performance_with_many_false_positives():
    """Test performance when there are many /* */ comments (not javadoc) to skip over."""
    # Create source with many regular block comments that shouldn't be removed
    parts = []
    for i in range(200):
        parts.append(f"/* Regular comment {i} */")
        parts.append(f"code_{i};")
    source = "\n".join(parts)
    
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 6.14μs -> 6.43μs (4.50% slower)

def test_single_long_line_with_many_javadocs():
    """Test processing of a very long single line with many Javadoc comments."""
    # Create a single very long line with multiple Javadoc comments
    parts = []
    for i in range(300):
        parts.append(f"/** Doc{i} */ x{i};")
    source = " ".join(parts)
    
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 60.2μs -> 112μs (46.7% slower)

def test_large_string_mostly_javadoc():
    """Test processing a large string that is mostly Javadoc comments."""
    # Create a source where most content is Javadoc
    large_javadoc = "/** " + ("x" * 50000) + " */"
    source = large_javadoc + " actual_code;"
    
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 412μs -> 46.0μs (796% faster)

def test_1000_javadoc_removals():
    """Test removal efficiency with 1000 Javadoc comments."""
    # Generate 1000 javadoc comments with varying content lengths
    parts = []
    for i in range(1000):
        content_length = (i % 100) + 1  # Vary content length from 1 to 100 chars
        javadoc = f"/** {'a' * content_length} */"
        parts.append(javadoc)
        parts.append(f"code{i};")
    source = "\n".join(parts)
    
    codeflash_output = _strip_javadoc_comments(source); result = codeflash_output # 604μs -> 446μs (35.5% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To test or edit this optimization locally git merge codeflash/optimize-pr1473-2026-02-13T00.49.56

Click to see suggested changes
Suggested change
import re
This function supports multi-file context extraction, grouping helpers by file
and creating proper CodeStringsMarkdown with file paths for multi-file replacement.
return re.sub(r"/\*\*.*?\*/\s*", "", source, flags=re.DOTALL)
# Manual scan to remove "/** ... */" occurrences and any following whitespace,
# avoiding regex overhead while preserving the original behavior.
s = source
n = len(s)
i = 0
parts: list[str] = []
while True:
idx = s.find("/**", i)
if idx == -1:
parts.append(s[i:])
break
parts.append(s[i:idx])
# Find the closing '*/' that comes after the initial '/**'.
# (Start searching at idx + 3 so we don't reuse the second '*' of '/**'.)
end = s.find("*/", idx + 3)
if end == -1:
# No closing delimiter found; preserve the rest unchanged.
parts.append(s[idx:])
break
j = end + 2
# Skip any whitespace characters following the closing '*/' (equivalent to \s*).
while j < n and s[j].isspace():
j += 1
i = j
return "".join(parts)

Static Badge

def _build_code_strings_for_language(
code_context,
function_to_optimize: FunctionToOptimize,
project_root_path: Path,
include_cross_file_helpers: bool = True,
strip_javadoc: bool = False,
include_same_file_helpers: bool = True,
) -> tuple[list[CodeString], list[FunctionSource], str]:
"""Build CodeString list from a CodeContext with configurable reduction.

Args:
function_to_optimize: The function to extract context for.
code_context: CodeContext from language support.
function_to_optimize: The target function.
project_root_path: Root of the project.
optim_token_limit: Token limit for optimization context.
testgen_token_limit: Token limit for testgen context.
include_cross_file_helpers: Whether to include helpers from other files.
strip_javadoc: Whether to strip Javadoc comments from all code.
include_same_file_helpers: Whether to include same-file helper methods.

Returns:
CodeOptimizationContext with target code and dependencies.
Tuple of (code_strings, helper_function_sources, read_only_context).

"""
from codeflash.languages import get_language_support

# Get language support for this function
language = Language(function_to_optimize.language)
lang_support = get_language_support(language)

# Extract code context using language support
code_context = lang_support.extract_code_context(function_to_optimize, project_root_path, project_root_path)

# Build imports string if available
imports_code = "\n".join(code_context.imports) if code_context.imports else ""

Expand All @@ -251,82 +250,194 @@ def get_code_optimization_context_for_language(
target_relative_path = function_to_optimize.file_path

# Group helpers by file path
helpers_by_file: dict[Path, list[HelperFunction]] = defaultdict(list)
helpers_by_file: dict[Path, list] = defaultdict(list)
helper_function_sources = []

for helper in code_context.helper_functions:
helpers_by_file[helper.file_path].append(helper)

# Convert to FunctionSource for pipeline compatibility
helper_function_sources.append(
FunctionSource(
file_path=helper.file_path,
qualified_name=helper.qualified_name,
fully_qualified_name=helper.qualified_name,
only_function_name=helper.name,
source_code=helper.source_code,
jedi_definition=None,
)
should_include = (
(helper.file_path == function_to_optimize.file_path and include_same_file_helpers)
or (helper.file_path != function_to_optimize.file_path and include_cross_file_helpers)
)
if should_include:
helper_function_sources.append(
FunctionSource(
file_path=helper.file_path,
qualified_name=helper.qualified_name,
fully_qualified_name=helper.qualified_name,
only_function_name=helper.name,
source_code=helper.source_code,
jedi_definition=None,
)
)

# Build read-writable code (target file + same-file helpers + global variables)
read_writable_code_strings = []
# Build read-writable code (target file + same-file helpers)
code_strings = []

# Combine target code with same-file helpers
target_file_code = code_context.target_code
same_file_helpers = helpers_by_file.get(function_to_optimize.file_path, [])
if same_file_helpers:
helper_code = "\n\n".join(h.source_code for h in same_file_helpers)
target_file_code = target_file_code + "\n\n" + helper_code

# Note: code_context.read_only_context contains type definitions and global variables
# These should be passed as read-only context to the AI, not prepended to the target code
# If prepended to target code, the AI treats them as code to optimize and includes them in output
if include_same_file_helpers:
same_file_helpers = helpers_by_file.get(function_to_optimize.file_path, [])
if same_file_helpers:
helper_code = "\n\n".join(h.source_code for h in same_file_helpers)
target_file_code = target_file_code + "\n\n" + helper_code

# Add imports to target file code
if imports_code:
target_file_code = imports_code + "\n\n" + target_file_code

read_writable_code_strings.append(
if strip_javadoc:
target_file_code = _strip_javadoc_comments(target_file_code)

code_strings.append(
CodeString(code=target_file_code, file_path=target_relative_path, language=function_to_optimize.language)
)

# Add helper files (cross-file helpers)
for file_path, file_helpers in helpers_by_file.items():
if file_path == function_to_optimize.file_path:
continue # Already included in target file
if include_cross_file_helpers:
for file_path, file_helpers in helpers_by_file.items():
if file_path == function_to_optimize.file_path:
continue # Already included in target file

try:
helper_relative_path = file_path.resolve().relative_to(project_root_path.resolve())
except ValueError:
helper_relative_path = file_path
try:
helper_relative_path = file_path.resolve().relative_to(project_root_path.resolve())
except ValueError:
helper_relative_path = file_path

combined_helper_code = "\n\n".join(h.source_code for h in file_helpers)
if strip_javadoc:
combined_helper_code = _strip_javadoc_comments(combined_helper_code)

code_strings.append(
CodeString(
code=combined_helper_code,
file_path=helper_relative_path,
language=function_to_optimize.language,
)
)

# Combine all helpers from this file
combined_helper_code = "\n\n".join(h.source_code for h in file_helpers)
read_only_context = code_context.read_only_context
if strip_javadoc and read_only_context:
read_only_context = _strip_javadoc_comments(read_only_context)

read_writable_code_strings.append(
CodeString(
code=combined_helper_code, file_path=helper_relative_path, language=function_to_optimize.language
)
return code_strings, helper_function_sources, read_only_context


def get_code_optimization_context_for_language(
function_to_optimize: FunctionToOptimize,
project_root_path: Path,
optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
) -> CodeOptimizationContext:
"""Extract code optimization context for non-Python languages.

Uses the language support abstraction to extract code context and converts
it to the CodeOptimizationContext format expected by the pipeline.

This function supports multi-file context extraction, grouping helpers by file
and creating proper CodeStringsMarkdown with file paths for multi-file replacement.

Applies progressive fallback when token limits are exceeded:
1. Full context (all helpers, Javadoc intact)
2. Remove cross-file helpers
3. Strip Javadoc comments
4. Remove all helpers (target code only)

Args:
function_to_optimize: The function to extract context for.
project_root_path: Root of the project.
optim_token_limit: Token limit for optimization context.
testgen_token_limit: Token limit for testgen context.

Returns:
CodeOptimizationContext with target code and dependencies.

"""
from codeflash.languages import get_language_support

# Get language support for this function
language = Language(function_to_optimize.language)
lang_support = get_language_support(language)

# Extract code context using language support
code_context = lang_support.extract_code_context(function_to_optimize, project_root_path, project_root_path)

# Progressive fallback strategies, ordered from most to least context
fallback_strategies = [
{"include_cross_file_helpers": True, "strip_javadoc": False, "include_same_file_helpers": True},
{"include_cross_file_helpers": False, "strip_javadoc": False, "include_same_file_helpers": True},
{"include_cross_file_helpers": False, "strip_javadoc": True, "include_same_file_helpers": True},
{"include_cross_file_helpers": False, "strip_javadoc": True, "include_same_file_helpers": False},
]

fallback_descriptions = [
"full context",
"without cross-file helpers",
"without cross-file helpers and Javadoc",
"target code only (no helpers, no Javadoc)",
]

code_strings = None
helper_function_sources = None
read_only_context = None

for i, strategy in enumerate(fallback_strategies):
code_strings, helper_function_sources, read_only_context = _build_code_strings_for_language(
code_context, function_to_optimize, project_root_path, **strategy
)

read_writable_code = CodeStringsMarkdown(
code_strings=code_strings, language=function_to_optimize.language
)
read_writable_tokens = encoded_tokens_len(read_writable_code.markdown)

if read_writable_tokens <= optim_token_limit:
if i > 0:
logger.debug(
"Code context exceeded token limit, using fallback: %s (%d tokens)",
fallback_descriptions[i],
read_writable_tokens,
)
break
else:
raise ValueError("Read-writable code has exceeded token limit even after removing all helpers and Javadoc")

read_writable_code = CodeStringsMarkdown(
code_strings=read_writable_code_strings, language=function_to_optimize.language
code_strings=code_strings, language=function_to_optimize.language
)

# Build testgen context (same as read_writable for non-Python)
# Build testgen context with its own progressive fallback
# Start from the same strategy level that worked for optim
testgen_code_strings = code_strings
testgen_helpers = helper_function_sources

testgen_context = CodeStringsMarkdown(
code_strings=read_writable_code_strings.copy(), language=function_to_optimize.language
code_strings=testgen_code_strings.copy(), language=function_to_optimize.language
)

# Check token limits
read_writable_tokens = encoded_tokens_len(read_writable_code.markdown)
if read_writable_tokens > optim_token_limit:
raise ValueError("Read-writable code has exceeded token limit, cannot proceed")

testgen_tokens = encoded_tokens_len(testgen_context.markdown)

if testgen_tokens > testgen_token_limit:
raise ValueError("Testgen code context has exceeded token limit, cannot proceed")
# Try remaining fallback strategies for testgen
for j in range(i + 1, len(fallback_strategies)):
testgen_code_strings, testgen_helpers, read_only_context = _build_code_strings_for_language(
code_context, function_to_optimize, project_root_path, **fallback_strategies[j]
)
testgen_context = CodeStringsMarkdown(
code_strings=testgen_code_strings.copy(), language=function_to_optimize.language
)
testgen_tokens = encoded_tokens_len(testgen_context.markdown)

if testgen_tokens <= testgen_token_limit:
logger.debug(
"Testgen context exceeded token limit, using fallback: %s (%d tokens)",
fallback_descriptions[j],
testgen_tokens,
)
break
else:
raise ValueError("Testgen code context has exceeded token limit even after removing all helpers and Javadoc")

# Generate code hash from all read-writable code
code_hash = hashlib.sha256(read_writable_code.flat.encode("utf-8")).hexdigest()
Expand All @@ -336,7 +447,7 @@ def get_code_optimization_context_for_language(
read_writable_code=read_writable_code,
# Pass type definitions and globals as read-only context for the AI
# This way the AI sees them as context but doesn't include them in optimized output
read_only_context_code=code_context.read_only_context,
read_only_context_code=read_only_context,
hashing_code_context=read_writable_code.flat,
hashing_code_context_hash=code_hash,
helper_functions=helper_function_sources,
Expand Down
Loading
Loading