Fix MCP Java test instrumentation paths and use larger Java benchmark inputs

mohammedahmed18 · mohammedahmed18 · commit b5a730049153 · 2026-05-12T01:37:20.000+03:00
diff --git a/code_to_optimize/java/src/main/java/com/example/BubbleSort.java b/code_to_optimize/java/src/main/java/com/example/BubbleSort.java
@@ -36,119 +36,4 @@ public static int[] bubbleSort(int[] arr) {
         return result;
     }
 
-    /**
-     * Sort an array in descending order using bubble sort.
-     *
-     * @param arr Array to sort
-     * @return New sorted array (descending order)
-     */
-    public static int[] bubbleSortDescending(int[] arr) {
-        if (arr == null || arr.length == 0) {
-            return arr;
-        }
-
-        int[] result = new int[arr.length];
-        for (int i = 0; i < arr.length; i++) {
-            result[i] = arr[i];
-        }
-
-        int n = result.length;
-
-        for (int i = 0; i < n - 1; i++) {
-            for (int j = 0; j < n - i - 1; j++) {
-                if (result[j] < result[j + 1]) {
-                    int temp = result[j];
-                    result[j] = result[j + 1];
-                    result[j + 1] = temp;
-                }
-            }
-        }
-
-        return result;
-    }
-
-    /**
-     * Sort an array using insertion sort algorithm.
-     *
-     * @param arr Array to sort
-     * @return New sorted array
-     */
-    public static int[] insertionSort(int[] arr) {
-        if (arr == null || arr.length == 0) {
-            return arr;
-        }
-
-        int[] result = new int[arr.length];
-        for (int i = 0; i < arr.length; i++) {
-            result[i] = arr[i];
-        }
-
-        int n = result.length;
-
-        for (int i = 1; i < n; i++) {
-            int key = result[i];
-            int j = i - 1;
-
-            while (j >= 0 && result[j] > key) {
-                result[j + 1] = result[j];
-                j = j - 1;
-            }
-            result[j + 1] = key;
-        }
-
-        return result;
-    }
-
-    /**
-     * Sort an array using selection sort algorithm.
-     *
-     * @param arr Array to sort
-     * @return New sorted array
-     */
-    public static int[] selectionSort(int[] arr) {
-        if (arr == null || arr.length == 0) {
-            return arr;
-        }
-
-        int[] result = new int[arr.length];
-        for (int i = 0; i < arr.length; i++) {
-            result[i] = arr[i];
-        }
-
-        int n = result.length;
-
-        for (int i = 0; i < n - 1; i++) {
-            int minIdx = i;
-            for (int j = i + 1; j < n; j++) {
-                if (result[j] < result[minIdx]) {
-                    minIdx = j;
-                }
-            }
-
-            int temp = result[minIdx];
-            result[minIdx] = result[i];
-            result[i] = temp;
-        }
-
-        return result;
-    }
-
-    /**
-     * Check if an array is sorted in ascending order.
-     *
-     * @param arr Array to check
-     * @return true if sorted in ascending order
-     */
-    public static boolean isSorted(int[] arr) {
-        if (arr == null || arr.length <= 1) {
-            return true;
-        }
-
-        for (int i = 0; i < arr.length - 1; i++) {
-            if (arr[i] > arr[i + 1]) {
-                return false;
-            }
-        }
-        return true;
-    }
 }
diff --git a/code_to_optimize/java/src/test/java/com/example/BubbleSortTest.java b/code_to_optimize/java/src/test/java/com/example/BubbleSortTest.java
@@ -1,16 +1,20 @@
 package com.example;
 
 import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+
 import static org.junit.jupiter.api.Assertions.*;
 
 /**
  * Tests for BubbleSort sorting algorithms.
  */
 class BubbleSortTest {
+    private static final int LARGE_SORT_SIZE = 5000;
 
     @Test
     void testBubbleSort() {
-        assertArrayEquals(new int[]{1, 2, 3, 4, 5}, BubbleSort.bubbleSort(new int[]{5, 3, 1, 4, 2}));
+        assertArrayEquals(ascendingRange(LARGE_SORT_SIZE), BubbleSort.bubbleSort(descendingRange(LARGE_SORT_SIZE)));
         assertArrayEquals(new int[]{1, 2, 3}, BubbleSort.bubbleSort(new int[]{3, 2, 1}));
         assertArrayEquals(new int[]{1}, BubbleSort.bubbleSort(new int[]{1}));
         assertArrayEquals(new int[]{}, BubbleSort.bubbleSort(new int[]{}));
@@ -19,56 +23,57 @@ void testBubbleSort() {
 
     @Test
     void testBubbleSortAlreadySorted() {
-        assertArrayEquals(new int[]{1, 2, 3, 4, 5}, BubbleSort.bubbleSort(new int[]{1, 2, 3, 4, 5}));
+        int[] sorted = ascendingRange(LARGE_SORT_SIZE);
+        assertArrayEquals(sorted, BubbleSort.bubbleSort(sorted));
     }
 
     @Test
     void testBubbleSortWithDuplicates() {
-        assertArrayEquals(new int[]{1, 2, 2, 3, 3, 4}, BubbleSort.bubbleSort(new int[]{3, 2, 4, 1, 3, 2}));
+        int[] input = duplicateHeavyRange(LARGE_SORT_SIZE);
+        assertArrayEquals(sortedCopy(input), BubbleSort.bubbleSort(input));
     }
 
     @Test
     void testBubbleSortWithNegatives() {
-        assertArrayEquals(new int[]{-5, -2, 0, 3, 7}, BubbleSort.bubbleSort(new int[]{3, -2, 7, 0, -5}));
+        int[] input = mixedSignedRange(LARGE_SORT_SIZE);
+        assertArrayEquals(sortedCopy(input), BubbleSort.bubbleSort(input));
     }
 
-    @Test
-    void testBubbleSortDescending() {
-        assertArrayEquals(new int[]{5, 4, 3, 2, 1}, BubbleSort.bubbleSortDescending(new int[]{1, 3, 5, 2, 4}));
-        assertArrayEquals(new int[]{3, 2, 1}, BubbleSort.bubbleSortDescending(new int[]{1, 2, 3}));
-        assertArrayEquals(new int[]{}, BubbleSort.bubbleSortDescending(new int[]{}));
+    private static int[] ascendingRange(int size) {
+        int[] arr = new int[size];
+        for (int i = 0; i < size; i++) {
+            arr[i] = i;
+        }
+        return arr;
     }
 
-    @Test
-    void testInsertionSort() {
-        assertArrayEquals(new int[]{1, 2, 3, 4, 5}, BubbleSort.insertionSort(new int[]{5, 3, 1, 4, 2}));
-        assertArrayEquals(new int[]{1, 2, 3}, BubbleSort.insertionSort(new int[]{3, 2, 1}));
-        assertArrayEquals(new int[]{1}, BubbleSort.insertionSort(new int[]{1}));
-        assertArrayEquals(new int[]{}, BubbleSort.insertionSort(new int[]{}));
+    private static int[] descendingRange(int size) {
+        int[] arr = new int[size];
+        for (int i = 0; i < size; i++) {
+            arr[i] = size - i - 1;
+        }
+        return arr;
     }
 
-    @Test
-    void testSelectionSort() {
-        assertArrayEquals(new int[]{1, 2, 3, 4, 5}, BubbleSort.selectionSort(new int[]{5, 3, 1, 4, 2}));
-        assertArrayEquals(new int[]{1, 2, 3}, BubbleSort.selectionSort(new int[]{3, 2, 1}));
-        assertArrayEquals(new int[]{1}, BubbleSort.selectionSort(new int[]{1}));
+    private static int[] duplicateHeavyRange(int size) {
+        int[] arr = new int[size];
+        for (int i = 0; i < size; i++) {
+            arr[i] = (size - i - 1) % 32;
+        }
+        return arr;
     }
 
-    @Test
-    void testIsSorted() {
-        assertTrue(BubbleSort.isSorted(new int[]{1, 2, 3, 4, 5}));
-        assertTrue(BubbleSort.isSorted(new int[]{1}));
-        assertTrue(BubbleSort.isSorted(new int[]{}));
-        assertTrue(BubbleSort.isSorted(null));
-        assertFalse(BubbleSort.isSorted(new int[]{5, 3, 1}));
-        assertFalse(BubbleSort.isSorted(new int[]{1, 3, 2}));
+    private static int[] mixedSignedRange(int size) {
+        int[] arr = new int[size];
+        for (int i = 0; i < size; i++) {
+            arr[i] = (i % 2 == 0) ? (size - i) : -(size - i);
+        }
+        return arr;
     }
 
-    @Test
-    void testBubbleSortDoesNotMutateInput() {
-        int[] original = {5, 3, 1, 4, 2};
-        int[] copy = {5, 3, 1, 4, 2};
-        BubbleSort.bubbleSort(original);
-        assertArrayEquals(copy, original);
+    private static int[] sortedCopy(int[] arr) {
+        int[] expected = arr.clone();
+        Arrays.sort(expected);
+        return expected;
     }
 }
diff --git a/mcp_server/runner.py b/mcp_server/runner.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import inspect
+from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -19,6 +20,12 @@ class TestingMode(str, Enum):
     BENCHMARKING = "benchmarking"
 
 
+@dataclass(frozen=True)
+class _ResolvedTestFile:
+    original_path: Path
+    effective_path: Path
+
+
 def build_test_env(project_root: Path) -> dict[str, str]:
     env = make_env_with_project_root(project_root)
     env["CODEFLASH_TEST_ITERATION"] = "0"
@@ -29,18 +36,19 @@ def build_test_env(project_root: Path) -> dict[str, str]:
     return env
 
 
-def _build_test_files(test_file_paths: list[str], mode: TestingMode) -> TestFiles:
+def _build_test_files(test_files: list[_ResolvedTestFile], mode: TestingMode) -> TestFiles:
     from codeflash.models.models import TestFile, TestFiles
     from codeflash.models.test_type import TestType
 
     test_files_objs = []
-    for path_str in test_file_paths:
-        p = Path(path_str).resolve()
+    for test_file in test_files:
+        effective_path = test_file.effective_path.resolve()
+        original_path = test_file.original_path.resolve()
         test_files_objs.append(
             TestFile(
-                instrumented_behavior_file_path=p,
-                benchmarking_file_path=p if mode == TestingMode.BENCHMARKING else None,
-                original_file_path=p,
+                instrumented_behavior_file_path=effective_path,
+                benchmarking_file_path=effective_path if mode == TestingMode.BENCHMARKING else None,
+                original_file_path=original_path,
                 test_type=TestType.EXISTING_UNIT_TEST,
             )
         )
@@ -138,8 +146,31 @@ def _invoke_with_optional_test_framework(run_callable: object, *, test_framework
     return run_callable(**kwargs)
 
 
+def _resolve_test_files(test_file_paths: list[str]) -> list[_ResolvedTestFile]:
+    return [_ResolvedTestFile(original_path=Path(path).resolve(), effective_path=Path(path).resolve()) for path in test_file_paths]
+
+
+def _instrumented_test_path(test_path: Path, language: str, mode: TestingMode) -> Path:
+    if language != "java":
+        return test_path
+
+    suffix = "__perfinstrumented" if mode == TestingMode.BEHAVIORAL else "__perfonlyinstrumented"
+    if test_path.stem.endswith(suffix):
+        return test_path
+    return test_path.with_name(f"{test_path.stem}{suffix}{test_path.suffix}")
+
+
+def _reset_java_compilation_cache(language: str) -> None:
+    if language != "java":
+        return
+
+    from codeflash.languages.java.test_runner import CompilationCache
+
+    CompilationCache.clear()
+
+
 class _InstrumentedFiles:
-    """Context manager that instruments test files in-place and restores originals on exit."""
+    """Context manager that instruments MCP test files and restores originals on exit."""
 
     def __init__(
         self,
@@ -157,8 +188,17 @@ def __init__(
         self.language = language
         self.mode = mode
         self._backups: dict[Path, str] = {}
+        self._created_files: set[Path] = set()
 
-    def __enter__(self) -> list[str]:
+    def _write_instrumented_source(self, target_path: Path, code: str) -> None:
+        if target_path.exists():
+            self._backups[target_path] = target_path.read_text(encoding="utf-8")
+        else:
+            self._created_files.add(target_path)
+
+        target_path.write_text(code, encoding="utf-8")
+
+    def __enter__(self) -> list[_ResolvedTestFile]:
         from codeflash.languages.current import set_current_language
         from codeflash.languages.registry import get_language_support
 
@@ -174,13 +214,14 @@ def __enter__(self) -> list[str]:
 
         instrument_mode = "behavior" if self.mode == TestingMode.BEHAVIORAL else "performance"
 
-        instrumented_paths: list[str] = []
+        instrumented_paths: list[_ResolvedTestFile] = []
         for test_file in self.test_file_paths:
             test_path = Path(test_file).resolve()
+            instrumented_path = _instrumented_test_path(test_path, self.language, self.mode)
 
             call_positions = _find_call_positions(test_path, func_to_optimize.function_name, self.language)
             if self.language == "python" and not call_positions:
-                instrumented_paths.append(test_file)
+                instrumented_paths.append(_ResolvedTestFile(original_path=test_path, effective_path=test_path))
                 continue
 
             success, code = lang_support.instrument_existing_test(
@@ -192,18 +233,23 @@ def __enter__(self) -> list[str]:
             )
 
             if success and code:
-                self._backups[test_path] = test_path.read_text(encoding="utf-8")
-                test_path.write_text(code, encoding="utf-8")
-                instrumented_paths.append(str(test_path))
+                self._write_instrumented_source(instrumented_path, code)
+                instrumented_paths.append(_ResolvedTestFile(original_path=test_path, effective_path=instrumented_path))
             else:
-                instrumented_paths.append(test_file)
+                instrumented_paths.append(_ResolvedTestFile(original_path=test_path, effective_path=test_path))
 
         return instrumented_paths
 
     def __exit__(self, *_exc: object) -> None:
+        # restore original code for backup files
         for path, original_content in self._backups.items():
             path.write_text(original_content, encoding="utf-8")
+
+        # remove new files
+        for path in self._created_files:
+            path.unlink(missing_ok=True)
         self._backups.clear()
+        self._created_files.clear()
 
 
 def run_and_parse(
@@ -225,11 +271,12 @@ def run_and_parse(
 
     set_current_language(language)
     lang_support = get_language_support(language)
+    _reset_java_compilation_cache(language)
 
     test_env = build_test_env(project_root)
     test_config = _build_test_config(project_root)
 
-    def _execute(effective_files: list[str]) -> tuple[TestResults, subprocess.CompletedProcess[str]]:
+    def _execute(effective_files: list[_ResolvedTestFile]) -> tuple[TestResults, subprocess.CompletedProcess[str]]:
         test_files_obj = _build_test_files(effective_files, mode)
 
         if mode == TestingMode.BEHAVIORAL:
@@ -281,4 +328,4 @@ def _execute(effective_files: list[str]) -> tuple[TestResults, subprocess.Comple
         ) as effective_files:
             return _execute(effective_files)
     else:
-        return _execute(test_files)
+        return _execute(_resolve_test_files(test_files))
diff --git a/mcp_server/test_mcp_workflow.py b/mcp_server/test_mcp_workflow.py