codeflash/codeflash/code_utils/code_utils.py at 782dba1f234c5e524db8876127946c3110eb6cbe · codeflash-ai/codeflash · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
from __future__ import annotations

import ast
import difflib
import os
import re
import shutil
import site
import sys
from contextlib import contextmanager
from functools import lru_cache
from pathlib import Path
from tempfile import TemporaryDirectory

import tomlkit

from codeflash.cli_cmds.console import logger, paneled_text
from codeflash.code_utils.config_parser import find_pyproject_toml

ImportErrorPattern = re.compile(r"ModuleNotFoundError.*$", re.MULTILINE)


def unified_diff_strings(code1: str, code2: str, fromfile: str = "original", tofile: str = "modified") -> str:
    """Return the unified diff between two code strings as a single string.

    :param code1: First code string (original).
    :param code2: Second code string (modified).
    :param fromfile: Label for the first code string.
    :param tofile: Label for the second code string.
    :return: Unified diff as a string.
    """
    code1_lines = code1.splitlines(keepends=True)
    code2_lines = code2.splitlines(keepends=True)

    diff = difflib.unified_diff(code1_lines, code2_lines, fromfile=fromfile, tofile=tofile, lineterm="")

    return "".join(diff)


def diff_length(a: str, b: str) -> int:
    """Compute the length (in characters) of the unified diff between two strings.

    Args:
        a (str): Original string.
        b (str): Modified string.

    Returns:
        int: Total number of characters in the diff.

    """
    # Split input strings into lines for line-by-line diff
    a_lines = a.splitlines(keepends=True)
    b_lines = b.splitlines(keepends=True)

    # Compute unified diff
    diff_lines = list(difflib.unified_diff(a_lines, b_lines, lineterm=""))

    # Join all lines with newline to calculate total diff length
    diff_text = "\n".join(diff_lines)

    return len(diff_text)


def create_rank_dictionary_compact(int_array: list[int]) -> dict[int, int]:
    """Create a dictionary from a list of ints, mapping the original index to its rank.

    This version uses a more compact, "Pythonic" implementation.

    Args:
        int_array: A list of integers.

    Returns:
        A dictionary where keys are original indices and values are the
        rank of the element in ascending order.

    """
    # Sort the indices of the array based on their corresponding values
    sorted_indices = sorted(range(len(int_array)), key=lambda i: int_array[i])

    # Create a dictionary mapping the original index to its rank (its position in the sorted list)
    return {original_index: rank for rank, original_index in enumerate(sorted_indices)}


@contextmanager
def custom_addopts() -> None:
    pyproject_file = find_pyproject_toml()
    original_content = None
    non_blacklist_plugin_args = ""

    try:
        # Read original file
        if pyproject_file.exists():
            with Path.open(pyproject_file, encoding="utf-8") as f:
                original_content = f.read()
                data = tomlkit.parse(original_content)
            # Backup original addopts
            original_addopts = data.get("tool", {}).get("pytest", {}).get("ini_options", {}).get("addopts", "")
            # nothing to do if no addopts present
            if original_addopts != "" and isinstance(original_addopts, list):
                original_addopts = [x.strip() for x in original_addopts]
                non_blacklist_plugin_args = re.sub(r"-n(?: +|=)\S+", "", " ".join(original_addopts)).split(" ")
                non_blacklist_plugin_args = [x for x in non_blacklist_plugin_args if x != ""]
                if non_blacklist_plugin_args != original_addopts:
                    data["tool"]["pytest"]["ini_options"]["addopts"] = non_blacklist_plugin_args
                    # Write modified file
                    with Path.open(pyproject_file, "w", encoding="utf-8") as f:
                        f.write(tomlkit.dumps(data))

        yield

    finally:
        # Restore original file
        if (
            original_content
            and pyproject_file.exists()
            and tuple(original_addopts) not in {(), tuple(non_blacklist_plugin_args)}
        ):
            with Path.open(pyproject_file, "w", encoding="utf-8") as f:
                f.write(original_content)


@contextmanager
def add_addopts_to_pyproject() -> None:
    pyproject_file = find_pyproject_toml()
    original_content = None
    try:
        # Read original file
        if pyproject_file.exists():
            with Path.open(pyproject_file, encoding="utf-8") as f:
                original_content = f.read()
                data = tomlkit.parse(original_content)
            data["tool"]["pytest"] = {}
            data["tool"]["pytest"]["ini_options"] = {}
            data["tool"]["pytest"]["ini_options"]["addopts"] = [
                "-n=auto",
                "-n",
                "1",
                "-n 1",
                "-n      1",
                "-n      auto",
            ]
            with Path.open(pyproject_file, "w", encoding="utf-8") as f:
                f.write(tomlkit.dumps(data))

        yield

    finally:
        # Restore original file
        with Path.open(pyproject_file, "w", encoding="utf-8") as f:
            f.write(original_content)


def encoded_tokens_len(s: str) -> int:
    """Return the approximate length of the encoded tokens.

    It's an approximation of BPE encoding (https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf).
    """
    return int(len(s) * 0.25)


def get_qualified_name(module_name: str, full_qualified_name: str) -> str:
    if not full_qualified_name:
        msg = "full_qualified_name cannot be empty"
        raise ValueError(msg)
    if not full_qualified_name.startswith(module_name):
        msg = f"{full_qualified_name} does not start with {module_name}"
        raise ValueError(msg)
    if module_name == full_qualified_name:
        msg = f"{full_qualified_name} is the same as {module_name}"
        raise ValueError(msg)
    return full_qualified_name[len(module_name) + 1 :]


def module_name_from_file_path(file_path: Path, project_root_path: Path, *, traverse_up: bool = False) -> str:
    try:
        relative_path = file_path.resolve().relative_to(project_root_path.resolve())
        return relative_path.with_suffix("").as_posix().replace("/", ".")
    except ValueError:
        if traverse_up:
            parent = file_path.parent
            while parent not in (project_root_path, parent.parent):
                try:
                    relative_path = file_path.relative_to(parent)
                    return relative_path.with_suffix("").as_posix().replace("/", ".")
                except ValueError:
                    parent = parent.parent
        msg = f"File {file_path} is not within the project root {project_root_path}."
        raise ValueError(msg)  # noqa: B904


def file_path_from_module_name(module_name: str, project_root_path: Path) -> Path:
    """Get file path from module path."""
    return project_root_path / (module_name.replace(".", os.sep) + ".py")


@lru_cache(maxsize=100)
def file_name_from_test_module_name(test_module_name: str, base_dir: Path) -> Path | None:
    partial_test_class = test_module_name
    while partial_test_class:
        test_path = file_path_from_module_name(partial_test_class, base_dir)
        if (base_dir / test_path).exists():
            return base_dir / test_path
        partial_test_class = ".".join(partial_test_class.split(".")[:-1])
    return None


def get_imports_from_file(
    file_path: Path | None = None, file_string: str | None = None, file_ast: ast.AST | None = None
) -> list[ast.Import | ast.ImportFrom]:
    assert sum([file_path is not None, file_string is not None, file_ast is not None]) == 1, (
        "Must provide exactly one of file_path, file_string, or file_ast"
    )
    if file_path:
        with file_path.open(encoding="utf8") as file:
            file_string = file.read()
    if file_ast is None:
        if file_string is None:
            logger.error("file_string cannot be None when file_ast is not provided")
            return []
        try:
            file_ast = ast.parse(file_string)
        except SyntaxError as e:
            logger.exception(f"Syntax error in code: {e}")
            return []
    return [node for node in ast.walk(file_ast) if isinstance(node, (ast.Import, ast.ImportFrom))]


def get_all_function_names(code: str) -> tuple[bool, list[str]]:
    try:
        module = ast.parse(code)
    except SyntaxError as e:
        logger.exception(f"Syntax error in code: {e}")
        return False, []

    function_names = [
        node.name for node in ast.walk(module) if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
    ]
    return True, function_names


def get_run_tmp_file(file_path: Path) -> Path:
    if not hasattr(get_run_tmp_file, "tmpdir"):
        get_run_tmp_file.tmpdir = TemporaryDirectory(prefix="codeflash_")
    return Path(get_run_tmp_file.tmpdir.name) / file_path


def path_belongs_to_site_packages(file_path: Path) -> bool:
    site_packages = [Path(p) for p in site.getsitepackages()]
    return any(file_path.resolve().is_relative_to(site_package_path) for site_package_path in site_packages)


def is_class_defined_in_file(class_name: str, file_path: Path) -> bool:
    if not file_path.exists():
        return False
    with file_path.open(encoding="utf8") as file:
        source = file.read()
    tree = ast.parse(source)
    return any(isinstance(node, ast.ClassDef) and node.name == class_name for node in ast.walk(tree))


def validate_python_code(code: str) -> str:
    """Validate a string of Python code by attempting to compile it."""
    try:
        compile(code, "<string>", "exec")
    except SyntaxError as e:
        msg = f"Invalid Python code: {e.msg} (line {e.lineno}, column {e.offset})"
        raise ValueError(msg) from e
    return code


def cleanup_paths(paths: list[Path]) -> None:
    for path in paths:
        if path and path.exists():
            if path.is_dir():
                shutil.rmtree(path, ignore_errors=True)
            else:
                path.unlink(missing_ok=True)


def restore_conftest(path_to_content_map: dict[Path, str]) -> None:
    for path, file_content in path_to_content_map.items():
        path.write_text(file_content, encoding="utf8")


def exit_with_message(message: str, *, error_on_exit: bool = False) -> None:
    paneled_text(message, panel_args={"style": "red"})

    sys.exit(1 if error_on_exit else 0)