Improve delimiter chunking performance and governor checks for Python 3.13 CI (#335)

Copilot · bashandbone · web-flow · commit 90d81cfe0509 · 2026-04-12T14:52:50.000-04:00
* Changes before error encountered Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/4434677c-67a3-47ee-82d6-dd1290f8b94c Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * chore: outline plan for review feedback Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * fix: align delimiter nesting parsing Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * refactor: streamline keyword hit tracking Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * test: use recording chunker for timeout check Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com>
diff --git a/.gitignore b/.gitignore
@@ -45,6 +45,9 @@ dist/
 *.tgz
 coverage
 *.lcov
+.coverage*
+coverage.xml
+htmlcov/
 .env
 *.local.*
 *.local
diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py
@@ -270,7 +270,7 @@ def _get_matches_with_fallback(
             List of delimiter matches
         """
         governor.check_timeout()
-        matches = self._find_delimiter_matches(content)
+        matches = self._find_delimiter_matches(content, governor=governor)
 
         if not matches:
             matches = self._fallback_paragraph_chunking(content)
@@ -340,14 +340,17 @@ def _enforce_chunk_limit(self, chunks: list[CodeChunk], file_path: Path | None)
                 file_path=str(file_path) if file_path else None,
             )
 
-    def _find_delimiter_matches(self, content: str) -> list[DelimiterMatch]:
+    def _find_delimiter_matches(
+        self, content: str, *, governor: Any | None = None
+    ) -> list[DelimiterMatch]:
         """Find all delimiter matches in content using two-phase matching.
 
         Phase 1: Matches explicit start/end pairs (e.g., {...}, (...))
         Phase 2: Matches keyword delimiters with empty ends (e.g., function, def, class)
 
         Args:
             content: Source code to scan
+            governor: Optional resource governor for timeout checks between phases
 
         Returns:
             List of DelimiterMatch objects ordered by position
@@ -364,6 +367,10 @@ def _find_delimiter_matches(self, content: str) -> list[DelimiterMatch]:
         # Phase 1: Handle explicit start/end pairs (existing logic)
         matches.extend(self._match_explicit_delimiters(content, explicit_delimiters))
 
+        # Check timeout between phases to avoid unbounded work
+        if governor is not None:
+            governor.check_timeout()
+
         # Phase 2: Handle keyword delimiters with empty ends
         matches.extend(self._match_keyword_delimiters(content, keyword_delimiters))
 
@@ -475,9 +482,17 @@ def _match_keyword_delimiters(
         for delimiter in keyword_delimiters:
             delimiter_map.setdefault(delimiter.start, []).append(delimiter)
 
-        for match in combined_pattern.finditer(content):
-            matched_text = match.group(0)
-            keyword_pos = match.start()
+        keyword_hits = [
+            (match.start(), match.group(0)) for match in combined_pattern.finditer(content)
+        ]
+        # Precompute brace-nesting levels at all keyword positions in a single
+        # O(n) forward pass.  The previous approach called _calculate_nesting_level
+        # per keyword match, each scanning from position 0, resulting in O(n * m)
+        # total work that caused timeouts on large files (especially Python 3.13).
+        keyword_positions = [pos for pos, _ in keyword_hits]
+        nesting_at = self._precompute_nesting_levels(content, keyword_positions)
+
+        for keyword_pos, matched_text in keyword_hits:
 
             # Skip if keyword is inside a string or comment
             if self._is_inside_string_or_comment(content, keyword_pos):
@@ -501,8 +516,8 @@ def _match_keyword_delimiters(
                 )
 
                 if struct_end is not None:
-                    # Calculate nesting level by counting parent structures
-                    nesting_level = self._calculate_nesting_level(content, keyword_pos)
+                    # Look up precomputed nesting level (O(1) per keyword)
+                    nesting_level = nesting_at.get(keyword_pos, 0)
 
                     # Create a complete match from keyword to closing structure
                     # This represents the entire construct (e.g., function...})
@@ -517,6 +532,84 @@ def _match_keyword_delimiters(
 
         return matches
 
+    def _precompute_nesting_levels(
+        self, content: str, positions: list[int]
+    ) -> dict[int, int]:
+        """Precompute brace-nesting levels at given positions in a single forward pass.
+
+        Replaces per-position calls to ``_calculate_nesting_level`` which each
+        scanned from position 0, yielding O(n * m) total work.  This method
+        achieves the same result in O(n + m) by walking the content once and
+        recording the running brace depth at each requested position.
+
+        Args:
+            content: Source code
+            positions: Character offsets whose nesting level is needed
+
+        Returns:
+            Mapping from position to nesting level (0 = top-level)
+        """
+        if not positions:
+            return {}
+
+        result: dict[int, int] = {}
+        sorted_positions = sorted(positions)
+        pos_idx = 0
+        brace_depth = 0
+        content_len = len(content)
+        pos = 0
+        string_state = StringParseState(in_string=False, delimiter=None)
+
+        while pos < content_len:
+            # Record nesting level for every target position we have reached
+            while pos_idx < len(sorted_positions) and sorted_positions[pos_idx] <= pos:
+                result[sorted_positions[pos_idx]] = brace_depth
+                pos_idx += 1
+
+            if pos_idx >= len(sorted_positions):
+                break  # All positions recorded
+
+            pos, brace_depth, string_state = self._advance_nesting_state(
+                content, pos, content_len, brace_depth, string_state
+            )
+
+        # Any remaining positions beyond the end of content
+        for p in sorted_positions[pos_idx:]:
+            result[p] = brace_depth
+
+        return result
+
+    def _advance_nesting_state(
+        self,
+        content: str,
+        pos: int,
+        content_len: int,
+        brace_depth: int,
+        string_state: StringParseState,
+    ) -> tuple[int, int, StringParseState]:
+        """Advance parsing state for nesting-level precomputation."""
+        char = content[pos]
+
+        # Track string boundaries
+        if self._is_string_boundary(char):
+            string_state = self._update_string_state(content, pos, char, string_state)
+
+        if string_state.in_string:
+            return pos + 1, brace_depth, string_state
+
+        comment_skip = self._skip_comment(content, pos, content_len)
+        if comment_skip is not None:
+            if comment_skip == -1:
+                return content_len, brace_depth, string_state
+            return comment_skip, brace_depth, string_state
+
+        if char == "{":
+            brace_depth += 1
+        elif char == "}":
+            brace_depth = max(0, brace_depth - 1)
+
+        return pos + 1, brace_depth, string_state
+
     def _calculate_nesting_level(self, content: str, pos: int) -> int:
         """Calculate nesting level at a given position by counting braces.
 
@@ -527,45 +620,10 @@ def _calculate_nesting_level(self, content: str, pos: int) -> int:
         Returns:
             Nesting level (0 = top level, 1+ = nested)
         """
-        # Count opening and closing braces before this position
-        # Ignore braces in strings and comments
-        brace_depth = 0
-        i = 0
-        in_string = False
-        string_char = None
-
-        while i < pos:
-            c = content[i]
-
-            # Handle strings
-            if c in ('"', "'", "`") and (i == 0 or content[i - 1] != "\\"):
-                if not in_string:
-                    in_string = True
-                    string_char = c
-                elif c == string_char:
-                    in_string = False
-                    string_char = None
-
-            # Handle comments (simplified - just check for // and /*)
-            elif not in_string:
-                if content[i : i + 2] == "//":
-                    # Skip to end of line
-                    next_newline = content.find("\n", i)
-                    i = next_newline if next_newline >= 0 else len(content)
-                    continue
-                if content[i : i + 2] == "/*":
-                    # Skip to end of comment
-                    end_comment = content.find("*/", i + 2)
-                    i = end_comment + 2 if end_comment >= 0 else len(content)
-                    continue
-                if c == "{":
-                    brace_depth += 1
-                elif c == "}":
-                    brace_depth = max(0, brace_depth - 1)
-
-            i += 1
+        if pos <= 0:
+            return 0
 
-        return brace_depth
+        return self._precompute_nesting_levels(content, [pos]).get(pos, 0)
 
     def _find_next_structural_with_char(
         self, content: str, start: int, allowed: frozenset[str]
@@ -660,13 +718,14 @@ def _skip_comment(self, content: str, pos: int, content_len: int) -> int | None:
         Returns:
             New position after comment, -1 if comment to EOF, None if no comment
         """
-        if pos + 1 >= content_len:
+        if pos >= content_len:
             return None
 
         two_chars = content[pos : pos + 2]
+        char = content[pos]
 
         # Line comments
-        if two_chars in ("//", "#"):
+        if two_chars == "//" or char == "#":
             newline_pos = content.find("\n", pos)
             return -1 if newline_pos == -1 else newline_pos + 1
         # Block comments
@@ -833,12 +892,13 @@ def _skip_comment_in_matching(self, content: str, pos: int, content_len: int) ->
         Returns:
             New position, -1 if end reached, None if no comment
         """
-        if pos + 1 >= content_len:
+        if pos >= content_len:
             return None
 
         two_chars = content[pos : pos + 2]
+        char = content[pos]
 
-        if two_chars in ("//", "#"):
+        if two_chars == "//" or char == "#":
             newline = content.find("\n", pos)
             return -1 if newline == -1 else newline
         if two_chars == "/*":
diff --git a/tests/unit/engine/chunker/test_delimiter_edge_cases.py b/tests/unit/engine/chunker/test_delimiter_edge_cases.py
@@ -336,6 +336,69 @@ def method_two(self):
             assert next_start >= current_end, "Chunks should not overlap"
 
 
+@pytest.mark.unit
+class TestGovernorChecks:
+    """Test resource governor integration points."""
+
+    def test_timeout_check_between_match_phases(
+        self, delimiter_chunker: DelimiterChunker
+    ) -> None:
+        """Ensure match phase timeout checks are invoked between phases."""
+
+        class RecordingGovernor:
+            def __init__(self, calls: list[str]) -> None:
+                self._calls = calls
+
+            def check_timeout(self) -> None:
+                self._calls.append("timeout")
+
+        calls: list[str] = []
+        governor = RecordingGovernor(calls)
+
+        class RecordingChunker(DelimiterChunker):
+            def __init__(self, *, governor: ChunkGovernor, calls: list[str]) -> None:
+                super().__init__(governor=governor)
+                self._calls = calls
+
+            def _match_explicit_delimiters(
+                self, _: str, __: list[Delimiter]
+            ):
+                self._calls.append("explicit")
+                return []
+
+            def _match_keyword_delimiters(
+                self, _: str, __: list[Delimiter]
+            ):
+                self._calls.append("keyword")
+                return []
+
+        recording_chunker = RecordingChunker(governor=delimiter_chunker.governor, calls=calls)
+        recording_chunker._delimiters = [
+            Delimiter(
+                start="{",
+                end="}",
+                kind=DelimiterKind.BLOCK,
+                priority=DelimiterKind.BLOCK.default_priority,
+                inclusive=True,
+                take_whole_lines=False,
+                nestable=True,
+            ),
+            Delimiter(
+                start="def",
+                end="",
+                kind=DelimiterKind.FUNCTION,
+                priority=DelimiterKind.FUNCTION.default_priority,
+                inclusive=True,
+                take_whole_lines=True,
+                nestable=True,
+            ),
+        ]
+
+        recording_chunker._find_delimiter_matches("def foo():\n    return 1\n", governor=governor)
+
+        assert calls == ["explicit", "timeout", "keyword"]
+
+
 @pytest.mark.unit
 class TestEdgeCaseContent:
     """Test delimiter chunker with edge case content."""