Skip to content

Commit 90d81cf

Browse files
Copilotbashandbone
andauthored
Improve delimiter chunking performance and governor checks for Python 3.13 CI (#335)
* Changes before error encountered Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/4434677c-67a3-47ee-82d6-dd1290f8b94c Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * chore: outline plan for review feedback Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * fix: align delimiter nesting parsing Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * refactor: streamline keyword hit tracking Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> * test: use recording chunker for timeout check Agent-Logs-Url: https://github.com/knitli/codeweaver/sessions/945ba2ae-3268-455f-bcba-0fbd07f80b26 Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com>
1 parent 01a39ed commit 90d81cf

3 files changed

Lines changed: 175 additions & 49 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ dist/
4545
*.tgz
4646
coverage
4747
*.lcov
48+
.coverage*
49+
coverage.xml
50+
htmlcov/
4851
.env
4952
*.local.*
5053
*.local

src/codeweaver/engine/chunker/delimiter.py

Lines changed: 109 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ def _get_matches_with_fallback(
270270
List of delimiter matches
271271
"""
272272
governor.check_timeout()
273-
matches = self._find_delimiter_matches(content)
273+
matches = self._find_delimiter_matches(content, governor=governor)
274274

275275
if not matches:
276276
matches = self._fallback_paragraph_chunking(content)
@@ -340,14 +340,17 @@ def _enforce_chunk_limit(self, chunks: list[CodeChunk], file_path: Path | None)
340340
file_path=str(file_path) if file_path else None,
341341
)
342342

343-
def _find_delimiter_matches(self, content: str) -> list[DelimiterMatch]:
343+
def _find_delimiter_matches(
344+
self, content: str, *, governor: Any | None = None
345+
) -> list[DelimiterMatch]:
344346
"""Find all delimiter matches in content using two-phase matching.
345347
346348
Phase 1: Matches explicit start/end pairs (e.g., {...}, (...))
347349
Phase 2: Matches keyword delimiters with empty ends (e.g., function, def, class)
348350
349351
Args:
350352
content: Source code to scan
353+
governor: Optional resource governor for timeout checks between phases
351354
352355
Returns:
353356
List of DelimiterMatch objects ordered by position
@@ -364,6 +367,10 @@ def _find_delimiter_matches(self, content: str) -> list[DelimiterMatch]:
364367
# Phase 1: Handle explicit start/end pairs (existing logic)
365368
matches.extend(self._match_explicit_delimiters(content, explicit_delimiters))
366369

370+
# Check timeout between phases to avoid unbounded work
371+
if governor is not None:
372+
governor.check_timeout()
373+
367374
# Phase 2: Handle keyword delimiters with empty ends
368375
matches.extend(self._match_keyword_delimiters(content, keyword_delimiters))
369376

@@ -475,9 +482,17 @@ def _match_keyword_delimiters(
475482
for delimiter in keyword_delimiters:
476483
delimiter_map.setdefault(delimiter.start, []).append(delimiter)
477484

478-
for match in combined_pattern.finditer(content):
479-
matched_text = match.group(0)
480-
keyword_pos = match.start()
485+
keyword_hits = [
486+
(match.start(), match.group(0)) for match in combined_pattern.finditer(content)
487+
]
488+
# Precompute brace-nesting levels at all keyword positions in a single
489+
# O(n) forward pass. The previous approach called _calculate_nesting_level
490+
# per keyword match, each scanning from position 0, resulting in O(n * m)
491+
# total work that caused timeouts on large files (especially Python 3.13).
492+
keyword_positions = [pos for pos, _ in keyword_hits]
493+
nesting_at = self._precompute_nesting_levels(content, keyword_positions)
494+
495+
for keyword_pos, matched_text in keyword_hits:
481496

482497
# Skip if keyword is inside a string or comment
483498
if self._is_inside_string_or_comment(content, keyword_pos):
@@ -501,8 +516,8 @@ def _match_keyword_delimiters(
501516
)
502517

503518
if struct_end is not None:
504-
# Calculate nesting level by counting parent structures
505-
nesting_level = self._calculate_nesting_level(content, keyword_pos)
519+
# Look up precomputed nesting level (O(1) per keyword)
520+
nesting_level = nesting_at.get(keyword_pos, 0)
506521

507522
# Create a complete match from keyword to closing structure
508523
# This represents the entire construct (e.g., function...})
@@ -517,6 +532,84 @@ def _match_keyword_delimiters(
517532

518533
return matches
519534

535+
def _precompute_nesting_levels(
536+
self, content: str, positions: list[int]
537+
) -> dict[int, int]:
538+
"""Precompute brace-nesting levels at given positions in a single forward pass.
539+
540+
Replaces per-position calls to ``_calculate_nesting_level`` which each
541+
scanned from position 0, yielding O(n * m) total work. This method
542+
achieves the same result in O(n + m) by walking the content once and
543+
recording the running brace depth at each requested position.
544+
545+
Args:
546+
content: Source code
547+
positions: Character offsets whose nesting level is needed
548+
549+
Returns:
550+
Mapping from position to nesting level (0 = top-level)
551+
"""
552+
if not positions:
553+
return {}
554+
555+
result: dict[int, int] = {}
556+
sorted_positions = sorted(positions)
557+
pos_idx = 0
558+
brace_depth = 0
559+
content_len = len(content)
560+
pos = 0
561+
string_state = StringParseState(in_string=False, delimiter=None)
562+
563+
while pos < content_len:
564+
# Record nesting level for every target position we have reached
565+
while pos_idx < len(sorted_positions) and sorted_positions[pos_idx] <= pos:
566+
result[sorted_positions[pos_idx]] = brace_depth
567+
pos_idx += 1
568+
569+
if pos_idx >= len(sorted_positions):
570+
break # All positions recorded
571+
572+
pos, brace_depth, string_state = self._advance_nesting_state(
573+
content, pos, content_len, brace_depth, string_state
574+
)
575+
576+
# Any remaining positions beyond the end of content
577+
for p in sorted_positions[pos_idx:]:
578+
result[p] = brace_depth
579+
580+
return result
581+
582+
def _advance_nesting_state(
583+
self,
584+
content: str,
585+
pos: int,
586+
content_len: int,
587+
brace_depth: int,
588+
string_state: StringParseState,
589+
) -> tuple[int, int, StringParseState]:
590+
"""Advance parsing state for nesting-level precomputation."""
591+
char = content[pos]
592+
593+
# Track string boundaries
594+
if self._is_string_boundary(char):
595+
string_state = self._update_string_state(content, pos, char, string_state)
596+
597+
if string_state.in_string:
598+
return pos + 1, brace_depth, string_state
599+
600+
comment_skip = self._skip_comment(content, pos, content_len)
601+
if comment_skip is not None:
602+
if comment_skip == -1:
603+
return content_len, brace_depth, string_state
604+
return comment_skip, brace_depth, string_state
605+
606+
if char == "{":
607+
brace_depth += 1
608+
elif char == "}":
609+
brace_depth = max(0, brace_depth - 1)
610+
611+
return pos + 1, brace_depth, string_state
612+
520613
def _calculate_nesting_level(self, content: str, pos: int) -> int:
521614
"""Calculate nesting level at a given position by counting braces.
522615
@@ -527,45 +620,10 @@ def _calculate_nesting_level(self, content: str, pos: int) -> int:
527620
Returns:
528621
Nesting level (0 = top level, 1+ = nested)
529622
"""
530-
# Count opening and closing braces before this position
531-
# Ignore braces in strings and comments
532-
brace_depth = 0
533-
i = 0
534-
in_string = False
535-
string_char = None
536-
537-
while i < pos:
538-
c = content[i]
539-
540-
# Handle strings
541-
if c in ('"', "'", "`") and (i == 0 or content[i - 1] != "\\"):
542-
if not in_string:
543-
in_string = True
544-
string_char = c
545-
elif c == string_char:
546-
in_string = False
547-
string_char = None
548-
549-
# Handle comments (simplified - just check for // and /*)
550-
elif not in_string:
551-
if content[i : i + 2] == "//":
552-
# Skip to end of line
553-
next_newline = content.find("\n", i)
554-
i = next_newline if next_newline >= 0 else len(content)
555-
continue
556-
if content[i : i + 2] == "/*":
557-
# Skip to end of comment
558-
end_comment = content.find("*/", i + 2)
559-
i = end_comment + 2 if end_comment >= 0 else len(content)
560-
continue
561-
if c == "{":
562-
brace_depth += 1
563-
elif c == "}":
564-
brace_depth = max(0, brace_depth - 1)
565-
566-
i += 1
623+
if pos <= 0:
624+
return 0
567625

568-
return brace_depth
626+
return self._precompute_nesting_levels(content, [pos]).get(pos, 0)
569627

570628
def _find_next_structural_with_char(
571629
self, content: str, start: int, allowed: frozenset[str]
@@ -660,13 +718,14 @@ def _skip_comment(self, content: str, pos: int, content_len: int) -> int | None:
660718
Returns:
661719
New position after comment, -1 if comment to EOF, None if no comment
662720
"""
663-
if pos + 1 >= content_len:
721+
if pos >= content_len:
664722
return None
665723

666724
two_chars = content[pos : pos + 2]
725+
char = content[pos]
667726

668727
# Line comments
669-
if two_chars in ("//", "#"):
728+
if two_chars == "//" or char == "#":
670729
newline_pos = content.find("\n", pos)
671730
return -1 if newline_pos == -1 else newline_pos + 1
672731
# Block comments
@@ -833,12 +892,13 @@ def _skip_comment_in_matching(self, content: str, pos: int, content_len: int) ->
833892
Returns:
834893
New position, -1 if end reached, None if no comment
835894
"""
836-
if pos + 1 >= content_len:
895+
if pos >= content_len:
837896
return None
838897

839898
two_chars = content[pos : pos + 2]
899+
char = content[pos]
840900

841-
if two_chars in ("//", "#"):
901+
if two_chars == "//" or char == "#":
842902
newline = content.find("\n", pos)
843903
return -1 if newline == -1 else newline
844904
if two_chars == "/*":

tests/unit/engine/chunker/test_delimiter_edge_cases.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,69 @@ def method_two(self):
336336
assert next_start >= current_end, "Chunks should not overlap"
337337

338338

339+
@pytest.mark.unit
340+
class TestGovernorChecks:
341+
"""Test resource governor integration points."""
342+
343+
def test_timeout_check_between_match_phases(
344+
self, delimiter_chunker: DelimiterChunker
345+
) -> None:
346+
"""Ensure match phase timeout checks are invoked between phases."""
347+
348+
class RecordingGovernor:
349+
def __init__(self, calls: list[str]) -> None:
350+
self._calls = calls
351+
352+
def check_timeout(self) -> None:
353+
self._calls.append("timeout")
354+
355+
calls: list[str] = []
356+
governor = RecordingGovernor(calls)
357+
358+
class RecordingChunker(DelimiterChunker):
359+
def __init__(self, *, governor: ChunkGovernor, calls: list[str]) -> None:
360+
super().__init__(governor=governor)
361+
self._calls = calls
362+
363+
def _match_explicit_delimiters(
364+
self, _: str, __: list[Delimiter]
365+
):
366+
self._calls.append("explicit")
367+
return []
368+
369+
def _match_keyword_delimiters(
370+
self, _: str, __: list[Delimiter]
371+
):
372+
self._calls.append("keyword")
373+
return []
374+
375+
recording_chunker = RecordingChunker(governor=delimiter_chunker.governor, calls=calls)
376+
recording_chunker._delimiters = [
377+
Delimiter(
378+
start="{",
379+
end="}",
380+
kind=DelimiterKind.BLOCK,
381+
priority=DelimiterKind.BLOCK.default_priority,
382+
inclusive=True,
383+
take_whole_lines=False,
384+
nestable=True,
385+
),
386+
Delimiter(
387+
start="def",
388+
end="",
389+
kind=DelimiterKind.FUNCTION,
390+
priority=DelimiterKind.FUNCTION.default_priority,
391+
inclusive=True,
392+
take_whole_lines=True,
393+
nestable=True,
394+
),
395+
]
396+
397+
recording_chunker._find_delimiter_matches("def foo():\n return 1\n", governor=governor)
398+
399+
assert calls == ["explicit", "timeout", "keyword"]
400+
401+
339402
@pytest.mark.unit
340403
class TestEdgeCaseContent:
341404
"""Test delimiter chunker with edge case content."""

0 commit comments

Comments
 (0)