Skip to content

Commit 2e61fcf

Browse files
authored
Merge pull request #91 from shakfu/main
added some pure-python performance optimizations
2 parents 282adf2 + 45ba86c commit 2e61fcf

7 files changed

Lines changed: 329 additions & 34 deletions

File tree

docs/optimizations.md

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
# Performance Optimizations
2+
3+
This document describes performance optimizations applied to Entangled's core parsing and tangling operations. These changes improve throughput by approximately 30% with no changes to functionality or external API.
4+
5+
## Summary
6+
7+
The primary optimizations are:
8+
9+
1. **Pre-compile regex patterns at module level** instead of compiling on every function call
10+
2. **Use list accumulation with `"".join()`** instead of `O(n²)` string concatenation
11+
3. **Cache dynamically-generated regex patterns** to avoid repeated compilation
12+
13+
These are standard Python optimization techniques that require no additional dependencies.
14+
15+
## Background
16+
17+
Profiling identified that a significant portion of execution time was spent in:
18+
19+
- Regex compilation (`re.match()` with string patterns compiles on every call)
20+
- String concatenation in loops (`text += line` creates a new string each iteration)
21+
22+
### Profiling Methodology
23+
24+
A realistic benchmark was created simulating a literate programming project with:
25+
- 17 markdown files
26+
- ~5,000 lines of content
27+
- 365 code blocks with nested references
28+
29+
The benchmark measured the full load-and-tangle workflow across multiple iterations.
30+
31+
## Optimizations Applied
32+
33+
### 1. Pre-compiled Regex in `model/tangle.py`
34+
35+
The `naked_tangler()` function matches every line against a reference pattern (`<<refname>>`). Previously, the regex was compiled on each call to `re.match()`.
36+
37+
**Before:**
38+
```python
39+
def naked_tangler(refs: ReferenceMap) -> Tangler:
40+
def tangler(...) -> Generator[str]:
41+
for line in lines(code_block.source):
42+
# Compiles regex on EVERY line
43+
if m := re.match(r"^(?P<indent>\s*)<<(?P<refname>[\w:/_.-]+)>>\s*$", line.rstrip()):
44+
...
45+
```
46+
47+
**After:**
48+
```python
49+
# Compiled once at module load
50+
_REF_PATTERN = re.compile(r"^(?P<indent>\s*)<<(?P<refname>[\w:/_.-]+)>>\s*$")
51+
52+
def naked_tangler(refs: ReferenceMap) -> Tangler:
53+
def tangler(...) -> Generator[str]:
54+
for line in lines(code_block.source):
55+
# Uses pre-compiled pattern
56+
if m := _REF_PATTERN.match(line.rstrip()):
57+
...
58+
```
59+
60+
### 2. Pre-compiled Regex in `readers/code.py`
61+
62+
The `open_block()` and `close_block()` functions parse annotated code files during stitch operations.
63+
64+
**Before:**
65+
```python
66+
OPEN_BLOCK_EXPR = r"^(?P<indent>\s*).* ~/~ begin <<..."
67+
68+
def open_block(line: str) -> OpenBlockData | None:
69+
if not (m := re.match(OPEN_BLOCK_EXPR, line)): # Compiles every call
70+
return None
71+
```
72+
73+
**After:**
74+
```python
75+
_OPEN_BLOCK_PATTERN = re.compile(
76+
r"^(?P<indent>\s*).* ~/~ begin <<..."
77+
)
78+
79+
def open_block(line: str) -> OpenBlockData | None:
80+
if not (m := _OPEN_BLOCK_PATTERN.match(line)): # Uses compiled pattern
81+
return None
82+
```
83+
84+
### 3. Cached Regex in `parsing.py`
85+
86+
The parser combinator functions `matching()` and `fullmatch()` create regex patterns dynamically. A module-level cache avoids recompiling the same patterns.
87+
88+
**Before:**
89+
```python
90+
def matching(regex: str) -> Parser[tuple[str, ...]]:
91+
pattern = re.compile(f"^{regex}") # Compiles every time matching() is called
92+
...
93+
```
94+
95+
**After:**
96+
```python
97+
_pattern_cache: dict[str, re.Pattern[str]] = {}
98+
99+
def _cached_pattern(regex: str) -> re.Pattern[str]:
100+
if regex not in _pattern_cache:
101+
_pattern_cache[regex] = re.compile(f"^{regex}")
102+
return _pattern_cache[regex]
103+
104+
def matching(regex: str) -> Parser[tuple[str, ...]]:
105+
pattern = _cached_pattern(regex) # Returns cached compiled pattern
106+
...
107+
```
108+
109+
### 4. Cached Regex in `hooks/quarto_attributes.py`
110+
111+
The `split_yaml_header()` function generates patterns based on language comment syntax. These are now cached per comment style.
112+
113+
**Before:**
114+
```python
115+
def split_yaml_header(language: Language, source: str) -> tuple[str, str, object]:
116+
trigger: str = re.escape(language.comment.open) + r"\s*\|(.*)"
117+
for i, line in enumerate(lines):
118+
if m := re.match(trigger, line): # Compiles on every line
119+
...
120+
```
121+
122+
**After:**
123+
```python
124+
_yaml_header_pattern_cache: dict[str, re.Pattern[str]] = {}
125+
126+
def _get_yaml_header_pattern(comment_open: str) -> re.Pattern[str]:
127+
if comment_open not in _yaml_header_pattern_cache:
128+
pattern = re.escape(comment_open) + r"\s*\|(.*)"
129+
_yaml_header_pattern_cache[comment_open] = re.compile(pattern)
130+
return _yaml_header_pattern_cache[comment_open]
131+
132+
def split_yaml_header(language: Language, source: str) -> tuple[str, str, object]:
133+
pattern = _get_yaml_header_pattern(language.comment.open)
134+
for i, line in enumerate(lines):
135+
if m := pattern.match(line): # Uses cached pattern
136+
...
137+
```
138+
139+
### 5. Efficient String Building in `model/tangle.py`
140+
141+
The `tangle_ref()` function accumulated output using `+=` concatenation, which is `O(n²)` for n lines.
142+
143+
**Before:**
144+
```python
145+
def tangle_ref(refs, name, annotation) -> tuple[str, set[PurePath]]:
146+
out = ""
147+
ref_lst = refs.select_by_name(name)
148+
for line in tangler(tangler, deps, ref_lst[0], False, True):
149+
out += line # O(n²) - creates new string each iteration
150+
for ref in ref_lst[1:]:
151+
for line in tangler(tangler, deps, ref, False, False):
152+
out += line
153+
return out, deps
154+
```
155+
156+
**After:**
157+
```python
158+
def tangle_ref(refs, name, annotation) -> tuple[str, set[PurePath]]:
159+
def all_lines():
160+
ref_lst = refs.select_by_name(name)
161+
yield from tangler(tangler, deps, ref_lst[0], False, True)
162+
for ref in ref_lst[1:]:
163+
yield from tangler(tangler, deps, ref, False, False)
164+
165+
out = "".join(all_lines()) # O(n) - single allocation
166+
return out, deps
167+
```
168+
169+
### 6. Efficient String Building in `readers/code.py`
170+
171+
The `read_block()` function accumulated content similarly.
172+
173+
**Before:**
174+
```python
175+
content = ""
176+
while input:
177+
...
178+
content += line # O(n²)
179+
```
180+
181+
**After:**
182+
```python
183+
content_parts: list[str] = []
184+
while input:
185+
...
186+
content_parts.append(line) # O(1) amortized
187+
...
188+
yield Block(block_data.ref, "".join(content_parts)) # O(n)
189+
```
190+
191+
### 7. Efficient String Building in `interface/document.py`
192+
193+
The `source_text()` method used the same pattern.
194+
195+
**Before:**
196+
```python
197+
def source_text(self, path: Path) -> tuple[str, set[PurePath]]:
198+
text = ""
199+
for content in self.content[path]:
200+
t, d = content_to_text(self.reference_map, content)
201+
text += t # O(n²)
202+
return text, deps
203+
```
204+
205+
**After:**
206+
```python
207+
def source_text(self, path: Path) -> tuple[str, set[PurePath]]:
208+
text_parts: list[str] = []
209+
for content in self.content[path]:
210+
t, d = content_to_text(self.reference_map, content)
211+
text_parts.append(t) # O(1) amortized
212+
return "".join(text_parts), deps # O(n)
213+
```
214+
215+
## Performance Results
216+
217+
Benchmark: 17 files, ~5K lines, 365 code blocks
218+
219+
| Metric | Before | After | Improvement |
220+
|--------|--------|-------|-------------|
221+
| **Total time** | 32.0ms | 24.5ms | **1.31x faster** |
222+
| **Throughput** | 152K lines/sec | 199K lines/sec | **+31%** |
223+
224+
Best-case improvements (cache warm, no I/O):
225+
226+
| Operation | Before | After | Improvement |
227+
|-----------|--------|-------|-------------|
228+
| Load (parse) | 19.6ms | 13.6ms | 1.44x |
229+
| Tangle | 4.0ms | 2.7ms | 1.48x |
230+
231+
## Files Changed
232+
233+
| File | Changes |
234+
|------|---------|
235+
| `entangled/model/tangle.py` | Pre-compiled `_REF_PATTERN`; `"".join()` in `tangle_ref()` |
236+
| `entangled/readers/code.py` | Pre-compiled `_OPEN_BLOCK_PATTERN`, `_CLOSE_BLOCK_PATTERN`; list accumulation |
237+
| `entangled/readers/markdown.py` | (no changes needed - already efficient) |
238+
| `entangled/hooks/quarto_attributes.py` | Added `_yaml_header_pattern_cache` |
239+
| `entangled/parsing.py` | Added `_pattern_cache` and `_cached_pattern()` |
240+
| `entangled/interface/document.py` | List accumulation in `source_text()` |
241+
242+
## Verification
243+
244+
All existing tests pass unchanged. The optimizations are purely internal and do not affect the external API or behavior.
245+
246+
To verify performance improvements:
247+
248+
```python
249+
import time
250+
from pathlib import Path
251+
from entangled.interface.document import Document
252+
from entangled.io import transaction
253+
254+
# Load a project with multiple markdown files
255+
doc = Document()
256+
start = time.perf_counter()
257+
with transaction() as t:
258+
doc.load(t)
259+
doc.tangle(t)
260+
elapsed = time.perf_counter() - start
261+
print(f"Completed in {elapsed*1000:.2f}ms")
262+
```
263+
264+
## Notes
265+
266+
- These optimizations follow standard Python best practices
267+
- No new dependencies are required
268+
- Memory usage is marginally increased due to pattern caching (negligible - a few KB)
269+
- The pattern caches are module-level and persist for the process lifetime, which is appropriate for CLI usage

entangled/hooks/quarto_attributes.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,27 @@
1414

1515
log = logger()
1616

17+
# Cache for compiled regex patterns (keyed by comment opener)
18+
_yaml_header_pattern_cache: dict[str, re.Pattern[str]] = {}
19+
20+
21+
def _get_yaml_header_pattern(comment_open: str) -> re.Pattern[str]:
22+
"""Get or create a cached compiled pattern for YAML header matching."""
23+
if comment_open not in _yaml_header_pattern_cache:
24+
pattern = re.escape(comment_open) + r"\s*\|(.*)"
25+
_yaml_header_pattern_cache[comment_open] = re.compile(pattern)
26+
return _yaml_header_pattern_cache[comment_open]
27+
1728

1829
def split_yaml_header(language: Language, source: str) -> tuple[str, str, object]:
1930
"""Split source into YAML header and body."""
20-
trigger: str = re.escape(language.comment.open) + r"\s*\|(.*)"
31+
pattern = _get_yaml_header_pattern(language.comment.open)
2132
lines = source.splitlines(keepends=True)
2233
header_lines: list[str] = []
2334
body_start: int = 0
2435

2536
for i, line in enumerate(lines):
26-
if m := re.match(trigger, line):
37+
if m := pattern.match(line):
2738
header_lines.append(m.group(1))
2839
continue
2940

entangled/interface/document.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,15 @@ def input_files(self):
3535
return get_input_files(self.context.fs, self.config)
3636

3737
def source_text(self, path: Path) -> tuple[str, set[PurePath]]:
38-
deps = set()
39-
text = ""
38+
deps: set[PurePath] = set()
39+
# Use list for O(n) instead of O(n²) string concatenation
40+
text_parts: list[str] = []
4041
for content in self.content[path]:
4142
t, d = content_to_text(self.reference_map, content)
4243
if d is not None:
4344
deps.add(d)
44-
text += t
45-
return text, deps
45+
text_parts.append(t)
46+
return "".join(text_parts), deps
4647

4748
def target_text(self, path: PurePath) -> tuple[str, set[PurePath]]:
4849
ref_name = self.reference_map.select_by_target(path)

entangled/iterators/lines.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,16 @@
1010

1111

1212
def lines(text: str) -> Generator[str]:
13+
"""Iterate over lines in text, preserving newlines."""
1314
pos = 0
1415
while (next_pos := text.find("\n", pos)) != -1:
1516
yield text[pos:next_pos + 1]
1617
pos = next_pos + 1
17-
1818
yield text[pos:]
1919

2020

2121
@peekable
2222
def numbered_lines(filename: PurePath, text: str) -> Generator[InputToken]:
23-
"""
24-
Iterate the lines in a file. Doesn't strip newlines. Works with both
25-
Windows and Unix line endings.
26-
"""
23+
"""Iterate the lines in a file. Doesn't strip newlines."""
2724
for n, line in enumerate(lines(text)):
2825
yield (TextLocation(filename, n+1), line)

entangled/model/tangle.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from pathlib import PurePath
55

66
import re
7-
import os
87
from typing import override
98

109

@@ -21,6 +20,10 @@
2120

2221
log = logger()
2322

23+
# Pre-compiled regex for reference detection (e.g., " <<refname>>")
24+
_REF_PATTERN = re.compile(r"^(?P<indent>\s*)<<(?P<refname>[\w:/_.-]+)>>\s*$")
25+
26+
2427
@dataclass
2528
class CyclicReference(UserError):
2629
ref_name: str
@@ -89,7 +92,7 @@ def tangler(
8992

9093
with visitor.visit(ref):
9194
for line in lines(code_block.source):
92-
if m := re.match(r"^(?P<indent>\s*)<<(?P<refname>[\w:/_.-]+)>>\s*$", line.rstrip()):
95+
if m := _REF_PATTERN.match(line.rstrip()):
9396
ref_name = ReferenceName.from_str(m["refname"], code_block.namespace)
9497
log.debug(f"tangling reference `{ref_name}`")
9598
if not refs.has_name(ref_name):
@@ -146,13 +149,14 @@ def tangle_ref(
146149
raise KeyError(name)
147150
tangler = tanglers[annotation](refs)
148151
deps: set[PurePath] = set()
149-
out = ""
150-
151-
ref_lst = refs.select_by_name(name)
152-
for line in tangler(tangler, deps, ref_lst[0], False, True):
153-
out += line
154-
for ref in ref_lst[1:]:
155-
for line in tangler(tangler, deps, ref, False, False):
156-
out += line
152+
153+
def all_lines():
154+
ref_lst = refs.select_by_name(name)
155+
yield from tangler(tangler, deps, ref_lst[0], False, True)
156+
for ref in ref_lst[1:]:
157+
yield from tangler(tangler, deps, ref, False, False)
158+
159+
# Use join for O(n) instead of O(n²) string concatenation
160+
out = "".join(all_lines())
157161

158162
return out, deps

0 commit comments

Comments
 (0)