Skip to content

Commit 43ec09d

Browse files
authored
Feat/1.4.2 (#8)
* fix(perf): reduce redundant syscalls and hash computations * test(extractor): cover block/segment gate branches and hash reuse fallback
1 parent e7b6d43 commit 43ec09d

File tree

11 files changed

+300
-111
lines changed

11 files changed

+300
-111
lines changed

CHANGELOG.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,56 @@
11
# Changelog
22

3+
## [1.4.2] - 2026-02-17
4+
5+
### Overview
6+
7+
This patch release is a maintenance update. Determinism remains guaranteed: reports are stable and ordering is
8+
unchanged.
9+
10+
### Performance & Implementation Cleanup
11+
12+
- `process_file()` now uses a single `os.stat()` call to obtain both size (size guard) and `st_mtime_ns`/`st_size` (file
13+
stat signature), removing a redundant `os.path.getsize()` call.
14+
- Discovery logic was deduplicated by extracting `_discover_files()`; quiet/non-quiet behavior differs only by UI status
15+
wrapper, not by semantics or filtering.
16+
- Cache path wiring now precomputes `wire_map` so `_wire_filepath_from_runtime()` is evaluated once per key.
17+
18+
### Hash Reuse for Block/Segment Analysis
19+
20+
- `extract_blocks()` and `extract_segments()` accept optional `precomputed_hashes`. When provided, they reuse hashes
21+
instead of recomputing.
22+
- The extractor computes function body hashes once and passes them to both block and segment extraction when both
23+
analyses run for the same function.
24+
25+
### Scanner Efficiency (No Semantic Change)
26+
27+
- `iter_py_files()` now filters candidates before sorting, so only valid candidates are sorted. The final order remains
28+
deterministic and equivalent to previous behavior.
29+
30+
### Contract Tightening
31+
32+
- `precomputed_hashes` type strengthened: `list[str] | None``Sequence[str] | None` (read-only intent in the type
33+
contract).
34+
- Added `assert len(precomputed_hashes) == len(body)` in both `extract_blocks()` and `extract_segments()` to catch
35+
mismatched inputs early (development-time invariant).
36+
37+
### Testing & Determinism
38+
39+
- Byte-identical JSON reports verified across repeated runs; differences, when present, are limited to
40+
volatile/provenance meta fields (e.g., cache status/path, timestamps), while semantic payload remains stable.
41+
- Unit tests updated to mock `os.stat` instead of `os.path.getsize` where applicable (`test_process_file_stat_error`,
42+
`test_process_file_size_limit`).
43+
44+
### Notes
45+
46+
- No changes to:
47+
- detection semantics / fingerprints
48+
- baseline hash inputs (`payload_sha256` semantic payload)
49+
- exit code contract and precedence
50+
- schema versions (baseline v1.0, cache v1.2, report v1.1)
51+
52+
---
53+
354
## [1.4.1] - 2026-02-15
455

556
### CLI

codeclone/blocks.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from __future__ import annotations
1010

1111
import ast
12+
from collections.abc import Sequence
1213
from dataclasses import dataclass
1314

1415
from .blockhash import stmt_hash
@@ -45,12 +46,20 @@ def extract_blocks(
4546
cfg: NormalizationConfig,
4647
block_size: int,
4748
max_blocks: int,
49+
precomputed_hashes: Sequence[str] | None = None,
4850
) -> list[BlockUnit]:
4951
body = getattr(func_node, "body", None)
5052
if not isinstance(body, list) or len(body) < block_size:
5153
return []
5254

53-
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
55+
if precomputed_hashes is not None:
56+
assert len(precomputed_hashes) == len(body), (
57+
f"precomputed_hashes length {len(precomputed_hashes)} "
58+
f"!= body length {len(body)}"
59+
)
60+
stmt_hashes = precomputed_hashes
61+
else:
62+
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
5463

5564
blocks: list[BlockUnit] = []
5665
last_start: int | None = None
@@ -94,12 +103,20 @@ def extract_segments(
94103
cfg: NormalizationConfig,
95104
window_size: int,
96105
max_segments: int,
106+
precomputed_hashes: Sequence[str] | None = None,
97107
) -> list[SegmentUnit]:
98108
body = getattr(func_node, "body", None)
99109
if not isinstance(body, list) or len(body) < window_size:
100110
return []
101111

102-
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
112+
if precomputed_hashes is not None:
113+
assert len(precomputed_hashes) == len(body), (
114+
f"precomputed_hashes length {len(precomputed_hashes)} "
115+
f"!= body length {len(body)}"
116+
)
117+
stmt_hashes = precomputed_hashes
118+
else:
119+
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
103120

104121
segments: list[SegmentUnit] = []
105122

codeclone/cache.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -344,14 +344,14 @@ def save(self) -> None:
344344
try:
345345
self.path.parent.mkdir(parents=True, exist_ok=True)
346346
wire_files: dict[str, object] = {}
347-
for runtime_path in sorted(
348-
self.data["files"], key=self._wire_filepath_from_runtime
349-
):
347+
wire_map = {
348+
rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"]
349+
}
350+
for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
350351
entry = self.get_file_entry(runtime_path)
351352
if entry is None:
352353
continue
353-
wire_path = self._wire_filepath_from_runtime(runtime_path)
354-
wire_files[wire_path] = _encode_wire_file_entry(entry)
354+
wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry)
355355

356356
payload: dict[str, object] = {
357357
"py": current_python_tag(),

codeclone/cli.py

Lines changed: 39 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,14 @@ def process_file(
122122
"""
123123

124124
try:
125-
# Check file size
125+
# Single os.stat() for both size check and cache signature
126126
try:
127-
st_size = os.path.getsize(filepath)
128-
if st_size > MAX_FILE_SIZE:
127+
st = os.stat(filepath)
128+
if st.st_size > MAX_FILE_SIZE:
129129
return ProcessingResult(
130130
filepath=filepath,
131131
success=False,
132-
error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
132+
error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})",
133133
error_kind="file_too_large",
134134
)
135135
except OSError as e:
@@ -140,6 +140,8 @@ def process_file(
140140
error_kind="stat_error",
141141
)
142142

143+
stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size}
144+
143145
try:
144146
source = Path(filepath).read_text("utf-8")
145147
except UnicodeDecodeError as e:
@@ -157,7 +159,6 @@ def process_file(
157159
error_kind="source_read_error",
158160
)
159161

160-
stat = file_stat_signature(filepath)
161162
module_name = module_name_from_path(root, filepath)
162163

163164
units, blocks, segments = extract_units_from_source(
@@ -355,68 +356,44 @@ def _safe_future_result(
355356
return None, str(e)
356357

357358
# Discovery phase
358-
try:
359-
if args.quiet:
360-
for fp in iter_py_files(str(root_path)):
361-
files_found += 1
362-
stat, cached, warn = _get_cached_entry(fp)
363-
if warn:
364-
console.print(warn)
365-
files_skipped += 1
366-
continue
367-
if cached and cached.get("stat") == stat:
368-
cache_hits += 1
369-
all_units.extend(
370-
cast(
371-
list[GroupItem],
372-
cast(object, cached.get("units", [])),
373-
)
359+
def _discover_files() -> None:
360+
nonlocal files_found, cache_hits, files_skipped
361+
for fp in iter_py_files(str(root_path)):
362+
files_found += 1
363+
stat, cached, warn = _get_cached_entry(fp)
364+
if warn:
365+
console.print(warn)
366+
files_skipped += 1
367+
continue
368+
if cached and cached.get("stat") == stat:
369+
cache_hits += 1
370+
all_units.extend(
371+
cast(
372+
list[GroupItem],
373+
cast(object, cached.get("units", [])),
374374
)
375-
all_blocks.extend(
376-
cast(
377-
list[GroupItem],
378-
cast(object, cached.get("blocks", [])),
379-
)
375+
)
376+
all_blocks.extend(
377+
cast(
378+
list[GroupItem],
379+
cast(object, cached.get("blocks", [])),
380380
)
381-
all_segments.extend(
382-
cast(
383-
list[GroupItem],
384-
cast(object, cached.get("segments", [])),
385-
)
381+
)
382+
all_segments.extend(
383+
cast(
384+
list[GroupItem],
385+
cast(object, cached.get("segments", [])),
386386
)
387-
else:
388-
files_to_process.append(fp)
387+
)
388+
else:
389+
files_to_process.append(fp)
390+
391+
try:
392+
if args.quiet:
393+
_discover_files()
389394
else:
390395
with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
391-
for fp in iter_py_files(str(root_path)):
392-
files_found += 1
393-
stat, cached, warn = _get_cached_entry(fp)
394-
if warn:
395-
console.print(warn)
396-
files_skipped += 1
397-
continue
398-
if cached and cached.get("stat") == stat:
399-
cache_hits += 1
400-
all_units.extend(
401-
cast(
402-
list[GroupItem],
403-
cast(object, cached.get("units", [])),
404-
)
405-
)
406-
all_blocks.extend(
407-
cast(
408-
list[GroupItem],
409-
cast(object, cached.get("blocks", [])),
410-
)
411-
)
412-
all_segments.extend(
413-
cast(
414-
list[GroupItem],
415-
cast(object, cached.get("segments", [])),
416-
)
417-
)
418-
else:
419-
files_to_process.append(fp)
396+
_discover_files()
420397
except OSError as e:
421398
console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e)))
422399
sys.exit(ExitCode.CONTRACT_ERROR)

codeclone/extractor.py

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from contextlib import contextmanager
1717
from dataclasses import dataclass
1818

19+
from .blockhash import stmt_hash
1920
from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments
2021
from .cfg import CFGBuilder
2122
from .errors import ParseError
@@ -250,28 +251,42 @@ def extract_units_from_source(
250251
)
251252
)
252253

253-
# Block-level units (exclude __init__)
254-
if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
255-
blocks = extract_blocks(
256-
node,
257-
filepath=filepath,
258-
qualname=qualname,
259-
cfg=cfg,
260-
block_size=4,
261-
max_blocks=15,
262-
)
263-
block_units.extend(blocks)
264-
265-
# Segment-level units (windows within functions, for internal clones)
266-
if loc >= 30 and stmt_count >= 12:
267-
segments = extract_segments(
268-
node,
269-
filepath=filepath,
270-
qualname=qualname,
271-
cfg=cfg,
272-
window_size=6,
273-
max_segments=60,
274-
)
275-
segment_units.extend(segments)
254+
# Block-level and segment-level units share statement hashes
255+
needs_blocks = (
256+
not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10
257+
)
258+
needs_segments = loc >= 30 and stmt_count >= 12
259+
260+
if needs_blocks or needs_segments:
261+
body = getattr(node, "body", None)
262+
hashes: list[str] | None = None
263+
if isinstance(body, list):
264+
hashes = [stmt_hash(stmt, cfg) for stmt in body]
265+
266+
if needs_blocks:
267+
block_units.extend(
268+
extract_blocks(
269+
node,
270+
filepath=filepath,
271+
qualname=qualname,
272+
cfg=cfg,
273+
block_size=4,
274+
max_blocks=15,
275+
precomputed_hashes=hashes,
276+
)
277+
)
278+
279+
if needs_segments:
280+
segment_units.extend(
281+
extract_segments(
282+
node,
283+
filepath=filepath,
284+
qualname=qualname,
285+
cfg=cfg,
286+
window_size=6,
287+
max_segments=60,
288+
precomputed_hashes=hashes,
289+
)
290+
)
276291

277292
return units, block_units, segment_units

codeclone/scanner.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,9 @@ def iter_py_files(
7777
if root_str.startswith(sensitive + "/"):
7878
raise ValidationError(f"Cannot scan under sensitive directory: {root}")
7979

80-
file_count = 0
81-
for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)):
80+
# Collect and filter first, then sort — avoids sorting excluded paths
81+
candidates: list[Path] = []
82+
for p in rootp.rglob("*.py"):
8283
# Verify path is actually under root (prevent symlink attacks)
8384
try:
8485
p.resolve().relative_to(rootp)
@@ -90,12 +91,15 @@ def iter_py_files(
9091
if any(ex in parts for ex in excludes):
9192
continue
9293

93-
file_count += 1
94-
if file_count > max_files:
95-
raise ValidationError(
96-
f"File count exceeds limit of {max_files}. "
97-
"Use more specific root or increase limit."
98-
)
94+
candidates.append(p)
95+
96+
if len(candidates) > max_files:
97+
raise ValidationError(
98+
f"File count exceeds limit of {max_files}. "
99+
"Use more specific root or increase limit."
100+
)
101+
102+
for p in sorted(candidates, key=lambda path: str(path)):
99103
yield str(p)
100104

101105

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "codeclone"
7-
version = "1.4.1"
7+
version = "1.4.2"
88
description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
99
readme = { file = "README.md", content-type = "text/markdown" }
1010
license = { text = "MIT" }

0 commit comments

Comments
 (0)