orenlab
diff --git a/‎CHANGELOG.md‎
Lines changed: 51 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎codeclone/blocks.py‎
Lines changed: 19 additions & 2 deletions b/‎codeclone/blocks.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎codeclone/cache.py‎
Lines changed: 5 additions & 5 deletions b/‎codeclone/cache.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎codeclone/cli.py‎
Lines changed: 39 additions & 62 deletions b/‎codeclone/cli.py‎
Lines changed: 39 additions & 62 deletions
diff --git a/‎codeclone/extractor.py‎
Lines changed: 38 additions & 23 deletions b/‎codeclone/extractor.py‎
Lines changed: 38 additions & 23 deletions
diff --git a/‎codeclone/scanner.py‎
Lines changed: 12 additions & 8 deletions b/‎codeclone/scanner.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -1,5 +1,56 @@
 # Changelog
 
+## [1.4.2] - 2026-02-17
+
+### Overview
+
+This patch release is a maintenance update. Determinism remains guaranteed: reports are stable and ordering is
+unchanged.
+
+### Performance & Implementation Cleanup
+
+- `process_file()` now uses a single `os.stat()` call to obtain both size (size guard) and `st_mtime_ns`/`st_size` (file
+  stat signature), removing a redundant `os.path.getsize()` call.
+- Discovery logic was deduplicated by extracting `_discover_files()`; quiet/non-quiet behavior differs only by UI status
+  wrapper, not by semantics or filtering.
+- Cache path wiring now precomputes `wire_map` so `_wire_filepath_from_runtime()` is evaluated once per key.
+
+### Hash Reuse for Block/Segment Analysis
+
+- `extract_blocks()` and `extract_segments()` accept optional `precomputed_hashes`. When provided, they reuse hashes
+  instead of recomputing.
+- The extractor computes function body hashes once and passes them to both block and segment extraction when both
+  analyses run for the same function.
+
+### Scanner Efficiency (No Semantic Change)
+
+- `iter_py_files()` now filters candidates before sorting, so only valid candidates are sorted. The final order remains
+  deterministic and equivalent to previous behavior.
+
+### Contract Tightening
+
+- `precomputed_hashes` type strengthened: `list[str] | None` → `Sequence[str] | None` (read-only intent in the type
+  contract).
+- Added `assert len(precomputed_hashes) == len(body)` in both `extract_blocks()` and `extract_segments()` to catch
+  mismatched inputs early (development-time invariant).
+
+### Testing & Determinism
+
+- Byte-identical JSON reports verified across repeated runs; differences, when present, are limited to
+  volatile/provenance meta fields (e.g., cache status/path, timestamps), while semantic payload remains stable.
+- Unit tests updated to mock `os.stat` instead of `os.path.getsize` where applicable (`test_process_file_stat_error`,
+  `test_process_file_size_limit`).
+
+### Notes
+
+- No changes to:
+  - detection semantics / fingerprints
+  - baseline hash inputs (`payload_sha256` semantic payload)
+  - exit code contract and precedence
+  - schema versions (baseline v1.0, cache v1.2, report v1.1)
+
+---
+
 ## [1.4.1] - 2026-02-15
 
 ### CLI
 
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import ast
+from collections.abc import Sequence
 from dataclasses import dataclass
 
 from .blockhash import stmt_hash
@@ -45,12 +46,20 @@ def extract_blocks(
     cfg: NormalizationConfig,
     block_size: int,
     max_blocks: int,
+    precomputed_hashes: Sequence[str] | None = None,
 ) -> list[BlockUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < block_size:
         return []
 
-    stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
+    if precomputed_hashes is not None:
+        assert len(precomputed_hashes) == len(body), (
+            f"precomputed_hashes length {len(precomputed_hashes)} "
+            f"!= body length {len(body)}"
+        )
+        stmt_hashes = precomputed_hashes
+    else:
+        stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
 
     blocks: list[BlockUnit] = []
     last_start: int | None = None
@@ -94,12 +103,20 @@ def extract_segments(
     cfg: NormalizationConfig,
     window_size: int,
     max_segments: int,
+    precomputed_hashes: Sequence[str] | None = None,
 ) -> list[SegmentUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < window_size:
         return []
 
-    stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
+    if precomputed_hashes is not None:
+        assert len(precomputed_hashes) == len(body), (
+            f"precomputed_hashes length {len(precomputed_hashes)} "
+            f"!= body length {len(body)}"
+        )
+        stmt_hashes = precomputed_hashes
+    else:
+        stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
 
     segments: list[SegmentUnit] = []
 
 
@@ -344,14 +344,14 @@ def save(self) -> None:
         try:
             self.path.parent.mkdir(parents=True, exist_ok=True)
             wire_files: dict[str, object] = {}
-            for runtime_path in sorted(
-                self.data["files"], key=self._wire_filepath_from_runtime
-            ):
+            wire_map = {
+                rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"]
+            }
+            for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
                 entry = self.get_file_entry(runtime_path)
                 if entry is None:
                     continue
-                wire_path = self._wire_filepath_from_runtime(runtime_path)
-                wire_files[wire_path] = _encode_wire_file_entry(entry)
+                wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry)
 
             payload: dict[str, object] = {
                 "py": current_python_tag(),
 
@@ -122,14 +122,14 @@ def process_file(
     """
 
     try:
-        # Check file size
+        # Single os.stat() for both size check and cache signature
         try:
-            st_size = os.path.getsize(filepath)
-            if st_size > MAX_FILE_SIZE:
+            st = os.stat(filepath)
+            if st.st_size > MAX_FILE_SIZE:
                 return ProcessingResult(
                     filepath=filepath,
                     success=False,
-                    error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
+                    error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})",
                     error_kind="file_too_large",
                 )
         except OSError as e:
@@ -140,6 +140,8 @@ def process_file(
                 error_kind="stat_error",
             )
 
+        stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size}
+
         try:
             source = Path(filepath).read_text("utf-8")
         except UnicodeDecodeError as e:
@@ -157,7 +159,6 @@ def process_file(
                 error_kind="source_read_error",
             )
 
-        stat = file_stat_signature(filepath)
         module_name = module_name_from_path(root, filepath)
 
         units, blocks, segments = extract_units_from_source(
@@ -355,68 +356,44 @@ def _safe_future_result(
             return None, str(e)
 
     # Discovery phase
-    try:
-        if args.quiet:
-            for fp in iter_py_files(str(root_path)):
-                files_found += 1
-                stat, cached, warn = _get_cached_entry(fp)
-                if warn:
-                    console.print(warn)
-                    files_skipped += 1
-                    continue
-                if cached and cached.get("stat") == stat:
-                    cache_hits += 1
-                    all_units.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("units", [])),
-                        )
+    def _discover_files() -> None:
+        nonlocal files_found, cache_hits, files_skipped
+        for fp in iter_py_files(str(root_path)):
+            files_found += 1
+            stat, cached, warn = _get_cached_entry(fp)
+            if warn:
+                console.print(warn)
+                files_skipped += 1
+                continue
+            if cached and cached.get("stat") == stat:
+                cache_hits += 1
+                all_units.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("units", [])),
                     )
-                    all_blocks.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("blocks", [])),
-                        )
+                )
+                all_blocks.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("blocks", [])),
                     )
-                    all_segments.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("segments", [])),
-                        )
+                )
+                all_segments.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("segments", [])),
                     )
-                else:
-                    files_to_process.append(fp)
+                )
+            else:
+                files_to_process.append(fp)
+
+    try:
+        if args.quiet:
+            _discover_files()
         else:
             with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
-                for fp in iter_py_files(str(root_path)):
-                    files_found += 1
-                    stat, cached, warn = _get_cached_entry(fp)
-                    if warn:
-                        console.print(warn)
-                        files_skipped += 1
-                        continue
-                    if cached and cached.get("stat") == stat:
-                        cache_hits += 1
-                        all_units.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("units", [])),
-                            )
-                        )
-                        all_blocks.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("blocks", [])),
-                            )
-                        )
-                        all_segments.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("segments", [])),
-                            )
-                        )
-                    else:
-                        files_to_process.append(fp)
+                _discover_files()
     except OSError as e:
         console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e)))
         sys.exit(ExitCode.CONTRACT_ERROR)
 
@@ -16,6 +16,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 
+from .blockhash import stmt_hash
 from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments
 from .cfg import CFGBuilder
 from .errors import ParseError
@@ -250,28 +251,42 @@ def extract_units_from_source(
             )
         )
 
-        # Block-level units (exclude __init__)
-        if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
-            blocks = extract_blocks(
-                node,
-                filepath=filepath,
-                qualname=qualname,
-                cfg=cfg,
-                block_size=4,
-                max_blocks=15,
-            )
-            block_units.extend(blocks)
-
-        # Segment-level units (windows within functions, for internal clones)
-        if loc >= 30 and stmt_count >= 12:
-            segments = extract_segments(
-                node,
-                filepath=filepath,
-                qualname=qualname,
-                cfg=cfg,
-                window_size=6,
-                max_segments=60,
-            )
-            segment_units.extend(segments)
+        # Block-level and segment-level units share statement hashes
+        needs_blocks = (
+            not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10
+        )
+        needs_segments = loc >= 30 and stmt_count >= 12
+
+        if needs_blocks or needs_segments:
+            body = getattr(node, "body", None)
+            hashes: list[str] | None = None
+            if isinstance(body, list):
+                hashes = [stmt_hash(stmt, cfg) for stmt in body]
+
+            if needs_blocks:
+                block_units.extend(
+                    extract_blocks(
+                        node,
+                        filepath=filepath,
+                        qualname=qualname,
+                        cfg=cfg,
+                        block_size=4,
+                        max_blocks=15,
+                        precomputed_hashes=hashes,
+                    )
+                )
+
+            if needs_segments:
+                segment_units.extend(
+                    extract_segments(
+                        node,
+                        filepath=filepath,
+                        qualname=qualname,
+                        cfg=cfg,
+                        window_size=6,
+                        max_segments=60,
+                        precomputed_hashes=hashes,
+                    )
+                )
 
     return units, block_units, segment_units
@@ -77,8 +77,9 @@ def iter_py_files(
             if root_str.startswith(sensitive + "/"):
                 raise ValidationError(f"Cannot scan under sensitive directory: {root}")
 
-    file_count = 0
-    for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)):
+    # Collect and filter first, then sort — avoids sorting excluded paths
+    candidates: list[Path] = []
+    for p in rootp.rglob("*.py"):
         # Verify path is actually under root (prevent symlink attacks)
         try:
             p.resolve().relative_to(rootp)
@@ -90,12 +91,15 @@ def iter_py_files(
         if any(ex in parts for ex in excludes):
             continue
 
-        file_count += 1
-        if file_count > max_files:
-            raise ValidationError(
-                f"File count exceeds limit of {max_files}. "
-                "Use more specific root or increase limit."
-            )
+        candidates.append(p)
+
+    if len(candidates) > max_files:
+        raise ValidationError(
+            f"File count exceeds limit of {max_files}. "
+            "Use more specific root or increase limit."
+        )
+
+    for p in sorted(candidates, key=lambda path: str(path)):
         yield str(p)
 
 
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "codeclone"
-version = "1.4.1"
+version = "1.4.2"
 description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { text = "MIT" }