Skip to content

Commit 208aa74

Browse files
committed
feat(version_scanner): implement generic subdirectory filtering and layout-agnostic package naming
1 parent a7907a9 commit 208aa74

2 files changed

Lines changed: 148 additions & 22 deletions

File tree

scripts/version_scanner/tests/unit/test_version_scanner.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,3 +478,50 @@ def test_regex_examples_from_config():
478478
matched = True
479479
break
480480
assert matched, f"Example '{example}' in group '{name}' did not match any pattern."
481+
482+
483+
def test_scan_repository_layout_agnostic(tmp_path):
484+
# Create directories under different roots
485+
p1 = tmp_path / "generated" / "pkg_gen"
486+
p1.mkdir(parents=True)
487+
(p1 / "setup.py").write_text("python_requires = '>=3.7'\n")
488+
489+
p2 = tmp_path / "handwritten" / "pkg_hand"
490+
p2.mkdir(parents=True)
491+
(p2 / "setup.py").write_text("python_requires = '>=3.7'\n")
492+
493+
rules = [
494+
{"name": "python_requires_check", "pattern": "python_requires\\s*=\\s*['\"]>=3\\.7['\"]"}
495+
]
496+
497+
from version_scanner import scan_repository
498+
499+
# Scan only handwritten package
500+
results = scan_repository(
501+
str(tmp_path),
502+
rules,
503+
target_packages=["handwritten/pkg_hand"]
504+
)
505+
506+
assert len(results) == 1
507+
assert results[0]["package_name"] == "pkg_hand"
508+
assert "handwritten/pkg_hand/setup.py" in results[0]["file_path"]
509+
510+
511+
def test_scan_repository_package_name_roots(tmp_path):
512+
# Create directories under various package roots
513+
p1 = tmp_path / "third_party" / "pkg_third"
514+
p1.mkdir(parents=True)
515+
(p1 / "setup.py").write_text("python_requires = '>=3.7'\n")
516+
517+
rules = [
518+
{"name": "python_requires_check", "pattern": "python_requires\\s*=\\s*['\"]>=3\\.7['\"]"}
519+
]
520+
521+
from version_scanner import scan_repository
522+
523+
results = scan_repository(str(tmp_path), rules)
524+
525+
assert len(results) == 1
526+
assert results[0]["package_name"] == "pkg_third"
527+
assert "third_party/pkg_third/setup.py" in results[0]["file_path"]

scripts/version_scanner/version_scanner.py

Lines changed: 101 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,60 @@
2727
import yaml
2828

2929
class ConfigManager:
30-
"""Handles loading and interpolation of regex configurations."""
30+
"""
31+
Handles loading, validation, and interpolation of the regex configuration rules.
32+
33+
Uses template-based regex configurations from a YAML file and interpolates them
34+
with dependency properties and computed version segments (e.g., major, minor) to
35+
generate active regex search patterns dynamically.
36+
"""
3137

3238
def __init__(self, config_path: str, dependency: str, version: str):
39+
"""
40+
Initializes the configuration manager.
41+
42+
Args:
43+
config_path: Path to the YAML configuration file containing regex templates.
44+
dependency: Name of the dependency to search for (e.g., "python", "protobuf").
45+
version: Specific target version string to search for (e.g., "3.7", "4.25.8").
46+
"""
3347
self.config_path = config_path
3448
self.dependency = dependency
3549
self.version = version
3650
self.variables = self._compute_variables()
3751

3852
def _compute_variables(self) -> Dict[str, str]:
39-
"""Compute variables for interpolation from version string."""
53+
"""
54+
Parses the version string and computes variables for template interpolation.
55+
56+
Splits the version string by '.' and generates segments like major, minor,
57+
patch, minor+1, and minor-1. These variables are used in regex templates
58+
(e.g., `{minor_plus_one}` to search for Python 3.8 when EOLing 3.7).
59+
60+
Examples:
61+
If version is "3.7" and dependency is "python":
62+
vars = {
63+
"name": "python",
64+
"version": "3.7",
65+
"major": "3",
66+
"minor": "7",
67+
"minor_plus_one": "8",
68+
"minor_minus_one": "6"
69+
}
70+
If version is "4.25.8" and dependency is "protobuf":
71+
vars = {
72+
"name": "protobuf",
73+
"version": "4.25.8",
74+
"major": "4",
75+
"minor": "25",
76+
"patch": "8",
77+
"minor_plus_one": "26",
78+
"minor_minus_one": "24"
79+
}
80+
81+
Returns:
82+
A dictionary mapping variable placeholder names to their resolved string values.
83+
"""
4084
vars = {
4185
"name": self.dependency,
4286
"version": self.version,
@@ -148,7 +192,22 @@ def format_match_for_csv(
148192
branch: str = "main"
149193
) -> Dict[str, str]:
150194
"""
151-
Format a match result for CSV output, adding GitHub links if requested.
195+
Formats a raw match dictionary for clean CSV presentation and imports.
196+
197+
Cleans long context lines by truncating them around the match location to prevent
198+
extreme cell overflow in spreadsheets. Optionally transforms line numbers into
199+
clickable `=HYPERLINK(...)` formulas linking directly to the exact file and line
200+
number in GitHub.
201+
202+
Args:
203+
match: A match dictionary containing 'file_path', 'repo_path', 'rule_name',
204+
'line_number', 'matched_string', and 'context_line'.
205+
github_repo: Optional GitHub repository base URL (e.g., "https://github.com/user/repo").
206+
If provided, triggers the hyperlink generation.
207+
branch: Optional branch name to build the GitHub blob URL (defaults to "main").
208+
209+
Returns:
210+
A copy of the match dictionary with formatted/truncated values, suitable for CSV writing.
152211
"""
153212
formatted = match.copy()
154213

@@ -368,16 +427,26 @@ def scan_repository(
368427
version_string: str = None
369428
) -> List[Dict[str, str]]:
370429
"""
371-
Scan repository for matching patterns.
430+
Scans the repository directory tree applying resolved regex patterns to files.
431+
432+
Walks the directory structure starting at the root path, checking filenames and
433+
file contents line-by-line against compiled patterns. Supports case-insensitive
434+
directory/file ignore patterns, dynamic package filter checks for layout-agnostic
435+
subfolders, and filename-based version string matching.
372436
373437
Args:
374-
root_path: Path to the repository root.
375-
rules: A list of dictionaries containing 'name' and 'pattern'.
376-
target_packages: A list of package paths to include (e.g., ['packages/pkg_a']).
377-
If None or empty, all packages are scanned.
438+
root_path: Absolute or relative path to the directory tree root to scan.
439+
rules: A list of dictionaries containing 'name' (rule name) and 'pattern'
440+
(regex search pattern string).
441+
target_packages: Optional list of specific subdirectory paths to restrict scanning
442+
(e.g., ['packages/pkg_a', 'generated/pkg_b']). If None or empty,
443+
performs a full recursive scan of the repository.
444+
ignore_dirs: Optional list of directory names or glob-like files to ignore (case-insensitive).
445+
version_string: Optional target version string (e.g. "3.7") to scan for in filenames.
378446
379447
Returns:
380-
A list of match details.
448+
A list of dictionaries detailing each match: 'file_path', 'repo_path',
449+
'package_name', 'rule_name', 'line_number', 'matched_string', 'context_line'.
381450
"""
382451
ignore_lower = {i.lower() for i in ignore_dirs} if ignore_dirs else set()
383452
results = []
@@ -408,17 +477,21 @@ def scan_repository(
408477
rel_root = os.path.relpath(root, root_path)
409478
parts = rel_root.split(os.sep)
410479

411-
# Monorepo filtering
412-
if target_packages and parts[0] == "packages":
413-
if len(parts) >= 2:
414-
current_package_path = os.path.join(parts[0], parts[1])
415-
if current_package_path not in target_packages:
416-
# Skip this directory and all subdirectories
417-
dirs[:] = []
418-
continue
419-
else:
420-
# We are in the "packages" directory itself. Continue to walk.
421-
pass
480+
# Layout-agnostic generic subdirectory filtering
481+
if target_packages:
482+
norm_targets = {os.path.normpath(tp) for tp in target_packages}
483+
is_valid_path = False
484+
for target in norm_targets:
485+
if (rel_root == "." or
486+
rel_root == target or
487+
rel_root.startswith(target + os.sep) or
488+
target.startswith(rel_root + os.sep)):
489+
is_valid_path = True
490+
break
491+
if not is_valid_path:
492+
# Skip searching this directory and all its descendants
493+
dirs[:] = []
494+
continue
422495

423496
for file in files:
424497
file_path = os.path.join(root, file)
@@ -438,7 +511,9 @@ def scan_repository(
438511

439512
package_name = ""
440513
path_parts = rel_file_path.split(os.sep)
441-
if len(path_parts) >= 2 and path_parts[0] == "packages":
514+
# Assume package name is the folder directly under standard package root directories
515+
package_roots = {"packages", "generated", "handwritten", "third_party"}
516+
if len(path_parts) >= 2 and path_parts[0] in package_roots:
442517
package_name = path_parts[1]
443518

444519
root_parts = os.path.abspath(root_path).split(os.sep)
@@ -531,7 +606,11 @@ def main():
531606
# Resolve target packages if filtering is requested
532607
target_packages = []
533608
if args.package:
534-
target_packages.append(os.path.join("packages", args.package))
609+
# If the folder exists under root path as-is, use it. Otherwise fallback to packages/ prefix.
610+
if os.path.exists(os.path.join(args.path, args.package)):
611+
target_packages.append(args.package)
612+
else:
613+
target_packages.append(os.path.join("packages", args.package))
535614
elif args.package_file:
536615
target_packages = read_package_file(args.package_file)
537616

0 commit comments

Comments
 (0)