-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcheck_duplicate_packages.py
More file actions
112 lines (91 loc) · 3.69 KB
/
check_duplicate_packages.py
File metadata and controls
112 lines (91 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import collections
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import DefaultDict, Iterable, List, Optional
try: # Optional dependency; fall back to lightweight parser if unavailable.
import yaml # type: ignore
except ModuleNotFoundError: # pragma: no cover - exercised only without PyYAML
yaml = None # type: ignore
@dataclass(frozen=True)
class PackageOccurrence:
package_id: str
file_path: Path
index: int
def read_package_ids(path: Path) -> List[str]:
if yaml is not None:
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
packages = data.get("packages", [])
return [
str(pkg.get("id", "")).strip() for pkg in packages
if isinstance(pkg, dict) and pkg.get("id")
]
ids: List[str] = []
with path.open(encoding="utf-8") as fh:
for line in fh:
stripped = line.strip()
if stripped.startswith("- id:"):
candidate = stripped.split(":", 1)[1].strip().strip('"\'')
if candidate:
ids.append(candidate)
return ids
def collect_duplicates(
package_files: Iterable[Path]
) -> DefaultDict[str, List[PackageOccurrence]]:
occurrences: DefaultDict[
str, List[PackageOccurrence]] = collections.defaultdict(list)
for package_file in package_files:
ids = read_package_ids(package_file)
for index, package_id in enumerate(ids, start=1):
occurrences[package_id].append(
PackageOccurrence(package_id, package_file, index))
return collections.defaultdict(list, {
pid: occ
for pid, occ in occurrences.items() if len(occ) > 1
})
def iter_package_files(root: Path, glob: str) -> Iterable[Path]:
package_dir = root / "data" / "catalog" / "packages"
if not package_dir.is_dir():
raise FileNotFoundError(f"Package directory not found: {package_dir}")
return sorted(package_dir.glob(glob))
def format_report(
duplicates: DefaultDict[str, List[PackageOccurrence]]) -> str:
lines: List[str] = []
for package_id in sorted(duplicates):
lines.append(f"{package_id} ({len(duplicates[package_id])}x)")
for occ in duplicates[package_id]:
relative = occ.file_path.relative_to(
Path.cwd()) if occ.file_path.is_absolute() else occ.file_path
lines.append(f" - {relative} [entry #{occ.index}]")
return "\n".join(lines)
def main(argv: Optional[List[str]] = None) -> int:
parser = argparse.ArgumentParser(
description="Detect duplicate package entries across YAML catalogs.")
parser.add_argument("--root",
type=Path,
default=Path(__file__).resolve().parents[1],
help="Repository root (defaults to project root)")
parser.add_argument("--glob",
default="*.yml",
help="Glob pattern for package catalog files")
parser.add_argument(
"--strict",
action="store_true",
help="Exit with non-zero status when duplicates are found")
args = parser.parse_args(argv)
package_files = list(iter_package_files(args.root, args.glob))
if not package_files:
print("No package catalog files found.", file=sys.stderr)
return 1
duplicates = collect_duplicates(package_files)
if not duplicates:
print("No duplicate package IDs detected.")
return 0
print("Duplicate package IDs detected:\n")
print(format_report(duplicates))
return 1 if args.strict else 0
if __name__ == "__main__":
sys.exit(main())