|
| 1 | +#!/usr/bin/env python |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 3 | +# or more contributor license agreements. See the NOTICE file |
| 4 | +# distributed with this work for additional information |
| 5 | +# regarding copyright ownership. The ASF licenses this file |
| 6 | +# to you under the Apache License, Version 2.0 (the |
| 7 | +# "License"); you may not use this file except in compliance |
| 8 | +# with the License. You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, |
| 13 | +# software distributed under the License is distributed on an |
| 14 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | +# KIND, either express or implied. See the License for the |
| 16 | +# specific language governing permissions and limitations |
| 17 | +# under the License. |
| 18 | +# /// script |
| 19 | +# requires-python = ">=3.10" |
| 20 | +# dependencies = [ |
| 21 | +# "rich>=13.0.0", |
| 22 | +# ] |
| 23 | +# /// |
| 24 | +"""Check that no new ``raise AirflowException`` usages are introduced. |
| 25 | +
|
| 26 | +All *existing* usages are recorded in ``known_airflow_exceptions.txt`` next to |
| 27 | +this script as ``relative/path::N`` entries (one per file), where ``N`` is the |
| 28 | +maximum number of ``raise AirflowException`` occurrences allowed in that file. |
| 29 | +A file whose current count exceeds the recorded limit is treated as a violation |
| 30 | +– use a dedicated exception class instead. |
| 31 | +
|
| 32 | +Modes |
| 33 | +----- |
| 34 | +Default (files passed by prek/pre-commit): |
| 35 | + Check only the supplied files; fail if any file's count exceeds the limit. |
| 36 | + When a file's count has *decreased*, the allowlist entry is tightened |
| 37 | + automatically and the hook exits with a non-zero code so that pre-commit |
| 38 | + reports the modified allowlist — just stage |
| 39 | + ``scripts/ci/prek/known_airflow_exceptions.txt`` and re-run. |
| 40 | +
|
| 41 | +``--all-files``: |
| 42 | + Walk the whole repository and check every ``.py`` file. |
| 43 | +
|
| 44 | +``--cleanup``: |
| 45 | + Remove entries for files that no longer exist. Safe to run at any time; |
| 46 | + does not add new entries or raise limits. |
| 47 | +
|
| 48 | +``--generate``: |
| 49 | + Scan the whole repository and *rebuild* the allowlist from scratch. |
| 50 | + Intended for the initial setup or after a large-scale clean-up sprint. |
| 51 | +""" |
| 52 | + |
| 53 | +from __future__ import annotations |
| 54 | + |
| 55 | +import argparse |
| 56 | +import re |
| 57 | +from pathlib import Path |
| 58 | + |
| 59 | +from rich.console import Console |
| 60 | +from rich.panel import Panel |
| 61 | + |
| 62 | +console = Console() |
| 63 | + |
| 64 | +REPO_ROOT = Path(__file__).parents[3] |
| 65 | + |
| 66 | +# Match lines that actually raise AirflowException. Comment filtering is done |
| 67 | +# in _raise_lines() by skipping lines whose stripped form starts with "#". |
| 68 | +_RAISE_RE = re.compile(r"raise\s+AirflowException\b") |
| 69 | + |
| 70 | + |
| 71 | +class AllowlistManager: |
| 72 | + def __init__(self, allowlist_file: Path) -> None: |
| 73 | + self.allowlist_file = allowlist_file |
| 74 | + |
| 75 | + def load(self) -> dict[str, int]: |
| 76 | + """Return mapping of ``relative_path -> allowed_count``.""" |
| 77 | + if not self.allowlist_file.exists(): |
| 78 | + return {} |
| 79 | + |
| 80 | + result: dict[str, int] = {} |
| 81 | + for raw_line in self.allowlist_file.read_text().splitlines(): |
| 82 | + if not (stripped := raw_line.strip()): |
| 83 | + continue |
| 84 | + |
| 85 | + rel_str, _, count_str = stripped.rpartition("::") |
| 86 | + if not rel_str or not count_str: |
| 87 | + continue |
| 88 | + |
| 89 | + try: |
| 90 | + result[rel_str] = int(count_str) |
| 91 | + except ValueError: |
| 92 | + continue |
| 93 | + |
| 94 | + return result |
| 95 | + |
| 96 | + def save(self, counts: dict[str, int]) -> None: |
| 97 | + lines = [f"{rel}::{count}" for rel, count in sorted(counts.items())] |
| 98 | + self.allowlist_file.write_text("\n".join(lines) + "\n") |
| 99 | + |
| 100 | + def generate(self) -> int: |
| 101 | + console.print(f"Scanning [cyan]{REPO_ROOT}[/cyan] for raise AirflowException …") |
| 102 | + counts: dict[str, int] = {} |
| 103 | + for path in _iter_python_files(): |
| 104 | + n = len(_raise_lines(path)) |
| 105 | + if n > 0: |
| 106 | + counts[str(path.relative_to(REPO_ROOT))] = n |
| 107 | + |
| 108 | + self.save(counts) |
| 109 | + total = sum(counts.values()) |
| 110 | + console.print( |
| 111 | + f"[green]✓ Generated[/green] [cyan]{self.allowlist_file.relative_to(REPO_ROOT)}[/cyan] " |
| 112 | + f"with [bold]{len(counts)}[/bold] files / [bold]{total}[/bold] occurrences." |
| 113 | + ) |
| 114 | + return 0 |
| 115 | + |
| 116 | + def cleanup(self) -> int: |
| 117 | + allowlist = self.load() |
| 118 | + if not allowlist: |
| 119 | + console.print("[yellow]Allowlist is empty – nothing to clean up.[/yellow]") |
| 120 | + return 0 |
| 121 | + |
| 122 | + stale: list[str] = [rel for rel in allowlist if not (REPO_ROOT / rel).exists()] |
| 123 | + if stale: |
| 124 | + console.print( |
| 125 | + f"[yellow]Removing {len(stale)} stale entr{'y' if len(stale) == 1 else 'ies'}:[/yellow]" |
| 126 | + ) |
| 127 | + for s in sorted(stale): |
| 128 | + console.print(f" [dim]-[/dim] {s}") |
| 129 | + for s in stale: |
| 130 | + del allowlist[s] |
| 131 | + self.save(allowlist) |
| 132 | + console.print( |
| 133 | + f"\n[green]Updated[/green] [cyan]{self.allowlist_file.relative_to(REPO_ROOT)}[/cyan]" |
| 134 | + ) |
| 135 | + else: |
| 136 | + console.print("[green]✓ No stale entries found.[/green]") |
| 137 | + return 0 |
| 138 | + |
| 139 | + |
| 140 | +def _raise_lines(path: Path) -> list[str]: |
| 141 | + """Return stripped raise-lines from *path* that match the pattern.""" |
| 142 | + try: |
| 143 | + text = path.read_text(encoding="utf-8", errors="replace") |
| 144 | + except OSError: |
| 145 | + return [] |
| 146 | + |
| 147 | + result = [] |
| 148 | + for raw_line in text.splitlines(): |
| 149 | + stripped = raw_line.strip() |
| 150 | + # Skip comment lines |
| 151 | + if stripped.startswith("#"): |
| 152 | + continue |
| 153 | + if _RAISE_RE.search(raw_line): |
| 154 | + result.append(stripped) |
| 155 | + return result |
| 156 | + |
| 157 | + |
| 158 | +def _iter_python_files() -> list[Path]: |
| 159 | + return [ |
| 160 | + p.resolve() for p in REPO_ROOT.rglob("*.py") if ".tox" not in p.parts and "__pycache__" not in p.parts |
| 161 | + ] |
| 162 | + |
| 163 | + |
| 164 | +def _check_airflow_exception_usage( |
| 165 | + files: list[Path], allowlist: dict[str, int], manager: AllowlistManager |
| 166 | +) -> int: |
| 167 | + violations: list[tuple[Path, int, int]] = [] |
| 168 | + tightened: list[tuple[str, int, int]] = [] # (rel, old_count, new_count) |
| 169 | + |
| 170 | + for path in files: |
| 171 | + if not path.exists() or path.suffix != ".py": |
| 172 | + continue |
| 173 | + actual = len(_raise_lines(path)) |
| 174 | + rel = str(path.relative_to(REPO_ROOT)) |
| 175 | + allowed = allowlist.get(rel, 0) |
| 176 | + if actual > allowed: |
| 177 | + violations.append((path, actual, allowed)) |
| 178 | + elif actual < allowed: |
| 179 | + # Usage was reduced — tighten the allowlist entry so it can't creep back up. |
| 180 | + if actual == 0: |
| 181 | + del allowlist[rel] |
| 182 | + else: |
| 183 | + allowlist[rel] = actual |
| 184 | + tightened.append((rel, allowed, actual)) |
| 185 | + |
| 186 | + if tightened: |
| 187 | + manager.save(allowlist) |
| 188 | + console.print( |
| 189 | + f"[green]✓ Tightened {len(tightened)} entr{'y' if len(tightened) == 1 else 'ies'} " |
| 190 | + f"in [cyan]{manager.allowlist_file.relative_to(REPO_ROOT)}[/cyan][/green] " |
| 191 | + "(stage the updated file):" |
| 192 | + ) |
| 193 | + for rel, old, new in tightened: |
| 194 | + console.print(f" [cyan]{rel}[/cyan] {old} → {new}") |
| 195 | + |
| 196 | + if violations: |
| 197 | + console.print( |
| 198 | + Panel.fit( |
| 199 | + "New [bold]raise AirflowException[/bold] usage detected.\n" |
| 200 | + "Define a dedicated exception class or use an existing specific exception.\n" |
| 201 | + "If this usage is intentional and pre-existing, run:\n\n" |
| 202 | + " [cyan]uv run ./scripts/ci/prek/check_new_airflow_exception_usage.py --generate[/cyan]\n\n" |
| 203 | + "to regenerate the allowlist, then commit the updated\n" |
| 204 | + "[cyan]scripts/ci/prek/known_airflow_exceptions.txt[/cyan].", |
| 205 | + title="[red]❌ Check failed[/red]", |
| 206 | + border_style="red", |
| 207 | + ) |
| 208 | + ) |
| 209 | + for path, actual, allowed in violations: |
| 210 | + console.print(f" [cyan]{path.relative_to(REPO_ROOT)}[/cyan] count={actual} (allowed={allowed})") |
| 211 | + return 1 |
| 212 | + |
| 213 | + # Return 1 when the allowlist was tightened so pre-commit reports the file as modified |
| 214 | + # and prompts the user to stage the updated allowlist. |
| 215 | + return 1 if tightened else 0 |
| 216 | + |
| 217 | + |
| 218 | +def main(argv: list[str] | None = None) -> int: |
| 219 | + parser = argparse.ArgumentParser( |
| 220 | + description="Prevent new `raise AirflowException` usages.", |
| 221 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 222 | + epilog=__doc__, |
| 223 | + ) |
| 224 | + parser.add_argument("files", nargs="*", metavar="FILE", help="Files to check (provided by prek)") |
| 225 | + parser.add_argument( |
| 226 | + "--all-files", |
| 227 | + action="store_true", |
| 228 | + help="Check every Python file in the repository", |
| 229 | + ) |
| 230 | + parser.add_argument( |
| 231 | + "--cleanup", |
| 232 | + action="store_true", |
| 233 | + help="Remove stale entries from the allowlist and exit", |
| 234 | + ) |
| 235 | + parser.add_argument( |
| 236 | + "--generate", |
| 237 | + action="store_true", |
| 238 | + help="Regenerate the allowlist from the current codebase and exit", |
| 239 | + ) |
| 240 | + args = parser.parse_args(argv) |
| 241 | + |
| 242 | + manager = AllowlistManager(Path(__file__).parent / "known_airflow_exceptions.txt") |
| 243 | + |
| 244 | + if args.generate: |
| 245 | + return manager.generate() |
| 246 | + |
| 247 | + if args.cleanup: |
| 248 | + return manager.cleanup() |
| 249 | + |
| 250 | + allowlist = manager.load() |
| 251 | + |
| 252 | + if args.all_files: |
| 253 | + return _check_airflow_exception_usage(_iter_python_files(), allowlist, manager) |
| 254 | + |
| 255 | + if not args.files: |
| 256 | + console.print( |
| 257 | + "[yellow]No files provided. Pass filenames or use --all-files to scan the whole repo.[/yellow]" |
| 258 | + ) |
| 259 | + return 0 |
| 260 | + |
| 261 | + return _check_airflow_exception_usage([Path(f).resolve() for f in args.files], allowlist, manager) |
| 262 | + |
| 263 | + |
| 264 | +if __name__ == "__main__": |
| 265 | + raise SystemExit(main()) |
0 commit comments