Skip to content

Commit 6f56aa1

Browse files
authored
build(prek-hook): Check whether new "raise AirflowException" is added (#55416)
* build(prek-hook): Check whether new "raise AirflowException" is added * feat: add white list * refactor: simplify the record to use count instead of raw content * fixup! refactor: simplify the record to use count instead of raw content * refactor: filter python files that matters * fixup! refactor: filter python files that matters
1 parent 645413b commit 6f56aa1

3 files changed

Lines changed: 704 additions & 0 deletions

File tree

.pre-commit-config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,12 @@ repos:
977977
language: python
978978
pass_filenames: true
979979
files: \.py$
980+
- id: check-no-new-airflow-exceptions
981+
name: Check that no new raise AirflowException usages are added
982+
entry: ./scripts/ci/prek/check_new_airflow_exception_usage.py
983+
language: python
984+
pass_filenames: true
985+
files: ^(airflow-core|airflow-ctl|task-sdk|providers|shared)/.*\.py$
980986
- id: bandit
981987
name: bandit
982988
description: "Bandit is a tool for finding common security issues in Python code"
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
#!/usr/bin/env python
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
# /// script
19+
# requires-python = ">=3.10"
20+
# dependencies = [
21+
# "rich>=13.0.0",
22+
# ]
23+
# ///
24+
"""Check that no new ``raise AirflowException`` usages are introduced.
25+
26+
All *existing* usages are recorded in ``known_airflow_exceptions.txt`` next to
27+
this script as ``relative/path::N`` entries (one per file), where ``N`` is the
28+
maximum number of ``raise AirflowException`` occurrences allowed in that file.
29+
A file whose current count exceeds the recorded limit is treated as a violation
30+
– use a dedicated exception class instead.
31+
32+
Modes
33+
-----
34+
Default (files passed by prek/pre-commit):
35+
Check only the supplied files; fail if any file's count exceeds the limit.
36+
When a file's count has *decreased*, the allowlist entry is tightened
37+
automatically and the hook exits with a non-zero code so that pre-commit
38+
reports the modified allowlist — just stage
39+
``scripts/ci/prek/known_airflow_exceptions.txt`` and re-run.
40+
41+
``--all-files``:
42+
Walk the whole repository and check every ``.py`` file.
43+
44+
``--cleanup``:
45+
Remove entries for files that no longer exist. Safe to run at any time;
46+
does not add new entries or raise limits.
47+
48+
``--generate``:
49+
Scan the whole repository and *rebuild* the allowlist from scratch.
50+
Intended for the initial setup or after a large-scale clean-up sprint.
51+
"""
52+
53+
from __future__ import annotations
54+
55+
import argparse
56+
import re
57+
from pathlib import Path
58+
59+
from rich.console import Console
60+
from rich.panel import Panel
61+
62+
console = Console()
63+
64+
REPO_ROOT = Path(__file__).parents[3]
65+
66+
# Match lines that actually raise AirflowException. Comment filtering is done
67+
# in _raise_lines() by skipping lines whose stripped form starts with "#".
68+
_RAISE_RE = re.compile(r"raise\s+AirflowException\b")
69+
70+
71+
class AllowlistManager:
72+
def __init__(self, allowlist_file: Path) -> None:
73+
self.allowlist_file = allowlist_file
74+
75+
def load(self) -> dict[str, int]:
76+
"""Return mapping of ``relative_path -> allowed_count``."""
77+
if not self.allowlist_file.exists():
78+
return {}
79+
80+
result: dict[str, int] = {}
81+
for raw_line in self.allowlist_file.read_text().splitlines():
82+
if not (stripped := raw_line.strip()):
83+
continue
84+
85+
rel_str, _, count_str = stripped.rpartition("::")
86+
if not rel_str or not count_str:
87+
continue
88+
89+
try:
90+
result[rel_str] = int(count_str)
91+
except ValueError:
92+
continue
93+
94+
return result
95+
96+
def save(self, counts: dict[str, int]) -> None:
97+
lines = [f"{rel}::{count}" for rel, count in sorted(counts.items())]
98+
self.allowlist_file.write_text("\n".join(lines) + "\n")
99+
100+
def generate(self) -> int:
101+
console.print(f"Scanning [cyan]{REPO_ROOT}[/cyan] for raise AirflowException …")
102+
counts: dict[str, int] = {}
103+
for path in _iter_python_files():
104+
n = len(_raise_lines(path))
105+
if n > 0:
106+
counts[str(path.relative_to(REPO_ROOT))] = n
107+
108+
self.save(counts)
109+
total = sum(counts.values())
110+
console.print(
111+
f"[green]✓ Generated[/green] [cyan]{self.allowlist_file.relative_to(REPO_ROOT)}[/cyan] "
112+
f"with [bold]{len(counts)}[/bold] files / [bold]{total}[/bold] occurrences."
113+
)
114+
return 0
115+
116+
def cleanup(self) -> int:
117+
allowlist = self.load()
118+
if not allowlist:
119+
console.print("[yellow]Allowlist is empty – nothing to clean up.[/yellow]")
120+
return 0
121+
122+
stale: list[str] = [rel for rel in allowlist if not (REPO_ROOT / rel).exists()]
123+
if stale:
124+
console.print(
125+
f"[yellow]Removing {len(stale)} stale entr{'y' if len(stale) == 1 else 'ies'}:[/yellow]"
126+
)
127+
for s in sorted(stale):
128+
console.print(f" [dim]-[/dim] {s}")
129+
for s in stale:
130+
del allowlist[s]
131+
self.save(allowlist)
132+
console.print(
133+
f"\n[green]Updated[/green] [cyan]{self.allowlist_file.relative_to(REPO_ROOT)}[/cyan]"
134+
)
135+
else:
136+
console.print("[green]✓ No stale entries found.[/green]")
137+
return 0
138+
139+
140+
def _raise_lines(path: Path) -> list[str]:
141+
"""Return stripped raise-lines from *path* that match the pattern."""
142+
try:
143+
text = path.read_text(encoding="utf-8", errors="replace")
144+
except OSError:
145+
return []
146+
147+
result = []
148+
for raw_line in text.splitlines():
149+
stripped = raw_line.strip()
150+
# Skip comment lines
151+
if stripped.startswith("#"):
152+
continue
153+
if _RAISE_RE.search(raw_line):
154+
result.append(stripped)
155+
return result
156+
157+
158+
def _iter_python_files() -> list[Path]:
159+
return [
160+
p.resolve() for p in REPO_ROOT.rglob("*.py") if ".tox" not in p.parts and "__pycache__" not in p.parts
161+
]
162+
163+
164+
def _check_airflow_exception_usage(
165+
files: list[Path], allowlist: dict[str, int], manager: AllowlistManager
166+
) -> int:
167+
violations: list[tuple[Path, int, int]] = []
168+
tightened: list[tuple[str, int, int]] = [] # (rel, old_count, new_count)
169+
170+
for path in files:
171+
if not path.exists() or path.suffix != ".py":
172+
continue
173+
actual = len(_raise_lines(path))
174+
rel = str(path.relative_to(REPO_ROOT))
175+
allowed = allowlist.get(rel, 0)
176+
if actual > allowed:
177+
violations.append((path, actual, allowed))
178+
elif actual < allowed:
179+
# Usage was reduced — tighten the allowlist entry so it can't creep back up.
180+
if actual == 0:
181+
del allowlist[rel]
182+
else:
183+
allowlist[rel] = actual
184+
tightened.append((rel, allowed, actual))
185+
186+
if tightened:
187+
manager.save(allowlist)
188+
console.print(
189+
f"[green]✓ Tightened {len(tightened)} entr{'y' if len(tightened) == 1 else 'ies'} "
190+
f"in [cyan]{manager.allowlist_file.relative_to(REPO_ROOT)}[/cyan][/green] "
191+
"(stage the updated file):"
192+
)
193+
for rel, old, new in tightened:
194+
console.print(f" [cyan]{rel}[/cyan] {old}{new}")
195+
196+
if violations:
197+
console.print(
198+
Panel.fit(
199+
"New [bold]raise AirflowException[/bold] usage detected.\n"
200+
"Define a dedicated exception class or use an existing specific exception.\n"
201+
"If this usage is intentional and pre-existing, run:\n\n"
202+
" [cyan]uv run ./scripts/ci/prek/check_new_airflow_exception_usage.py --generate[/cyan]\n\n"
203+
"to regenerate the allowlist, then commit the updated\n"
204+
"[cyan]scripts/ci/prek/known_airflow_exceptions.txt[/cyan].",
205+
title="[red]❌ Check failed[/red]",
206+
border_style="red",
207+
)
208+
)
209+
for path, actual, allowed in violations:
210+
console.print(f" [cyan]{path.relative_to(REPO_ROOT)}[/cyan] count={actual} (allowed={allowed})")
211+
return 1
212+
213+
# Return 1 when the allowlist was tightened so pre-commit reports the file as modified
214+
# and prompts the user to stage the updated allowlist.
215+
return 1 if tightened else 0
216+
217+
218+
def main(argv: list[str] | None = None) -> int:
219+
parser = argparse.ArgumentParser(
220+
description="Prevent new `raise AirflowException` usages.",
221+
formatter_class=argparse.RawDescriptionHelpFormatter,
222+
epilog=__doc__,
223+
)
224+
parser.add_argument("files", nargs="*", metavar="FILE", help="Files to check (provided by prek)")
225+
parser.add_argument(
226+
"--all-files",
227+
action="store_true",
228+
help="Check every Python file in the repository",
229+
)
230+
parser.add_argument(
231+
"--cleanup",
232+
action="store_true",
233+
help="Remove stale entries from the allowlist and exit",
234+
)
235+
parser.add_argument(
236+
"--generate",
237+
action="store_true",
238+
help="Regenerate the allowlist from the current codebase and exit",
239+
)
240+
args = parser.parse_args(argv)
241+
242+
manager = AllowlistManager(Path(__file__).parent / "known_airflow_exceptions.txt")
243+
244+
if args.generate:
245+
return manager.generate()
246+
247+
if args.cleanup:
248+
return manager.cleanup()
249+
250+
allowlist = manager.load()
251+
252+
if args.all_files:
253+
return _check_airflow_exception_usage(_iter_python_files(), allowlist, manager)
254+
255+
if not args.files:
256+
console.print(
257+
"[yellow]No files provided. Pass filenames or use --all-files to scan the whole repo.[/yellow]"
258+
)
259+
return 0
260+
261+
return _check_airflow_exception_usage([Path(f).resolve() for f in args.files], allowlist, manager)
262+
263+
264+
if __name__ == "__main__":
265+
raise SystemExit(main())

0 commit comments

Comments
 (0)