|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# -------------------------------------------------------------------------------------------- |
| 4 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 5 | +# Licensed under the MIT License. See License.txt in the project root for license information. |
| 6 | +# -------------------------------------------------------------------------------------------- |
| 7 | + |
| 8 | +"""Fail CI if forbidden raw GitHub URL is introduced in new diff lines.""" |
| 9 | + |
| 10 | +import argparse |
| 11 | +import fnmatch |
| 12 | +import json |
| 13 | +import re |
| 14 | +import subprocess |
| 15 | +import sys |
| 16 | +from pathlib import Path |
| 17 | + |
| 18 | + |
| 19 | +GITHUB_URL_PATTERN = re.compile( |
| 20 | + r"https?://raw\.githubusercontent\.com/[^\s\"'`,)}\]]*" |
| 21 | +) |
| 22 | +INLINE_SUPPRESSION_PATTERN = re.compile( |
| 23 | + r"#\s*external-url-exempt:\s*\S" |
| 24 | +) |
| 25 | +_FILENAME_PATTERN = re.compile(r"^[A-Za-z0-9_\-]+\.[A-Za-z0-9]{1,10}$") |
| 26 | +RECOMMENDED_INTERNAL_URL = "https://azcliprod.blob.core.windows.net/cli" |
| 27 | +SCOPE_CONFIG_PATH = Path(__file__).with_name("external_url_exclusions.json") |
| 28 | + |
| 29 | +# Scope configuration loaded from external_url_exclusions.json. |
| 30 | +# Contains optional "include" and "exclude" glob-pattern lists. |
| 31 | +_SCOPE_CONFIG = None |
| 32 | + |
| 33 | + |
| 34 | +def _load_scope_config(): |
| 35 | + """Load scope configuration (include/exclude patterns) from the JSON file.""" |
| 36 | + try: |
| 37 | + with SCOPE_CONFIG_PATH.open(encoding="utf-8") as input_file: |
| 38 | + config = json.load(input_file) |
| 39 | + except (OSError, ValueError) as ex: |
| 40 | + raise RuntimeError(f"Unable to load scope config from '{SCOPE_CONFIG_PATH}': {ex}") from ex |
| 41 | + |
| 42 | + if not isinstance(config, dict): |
| 43 | + raise RuntimeError( |
| 44 | + f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': expected a JSON object" |
| 45 | + ) |
| 46 | + |
| 47 | + scope = config.get("scope", {}) |
| 48 | + if not isinstance(scope, dict): |
| 49 | + raise RuntimeError( |
| 50 | + f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': 'scope' must be a JSON object" |
| 51 | + ) |
| 52 | + |
| 53 | + include = scope.get("include", []) |
| 54 | + exclude = scope.get("exclude", []) |
| 55 | + |
| 56 | + if isinstance(include, str): |
| 57 | + include = [include] |
| 58 | + if isinstance(exclude, str): |
| 59 | + exclude = [exclude] |
| 60 | + |
| 61 | + if not isinstance(include, list) or not all(isinstance(p, str) for p in include): |
| 62 | + raise RuntimeError( |
| 63 | + f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': 'include' must be a string or array of strings" |
| 64 | + ) |
| 65 | + if not isinstance(exclude, list) or not all(isinstance(p, str) for p in exclude): |
| 66 | + raise RuntimeError( |
| 67 | + f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': 'exclude' must be a string or array of strings" |
| 68 | + ) |
| 69 | + |
| 70 | + return ( |
| 71 | + [p.replace("\\", "/") for p in include], |
| 72 | + [p.replace("\\", "/") for p in exclude], |
| 73 | + ) |
| 74 | + |
| 75 | + |
| 76 | +def _get_scope_config(): |
| 77 | + """Return cached (include_patterns, exclude_patterns) tuple.""" |
| 78 | + global _SCOPE_CONFIG # pylint: disable=global-statement |
| 79 | + |
| 80 | + if _SCOPE_CONFIG is None: |
| 81 | + _SCOPE_CONFIG = _load_scope_config() |
| 82 | + |
| 83 | + return _SCOPE_CONFIG |
| 84 | + |
| 85 | + |
| 86 | +def _matches_any(file_path: str, patterns: list) -> bool: |
| 87 | + """Return True if *file_path* matches any of the given glob patterns.""" |
| 88 | + return any(fnmatch.fnmatch(file_path, p) for p in patterns) |
| 89 | + |
| 90 | + |
| 91 | + |
| 92 | +def _extract_filename_from_url(line: str) -> str: |
| 93 | + """Extract the file name from the first GitHub URL found in *line*. |
| 94 | +
|
| 95 | + Returns the basename (e.g. ``map.json``) or ``"xxx.xxx"`` when no |
| 96 | + recognisable file name is present. |
| 97 | + """ |
| 98 | + match = GITHUB_URL_PATTERN.search(line) |
| 99 | + if match: |
| 100 | + url_path = match.group(0).rstrip("/") |
| 101 | + basename = url_path.rsplit("/", 1)[-1] if "/" in url_path else "" |
| 102 | + if _FILENAME_PATTERN.match(basename): |
| 103 | + return basename |
| 104 | + return "xxx.xxx" |
| 105 | + |
| 106 | + |
| 107 | +def _should_flag(file_path: str) -> bool: |
| 108 | + """Decide whether *file_path* should be checked for forbidden URLs. |
| 109 | +
|
| 110 | + An entry is included when there is no include list (empty means |
| 111 | + "entire codebase") or when it matches at least one include pattern. |
| 112 | + A included entry is then flagged unless it also matches an exclude pattern. |
| 113 | + """ |
| 114 | + include_patterns, exclude_patterns = _get_scope_config() |
| 115 | + |
| 116 | + included = (not include_patterns) or _matches_any(file_path, include_patterns) |
| 117 | + return included and not _matches_any(file_path, exclude_patterns) |
| 118 | + |
| 119 | + |
| 120 | +def _run_diff(src: str, tgt: str, cached: bool = False) -> str: |
| 121 | + cmd = ["git", "diff", "--unified=0", "--no-color"] |
| 122 | + if cached: |
| 123 | + cmd.append("--cached") |
| 124 | + else: |
| 125 | + cmd.append(f"{tgt}...{src}") |
| 126 | + |
| 127 | + proc = subprocess.run( |
| 128 | + cmd, |
| 129 | + stdout=subprocess.PIPE, |
| 130 | + stderr=subprocess.PIPE, |
| 131 | + text=True, |
| 132 | + check=False, |
| 133 | + ) |
| 134 | + if proc.returncode != 0: |
| 135 | + raise RuntimeError(proc.stderr.strip() or "git diff failed") |
| 136 | + return proc.stdout |
| 137 | + |
| 138 | + |
| 139 | +def _find_violations(diff_text: str): |
| 140 | + violations = [] |
| 141 | + current_file = "" |
| 142 | + prev_added_line = "" |
| 143 | + |
| 144 | + for line in diff_text.splitlines(): |
| 145 | + if line.startswith("+++ b/"): |
| 146 | + current_file = line[6:] |
| 147 | + prev_added_line = "" |
| 148 | + continue |
| 149 | + |
| 150 | + if not line.startswith("+") or line.startswith("+++"): |
| 151 | + prev_added_line = "" |
| 152 | + continue |
| 153 | + |
| 154 | + added_line = line[1:] |
| 155 | + if GITHUB_URL_PATTERN.search(added_line) and _should_flag(current_file): |
| 156 | + # Skip if the current line or the previous added line has a suppression comment |
| 157 | + if not (INLINE_SUPPRESSION_PATTERN.search(added_line) |
| 158 | + or INLINE_SUPPRESSION_PATTERN.search(prev_added_line)): |
| 159 | + violations.append((current_file or "<unknown>", added_line.strip())) |
| 160 | + |
| 161 | + prev_added_line = added_line |
| 162 | + |
| 163 | + return violations |
| 164 | + |
| 165 | + |
| 166 | +def main() -> int: |
| 167 | + parser = argparse.ArgumentParser(description="Check diff for forbidden raw GitHub URL usage.") |
| 168 | + parser.add_argument("--src", default="HEAD", help="Source ref/commit for git diff.") |
| 169 | + parser.add_argument("--tgt", default="HEAD~1", help="Target ref/commit for git diff.") |
| 170 | + parser.add_argument("--cached", action="store_true", help="Check staged changes in git index.") |
| 171 | + args = parser.parse_args() |
| 172 | + |
| 173 | + try: |
| 174 | + _get_scope_config() |
| 175 | + diff_text = _run_diff(src=args.src, tgt=args.tgt, cached=args.cached) |
| 176 | + except Exception as ex: # pylint: disable=broad-except |
| 177 | + if args.cached: |
| 178 | + print(f"Unable to evaluate staged diff: {ex}", file=sys.stderr) |
| 179 | + else: |
| 180 | + print(f"Unable to evaluate diff between '{args.tgt}' and '{args.src}': {ex}", file=sys.stderr) |
| 181 | + return 1 |
| 182 | + |
| 183 | + violations = _find_violations(diff_text) |
| 184 | + if not violations: |
| 185 | + print("No forbidden external GitHub URL found in added lines.") |
| 186 | + return 0 |
| 187 | + |
| 188 | + print("ERROR: Found forbidden external GitHub URL(s) in this change:\n", file=sys.stderr) |
| 189 | + for file_path, content in violations: |
| 190 | + filename = _extract_filename_from_url(content) |
| 191 | + print( |
| 192 | + f" {file_path}: {content}\n" |
| 193 | + "\n" |
| 194 | + " To fix, follow one of the options below (in priority order):\n" |
| 195 | + "\n" |
| 196 | + " Option 1 (Preferred) — Host the file in the AME storage account\n" |
| 197 | + " ---------------------------------------------------------------\n" |
| 198 | + " Reach out to the Platform squad to upload the file to the shared\n" |
| 199 | + " Azure CLI storage account. Once uploaded, replace the raw GitHub\n" |
| 200 | + " URL with the internal blob URL. The resulting URL should look like:\n" |
| 201 | + "\n" |
| 202 | + f" {RECOMMENDED_INTERNAL_URL}/<module>/{filename}\n" |
| 203 | + "\n" |
| 204 | + " Option 2 (Fallback) — Suppress with an inline comment\n" |
| 205 | + " -----------------------------------------------------\n" |
| 206 | + " Only if the GitHub URL is required by design (e.g. the upstream\n" |
| 207 | + " repo IS the authoritative source), add an inline suppression\n" |
| 208 | + " comment on the line before or on the same line like:\n" |
| 209 | + "\n" |
| 210 | + " # external-url-exempt: <reason>\n" |
| 211 | + f" {content} \n", |
| 212 | + file=sys.stderr, |
| 213 | + ) |
| 214 | + return 1 |
| 215 | + |
| 216 | + |
| 217 | +if __name__ == "__main__": |
| 218 | + sys.exit(main()) |
| 219 | + |
0 commit comments