Skip to content

Commit 0a57769

Browse files
authored
[Core] raw githubusercontent urls are updated to refer azcli blob to restrict external system access (#9826)
raw githubusercontent urls are updated to refer azcli blob to restrict external system access (#9826)
1 parent aad2e40 commit 0a57769

3 files changed

Lines changed: 251 additions & 0 deletions

File tree

azure-pipelines.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,24 @@ jobs:
150150
ADO_PULL_REQUEST_LATEST_COMMIT: HEAD
151151
ADO_PULL_REQUEST_TARGET_BRANCH: $(System.PullRequest.TargetBranch)
152152
153+
- job: CheckExternalUrls
154+
displayName: "Check External Source URLs"
155+
condition: and(succeeded(), eq(variables['Build.Reason'], 'PullRequest'))
156+
pool:
157+
name: ${{ variables.ubuntu_pool }}
158+
steps:
159+
- task: UsePythonVersion@0
160+
displayName: 'Use Python 3.13'
161+
inputs:
162+
versionSpec: 3.13
163+
- bash: |
164+
#!/usr/bin/env bash
165+
set -ev
166+
# External URL exclusions are maintained in scripts/ci/external_url_exclusions.json.
167+
git fetch origin --depth=1 $(System.PullRequest.TargetBranch)
168+
python scripts/ci/validate_external_source_urls.py --src=HEAD --tgt=origin/$(System.PullRequest.TargetBranch)
169+
displayName: 'Validate External Source URLs'
170+
153171
- job: AzdevLinterModifiedExtensions
154172
displayName: "azdev linter on Modified Extensions"
155173
condition: and(succeeded(), eq(variables['Build.Reason'], 'PullRequest'))
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"tool": "External URL Validation",
3+
"scope": {
4+
"include": ["src/**/*.py"],
5+
"exclude": [
6+
"**/tests/**",
7+
"**/vendored_sdks/**",
8+
"**/*help.py"
9+
]
10+
}
11+
}
12+
13+
14+
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
#!/usr/bin/env python
2+
3+
# --------------------------------------------------------------------------------------------
4+
# Copyright (c) Microsoft Corporation. All rights reserved.
5+
# Licensed under the MIT License. See License.txt in the project root for license information.
6+
# --------------------------------------------------------------------------------------------
7+
8+
"""Fail CI if forbidden raw GitHub URL is introduced in new diff lines."""
9+
10+
import argparse
11+
import fnmatch
12+
import json
13+
import re
14+
import subprocess
15+
import sys
16+
from pathlib import Path
17+
18+
19+
GITHUB_URL_PATTERN = re.compile(
20+
r"https?://raw\.githubusercontent\.com/[^\s\"'`,)}\]]*"
21+
)
22+
INLINE_SUPPRESSION_PATTERN = re.compile(
23+
r"#\s*external-url-exempt:\s*\S"
24+
)
25+
_FILENAME_PATTERN = re.compile(r"^[A-Za-z0-9_\-]+\.[A-Za-z0-9]{1,10}$")
26+
RECOMMENDED_INTERNAL_URL = "https://azcliprod.blob.core.windows.net/cli"
27+
SCOPE_CONFIG_PATH = Path(__file__).with_name("external_url_exclusions.json")
28+
29+
# Scope configuration loaded from external_url_exclusions.json.
30+
# Contains optional "include" and "exclude" glob-pattern lists.
31+
_SCOPE_CONFIG = None
32+
33+
34+
def _load_scope_config():
35+
"""Load scope configuration (include/exclude patterns) from the JSON file."""
36+
try:
37+
with SCOPE_CONFIG_PATH.open(encoding="utf-8") as input_file:
38+
config = json.load(input_file)
39+
except (OSError, ValueError) as ex:
40+
raise RuntimeError(f"Unable to load scope config from '{SCOPE_CONFIG_PATH}': {ex}") from ex
41+
42+
if not isinstance(config, dict):
43+
raise RuntimeError(
44+
f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': expected a JSON object"
45+
)
46+
47+
scope = config.get("scope", {})
48+
if not isinstance(scope, dict):
49+
raise RuntimeError(
50+
f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': 'scope' must be a JSON object"
51+
)
52+
53+
include = scope.get("include", [])
54+
exclude = scope.get("exclude", [])
55+
56+
if isinstance(include, str):
57+
include = [include]
58+
if isinstance(exclude, str):
59+
exclude = [exclude]
60+
61+
if not isinstance(include, list) or not all(isinstance(p, str) for p in include):
62+
raise RuntimeError(
63+
f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': 'include' must be a string or array of strings"
64+
)
65+
if not isinstance(exclude, list) or not all(isinstance(p, str) for p in exclude):
66+
raise RuntimeError(
67+
f"Invalid scope configuration in '{SCOPE_CONFIG_PATH}': 'exclude' must be a string or array of strings"
68+
)
69+
70+
return (
71+
[p.replace("\\", "/") for p in include],
72+
[p.replace("\\", "/") for p in exclude],
73+
)
74+
75+
76+
def _get_scope_config():
77+
"""Return cached (include_patterns, exclude_patterns) tuple."""
78+
global _SCOPE_CONFIG # pylint: disable=global-statement
79+
80+
if _SCOPE_CONFIG is None:
81+
_SCOPE_CONFIG = _load_scope_config()
82+
83+
return _SCOPE_CONFIG
84+
85+
86+
def _matches_any(file_path: str, patterns: list) -> bool:
87+
"""Return True if *file_path* matches any of the given glob patterns."""
88+
return any(fnmatch.fnmatch(file_path, p) for p in patterns)
89+
90+
91+
92+
def _extract_filename_from_url(line: str) -> str:
93+
"""Extract the file name from the first GitHub URL found in *line*.
94+
95+
Returns the basename (e.g. ``map.json``) or ``"xxx.xxx"`` when no
96+
recognisable file name is present.
97+
"""
98+
match = GITHUB_URL_PATTERN.search(line)
99+
if match:
100+
url_path = match.group(0).rstrip("/")
101+
basename = url_path.rsplit("/", 1)[-1] if "/" in url_path else ""
102+
if _FILENAME_PATTERN.match(basename):
103+
return basename
104+
return "xxx.xxx"
105+
106+
107+
def _should_flag(file_path: str) -> bool:
108+
"""Decide whether *file_path* should be checked for forbidden URLs.
109+
110+
An entry is included when there is no include list (empty means
111+
"entire codebase") or when it matches at least one include pattern.
112+
A included entry is then flagged unless it also matches an exclude pattern.
113+
"""
114+
include_patterns, exclude_patterns = _get_scope_config()
115+
116+
included = (not include_patterns) or _matches_any(file_path, include_patterns)
117+
return included and not _matches_any(file_path, exclude_patterns)
118+
119+
120+
def _run_diff(src: str, tgt: str, cached: bool = False) -> str:
121+
cmd = ["git", "diff", "--unified=0", "--no-color"]
122+
if cached:
123+
cmd.append("--cached")
124+
else:
125+
cmd.append(f"{tgt}...{src}")
126+
127+
proc = subprocess.run(
128+
cmd,
129+
stdout=subprocess.PIPE,
130+
stderr=subprocess.PIPE,
131+
text=True,
132+
check=False,
133+
)
134+
if proc.returncode != 0:
135+
raise RuntimeError(proc.stderr.strip() or "git diff failed")
136+
return proc.stdout
137+
138+
139+
def _find_violations(diff_text: str):
140+
violations = []
141+
current_file = ""
142+
prev_added_line = ""
143+
144+
for line in diff_text.splitlines():
145+
if line.startswith("+++ b/"):
146+
current_file = line[6:]
147+
prev_added_line = ""
148+
continue
149+
150+
if not line.startswith("+") or line.startswith("+++"):
151+
prev_added_line = ""
152+
continue
153+
154+
added_line = line[1:]
155+
if GITHUB_URL_PATTERN.search(added_line) and _should_flag(current_file):
156+
# Skip if the current line or the previous added line has a suppression comment
157+
if not (INLINE_SUPPRESSION_PATTERN.search(added_line)
158+
or INLINE_SUPPRESSION_PATTERN.search(prev_added_line)):
159+
violations.append((current_file or "<unknown>", added_line.strip()))
160+
161+
prev_added_line = added_line
162+
163+
return violations
164+
165+
166+
def main() -> int:
167+
parser = argparse.ArgumentParser(description="Check diff for forbidden raw GitHub URL usage.")
168+
parser.add_argument("--src", default="HEAD", help="Source ref/commit for git diff.")
169+
parser.add_argument("--tgt", default="HEAD~1", help="Target ref/commit for git diff.")
170+
parser.add_argument("--cached", action="store_true", help="Check staged changes in git index.")
171+
args = parser.parse_args()
172+
173+
try:
174+
_get_scope_config()
175+
diff_text = _run_diff(src=args.src, tgt=args.tgt, cached=args.cached)
176+
except Exception as ex: # pylint: disable=broad-except
177+
if args.cached:
178+
print(f"Unable to evaluate staged diff: {ex}", file=sys.stderr)
179+
else:
180+
print(f"Unable to evaluate diff between '{args.tgt}' and '{args.src}': {ex}", file=sys.stderr)
181+
return 1
182+
183+
violations = _find_violations(diff_text)
184+
if not violations:
185+
print("No forbidden external GitHub URL found in added lines.")
186+
return 0
187+
188+
print("ERROR: Found forbidden external GitHub URL(s) in this change:\n", file=sys.stderr)
189+
for file_path, content in violations:
190+
filename = _extract_filename_from_url(content)
191+
print(
192+
f" {file_path}: {content}\n"
193+
"\n"
194+
" To fix, follow one of the options below (in priority order):\n"
195+
"\n"
196+
" Option 1 (Preferred) — Host the file in the AME storage account\n"
197+
" ---------------------------------------------------------------\n"
198+
" Reach out to the Platform squad to upload the file to the shared\n"
199+
" Azure CLI storage account. Once uploaded, replace the raw GitHub\n"
200+
" URL with the internal blob URL. The resulting URL should look like:\n"
201+
"\n"
202+
f" {RECOMMENDED_INTERNAL_URL}/<module>/{filename}\n"
203+
"\n"
204+
" Option 2 (Fallback) — Suppress with an inline comment\n"
205+
" -----------------------------------------------------\n"
206+
" Only if the GitHub URL is required by design (e.g. the upstream\n"
207+
" repo IS the authoritative source), add an inline suppression\n"
208+
" comment on the line before or on the same line like:\n"
209+
"\n"
210+
" # external-url-exempt: <reason>\n"
211+
f" {content} \n",
212+
file=sys.stderr,
213+
)
214+
return 1
215+
216+
217+
if __name__ == "__main__":
218+
sys.exit(main())
219+

0 commit comments

Comments
 (0)