Skip to content

Commit df08471

Browse files
stevesCopilot
andauthored
Use the rest api for repo checks (#61196)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: steves <54561+steves@users.noreply.github.com>
1 parent 54dba4e commit df08471

1 file changed

Lines changed: 150 additions & 2 deletions

File tree

src/links/scripts/check-links-external.ts

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* npm run check-links-external -- --max 100
1010
*
1111
* Environment variables:
12-
* GITHUB_TOKEN - For creating issue reports
12+
* GITHUB_TOKEN - For creating issue reports and GitHub API repo checks
1313
* ACTION_RUN_URL - Link to the action run
1414
* CREATE_REPORT - Whether to create an issue report (default: false)
1515
* REPORT_REPOSITORY - Repository to create report issues in
@@ -186,6 +186,134 @@ async function fetchWithTimeout(
186186
}
187187
}
188188

189+
/**
190+
* Return the owner/repo if the URL is exactly github.com/<owner>/<repo>, else null.
191+
*/
192+
function isGithubRepoRootUrl(url: string): { owner: string; repo: string } | null {
193+
try {
194+
const parsed = new URL(url)
195+
if (parsed.hostname !== 'github.com') return null
196+
const segments = parsed.pathname.split('/').filter(Boolean)
197+
if (segments.length === 2) return { owner: segments[0], repo: segments[1] }
198+
} catch {
199+
// ignore malformed URLs
200+
}
201+
return null
202+
}
203+
204+
/**
205+
* Check a github.com/<owner>/<repo> URL via the REST API instead of hitting
206+
* the main website. Verifies the repo exists and that html_url in the response
207+
* matches the original link (catches renames/redirects).
208+
*/
209+
async function checkGithubRepoUrl(
210+
url: string,
211+
owner: string,
212+
repo: string,
213+
cache: CacheData,
214+
): Promise<{
215+
ok: boolean
216+
statusCode?: number
217+
error?: string
218+
cached: boolean
219+
fallbackAllowed?: boolean
220+
}> {
221+
// Check cache first
222+
const cached = cache.urls[url]
223+
if (cached) {
224+
const age = Date.now() - cached.timestamp
225+
if (age < CACHE_MAX_AGE_MS) {
226+
return {
227+
ok: cached.ok,
228+
statusCode: cached.statusCode,
229+
error: cached.error,
230+
cached: true,
231+
}
232+
}
233+
}
234+
235+
const apiUrl = `https://api.github.com/repos/${owner}/${repo}`
236+
const headers: Record<string, string> = {
237+
'User-Agent': 'GitHub-Docs-Link-Checker/1.0',
238+
Accept: 'application/vnd.github+json',
239+
'X-GitHub-Api-Version': '2022-11-28',
240+
}
241+
if (process.env.GITHUB_TOKEN) {
242+
headers['Authorization'] = `Bearer ${process.env.GITHUB_TOKEN}`
243+
}
244+
245+
const controller = new AbortController()
246+
const timeoutHandle = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS)
247+
248+
try {
249+
const response = await fetch(apiUrl, {
250+
method: 'GET',
251+
signal: controller.signal,
252+
headers,
253+
})
254+
clearTimeout(timeoutHandle)
255+
256+
let result: {
257+
ok: boolean
258+
statusCode?: number
259+
error?: string
260+
cached: boolean
261+
fallbackAllowed?: boolean
262+
}
263+
264+
if (response.ok) {
265+
const data = (await response.json()) as { html_url?: string; private?: boolean }
266+
267+
if (data.private) {
268+
result = {
269+
ok: false,
270+
statusCode: response.status,
271+
error: 'Repository is private',
272+
cached: false,
273+
fallbackAllowed: true,
274+
}
275+
} else {
276+
result = {
277+
ok: true,
278+
statusCode: response.status,
279+
cached: false,
280+
fallbackAllowed: false,
281+
}
282+
}
283+
} else {
284+
result = {
285+
ok: false,
286+
statusCode: response.status,
287+
error: `HTTP ${response.status}`,
288+
cached: false,
289+
fallbackAllowed: [401, 403, 404, 429].includes(response.status) || response.status >= 500,
290+
}
291+
}
292+
293+
// Only cache successful results. A failed API check may mean the URL is
294+
// not actually a repo (e.g. github.com/settings/tokens), so we leave the
295+
// cache empty for failures and let the checkUrl fallback handle caching.
296+
if (result.ok) {
297+
cache.urls[url] = {
298+
timestamp: Date.now(),
299+
ok: result.ok,
300+
statusCode: result.statusCode,
301+
error: result.error,
302+
}
303+
}
304+
305+
return result
306+
} catch {
307+
clearTimeout(timeoutHandle)
308+
return {
309+
ok: false,
310+
error: 'Request timed out or failed',
311+
cached: false,
312+
fallbackAllowed: true,
313+
}
314+
}
315+
}
316+
189317
/**
190318
* Extract all external links from content files
191319
*/
@@ -345,6 +473,7 @@ async function main() {
345473
}
346474
}
347475
}
476+
348477
const queuedUrlCount = Array.from(urlsByDomain.values()).reduce(
349478
(count, domainUrls) => count + domainUrls.length,
350479
0,
@@ -359,7 +488,26 @@ async function main() {
359488
async function checkDomainUrls(domainUrls: string[]): Promise<void> {
360489
for (const url of domainUrls) {
361490
const occurrences = allLinks.get(url)!
362-
const result = await checkUrl(url, db.data)
491+
const repoInfo = isGithubRepoRootUrl(url)
492+
let result: {
493+
ok: boolean
494+
statusCode?: number
495+
error?: string
496+
cached: boolean
497+
fallbackAllowed?: boolean
498+
}
499+
500+
if (repoInfo && process.env.GITHUB_TOKEN) {
501+
result = await checkGithubRepoUrl(url, repoInfo.owner, repoInfo.repo, db.data)
502+
// Fall back to direct HTTP checks only when the API result is not
503+
// definitive (e.g. API/network failures or private-repo responses).
504+
if (!result.ok && result.fallbackAllowed) {
505+
result = await checkUrl(url, db.data)
506+
}
507+
} else {
508+
result = await checkUrl(url, db.data)
509+
}
510+
363511
checkedCount++
364512

365513
if (result.cached) cachedCount++

0 commit comments

Comments
 (0)