Skip to content

Commit ef72ecf

Browse files
stevesCopilot
andauthored
Minor improvements to external links report (#61184)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: steves <54561+steves@users.noreply.github.com>
1 parent e86b0ed commit ef72ecf

3 files changed

Lines changed: 157 additions & 23 deletions

File tree

src/links/lib/link-report.ts

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ export interface LinkReport {
3232
title: string
3333
summary: string
3434
groups: GroupedBrokenLinks[]
35+
selfReferentialGroups?: GroupedBrokenLinks[]
3536
uniqueTargets: number
3637
totalOccurrences: number
3738
timestamp: string
@@ -96,6 +97,25 @@ ${statusInfo}${suggestion}**Found in ${count} file${plural}:**
9697
${tableRows}`
9798
},
9899

100+
// Self-referential links section
101+
selfReferentialLinks: (title: string, groups: GroupedBrokenLinks[]) => {
102+
const totalOccurrences = groups.reduce((sum, g) => sum + g.occurrences.length, 0)
103+
const rows = groups
104+
.map((g) => {
105+
const uniqueFileCount = new Set(g.occurrences.map((occ) => occ.file)).size
106+
const occRows = g.occurrences
107+
.map((occ) => `| \`${occ.file}\` | ${occ.lines.join(', ')} |`)
108+
.join('\n')
109+
return `### \`${g.target}\`\n\n**Found in ${uniqueFileCount} file${uniqueFileCount === 1 ? '' : 's'}:**\n\n| File | Line(s) |\n|------|---------|\n${occRows}`
110+
})
111+
.join('\n\n')
112+
return `## 🔗 ${title} (${groups.length} unique URL${groups.length === 1 ? '' : 's'}, ${totalOccurrences} occurrence${totalOccurrences === 1 ? '' : 's'})
113+
114+
The following links point to \`docs.github.com\`. Consider replacing them with relative internal links using the \`[AUTOTITLE](/path/to/article)\` syntax.
115+
116+
${rows}`
117+
},
118+
99119
// Empty report
100120
noIssues: () => 'No issues found! 🎉',
101121

@@ -301,9 +321,12 @@ export function generateInternalLinkReport(
301321
*/
302322
export function generateExternalLinkReport(
303323
brokenLinks: BrokenLink[],
304-
options: { actionUrl?: string } = {},
324+
options: { actionUrl?: string; selfReferentialLinks?: BrokenLink[] } = {},
305325
): LinkReport {
306326
const groups = groupExternalLinksByDomain(brokenLinks)
327+
const selfReferentialGroups = options.selfReferentialLinks?.length
328+
? groupBrokenLinks(options.selfReferentialLinks)
329+
: undefined
307330
const count = groups.length
308331
const plural = count === 1 ? '' : 's'
309332

@@ -314,6 +337,7 @@ export function generateExternalLinkReport(
314337
? `Found **${brokenLinks.length}** broken external link${brokenLinks.length === 1 ? '' : 's'} across **${count}** domain${plural}.`
315338
: 'All external links are valid! ✅',
316339
groups,
340+
selfReferentialGroups,
317341
uniqueTargets: count,
318342
totalOccurrences: brokenLinks.length,
319343
timestamp: new Date().toISOString(),
@@ -360,14 +384,16 @@ function renderGroups(groups: GroupedBrokenLinks[], isExternal: boolean): string
360384
*/
361385
export function reportToMarkdown(report: LinkReport, isExternal = false): string {
362386
const parts: string[] = []
387+
const hasBrokenOrRedirectGroups = report.groups.length > 0
388+
const hasSelfReferentialGroups = Boolean(report.selfReferentialGroups?.length)
363389

364390
// Header
365391
parts.push(
366392
TEMPLATES.reportHeader(report.title, report.summary, report.timestamp, report.actionUrl),
367393
)
368394
parts.push('')
369395

370-
if (report.groups.length === 0) {
396+
if (!hasBrokenOrRedirectGroups && !hasSelfReferentialGroups) {
371397
parts.push(TEMPLATES.noIssues())
372398
return parts.join('\n')
373399
}
@@ -379,7 +405,17 @@ export function reportToMarkdown(report: LinkReport, isExternal = false): string
379405
}
380406

381407
// Groups
382-
parts.push(renderGroups(report.groups, isExternal))
408+
if (hasBrokenOrRedirectGroups) {
409+
parts.push(renderGroups(report.groups, isExternal))
410+
}
411+
412+
// Self-referential links section (external report only)
413+
if (hasSelfReferentialGroups) {
414+
parts.push(
415+
TEMPLATES.selfReferentialLinks('Potential Internal Links', report.selfReferentialGroups!),
416+
)
417+
parts.push('')
418+
}
383419

384420
return parts.join('\n')
385421
}

src/links/scripts/check-links-external.ts

Lines changed: 87 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,43 @@ interface CacheData {
6464
urls: Record<string, CacheEntry>
6565
}
6666

67+
interface LinkOccurrence {
68+
file: string
69+
line: number
70+
href: string
71+
}
72+
73+
/**
74+
* Normalize a URL for deduplication purposes:
75+
* - Remove URL fragment (#anchor)
76+
* - Remove trailing slash only for origin/root URLs
77+
*
78+
* For example, https://www.githubstatus.com and https://www.githubstatus.com/
79+
* are treated as the same URL.
80+
*/
81+
function normalizeUrl(href: string): string {
82+
// Remove fragment
83+
const withoutFragment = href.split('#')[0]
84+
// Remove trailing slash only for origin/root URLs
85+
try {
86+
const parsed = new URL(withoutFragment)
87+
if (parsed.pathname === '/' && !parsed.search) {
88+
return parsed.origin
89+
}
90+
} catch {
91+
// Keep original if URL parsing fails.
92+
}
93+
return withoutFragment
94+
}
95+
96+
function isDocsGithubUrl(url: string): boolean {
97+
try {
98+
return new URL(url).hostname === 'docs.github.com'
99+
} catch {
100+
return false
101+
}
102+
}
103+
67104
/**
68105
* Sleep for a given number of milliseconds
69106
*/
@@ -150,11 +187,11 @@ async function fetchWithTimeout(
150187
/**
151188
* Extract all external links from content files
152189
*/
153-
async function extractAllExternalLinks(): Promise<Map<string, { file: string; line: number }[]>> {
154-
const links = new Map<string, { file: string; line: number }[]>()
190+
async function extractAllExternalLinks(): Promise<Map<string, LinkOccurrence[]>> {
191+
const links = new Map<string, LinkOccurrence[]>()
155192

156193
// Find all Markdown files
157-
const files = await glob('content/**/*.md')
194+
const files = await glob('content/**/*.md', { ignore: '**/README.md' })
158195
console.log(`Found ${files.length} Markdown files to scan`)
159196

160197
const extractStart = Date.now()
@@ -175,13 +212,13 @@ async function extractAllExternalLinks(): Promise<Map<string, { file: string; li
175212
if (!link.href.startsWith('https://')) continue
176213
if (isExcludedLink(link.href)) continue
177214

178-
// Normalize URL (remove anchors for checking)
179-
const url = link.href.split('#')[0]
215+
// Normalize URL (remove anchors and trailing slashes for checking)
216+
const url = normalizeUrl(link.href)
180217

181218
if (!links.has(url)) {
182219
links.set(url, [])
183220
}
184-
links.get(url)!.push({ file, line: link.line })
221+
links.get(url)!.push({ file, line: link.line, href: link.href })
185222
}
186223

187224
if ((i + 1) % 500 === 0) {
@@ -236,7 +273,19 @@ async function main() {
236273
// Extract all external links
237274
console.log('Extracting external links from content files...')
238275
const allLinks = await extractAllExternalLinks()
276+
277+
// Separate docs.github.com links — they're self-referential (this repo IS the docs site)
278+
// and will be reported separately as candidates for conversion to internal links.
279+
const selfReferentialLinks = new Map<string, LinkOccurrence[]>()
280+
for (const [url, occurrences] of allLinks) {
281+
if (isDocsGithubUrl(url)) {
282+
selfReferentialLinks.set(url, occurrences)
283+
allLinks.delete(url)
284+
}
285+
}
286+
239287
console.log(`Found ${allLinks.size} unique external URLs`)
288+
console.log(`Found ${selfReferentialLinks.size} self-referential docs.github.com URLs`)
240289
console.log('')
241290

242291
if (options.dryRun) {
@@ -276,7 +325,7 @@ async function main() {
276325
if (!result.ok) {
277326
for (const occ of occurrences) {
278327
brokenLinks.push({
279-
href: url,
328+
href: occ.href,
280329
file: occ.file,
281330
lines: [occ.line],
282331
statusCode: result.statusCode,
@@ -318,28 +367,46 @@ async function main() {
318367
chalk.blue(`Checked ${checkedCount} URLs in ${duration}s (${cachedCount} from cache)`),
319368
)
320369

321-
if (brokenLinks.length === 0) {
370+
// Build self-referential BrokenLink list for the report
371+
const selfReferentialBrokenLinks: BrokenLink[] = []
372+
for (const occurrences of selfReferentialLinks.values()) {
373+
for (const occ of occurrences) {
374+
selfReferentialBrokenLinks.push({ href: occ.href, file: occ.file, lines: [occ.line] })
375+
}
376+
}
377+
378+
if (brokenLinks.length === 0 && selfReferentialBrokenLinks.length === 0) {
322379
console.log(chalk.green('✅ All external links valid!'))
323380
process.exit(0)
324381
}
325382

326383
// Generate report
327384
const report = generateExternalLinkReport(brokenLinks, {
328385
actionUrl: process.env.ACTION_RUN_URL,
386+
selfReferentialLinks: selfReferentialBrokenLinks,
329387
})
330388

331-
console.log('')
332-
console.log(chalk.red(`❌ ${report.uniqueTargets} domain(s) with broken links`))
333-
console.log(chalk.red(` ${report.totalOccurrences} total occurrence(s)`))
389+
if (brokenLinks.length === 0) {
390+
console.log(chalk.green('✅ All external links valid!'))
391+
console.log(
392+
chalk.blue(
393+
`ℹ️ Found ${selfReferentialBrokenLinks.length} docs.github.com absolute link occurrence(s) to convert.`,
394+
),
395+
)
396+
} else {
397+
console.log('')
398+
console.log(chalk.red(`❌ ${report.uniqueTargets} domain(s) with broken links`))
399+
console.log(chalk.red(` ${report.totalOccurrences} total occurrence(s)`))
334400

335-
// Show summary by domain
336-
console.log('')
337-
console.log('Broken links by domain:')
338-
for (const group of report.groups.slice(0, 10)) {
339-
console.log(` ${group.target}: ${group.occurrences.length} occurrence(s)`)
340-
}
341-
if (report.groups.length > 10) {
342-
console.log(` ... and ${report.groups.length - 10} more domains`)
401+
// Show summary by domain
402+
console.log('')
403+
console.log('Broken links by domain:')
404+
for (const group of report.groups.slice(0, 10)) {
405+
console.log(` ${group.target}: ${group.occurrences.length} occurrence(s)`)
406+
}
407+
if (report.groups.length > 10) {
408+
console.log(` ... and ${report.groups.length - 10} more domains`)
409+
}
343410
}
344411

345412
// Write artifact
@@ -351,7 +418,7 @@ async function main() {
351418
const createReport = process.env.CREATE_REPORT === 'true'
352419
const reportRepository = process.env.REPORT_REPOSITORY || 'github/docs-content'
353420

354-
if (createReport && process.env.GITHUB_TOKEN) {
421+
if (brokenLinks.length > 0 && createReport && process.env.GITHUB_TOKEN) {
355422
console.log('')
356423
console.log('Creating issue report...')
357424

src/links/tests/link-report.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,15 @@ describe('generateExternalLinkReport', () => {
174174
expect(report.title).toContain('2 domains')
175175
expect(report.uniqueTargets).toBe(2)
176176
})
177+
178+
test('includes self-referential groups when provided', () => {
179+
const report = generateExternalLinkReport([], {
180+
selfReferentialLinks: [{ href: 'https://docs.github.com/en', file: 'a.md', lines: [1] }],
181+
})
182+
183+
expect(report.selfReferentialGroups).toHaveLength(1)
184+
expect(report.selfReferentialGroups?.[0].target).toBe('https://docs.github.com/en')
185+
})
177186
})
178187

179188
describe('reportToMarkdown', () => {
@@ -242,6 +251,28 @@ describe('reportToMarkdown', () => {
242251
expect(markdown).toContain('## ❌ Broken Links')
243252
expect(markdown).toContain('## ⚠️ Redirects to Update')
244253
})
254+
255+
test('includes potential internal links section with no broken links', () => {
256+
const report = generateExternalLinkReport([], {
257+
selfReferentialLinks: [{ href: 'https://docs.github.com/en', file: 'a.md', lines: [1] }],
258+
})
259+
const markdown = reportToMarkdown(report, true)
260+
261+
expect(markdown).toContain('Potential Internal Links')
262+
expect(markdown).not.toContain('No issues found')
263+
})
264+
265+
test('shows unique file count for potential internal links', () => {
266+
const report = generateExternalLinkReport([], {
267+
selfReferentialLinks: [
268+
{ href: 'https://docs.github.com/en', file: 'a.md', lines: [1] },
269+
{ href: 'https://docs.github.com/en', file: 'a.md', lines: [2] },
270+
],
271+
})
272+
const markdown = reportToMarkdown(report, true)
273+
274+
expect(markdown).toContain('Found in 1 file')
275+
})
245276
})
246277

247278
describe('generatePRComment', () => {

0 commit comments

Comments
 (0)