Skip to content

Commit 4d23eb6

Browse files
authored
Fix search index scrape crashing on records with empty titles (#59656)
1 parent 035cb17 commit 4d23eb6

File tree

5 files changed

+306
-17
lines changed

5 files changed

+306
-17
lines changed

content/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ Before adding a new SSH key to the ssh-agent to manage your keys, you should hav
162162
```powershell
163163
ssh-add c:/Users/YOU/.ssh/id_ed25519
164164
```
165+
165166
{% data reusables.ssh.add-public-key-to-github %}
166167
167168
> ### Troubleshooting SSH agent conflicts in Windows

src/search/scripts/scrape/lib/build-records-from-api.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,18 @@ export default async function buildRecordsFromApi(
416416
}
417417

418418
if (result.record) {
419+
// Validate required fields before adding to records
420+
if (!result.record.title) {
421+
failedPages.push({
422+
url: permalink.href,
423+
relativePath: permalink.relativePath,
424+
error: 'Record has empty title',
425+
errorType: 'Validation Error',
426+
})
427+
if (!noMarkers) process.stdout.write(chalk.red('✗'))
428+
return null
429+
}
430+
419431
// Apply popularity
420432
const pathArticle = permalink.relativePath.replace('/index.md', '').replace('.md', '')
421433
let popularity = (hasPopularPages && popularPages[pathArticle]) || 0.0

src/search/scripts/scrape/lib/search-index-records.ts

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import path from 'path'
22
import fsSync from 'fs'
33
import fs from 'fs/promises'
4-
import assert from 'assert'
4+
import chalk from 'chalk'
55
import { isArray, isString } from 'lodash-es'
66

77
import type { Record } from '@/search/scripts/scrape/types'
@@ -11,9 +11,9 @@ export async function writeIndexRecords(
1111
records: Record[],
1212
outDirectory: string,
1313
): Promise<string> {
14-
validateRecords(name, records)
14+
const validRecords = validateRecords(name, records)
1515

16-
const recordsObject = Object.fromEntries(records.map((record) => [record.objectID, record]))
16+
const recordsObject = Object.fromEntries(validRecords.map((record) => [record.objectID, record]))
1717
const content = JSON.stringify(recordsObject, undefined, 0)
1818

1919
// If the outDirectory doesn't exist, create it
@@ -27,30 +27,52 @@ export async function writeIndexRecords(
2727
return filePath
2828
}
2929

30-
function validateRecords(name: string, records: Record[]): true {
31-
assert(isString(name) && name.length, '`name` is required')
32-
assert(isArray(records) && records.length, '`records` must be a non-empty array')
30+
function validateRecords(name: string, records: Record[]): Record[] {
31+
if (!isString(name) || !name.length) {
32+
throw new Error('`name` is required')
33+
}
34+
if (!isArray(records) || !records.length) {
35+
throw new Error('`records` must be a non-empty array')
36+
}
3337

34-
// each ID is unique
38+
// each ID is unique — deduplicate rather than crash
3539
const objectIDs = records.map((record) => record.objectID)
3640
const dupes = countArrayValues(objectIDs)
3741
.filter(({ count }) => count > 1)
3842
.map(({ value }) => value)
39-
assert(!dupes.length, `every objectID must be unique. dupes: ${dupes.join('; ')}`)
43+
if (dupes.length) {
44+
console.warn(
45+
chalk.yellow(`⚠ Duplicate objectIDs found and will be deduplicated: ${dupes.join('; ')}`),
46+
)
47+
}
48+
49+
const seen = new Set<string>()
50+
const validRecords: Record[] = []
4051

4152
for (const record of records) {
42-
assert(
43-
isString(record.objectID) && record.objectID.length,
44-
`objectID must be a string. received: ${record.objectID}, ${JSON.stringify(record)}`,
45-
)
53+
if (!isString(record.objectID) || !record.objectID.length) {
54+
console.warn(
55+
chalk.yellow(
56+
`⚠ Skipping record with invalid objectID: ${JSON.stringify({ objectID: record.objectID, title: record.title })}`,
57+
),
58+
)
59+
continue
60+
}
4661

47-
assert(
48-
isString(record.title) && record.title.length,
49-
`title must be a string. received: ${record.title}, ${JSON.stringify(record)}`,
50-
)
62+
if (!isString(record.title) || !record.title.length) {
63+
console.warn(chalk.yellow(`⚠ Skipping record with empty title: ${record.objectID}`))
64+
continue
65+
}
66+
67+
if (seen.has(record.objectID)) {
68+
continue
69+
}
70+
seen.add(record.objectID)
71+
72+
validRecords.push(record)
5173
}
5274

53-
return true
75+
return validRecords
5476
}
5577

5678
function countArrayValues(arr: string[]) {
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import { describe, test, expect } from 'vitest'
2+
3+
import {
4+
articleApiResponseToRecord,
5+
extractFromMarkdown,
6+
type ArticleApiResponse,
7+
} from '@/search/scripts/scrape/lib/build-records-from-api'
8+
9+
describe('articleApiResponseToRecord', () => {
10+
function makeApiResponse(overrides: Partial<ArticleApiResponse> = {}): ArticleApiResponse {
11+
return {
12+
meta: {
13+
title: 'Test Page',
14+
intro: 'An intro paragraph.',
15+
product: 'test-product',
16+
breadcrumbs: [
17+
{ href: '/en', title: 'Home' },
18+
{ href: '/en/test', title: 'Test' },
19+
{ href: '/en/test/page', title: 'Test Page' },
20+
],
21+
...overrides.meta,
22+
},
23+
body: overrides.body ?? '## Getting started\n\nSome content here.',
24+
}
25+
}
26+
27+
test('converts API response to a search record', () => {
28+
const record = articleApiResponseToRecord('/en/test/page', makeApiResponse())
29+
30+
expect(record.objectID).toBe('/en/test/page')
31+
expect(record.title).toBe('Test Page')
32+
expect(record.intro).toBe('An intro paragraph.')
33+
expect(record.breadcrumbs).toBe('Home / Test')
34+
expect(record.toplevel).toBe('Home')
35+
expect(record.headings).toContain('Getting started')
36+
expect(record.content).toContain('Some content here.')
37+
})
38+
39+
test('returns empty title when API response has empty title', () => {
40+
const record = articleApiResponseToRecord(
41+
'/en/test/page',
42+
makeApiResponse({ meta: { title: '', intro: '', product: 'test' } }),
43+
)
44+
45+
expect(record.title).toBe('')
46+
})
47+
48+
test('excludes navigational headings', () => {
49+
const record = articleApiResponseToRecord(
50+
'/en/test/page',
51+
makeApiResponse({
52+
body: '## In this article\n\n## Real heading\n\n## Further reading\n\nContent.',
53+
}),
54+
)
55+
56+
expect(record.headings).toBe('Real heading')
57+
})
58+
59+
test('handles missing breadcrumbs', () => {
60+
const response = makeApiResponse()
61+
response.meta.breadcrumbs = undefined
62+
const record = articleApiResponseToRecord('/en/test/page', response)
63+
64+
expect(record.breadcrumbs).toBe('')
65+
expect(record.toplevel).toBe('')
66+
})
67+
68+
test('prepends intro to content when not already present', () => {
69+
const record = articleApiResponseToRecord(
70+
'/en/test/page',
71+
makeApiResponse({
72+
meta: { title: 'T', intro: 'Unique intro.', product: 'p' },
73+
body: 'Body text only.',
74+
}),
75+
)
76+
77+
expect(record.content).toMatch(/^Unique intro\.\nBody text only\.$/)
78+
})
79+
80+
test('does not duplicate intro when body already contains it', () => {
81+
const record = articleApiResponseToRecord(
82+
'/en/test/page',
83+
makeApiResponse({
84+
meta: { title: 'T', intro: 'Body text', product: 'p' },
85+
body: 'Body text only.',
86+
}),
87+
)
88+
89+
const occurrences = record.content.split('Body text').length - 1
90+
expect(occurrences).toBe(1)
91+
})
92+
})
93+
94+
describe('extractFromMarkdown', () => {
95+
test('extracts h2 headings and plain text content', () => {
96+
const md = '## Heading One\n\nParagraph text.\n\n## Heading Two\n\nMore text.'
97+
const result = extractFromMarkdown(md)
98+
99+
expect(result.headings).toBe('Heading One\nHeading Two')
100+
expect(result.content).toContain('Paragraph text.')
101+
expect(result.content).toContain('More text.')
102+
})
103+
104+
test('skips ignored heading slugs', () => {
105+
const md = '## In this article\n\n## Prerequisites\n\n## Real heading'
106+
const result = extractFromMarkdown(md)
107+
108+
expect(result.headings).toBe('Real heading')
109+
})
110+
111+
test('skips ignored heading texts in non-English', () => {
112+
const md = '## 本文内容\n\n## 先决条件\n\n## 真正的标题'
113+
const result = extractFromMarkdown(md)
114+
115+
expect(result.headings).toBe('真正的标题')
116+
})
117+
118+
test('ignores h1 and h3+ headings', () => {
119+
const md = '# H1 Title\n\n## H2 Heading\n\n### H3 Subheading'
120+
const result = extractFromMarkdown(md)
121+
122+
expect(result.headings).toBe('H2 Heading')
123+
})
124+
125+
test('preserves code block text in content', () => {
126+
const md = '## Setup\n\n```bash\nssh_url=git@github.com\n```'
127+
const result = extractFromMarkdown(md)
128+
129+
expect(result.content).toContain('ssh_url=git@github.com')
130+
})
131+
})
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import { describe, test, expect, vi, beforeEach } from 'vitest'
2+
import fs from 'fs/promises'
3+
import fsSync from 'fs'
4+
5+
import { writeIndexRecords } from '@/search/scripts/scrape/lib/search-index-records'
6+
import type { Record } from '@/search/scripts/scrape/types'
7+
8+
vi.mock('fs/promises')
9+
vi.mock('fs')
10+
11+
function makeRecord(overrides: Partial<Record> = {}): Record {
12+
return {
13+
objectID: '/en/test-page',
14+
breadcrumbs: 'Test',
15+
title: 'Test Page',
16+
headings: '',
17+
content: 'Some content',
18+
intro: '',
19+
toplevel: 'Test',
20+
...overrides,
21+
}
22+
}
23+
24+
describe('writeIndexRecords', () => {
25+
beforeEach(() => {
26+
vi.clearAllMocks()
27+
vi.mocked(fsSync.existsSync).mockReturnValue(true)
28+
vi.mocked(fs.writeFile).mockResolvedValue()
29+
})
30+
31+
test('writes valid records to JSON file', async () => {
32+
const records = [makeRecord(), makeRecord({ objectID: '/en/other-page', title: 'Other Page' })]
33+
34+
const result = await writeIndexRecords('test-index', records, '/tmp/out')
35+
36+
expect(result).toBe('/tmp/out/test-index-records.json')
37+
expect(fs.writeFile).toHaveBeenCalledOnce()
38+
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
39+
const parsed = JSON.parse(writtenJson)
40+
expect(Object.keys(parsed)).toEqual(['/en/test-page', '/en/other-page'])
41+
})
42+
43+
test('filters out records with empty titles', async () => {
44+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
45+
const records = [makeRecord(), makeRecord({ objectID: '/en/bad-page', title: '' })]
46+
47+
await writeIndexRecords('test-index', records, '/tmp/out')
48+
49+
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
50+
const parsed = JSON.parse(writtenJson)
51+
expect(Object.keys(parsed)).toEqual(['/en/test-page'])
52+
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('empty title'))
53+
warnSpy.mockRestore()
54+
})
55+
56+
test('filters out records with missing objectID', async () => {
57+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
58+
const records = [makeRecord(), makeRecord({ objectID: '', title: 'No ID' })]
59+
60+
await writeIndexRecords('test-index', records, '/tmp/out')
61+
62+
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
63+
const parsed = JSON.parse(writtenJson)
64+
expect(Object.keys(parsed)).toEqual(['/en/test-page'])
65+
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('invalid objectID'))
66+
warnSpy.mockRestore()
67+
})
68+
69+
test('deduplicates records with the same objectID', async () => {
70+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
71+
const records = [
72+
makeRecord({ title: 'First' }),
73+
makeRecord({ title: 'Duplicate' }), // same objectID
74+
]
75+
76+
await writeIndexRecords('test-index', records, '/tmp/out')
77+
78+
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
79+
const parsed = JSON.parse(writtenJson)
80+
expect(parsed['/en/test-page'].title).toBe('First')
81+
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('Duplicate objectIDs'))
82+
warnSpy.mockRestore()
83+
})
84+
85+
test('does not log full record content for invalid objectID', async () => {
86+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
87+
const records = [
88+
makeRecord(),
89+
makeRecord({
90+
objectID: '',
91+
title: 'No ID',
92+
content: 'A very long content string that should not appear in logs',
93+
}),
94+
]
95+
96+
await writeIndexRecords('test-index', records, '/tmp/out')
97+
98+
const warnMessage = warnSpy.mock.calls[0][0] as string
99+
expect(warnMessage).not.toContain('very long content string')
100+
warnSpy.mockRestore()
101+
})
102+
103+
test('throws when name is empty', async () => {
104+
await expect(writeIndexRecords('', [makeRecord()], '/tmp/out')).rejects.toThrow(
105+
'`name` is required',
106+
)
107+
})
108+
109+
test('throws when records array is empty', async () => {
110+
await expect(writeIndexRecords('test-index', [], '/tmp/out')).rejects.toThrow(
111+
'`records` must be a non-empty array',
112+
)
113+
})
114+
115+
test('creates output directory if it does not exist', async () => {
116+
vi.mocked(fsSync.existsSync).mockReturnValue(false)
117+
vi.mocked(fs.mkdir).mockResolvedValue(undefined)
118+
119+
await writeIndexRecords('test-index', [makeRecord()], '/tmp/new-dir')
120+
121+
expect(fs.mkdir).toHaveBeenCalledWith('/tmp/new-dir', { recursive: true })
122+
})
123+
})

0 commit comments

Comments
 (0)