Skip to content

Commit 2777fe3

Browse files
authored
Youtube video search anomaly (#703)
1 parent 9a4589f commit 2777fe3

8 files changed

Lines changed: 387 additions & 72 deletions

app/utils/__tests__/semantic-search.server.test.ts

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,87 @@ test('semanticSearchKCD routes user query embeddings through CLOUDFLARE_AI_GATEW
7676
fetchSpy.mockRestore()
7777
}
7878
})
79+
80+
test('semanticSearchKCD canonicalizes YouTube results by video id from URL when slug is missing', async () => {
81+
using ignoredEnv = setEnv({
82+
CLOUDFLARE_ACCOUNT_ID: 'mock-account',
83+
CLOUDFLARE_API_TOKEN: 'mock-token',
84+
CLOUDFLARE_AI_GATEWAY_ID: 'runtime-search-gateway',
85+
CLOUDFLARE_AI_GATEWAY_AUTH_TOKEN: 'mock-gateway-auth-token',
86+
CLOUDFLARE_VECTORIZE_INDEX: 'mock-index',
87+
})
88+
89+
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockImplementation(async (input) => {
90+
const url = input instanceof Request ? input.url : String(input)
91+
92+
if (url.includes('/workers-ai/')) {
93+
return new Response(
94+
JSON.stringify({
95+
result: {
96+
shape: [1, 3],
97+
data: [[0.1, 0.2, 0.3]],
98+
},
99+
}),
100+
{
101+
status: 200,
102+
headers: { 'Content-Type': 'application/json' },
103+
},
104+
)
105+
}
106+
107+
if (url.includes('/vectorize/')) {
108+
return new Response(
109+
JSON.stringify({
110+
result: {
111+
count: 2,
112+
matches: [
113+
{
114+
id: 'legacy-vector-a',
115+
score: 0.9,
116+
metadata: {
117+
type: 'youtube',
118+
title: 'Legacy video A',
119+
url: '/youtube?video=AAA111BBB22',
120+
snippet: 'First transcript chunk',
121+
chunkKind: 'transcript',
122+
},
123+
},
124+
{
125+
id: 'legacy-vector-b',
126+
score: 0.8,
127+
metadata: {
128+
type: 'youtube',
129+
title: 'Legacy video B',
130+
url: '/youtube?video=CCC333DDD44',
131+
snippet: 'Second transcript chunk',
132+
chunkKind: 'transcript',
133+
},
134+
},
135+
],
136+
},
137+
}),
138+
{
139+
status: 200,
140+
headers: { 'Content-Type': 'application/json' },
141+
},
142+
)
143+
}
144+
145+
throw new Error(`Unexpected fetch URL in semantic search test: ${url}`)
146+
})
147+
148+
try {
149+
const results = await semanticSearchKCD({
150+
query: 'legacy youtube',
151+
topK: 5,
152+
})
153+
154+
expect(results).toHaveLength(2)
155+
expect(results.map((r) => r.id)).toEqual([
156+
'youtube:aaa111bbb22',
157+
'youtube:ccc333ddd44',
158+
])
159+
} finally {
160+
fetchSpy.mockRestore()
161+
}
162+
})

app/utils/semantic-search.server.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,12 @@ function getCanonicalResultId({
161161
}) {
162162
// The Vectorize index stores multiple chunk vectors per doc, so we need a
163163
// canonical, doc-level identifier to collapse duplicates in query results.
164+
if (type === 'youtube') {
165+
if (slug) return `${type}:${normalizeSlugForKey(slug)}`
166+
// Legacy vectors may omit `slug`; recover stable doc identity from URL.
167+
const youtubeVideoId = parseYoutubeVideoIdFromUrl(url)
168+
if (youtubeVideoId) return `${type}:${normalizeSlugForKey(youtubeVideoId)}`
169+
}
164170
if (type && slug) return `${type}:${normalizeSlugForKey(slug)}`
165171
const fromVectorId = parseDocRefFromVectorId(vectorId)
166172
if (fromVectorId) {

docs/agents/project-context.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ reference:
3939
It's populated on first request or via `npm run prime-cache:mocks`.
4040
- Content is filesystem-based: blog posts are MDX files in `content/blog/`.
4141
Changes to content files are auto-detected by the dev server's file watcher.
42+
- Semantic search caveat: YouTube auto-captions can include cue-only chunks like
43+
`[Music]`. The YouTube indexer filters these low-signal caption lines and
44+
merges tiny trailing transcript chunks at ingest time, but old vectors can
45+
still linger until the next YouTube reindex.
4246

4347
## Cloud / headless manual testing
4448

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import { expect, test } from 'vitest'
2+
import {
3+
chunkTranscriptEvents,
4+
type TranscriptEvent,
5+
} from '../youtube-transcript-chunking.ts'
6+
7+
test('chunkTranscriptEvents merges tiny trailing chunks into previous chunk', () => {
8+
const events: Array<TranscriptEvent> = [
9+
{ startMs: 0, durationMs: 1000, text: 'A'.repeat(40) },
10+
{ startMs: 1200, durationMs: 600, text: 'tail' },
11+
]
12+
13+
const chunks = chunkTranscriptEvents(events, {
14+
targetChars: 40,
15+
maxChunkChars: 80,
16+
minChunkChars: 20,
17+
})
18+
19+
expect(chunks).toHaveLength(1)
20+
expect(chunks[0]?.body).toContain('A'.repeat(40))
21+
expect(chunks[0]?.body).toContain('tail')
22+
expect(chunks[0]?.startMs).toBe(0)
23+
expect(chunks[0]?.endMs).toBe(1800)
24+
})
25+
26+
test('chunkTranscriptEvents keeps a tiny chunk when it is the only chunk', () => {
27+
const events: Array<TranscriptEvent> = [
28+
{ startMs: 0, durationMs: 400, text: 'short' },
29+
]
30+
31+
const chunks = chunkTranscriptEvents(events, {
32+
targetChars: 40,
33+
maxChunkChars: 80,
34+
minChunkChars: 20,
35+
})
36+
37+
expect(chunks).toHaveLength(1)
38+
expect(chunks[0]?.body).toBe('short')
39+
})
40+
41+
test('chunkTranscriptEvents keeps trailing chunks that meet minimum size', () => {
42+
const events: Array<TranscriptEvent> = [
43+
{ startMs: 0, durationMs: 800, text: 'A'.repeat(40) },
44+
{ startMs: 1000, durationMs: 700, text: 'B'.repeat(25) },
45+
]
46+
47+
const chunks = chunkTranscriptEvents(events, {
48+
targetChars: 40,
49+
maxChunkChars: 80,
50+
minChunkChars: 20,
51+
})
52+
53+
expect(chunks).toHaveLength(2)
54+
expect(chunks[0]?.body).toBe('A'.repeat(40))
55+
expect(chunks[1]?.body).toBe('B'.repeat(25))
56+
})
57+
58+
test('chunkTranscriptEvents re-checks oversized lines after flush', () => {
59+
const events: Array<TranscriptEvent> = [
60+
{ startMs: 0, durationMs: 800, text: 'A'.repeat(40) },
61+
{ startMs: 1000, durationMs: 700, text: 'B'.repeat(120) },
62+
]
63+
64+
const chunks = chunkTranscriptEvents(events, {
65+
targetChars: 50,
66+
maxChunkChars: 80,
67+
minChunkChars: 0,
68+
})
69+
70+
expect(chunks).toHaveLength(4)
71+
expect(chunks[0]?.body).toBe('A'.repeat(40))
72+
expect(chunks.slice(1).every((chunk) => (chunk.body.length || 0) <= 50)).toBe(
73+
true,
74+
)
75+
})
76+
77+
test('chunkTranscriptEvents guards non-positive targetChars', () => {
78+
const events: Array<TranscriptEvent> = [
79+
{ startMs: 0, durationMs: 500, text: 'A'.repeat(12) },
80+
]
81+
82+
const chunks = chunkTranscriptEvents(events, {
83+
targetChars: 0,
84+
maxChunkChars: 8,
85+
minChunkChars: 0,
86+
})
87+
88+
expect(chunks.length).toBeGreaterThan(0)
89+
expect(chunks.every((chunk) => chunk.body.length >= 1)).toBe(true)
90+
})
91+
92+
test('chunkTranscriptEvents keeps event end at or after start for negative durations', () => {
93+
const events: Array<TranscriptEvent> = [
94+
{ startMs: 1000, durationMs: -900, text: 'negative duration' },
95+
]
96+
97+
const chunks = chunkTranscriptEvents(events, {
98+
targetChars: 40,
99+
maxChunkChars: 80,
100+
minChunkChars: 0,
101+
})
102+
103+
expect(chunks).toHaveLength(1)
104+
expect(chunks[0]?.startMs).toBe(1000)
105+
expect(chunks[0]?.endMs).toBe(1000)
106+
})
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import { expect, test } from 'vitest'
2+
import { isLowSignalYoutubeCaptionCueLine } from '../youtube-transcript-cue-filter.ts'
3+
4+
test('isLowSignalYoutubeCaptionCueLine matches bracket-only cues', () => {
5+
expect(isLowSignalYoutubeCaptionCueLine('[Music]')).toBe(true)
6+
expect(isLowSignalYoutubeCaptionCueLine('[Applause] [Music]')).toBe(true)
7+
expect(isLowSignalYoutubeCaptionCueLine('[Laughter], [Applause]')).toBe(true)
8+
})
9+
10+
test('isLowSignalYoutubeCaptionCueLine matches bare cue words', () => {
11+
expect(isLowSignalYoutubeCaptionCueLine('music')).toBe(true)
12+
expect(isLowSignalYoutubeCaptionCueLine('Applause')).toBe(true)
13+
expect(isLowSignalYoutubeCaptionCueLine('inaudible')).toBe(true)
14+
})
15+
16+
test('isLowSignalYoutubeCaptionCueLine keeps meaningful transcript lines', () => {
17+
expect(
18+
isLowSignalYoutubeCaptionCueLine(
19+
'Remix makes progressive enhancement easier for this workflow.',
20+
),
21+
).toBe(false)
22+
expect(
23+
isLowSignalYoutubeCaptionCueLine(
24+
'[Music] and then we talk about build tools and testing.',
25+
),
26+
).toBe(false)
27+
})

other/semantic-search/index-youtube-playlist.ts

Lines changed: 6 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ import {
1111
} from './cloudflare.ts'
1212
import { getSemanticSearchIgnoreList, isDocIdIgnored } from './ignore-list.ts'
1313
import { getJsonObject, putJsonObject } from './r2-manifest.ts'
14+
import {
15+
chunkTranscriptEvents,
16+
type TranscriptEvent,
17+
} from './youtube-transcript-chunking.ts'
18+
import { isLowSignalYoutubeCaptionCueLine } from './youtube-transcript-cue-filter.ts'
1419

1520
type DocType = 'youtube'
1621
type TranscriptSource = 'manual' | 'auto' | 'none'
@@ -63,12 +68,6 @@ type VideoEnrichedData = {
6368
transcriptSource: TranscriptSource
6469
}
6570

66-
type TranscriptEvent = {
67-
startMs: number
68-
durationMs: number
69-
text: string
70-
}
71-
7271
type YoutubeChunkItem =
7372
| { kind: 'meta'; body: string }
7473
| {
@@ -844,6 +843,7 @@ async function fetchTranscriptFromTrack(track: CaptionTrack, label: string) {
844843
.replace(/\u200B/g, '')
845844
.trim()
846845
if (!line) continue
846+
if (isLowSignalYoutubeCaptionCueLine(line)) continue
847847
lines.push(line)
848848
transcriptEvents.push({ startMs, durationMs, text: line })
849849
}
@@ -853,72 +853,6 @@ async function fetchTranscriptFromTrack(track: CaptionTrack, label: string) {
853853
}
854854
}
855855

856-
function chunkTranscriptEvents(
857-
events: TranscriptEvent[],
858-
{
859-
targetChars = 3500,
860-
maxChunkChars = 5500,
861-
}: { targetChars?: number; maxChunkChars?: number } = {},
862-
) {
863-
const sorted = [...events].sort((a, b) => a.startMs - b.startMs)
864-
const chunks: Array<{
865-
body: string
866-
startMs: number
867-
endMs: number
868-
}> = []
869-
870-
let currentLines: string[] = []
871-
let currentLen = 0
872-
let startMs: number | null = null
873-
let endMs = 0
874-
875-
const flush = () => {
876-
if (!currentLines.length || startMs === null) return
877-
const body = normalizeText(currentLines.join('\n'))
878-
if (!body) return
879-
chunks.push({ body, startMs, endMs })
880-
currentLines = []
881-
currentLen = 0
882-
startMs = null
883-
endMs = 0
884-
}
885-
886-
for (const e of sorted) {
887-
const line = normalizeText(e.text)
888-
if (!line) continue
889-
890-
// If we don't have a current chunk and this line is huge, split it.
891-
if (!currentLines.length && line.length > maxChunkChars) {
892-
const eStartMs = Math.max(0, Math.floor(e.startMs))
893-
const eEndMs = Math.max(
894-
eStartMs,
895-
Math.floor(e.startMs + (e.durationMs || 0)),
896-
)
897-
for (let i = 0; i < line.length; i += targetChars) {
898-
const part = line.slice(i, i + targetChars)
899-
const body = normalizeText(part)
900-
if (!body) continue
901-
chunks.push({ body, startMs: eStartMs, endMs: eEndMs })
902-
}
903-
continue
904-
}
905-
906-
const nextLen = currentLen + (currentLines.length ? 1 : 0) + line.length
907-
if (currentLines.length && nextLen > targetChars) {
908-
flush()
909-
}
910-
911-
if (startMs === null) startMs = Math.max(0, Math.floor(e.startMs))
912-
const eventEnd = Math.max(0, Math.floor(e.startMs + (e.durationMs || 0)))
913-
endMs = Math.max(endMs, eventEnd)
914-
currentLines.push(line)
915-
currentLen = currentLen + (currentLines.length > 1 ? 1 : 0) + line.length
916-
}
917-
918-
flush()
919-
return chunks
920-
}
921-
922856
async function fetchVideoEnrichedData({
923857
config,
924858
videoId,

0 commit comments

Comments
 (0)