Youtube video search anomaly (#703)

kentcdodds · web-flow · commit 2777fe36b724 · 2026-02-24T22:04:06.000-07:00
diff --git a/app/utils/__tests__/semantic-search.server.test.ts b/app/utils/__tests__/semantic-search.server.test.ts
@@ -76,3 +76,87 @@ test('semanticSearchKCD routes user query embeddings through CLOUDFLARE_AI_GATEW
 		fetchSpy.mockRestore()
 	}
 })
+
+test('semanticSearchKCD canonicalizes YouTube results by video id from URL when slug is missing', async () => {
+	using ignoredEnv = setEnv({
+		CLOUDFLARE_ACCOUNT_ID: 'mock-account',
+		CLOUDFLARE_API_TOKEN: 'mock-token',
+		CLOUDFLARE_AI_GATEWAY_ID: 'runtime-search-gateway',
+		CLOUDFLARE_AI_GATEWAY_AUTH_TOKEN: 'mock-gateway-auth-token',
+		CLOUDFLARE_VECTORIZE_INDEX: 'mock-index',
+	})
+
+	const fetchSpy = vi.spyOn(globalThis, 'fetch').mockImplementation(async (input) => {
+		const url = input instanceof Request ? input.url : String(input)
+
+		if (url.includes('/workers-ai/')) {
+			return new Response(
+				JSON.stringify({
+					result: {
+						shape: [1, 3],
+						data: [[0.1, 0.2, 0.3]],
+					},
+				}),
+				{
+					status: 200,
+					headers: { 'Content-Type': 'application/json' },
+				},
+			)
+		}
+
+		if (url.includes('/vectorize/')) {
+			return new Response(
+				JSON.stringify({
+					result: {
+						count: 2,
+						matches: [
+							{
+								id: 'legacy-vector-a',
+								score: 0.9,
+								metadata: {
+									type: 'youtube',
+									title: 'Legacy video A',
+									url: '/youtube?video=AAA111BBB22',
+									snippet: 'First transcript chunk',
+									chunkKind: 'transcript',
+								},
+							},
+							{
+								id: 'legacy-vector-b',
+								score: 0.8,
+								metadata: {
+									type: 'youtube',
+									title: 'Legacy video B',
+									url: '/youtube?video=CCC333DDD44',
+									snippet: 'Second transcript chunk',
+									chunkKind: 'transcript',
+								},
+							},
+						],
+					},
+				}),
+				{
+					status: 200,
+					headers: { 'Content-Type': 'application/json' },
+				},
+			)
+		}
+
+		throw new Error(`Unexpected fetch URL in semantic search test: ${url}`)
+	})
+
+	try {
+		const results = await semanticSearchKCD({
+			query: 'legacy youtube',
+			topK: 5,
+		})
+
+		expect(results).toHaveLength(2)
+		expect(results.map((r) => r.id)).toEqual([
+			'youtube:aaa111bbb22',
+			'youtube:ccc333ddd44',
+		])
+	} finally {
+		fetchSpy.mockRestore()
+	}
+})
diff --git a/app/utils/semantic-search.server.ts b/app/utils/semantic-search.server.ts
@@ -161,6 +161,12 @@ function getCanonicalResultId({
 }) {
 	// The Vectorize index stores multiple chunk vectors per doc, so we need a
 	// canonical, doc-level identifier to collapse duplicates in query results.
+	if (type === 'youtube') {
+		if (slug) return `${type}:${normalizeSlugForKey(slug)}`
+		// Legacy vectors may omit `slug`; recover stable doc identity from URL.
+		const youtubeVideoId = parseYoutubeVideoIdFromUrl(url)
+		if (youtubeVideoId) return `${type}:${normalizeSlugForKey(youtubeVideoId)}`
+	}
 	if (type && slug) return `${type}:${normalizeSlugForKey(slug)}`
 	const fromVectorId = parseDocRefFromVectorId(vectorId)
 	if (fromVectorId) {
diff --git a/docs/agents/project-context.md b/docs/agents/project-context.md
@@ -39,6 +39,10 @@ reference:
   It's populated on first request or via `npm run prime-cache:mocks`.
 - Content is filesystem-based: blog posts are MDX files in `content/blog/`.
   Changes to content files are auto-detected by the dev server's file watcher.
+- Semantic search caveat: YouTube auto-captions can include cue-only chunks like
+  `[Music]`. The YouTube indexer filters these low-signal caption lines and
+  merges tiny trailing transcript chunks at ingest time, but old vectors can
+  still linger until the next YouTube reindex.
 
 ## Cloud / headless manual testing
 
diff --git a/other/semantic-search/__tests__/youtube-transcript-chunking.test.ts b/other/semantic-search/__tests__/youtube-transcript-chunking.test.ts
@@ -0,0 +1,106 @@
+import { expect, test } from 'vitest'
+import {
+	chunkTranscriptEvents,
+	type TranscriptEvent,
+} from '../youtube-transcript-chunking.ts'
+
+test('chunkTranscriptEvents merges tiny trailing chunks into previous chunk', () => {
+	const events: Array<TranscriptEvent> = [
+		{ startMs: 0, durationMs: 1000, text: 'A'.repeat(40) },
+		{ startMs: 1200, durationMs: 600, text: 'tail' },
+	]
+
+	const chunks = chunkTranscriptEvents(events, {
+		targetChars: 40,
+		maxChunkChars: 80,
+		minChunkChars: 20,
+	})
+
+	expect(chunks).toHaveLength(1)
+	expect(chunks[0]?.body).toContain('A'.repeat(40))
+	expect(chunks[0]?.body).toContain('tail')
+	expect(chunks[0]?.startMs).toBe(0)
+	expect(chunks[0]?.endMs).toBe(1800)
+})
+
+test('chunkTranscriptEvents keeps a tiny chunk when it is the only chunk', () => {
+	const events: Array<TranscriptEvent> = [
+		{ startMs: 0, durationMs: 400, text: 'short' },
+	]
+
+	const chunks = chunkTranscriptEvents(events, {
+		targetChars: 40,
+		maxChunkChars: 80,
+		minChunkChars: 20,
+	})
+
+	expect(chunks).toHaveLength(1)
+	expect(chunks[0]?.body).toBe('short')
+})
+
+test('chunkTranscriptEvents keeps trailing chunks that meet minimum size', () => {
+	const events: Array<TranscriptEvent> = [
+		{ startMs: 0, durationMs: 800, text: 'A'.repeat(40) },
+		{ startMs: 1000, durationMs: 700, text: 'B'.repeat(25) },
+	]
+
+	const chunks = chunkTranscriptEvents(events, {
+		targetChars: 40,
+		maxChunkChars: 80,
+		minChunkChars: 20,
+	})
+
+	expect(chunks).toHaveLength(2)
+	expect(chunks[0]?.body).toBe('A'.repeat(40))
+	expect(chunks[1]?.body).toBe('B'.repeat(25))
+})
+
+test('chunkTranscriptEvents re-checks oversized lines after flush', () => {
+	const events: Array<TranscriptEvent> = [
+		{ startMs: 0, durationMs: 800, text: 'A'.repeat(40) },
+		{ startMs: 1000, durationMs: 700, text: 'B'.repeat(120) },
+	]
+
+	const chunks = chunkTranscriptEvents(events, {
+		targetChars: 50,
+		maxChunkChars: 80,
+		minChunkChars: 0,
+	})
+
+	expect(chunks).toHaveLength(4)
+	expect(chunks[0]?.body).toBe('A'.repeat(40))
+	expect(chunks.slice(1).every((chunk) => (chunk.body.length || 0) <= 50)).toBe(
+		true,
+	)
+})
+
+test('chunkTranscriptEvents guards non-positive targetChars', () => {
+	const events: Array<TranscriptEvent> = [
+		{ startMs: 0, durationMs: 500, text: 'A'.repeat(12) },
+	]
+
+	const chunks = chunkTranscriptEvents(events, {
+		targetChars: 0,
+		maxChunkChars: 8,
+		minChunkChars: 0,
+	})
+
+	expect(chunks.length).toBeGreaterThan(0)
+	expect(chunks.every((chunk) => chunk.body.length >= 1)).toBe(true)
+})
+
+test('chunkTranscriptEvents keeps event end at or after start for negative durations', () => {
+	const events: Array<TranscriptEvent> = [
+		{ startMs: 1000, durationMs: -900, text: 'negative duration' },
+	]
+
+	const chunks = chunkTranscriptEvents(events, {
+		targetChars: 40,
+		maxChunkChars: 80,
+		minChunkChars: 0,
+	})
+
+	expect(chunks).toHaveLength(1)
+	expect(chunks[0]?.startMs).toBe(1000)
+	expect(chunks[0]?.endMs).toBe(1000)
+})
diff --git a/other/semantic-search/__tests__/youtube-transcript-cue-filter.test.ts b/other/semantic-search/__tests__/youtube-transcript-cue-filter.test.ts
@@ -0,0 +1,27 @@
+import { expect, test } from 'vitest'
+import { isLowSignalYoutubeCaptionCueLine } from '../youtube-transcript-cue-filter.ts'
+
+test('isLowSignalYoutubeCaptionCueLine matches bracket-only cues', () => {
+	expect(isLowSignalYoutubeCaptionCueLine('[Music]')).toBe(true)
+	expect(isLowSignalYoutubeCaptionCueLine('[Applause] [Music]')).toBe(true)
+	expect(isLowSignalYoutubeCaptionCueLine('[Laughter], [Applause]')).toBe(true)
+})
+
+test('isLowSignalYoutubeCaptionCueLine matches bare cue words', () => {
+	expect(isLowSignalYoutubeCaptionCueLine('music')).toBe(true)
+	expect(isLowSignalYoutubeCaptionCueLine('Applause')).toBe(true)
+	expect(isLowSignalYoutubeCaptionCueLine('inaudible')).toBe(true)
+})
+
+test('isLowSignalYoutubeCaptionCueLine keeps meaningful transcript lines', () => {
+	expect(
+		isLowSignalYoutubeCaptionCueLine(
+			'Remix makes progressive enhancement easier for this workflow.',
+		),
+	).toBe(false)
+	expect(
+		isLowSignalYoutubeCaptionCueLine(
+			'[Music] and then we talk about build tools and testing.',
+		),
+	).toBe(false)
+})
diff --git a/other/semantic-search/index-youtube-playlist.ts b/other/semantic-search/index-youtube-playlist.ts
@@ -11,6 +11,11 @@ import {
 } from './cloudflare.ts'
 import { getSemanticSearchIgnoreList, isDocIdIgnored } from './ignore-list.ts'
 import { getJsonObject, putJsonObject } from './r2-manifest.ts'
+import {
+	chunkTranscriptEvents,
+	type TranscriptEvent,
+} from './youtube-transcript-chunking.ts'
+import { isLowSignalYoutubeCaptionCueLine } from './youtube-transcript-cue-filter.ts'
 
 type DocType = 'youtube'
 type TranscriptSource = 'manual' | 'auto' | 'none'
@@ -63,12 +68,6 @@ type VideoEnrichedData = {
 	transcriptSource: TranscriptSource
 }
 
-type TranscriptEvent = {
-	startMs: number
-	durationMs: number
-	text: string
-}
-
 type YoutubeChunkItem =
 	| { kind: 'meta'; body: string }
 	| {
@@ -844,6 +843,7 @@ async function fetchTranscriptFromTrack(track: CaptionTrack, label: string) {
 			.replace(/\u200B/g, '')
 			.trim()
 		if (!line) continue
+		if (isLowSignalYoutubeCaptionCueLine(line)) continue
 		lines.push(line)
 		transcriptEvents.push({ startMs, durationMs, text: line })
 	}
@@ -853,72 +853,6 @@ async function fetchTranscriptFromTrack(track: CaptionTrack, label: string) {
 	}
 }
 
-function chunkTranscriptEvents(
-	events: TranscriptEvent[],
-	{
-		targetChars = 3500,
-		maxChunkChars = 5500,
-	}: { targetChars?: number; maxChunkChars?: number } = {},
-) {
-	const sorted = [...events].sort((a, b) => a.startMs - b.startMs)
-	const chunks: Array<{
-		body: string
-		startMs: number
-		endMs: number
-	}> = []
-
-	let currentLines: string[] = []
-	let currentLen = 0
-	let startMs: number | null = null
-	let endMs = 0
-
-	const flush = () => {
-		if (!currentLines.length || startMs === null) return
-		const body = normalizeText(currentLines.join('\n'))
-		if (!body) return
-		chunks.push({ body, startMs, endMs })
-		currentLines = []
-		currentLen = 0
-		startMs = null
-		endMs = 0
-	}
-
-	for (const e of sorted) {
-		const line = normalizeText(e.text)
-		if (!line) continue
-
-		// If we don't have a current chunk and this line is huge, split it.
-		if (!currentLines.length && line.length > maxChunkChars) {
-			const eStartMs = Math.max(0, Math.floor(e.startMs))
-			const eEndMs = Math.max(
-				eStartMs,
-				Math.floor(e.startMs + (e.durationMs || 0)),
-			)
-			for (let i = 0; i < line.length; i += targetChars) {
-				const part = line.slice(i, i + targetChars)
-				const body = normalizeText(part)
-				if (!body) continue
-				chunks.push({ body, startMs: eStartMs, endMs: eEndMs })
-			}
-			continue
-		}
-
-		const nextLen = currentLen + (currentLines.length ? 1 : 0) + line.length
-		if (currentLines.length && nextLen > targetChars) {
-			flush()
-		}
-
-		if (startMs === null) startMs = Math.max(0, Math.floor(e.startMs))
-		const eventEnd = Math.max(0, Math.floor(e.startMs + (e.durationMs || 0)))
-		endMs = Math.max(endMs, eventEnd)
-		currentLines.push(line)
-		currentLen = currentLen + (currentLines.length > 1 ? 1 : 0) + line.length
-	}
-
-	flush()
-	return chunks
-}
-
 async function fetchVideoEnrichedData({
 	config,
 	videoId,
diff --git a/other/semantic-search/youtube-transcript-chunking.ts b/other/semantic-search/youtube-transcript-chunking.ts
diff --git a/other/semantic-search/youtube-transcript-cue-filter.ts b/other/semantic-search/youtube-transcript-cue-filter.ts