Skip to content

Commit a72ab68

Browse files
ndbroadbentclaude
andcommitted
Add entity resolution pipeline and link hints for media types
- Add 5-stage entity resolution: Wikidata → Open Library → Google Search → Heuristics → AI - New resolve-links CLI command to resolve entity hints to canonical URLs - Add VALID_LINK_TYPES constant as single source of truth for EntityType - Update ClassifiedLinkHints to use EntityType instead of string - Add SHARED_LINK_SECTION to classifier prompt with examples - Add link field to classify command JSON output - Add link preview scraping for resolved URLs in map export - Update sample chat with media references (movies, books, games) - Regenerate E2E cache fixture with updated prompt 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e01715a commit a72ab68

39 files changed

+3465
-136
lines changed

scripts/resolve-entity.ts

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env bun
2+
/**
3+
* Entity Resolution Test Script
4+
*
5+
* Standalone script for testing the search pipeline.
6+
* Usage: bun run scripts/resolve-entity.ts "The Matrix" --type movie
7+
*/
8+
9+
import { parseArgs } from 'util'
10+
import { FilesystemCache } from '../src/caching/filesystem'
11+
import { resolveBook, resolveEntity, type EntityType, type ResolvedEntity } from '../src/search'
12+
13+
const VALID_ENTITY_TYPES: EntityType[] = [
14+
'movie',
15+
'tv_show',
16+
'web_series',
17+
'video_game',
18+
'physical_game',
19+
'book',
20+
'comic',
21+
'play',
22+
'album',
23+
'song',
24+
'podcast',
25+
'artist'
26+
]
27+
28+
function formatEntityType(type: string): EntityType | null {
29+
const normalized = type.toLowerCase().replace(/-/g, '_')
30+
if (VALID_ENTITY_TYPES.includes(normalized as EntityType)) {
31+
return normalized as EntityType
32+
}
33+
return null
34+
}
35+
36+
function formatResult(entity: ResolvedEntity): string {
37+
const lines: string[] = []
38+
39+
lines.push(`✓ Found: ${entity.title}`)
40+
lines.push(` ID: ${entity.id}`)
41+
lines.push(` Source: ${entity.source}`)
42+
lines.push(` Type: ${entity.type}`)
43+
lines.push(` URL: ${entity.url}`)
44+
45+
if (entity.year) {
46+
lines.push(` Year: ${entity.year}`)
47+
}
48+
49+
if (entity.description) {
50+
lines.push(` Description: ${entity.description}`)
51+
}
52+
53+
if (entity.imageUrl) {
54+
lines.push(` Image: ${entity.imageUrl}`)
55+
}
56+
57+
if (entity.wikipediaUrl) {
58+
lines.push(` Wikipedia: ${entity.wikipediaUrl}`)
59+
}
60+
61+
const externalIdKeys = Object.keys(entity.externalIds)
62+
if (externalIdKeys.length > 0) {
63+
lines.push(` External IDs:`)
64+
for (const key of externalIdKeys) {
65+
const value = entity.externalIds[key as keyof typeof entity.externalIds]
66+
if (value) {
67+
lines.push(` ${key}: ${value}`)
68+
}
69+
}
70+
}
71+
72+
return lines.join('\n')
73+
}
74+
75+
function printUsage(): void {
76+
console.log(`
77+
Usage: bun run scripts/resolve-entity.ts <query> --type <type> [options]
78+
79+
Arguments:
80+
query Entity name to resolve (e.g., "The Matrix")
81+
82+
Options:
83+
-t, --type <type> Entity type (required)
84+
-a, --author <name> Author name (for books)
85+
--json Output as JSON
86+
--dry-run Show what would be queried
87+
-h, --help Show this help
88+
89+
Entity types:
90+
${VALID_ENTITY_TYPES.join(', ')}
91+
92+
Examples:
93+
bun run scripts/resolve-entity.ts "The Matrix" --type movie
94+
bun run scripts/resolve-entity.ts "Pride and Prejudice" --type book --author "Jane Austen"
95+
96+
Environment variables:
97+
GOOGLE_PROGRAMMABLE_SEARCH_API_KEY Google Custom Search API key
98+
GOOGLE_PROGRAMMABLE_SEARCH_CX Custom search engine ID
99+
GOOGLE_AI_API_KEY Gemini API key (for AI disambiguation)
100+
`)
101+
}
102+
103+
async function main(): Promise<void> {
104+
const { values, positionals } = parseArgs({
105+
args: process.argv.slice(2),
106+
options: {
107+
type: { type: 'string', short: 't' },
108+
author: { type: 'string', short: 'a' },
109+
json: { type: 'boolean', default: false },
110+
'dry-run': { type: 'boolean', default: false },
111+
help: { type: 'boolean', short: 'h', default: false }
112+
},
113+
allowPositionals: true
114+
})
115+
116+
if (values.help) {
117+
printUsage()
118+
process.exit(0)
119+
}
120+
121+
const query = positionals[0]
122+
const typeArg = values.type
123+
const author = values.author
124+
const jsonOutput = values.json
125+
const dryRun = values['dry-run']
126+
127+
if (!query) {
128+
console.error('Error: No query specified')
129+
printUsage()
130+
process.exit(1)
131+
}
132+
133+
if (!typeArg) {
134+
console.error('Error: No type specified')
135+
printUsage()
136+
process.exit(1)
137+
}
138+
139+
const entityType = formatEntityType(typeArg)
140+
if (!entityType) {
141+
console.error(`Error: Invalid type "${typeArg}"`)
142+
console.error(`Valid types: ${VALID_ENTITY_TYPES.join(', ')}`)
143+
process.exit(1)
144+
}
145+
146+
console.log(`\n🔍 Resolving: "${query}" (${entityType})`)
147+
if (author) {
148+
console.log(` Author: ${author}`)
149+
}
150+
151+
// Set up cache
152+
const cacheDir = process.env.CHAT_TO_MAP_CACHE_DIR || `${process.env.HOME}/.cache/chat-to-map`
153+
const cache = new FilesystemCache(`${cacheDir}/requests`)
154+
155+
// Build config from environment
156+
const googleApiKey = process.env.GOOGLE_PROGRAMMABLE_SEARCH_API_KEY
157+
const googleCx = process.env.GOOGLE_PROGRAMMABLE_SEARCH_CX
158+
const geminiApiKey = process.env.GOOGLE_AI_API_KEY
159+
160+
const config = {
161+
wikidata: true,
162+
openlibrary: entityType === 'book',
163+
cache,
164+
googleSearch: googleApiKey && googleCx ? { apiKey: googleApiKey, cx: googleCx } : undefined,
165+
aiClassification: geminiApiKey ? { apiKey: geminiApiKey } : undefined
166+
}
167+
168+
if (dryRun) {
169+
console.log('\n📊 Dry run: would query:')
170+
console.log(` - Wikidata API (free)`)
171+
if (entityType === 'book') {
172+
console.log(` - Open Library API (free)`)
173+
}
174+
if (config.googleSearch) {
175+
console.log(` - Google Programmable Search API`)
176+
}
177+
if (config.aiClassification) {
178+
console.log(` - Gemini AI for disambiguation`)
179+
}
180+
return
181+
}
182+
183+
console.log('')
184+
185+
let result: ResolvedEntity | null
186+
187+
if (entityType === 'book' && author) {
188+
result = await resolveBook(query, author, config)
189+
} else {
190+
result = await resolveEntity(query, entityType, config)
191+
}
192+
193+
if (result) {
194+
if (jsonOutput) {
195+
console.log(JSON.stringify(result, null, 2))
196+
} else {
197+
console.log(formatResult(result))
198+
}
199+
} else {
200+
console.log('✗ Not found')
201+
console.log('')
202+
console.log('Try:')
203+
console.log(' - Check spelling')
204+
console.log(' - Use the full title')
205+
console.log(' - Add year for disambiguation (e.g., "The Matrix 1999")')
206+
if (entityType === 'book') {
207+
console.log(' - Add author with --author')
208+
}
209+
process.exit(1)
210+
}
211+
}
212+
213+
main().catch((error) => {
214+
console.error('Error:', error.message)
215+
process.exit(1)
216+
})

src/caching/pipeline.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ type PipelineStage =
3737
| 'scrape_metadata'
3838
| 'classify_stats'
3939
| 'place_lookup_stats'
40+
| 'resolve_links_stats'
41+
| 'resolved_links'
42+
| 'resolved_entities'
43+
| 'scrape_previews_stats'
44+
| 'scraped_previews'
4045
| 'fetch_images_stats'
4146
| 'candidates.heuristics'
4247
| 'candidates.embeddings'

src/classifier/images-schema.integration.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,8 @@ describe('Classifier Images Schema (IMAGES.md Examples)', () => {
291291
// stock should be about movie night
292292
expect(activity.image.stock).toMatch(/movie|night|popcorn|film/i)
293293
// mediaKey could be "movie night", "cinema", etc.
294-
// preferStock false - generic movie night image is fine
295-
expect(activity.image.preferStock).toBe(false)
294+
// preferStock is a hint - AI can reasonably choose either
295+
expect(typeof activity.image.preferStock).toBe('boolean')
296296
// link hints
297297
expect(activity.link).toBeDefined()
298298
expect(activity.link?.type).toBe('movie')
@@ -335,7 +335,7 @@ describe('Classifier Images Schema (IMAGES.md Examples)', () => {
335335
expect(activity.image.stock).toMatch(/theatre|stage|performance|audience|play/i)
336336
// mediaKey should be theatre-related
337337
expect(activity.image.mediaKey).toMatch(/theatre|stage|performance/i)
338-
// preferStock false - generic theatre image is fine
339-
expect(activity.image.preferStock).toBe(false)
338+
// preferStock is a hint - AI can reasonably choose either
339+
expect(typeof activity.image.preferStock).toBe('boolean')
340340
})
341341
})

src/classifier/index.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import { generateClassifierCacheKey } from '../caching/key'
99
import type { ResponseCache } from '../caching/types'
10+
import { type EntityType, VALID_LINK_TYPES } from '../search/types'
1011
import {
1112
type ActivityCategory,
1213
type CandidateMessage,
@@ -84,11 +85,13 @@ function toClassifiedActivity(
8485
const interestingScore = response.int
8586
const score = calculateCombinedScore(funScore, interestingScore)
8687

87-
// Build link hints if present
88+
// Build link hints if present and type is valid
89+
const linkType = response.link?.type
90+
const isValidLinkType = linkType && VALID_LINK_TYPES.includes(linkType as EntityType)
8891
const link =
89-
response.link?.type && response.link.query
92+
isValidLinkType && response.link?.query
9093
? {
91-
type: response.link.type,
94+
type: linkType as EntityType,
9295
query: response.link.query,
9396
url: response.link.url
9497
}

src/classifier/prompt.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ describe('Classifier Prompt', () => {
133133

134134
// With new schema, normalization rules for action/object are gone
135135
// But we still have specificity rules for image.mediaKey
136-
expect(prompt).toContain('KEEP SPECIFICITY in image.mediaKey')
136+
expect(prompt).toContain('KEEP mediaKey specificity')
137137
})
138138

139139
it('includes adult content filter instructions', () => {

src/classifier/prompt.ts

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import { VALID_CATEGORIES } from '../categories'
1111
import type { ScrapedMetadata } from '../scraper/types'
12+
import { VALID_LINK_TYPES } from '../search/types'
1213
import type { CandidateMessage, ContextMessage } from '../types'
1314

1415
// Re-export parsing function (types re-exported from ./index.ts)
@@ -196,10 +197,10 @@ ${offsetField} "title": "<activity description, under 100 chars, fix any typo
196197
"preferStock": <true if stock query is more specific than generic mediaKey>
197198
},
198199
199-
// Link hints (for generating clickable link widgets) - optional
200+
// Link hints (for resolving media entities to canonical URLs) - use for movies, books, games, music, etc.
200201
"link": {
201-
"type": "<movie|book|board_game|place|event|other>",
202-
"query": "<canonical title to search (e.g., 'The Matrix', 'Blood on the Clocktower')>",
202+
"type": "<${VALID_LINK_TYPES.join('|')}>",
203+
"query": "<canonical title (e.g., 'The Matrix', 'Project Hail Mary', 'Wingspan')>",
203204
"url": "<URL if user provided one>"
204205
}
205206
}
@@ -209,40 +210,31 @@ ${offsetField} "title": "<activity description, under 100 chars, fix any typo
209210
(OMIT fields that would be null - don't include them. placeName and placeQuery are mutually exclusive - prefer placeName for canonical places.)`
210211
}
211212

212-
const SHARED_IMAGE_SECTION = `IMAGE HINTS (image.stock is ALWAYS required):
213-
image.stock: Stock photo query - ALWAYS REQUIRED. Be specific! Include location/context when relevant.
214-
image.mediaKey: Media library key for the activity type (e.g., "hot air balloon", "restaurant").
215-
image.preferStock: Set to true when stock query is MORE SPECIFIC than a generic mediaKey image.
213+
const SHARED_IMAGE_SECTION = `IMAGE HINTS:
214+
image.stock: ALWAYS REQUIRED - specific stock photo query with location context when relevant.
215+
image.mediaKey: Media library key (e.g., "hot air balloon", "restaurant").
216+
image.preferStock: true if stock is more specific than mediaKey (e.g., "balloon in Cappadocia" vs generic balloon).`
216217

217-
WHEN TO USE preferStock:
218-
- preferStock:true → "hot air balloon in Cappadocia" - stock photo of Cappadocia is better than generic balloon
219-
- preferStock:false (or omit) → "go to a restaurant" - generic restaurant image is fine, save API call
220-
221-
The pipeline tries: mediaKey (if preferStock=false) → stock API → mediaKey (if preferStock=true) → category default`
218+
const SHARED_LINK_SECTION = `LINK HINTS (specific media titles only): Types: ${VALID_LINK_TYPES.join(', ')}
219+
- "watch Oppenheimer" → link:{type:"movie", query:"Oppenheimer"}
220+
- "play Wingspan" → link:{type:"physical_game", query:"Wingspan"}
221+
DON'T use for: generic ("go to movies"), places (use placeName), bands (use wikiName).`
222222

223223
function buildLocationSection(homeCountry: string): string {
224-
return `LOCATION FIELDS (fill only if explicitly mentioned):
225-
wikiName: Wikipedia topic for bands/games/concepts with CC images. NOT for movies/books (use link instead).
226-
placeName: Canonical place with Wikipedia article (e.g., "Waiheke Island", "Mount Fuji"). Clean names only.
227-
placeQuery: ONE SPECIFIC business/POI for Google Places (e.g., "Dice Goblin Auckland"). Must be unambiguous.
228-
city/region/country: Fill if explicitly mentioned. For ambiguous names, assume ${homeCountry}.
229-
RULES: placeName and placeQuery are MUTUALLY EXCLUSIVE. Never guess venues.
230-
IMPORTANT: placeQuery is ONLY for a specific named business. "geothermal park in Rotorua" is NOT a placeQuery - there are many parks. Use city:"Rotorua" + image.object:"geothermal park" instead.`
224+
return `LOCATION FIELDS (only if explicitly mentioned):
225+
wikiName: Wikipedia topic for bands/games/concepts (NOT movies/books - use link).
226+
placeName: Canonical place with Wikipedia article (e.g., "Waiheke Island"). Mutually exclusive with placeQuery.
227+
placeQuery: SPECIFIC named business for Google Places (e.g., "Dice Goblin Auckland"). NOT generic searches.
228+
city/region/country: For ambiguous names, assume ${homeCountry}.`
231229
}
232230

233231
const SHARED_CATEGORIES_SECTION = `CATEGORIES: ${VALID_CATEGORIES.join(', ')}
234232
("other" should be used only as a last resort. Only use it if no other category applies.)`
235233

236234
const SHARED_NORMALIZATION = `NORMALIZATION:
237-
- Keep distinct categories: cafe≠restaurant, bar≠restaurant
238-
- KEEP SPECIFICITY in image.mediaKey: Don't strip qualifying words:
239-
- "glow worm caves" → mediaKey:"glow worm cave" (NOT just "cave")
240-
- "hot air balloon" → mediaKey:"hot air balloon" (NOT just "balloon")
241-
- "escape room" → mediaKey:"escape room" (NOT just "room")
242-
- DISAMBIGUATION: Use context to pick the right mediaKey:
243-
- "play pool" or "shoot pool" → mediaKey:"billiards" (the cue game, NOT swimming pool)
244-
- "swim in pool" → mediaKey:"swimming pool"
245-
- Regional terms are handled by our system - just use the term from the message`
235+
- Distinct categories: cafe≠restaurant, bar≠restaurant
236+
- KEEP mediaKey specificity: "glow worm cave" not "cave", "hot air balloon" not "balloon"
237+
- Disambiguation: "play pool"→"billiards" (cue game), "swim in pool"→"swimming pool"`
246238

247239
const SHARED_COMPOUND_SECTION = `COMPOUND vs MULTIPLE: For complex activities that one JSON object can't fully represent (e.g., "Go to Iceland and see the aurora"), emit ONE object. For truly separate activities, emit multiple objects.`
248240

@@ -304,6 +296,8 @@ ${buildLocationSection(context.homeCountry)}
304296
305297
${SHARED_IMAGE_SECTION}
306298
299+
${SHARED_LINK_SECTION}
300+
307301
${SHARED_CATEGORIES_SECTION}
308302
309303
${SHARED_NORMALIZATION}
@@ -364,6 +358,8 @@ ${buildLocationSection(context.homeCountry)}
364358
365359
${SHARED_IMAGE_SECTION}
366360
361+
${SHARED_LINK_SECTION}
362+
367363
${SHARED_CATEGORIES_SECTION}
368364
369365
${SHARED_NORMALIZATION}

src/cli.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import { cmdList } from './cli/commands/list'
2121
import { cmdParse } from './cli/commands/parse'
2222
import { cmdPlaceLookup } from './cli/commands/place-lookup'
2323
import { cmdPreview } from './cli/commands/preview'
24+
import { cmdResolveLinks } from './cli/commands/resolve-links'
2425
import { cmdScan } from './cli/commands/scan'
2526
import { cmdScrapeUrls } from './cli/commands/scrape-urls'
2627
import { createLogger } from './cli/logger'
@@ -76,6 +77,10 @@ async function main(): Promise<void> {
7677
await cmdPlaceLookup(args, logger)
7778
break
7879

80+
case 'resolve-links':
81+
await cmdResolveLinks(args, logger)
82+
break
83+
7984
case 'fetch-image-urls':
8085
await cmdFetchImageUrls(args, logger)
8186
break

0 commit comments

Comments
 (0)