DocSpring
diff --git a/‎scripts/resolve-entity.ts‎
Lines changed: 216 additions & 0 deletions b/‎scripts/resolve-entity.ts‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎src/caching/pipeline.ts‎
Lines changed: 5 additions & 0 deletions b/‎src/caching/pipeline.ts‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/classifier/images-schema.integration.test.ts‎
Lines changed: 4 additions & 4 deletions b/‎src/classifier/images-schema.integration.test.ts‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/classifier/index.ts‎
Lines changed: 6 additions & 3 deletions b/‎src/classifier/index.ts‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/classifier/prompt.test.ts‎
Lines changed: 1 addition & 1 deletion b/‎src/classifier/prompt.test.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/classifier/prompt.ts‎
Lines changed: 24 additions & 28 deletions b/‎src/classifier/prompt.ts‎
Lines changed: 24 additions & 28 deletions
diff --git a/‎src/cli.ts‎
Lines changed: 5 additions & 0 deletions b/‎src/cli.ts‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,216 @@
+#!/usr/bin/env bun
+/**
+ * Entity Resolution Test Script
+ *
+ * Standalone script for testing the search pipeline.
+ * Usage: bun run scripts/resolve-entity.ts "The Matrix" --type movie
+ */
+
+import { parseArgs } from 'util'
+import { FilesystemCache } from '../src/caching/filesystem'
+import { resolveBook, resolveEntity, type EntityType, type ResolvedEntity } from '../src/search'
+
+const VALID_ENTITY_TYPES: EntityType[] = [
+  'movie',
+  'tv_show',
+  'web_series',
+  'video_game',
+  'physical_game',
+  'book',
+  'comic',
+  'play',
+  'album',
+  'song',
+  'podcast',
+  'artist'
+]
+
+function formatEntityType(type: string): EntityType | null {
+  const normalized = type.toLowerCase().replace(/-/g, '_')
+  if (VALID_ENTITY_TYPES.includes(normalized as EntityType)) {
+    return normalized as EntityType
+  }
+  return null
+}
+
+function formatResult(entity: ResolvedEntity): string {
+  const lines: string[] = []
+
+  lines.push(`✓ Found: ${entity.title}`)
+  lines.push(`  ID: ${entity.id}`)
+  lines.push(`  Source: ${entity.source}`)
+  lines.push(`  Type: ${entity.type}`)
+  lines.push(`  URL: ${entity.url}`)
+
+  if (entity.year) {
+    lines.push(`  Year: ${entity.year}`)
+  }
+
+  if (entity.description) {
+    lines.push(`  Description: ${entity.description}`)
+  }
+
+  if (entity.imageUrl) {
+    lines.push(`  Image: ${entity.imageUrl}`)
+  }
+
+  if (entity.wikipediaUrl) {
+    lines.push(`  Wikipedia: ${entity.wikipediaUrl}`)
+  }
+
+  const externalIdKeys = Object.keys(entity.externalIds)
+  if (externalIdKeys.length > 0) {
+    lines.push(`  External IDs:`)
+    for (const key of externalIdKeys) {
+      const value = entity.externalIds[key as keyof typeof entity.externalIds]
+      if (value) {
+        lines.push(`    ${key}: ${value}`)
+      }
+    }
+  }
+
+  return lines.join('\n')
+}
+
+function printUsage(): void {
+  console.log(`
+Usage: bun run scripts/resolve-entity.ts <query> --type <type> [options]
+
+Arguments:
+  query                Entity name to resolve (e.g., "The Matrix")
+
+Options:
+  -t, --type <type>    Entity type (required)
+  -a, --author <name>  Author name (for books)
+  --json               Output as JSON
+  --dry-run            Show what would be queried
+  -h, --help           Show this help
+
+Entity types:
+  ${VALID_ENTITY_TYPES.join(', ')}
+
+Examples:
+  bun run scripts/resolve-entity.ts "The Matrix" --type movie
+  bun run scripts/resolve-entity.ts "Pride and Prejudice" --type book --author "Jane Austen"
+
+Environment variables:
+  GOOGLE_PROGRAMMABLE_SEARCH_API_KEY  Google Custom Search API key
+  GOOGLE_PROGRAMMABLE_SEARCH_CX       Custom search engine ID
+  GOOGLE_AI_API_KEY                   Gemini API key (for AI disambiguation)
+`)
+}
+
+async function main(): Promise<void> {
+  const { values, positionals } = parseArgs({
+    args: process.argv.slice(2),
+    options: {
+      type: { type: 'string', short: 't' },
+      author: { type: 'string', short: 'a' },
+      json: { type: 'boolean', default: false },
+      'dry-run': { type: 'boolean', default: false },
+      help: { type: 'boolean', short: 'h', default: false }
+    },
+    allowPositionals: true
+  })
+
+  if (values.help) {
+    printUsage()
+    process.exit(0)
+  }
+
+  const query = positionals[0]
+  const typeArg = values.type
+  const author = values.author
+  const jsonOutput = values.json
+  const dryRun = values['dry-run']
+
+  if (!query) {
+    console.error('Error: No query specified')
+    printUsage()
+    process.exit(1)
+  }
+
+  if (!typeArg) {
+    console.error('Error: No type specified')
+    printUsage()
+    process.exit(1)
+  }
+
+  const entityType = formatEntityType(typeArg)
+  if (!entityType) {
+    console.error(`Error: Invalid type "${typeArg}"`)
+    console.error(`Valid types: ${VALID_ENTITY_TYPES.join(', ')}`)
+    process.exit(1)
+  }
+
+  console.log(`\n🔍 Resolving: "${query}" (${entityType})`)
+  if (author) {
+    console.log(`   Author: ${author}`)
+  }
+
+  // Set up cache
+  const cacheDir = process.env.CHAT_TO_MAP_CACHE_DIR || `${process.env.HOME}/.cache/chat-to-map`
+  const cache = new FilesystemCache(`${cacheDir}/requests`)
+
+  // Build config from environment
+  const googleApiKey = process.env.GOOGLE_PROGRAMMABLE_SEARCH_API_KEY
+  const googleCx = process.env.GOOGLE_PROGRAMMABLE_SEARCH_CX
+  const geminiApiKey = process.env.GOOGLE_AI_API_KEY
+
+  const config = {
+    wikidata: true,
+    openlibrary: entityType === 'book',
+    cache,
+    googleSearch: googleApiKey && googleCx ? { apiKey: googleApiKey, cx: googleCx } : undefined,
+    aiClassification: geminiApiKey ? { apiKey: geminiApiKey } : undefined
+  }
+
+  if (dryRun) {
+    console.log('\n📊 Dry run: would query:')
+    console.log(`   - Wikidata API (free)`)
+    if (entityType === 'book') {
+      console.log(`   - Open Library API (free)`)
+    }
+    if (config.googleSearch) {
+      console.log(`   - Google Programmable Search API`)
+    }
+    if (config.aiClassification) {
+      console.log(`   - Gemini AI for disambiguation`)
+    }
+    return
+  }
+
+  console.log('')
+
+  let result: ResolvedEntity | null
+
+  if (entityType === 'book' && author) {
+    result = await resolveBook(query, author, config)
+  } else {
+    result = await resolveEntity(query, entityType, config)
+  }
+
+  if (result) {
+    if (jsonOutput) {
+      console.log(JSON.stringify(result, null, 2))
+    } else {
+      console.log(formatResult(result))
+    }
+  } else {
+    console.log('✗ Not found')
+    console.log('')
+    console.log('Try:')
+    console.log('  - Check spelling')
+    console.log('  - Use the full title')
+    console.log('  - Add year for disambiguation (e.g., "The Matrix 1999")')
+    if (entityType === 'book') {
+      console.log('  - Add author with --author')
+    }
+    process.exit(1)
+  }
+}
+
+main().catch((error) => {
+  console.error('Error:', error.message)
+  process.exit(1)
+})
@@ -37,6 +37,11 @@ type PipelineStage =
   | 'scrape_metadata'
   | 'classify_stats'
   | 'place_lookup_stats'
+  | 'resolve_links_stats'
+  | 'resolved_links'
+  | 'resolved_entities'
+  | 'scrape_previews_stats'
+  | 'scraped_previews'
   | 'fetch_images_stats'
   | 'candidates.heuristics'
   | 'candidates.embeddings'
 
@@ -291,8 +291,8 @@ describe('Classifier Images Schema (IMAGES.md Examples)', () => {
     // stock should be about movie night
     expect(activity.image.stock).toMatch(/movie|night|popcorn|film/i)
     // mediaKey could be "movie night", "cinema", etc.
-    // preferStock false - generic movie night image is fine
-    expect(activity.image.preferStock).toBe(false)
+    // preferStock is a hint - AI can reasonably choose either
+    expect(typeof activity.image.preferStock).toBe('boolean')
     // link hints
     expect(activity.link).toBeDefined()
     expect(activity.link?.type).toBe('movie')
@@ -335,7 +335,7 @@ describe('Classifier Images Schema (IMAGES.md Examples)', () => {
     expect(activity.image.stock).toMatch(/theatre|stage|performance|audience|play/i)
     // mediaKey should be theatre-related
     expect(activity.image.mediaKey).toMatch(/theatre|stage|performance/i)
-    // preferStock false - generic theatre image is fine
-    expect(activity.image.preferStock).toBe(false)
+    // preferStock is a hint - AI can reasonably choose either
+    expect(typeof activity.image.preferStock).toBe('boolean')
   })
 })
@@ -7,6 +7,7 @@
 
 import { generateClassifierCacheKey } from '../caching/key'
 import type { ResponseCache } from '../caching/types'
+import { type EntityType, VALID_LINK_TYPES } from '../search/types'
 import {
   type ActivityCategory,
   type CandidateMessage,
@@ -84,11 +85,13 @@ function toClassifiedActivity(
   const interestingScore = response.int
   const score = calculateCombinedScore(funScore, interestingScore)
 
-  // Build link hints if present
+  // Build link hints if present and type is valid
+  const linkType = response.link?.type
+  const isValidLinkType = linkType && VALID_LINK_TYPES.includes(linkType as EntityType)
   const link =
-    response.link?.type && response.link.query
+    isValidLinkType && response.link?.query
       ? {
-          type: response.link.type,
+          type: linkType as EntityType,
           query: response.link.query,
           url: response.link.url
         }
 
@@ -133,7 +133,7 @@ describe('Classifier Prompt', () => {
 
       // With new schema, normalization rules for action/object are gone
       // But we still have specificity rules for image.mediaKey
-      expect(prompt).toContain('KEEP SPECIFICITY in image.mediaKey')
+      expect(prompt).toContain('KEEP mediaKey specificity')
     })
 
     it('includes adult content filter instructions', () => {
 
@@ -9,6 +9,7 @@
 
 import { VALID_CATEGORIES } from '../categories'
 import type { ScrapedMetadata } from '../scraper/types'
+import { VALID_LINK_TYPES } from '../search/types'
 import type { CandidateMessage, ContextMessage } from '../types'
 
 // Re-export parsing function (types re-exported from ./index.ts)
@@ -196,10 +197,10 @@ ${offsetField}    "title": "<activity description, under 100 chars, fix any typo
       "preferStock": <true if stock query is more specific than generic mediaKey>
     },
 
-    // Link hints (for generating clickable link widgets) - optional
+    // Link hints (for resolving media entities to canonical URLs) - use for movies, books, games, music, etc.
     "link": {
-      "type": "<movie|book|board_game|place|event|other>",
-      "query": "<canonical title to search (e.g., 'The Matrix', 'Blood on the Clocktower')>",
+      "type": "<${VALID_LINK_TYPES.join('|')}>",
+      "query": "<canonical title (e.g., 'The Matrix', 'Project Hail Mary', 'Wingspan')>",
       "url": "<URL if user provided one>"
     }
   }
@@ -209,40 +210,31 @@ ${offsetField}    "title": "<activity description, under 100 chars, fix any typo
 (OMIT fields that would be null - don't include them. placeName and placeQuery are mutually exclusive - prefer placeName for canonical places.)`
 }
 
-const SHARED_IMAGE_SECTION = `IMAGE HINTS (image.stock is ALWAYS required):
-image.stock: Stock photo query - ALWAYS REQUIRED. Be specific! Include location/context when relevant.
-image.mediaKey: Media library key for the activity type (e.g., "hot air balloon", "restaurant").
-image.preferStock: Set to true when stock query is MORE SPECIFIC than a generic mediaKey image.
+const SHARED_IMAGE_SECTION = `IMAGE HINTS:
+image.stock: ALWAYS REQUIRED - specific stock photo query with location context when relevant.
+image.mediaKey: Media library key (e.g., "hot air balloon", "restaurant").
+image.preferStock: true if stock is more specific than mediaKey (e.g., "balloon in Cappadocia" vs generic balloon).`
 
-WHEN TO USE preferStock:
-- preferStock:true → "hot air balloon in Cappadocia" - stock photo of Cappadocia is better than generic balloon
-- preferStock:false (or omit) → "go to a restaurant" - generic restaurant image is fine, save API call
-
-The pipeline tries: mediaKey (if preferStock=false) → stock API → mediaKey (if preferStock=true) → category default`
+const SHARED_LINK_SECTION = `LINK HINTS (specific media titles only): Types: ${VALID_LINK_TYPES.join(', ')}
+- "watch Oppenheimer" → link:{type:"movie", query:"Oppenheimer"}
+- "play Wingspan" → link:{type:"physical_game", query:"Wingspan"}
+DON'T use for: generic ("go to movies"), places (use placeName), bands (use wikiName).`
 
 function buildLocationSection(homeCountry: string): string {
-  return `LOCATION FIELDS (fill only if explicitly mentioned):
-wikiName: Wikipedia topic for bands/games/concepts with CC images. NOT for movies/books (use link instead).
-placeName: Canonical place with Wikipedia article (e.g., "Waiheke Island", "Mount Fuji"). Clean names only.
-placeQuery: ONE SPECIFIC business/POI for Google Places (e.g., "Dice Goblin Auckland"). Must be unambiguous.
-city/region/country: Fill if explicitly mentioned. For ambiguous names, assume ${homeCountry}.
-RULES: placeName and placeQuery are MUTUALLY EXCLUSIVE. Never guess venues.
-IMPORTANT: placeQuery is ONLY for a specific named business. "geothermal park in Rotorua" is NOT a placeQuery - there are many parks. Use city:"Rotorua" + image.object:"geothermal park" instead.`
+  return `LOCATION FIELDS (only if explicitly mentioned):
+wikiName: Wikipedia topic for bands/games/concepts (NOT movies/books - use link).
+placeName: Canonical place with Wikipedia article (e.g., "Waiheke Island"). Mutually exclusive with placeQuery.
+placeQuery: SPECIFIC named business for Google Places (e.g., "Dice Goblin Auckland"). NOT generic searches.
+city/region/country: For ambiguous names, assume ${homeCountry}.`
 }
 
 const SHARED_CATEGORIES_SECTION = `CATEGORIES: ${VALID_CATEGORIES.join(', ')}
 ("other" should be used only as a last resort. Only use it if no other category applies.)`
 
 const SHARED_NORMALIZATION = `NORMALIZATION:
-- Keep distinct categories: cafe≠restaurant, bar≠restaurant
-- KEEP SPECIFICITY in image.mediaKey: Don't strip qualifying words:
-  - "glow worm caves" → mediaKey:"glow worm cave" (NOT just "cave")
-  - "hot air balloon" → mediaKey:"hot air balloon" (NOT just "balloon")
-  - "escape room" → mediaKey:"escape room" (NOT just "room")
-- DISAMBIGUATION: Use context to pick the right mediaKey:
-  - "play pool" or "shoot pool" → mediaKey:"billiards" (the cue game, NOT swimming pool)
-  - "swim in pool" → mediaKey:"swimming pool"
-- Regional terms are handled by our system - just use the term from the message`
+- Distinct categories: cafe≠restaurant, bar≠restaurant
+- KEEP mediaKey specificity: "glow worm cave" not "cave", "hot air balloon" not "balloon"
+- Disambiguation: "play pool"→"billiards" (cue game), "swim in pool"→"swimming pool"`
 
 const SHARED_COMPOUND_SECTION = `COMPOUND vs MULTIPLE: For complex activities that one JSON object can't fully represent (e.g., "Go to Iceland and see the aurora"), emit ONE object. For truly separate activities, emit multiple objects.`
 
@@ -304,6 +296,8 @@ ${buildLocationSection(context.homeCountry)}
 
 ${SHARED_IMAGE_SECTION}
 
+${SHARED_LINK_SECTION}
+
 ${SHARED_CATEGORIES_SECTION}
 
 ${SHARED_NORMALIZATION}
@@ -364,6 +358,8 @@ ${buildLocationSection(context.homeCountry)}
 
 ${SHARED_IMAGE_SECTION}
 
+${SHARED_LINK_SECTION}
+
 ${SHARED_CATEGORIES_SECTION}
 
 ${SHARED_NORMALIZATION}
 
@@ -21,6 +21,7 @@ import { cmdList } from './cli/commands/list'
 import { cmdParse } from './cli/commands/parse'
 import { cmdPlaceLookup } from './cli/commands/place-lookup'
 import { cmdPreview } from './cli/commands/preview'
+import { cmdResolveLinks } from './cli/commands/resolve-links'
 import { cmdScan } from './cli/commands/scan'
 import { cmdScrapeUrls } from './cli/commands/scrape-urls'
 import { createLogger } from './cli/logger'
@@ -76,6 +77,10 @@ async function main(): Promise<void> {
         await cmdPlaceLookup(args, logger)
         break
 
+      case 'resolve-links':
+        await cmdResolveLinks(args, logger)
+        break
+
       case 'fetch-image-urls':
         await cmdFetchImageUrls(args, logger)
         break