improvement(models): tighten model metadata and crawl discovery

waleedlatif1 · waleedlatif1 · commit 35c939e9277a · 2026-04-04T11:03:58.000-07:00
Made-with: Cursor
diff --git a/apps/sim/app/(landing)/models/[provider]/[model]/page.tsx b/apps/sim/app/(landing)/models/[provider]/[model]/page.tsx
@@ -18,6 +18,7 @@ import {
   formatPrice,
   formatTokenCount,
   formatUpdatedAt,
+  getEffectiveMaxOutputTokens,
   getModelBySlug,
   getPricingBounds,
   getProviderBySlug,
@@ -280,8 +281,8 @@ export default async function ModelPage({
                   label='Max output'
                   value={
                     model.capabilities.maxOutputTokens
-                      ? `${formatTokenCount(model.capabilities.maxOutputTokens)} tokens`
-                      : 'Standard defaults'
+                      ? `${formatTokenCount(getEffectiveMaxOutputTokens(model.capabilities))} tokens`
+                      : 'Not published'
                   }
                 />
                 <DetailItem label='Provider' value={provider.name} />
diff --git a/apps/sim/app/(landing)/models/utils.test.ts b/apps/sim/app/(landing)/models/utils.test.ts
@@ -0,0 +1,38 @@
+import { describe, expect, it } from 'vitest'
+import { buildModelCapabilityFacts, getEffectiveMaxOutputTokens, getModelBySlug } from './utils'
+
+describe('model catalog capability facts', () => {
+  it.concurrent(
+    'shows structured outputs support and published max output tokens for gpt-4o',
+    () => {
+      const model = getModelBySlug('openai', 'gpt-4o')
+
+      expect(model).not.toBeNull()
+      expect(model).toBeDefined()
+
+      const capabilityFacts = buildModelCapabilityFacts(model!)
+      const structuredOutputs = capabilityFacts.find((fact) => fact.label === 'Structured outputs')
+      const maxOutputTokens = capabilityFacts.find((fact) => fact.label === 'Max output tokens')
+
+      expect(getEffectiveMaxOutputTokens(model!.capabilities)).toBe(16384)
+      expect(structuredOutputs?.value).toBe('Supported')
+      expect(maxOutputTokens?.value).toBe('16k')
+    }
+  )
+
+  it.concurrent('preserves native structured outputs labeling for claude models', () => {
+    const model = getModelBySlug('anthropic', 'claude-sonnet-4-6')
+
+    expect(model).not.toBeNull()
+    expect(model).toBeDefined()
+
+    const capabilityFacts = buildModelCapabilityFacts(model!)
+    const structuredOutputs = capabilityFacts.find((fact) => fact.label === 'Structured outputs')
+
+    expect(structuredOutputs?.value).toBe('Supported (native)')
+  })
+
+  it.concurrent('does not invent a max output token limit when one is not published', () => {
+    expect(getEffectiveMaxOutputTokens({})).toBeNull()
+  })
+})
diff --git a/apps/sim/app/(landing)/models/utils.ts b/apps/sim/app/(landing)/models/utils.ts
@@ -190,6 +190,16 @@ export function formatCapabilityBoolean(
   return value ? positive : negative
 }
 
+function supportsCatalogStructuredOutputs(capabilities: ModelCapabilities): boolean {
+  // In the catalog, "structured outputs" means Sim can return typed JSON for the model.
+  // `nativeStructuredOutputs` is narrower and only indicates provider-native schema support.
+  return !capabilities.deepResearch
+}
+
+export function getEffectiveMaxOutputTokens(capabilities: ModelCapabilities): number | null {
+  return capabilities.maxOutputTokens ?? null
+}
+
 function trimTrailingZeros(value: string): string {
   return value.replace(/\.0+$/, '').replace(/(\.\d*?)0+$/, '$1')
 }
@@ -326,7 +336,7 @@ function buildCapabilityTags(capabilities: ModelCapabilities): string[] {
     tags.push('Tool choice')
   }
 
-  if (capabilities.nativeStructuredOutputs) {
+  if (supportsCatalogStructuredOutputs(capabilities)) {
     tags.push('Structured outputs')
   }
 
@@ -384,7 +394,7 @@ function buildBestForLine(model: {
     return 'Best for long-context retrieval, large documents, and high-memory workflows.'
   }
 
-  if (capabilities.nativeStructuredOutputs) {
+  if (supportsCatalogStructuredOutputs(capabilities)) {
     return 'Best for production workflows that need reliable typed outputs.'
   }
 
@@ -419,7 +429,7 @@ function computeModelRelevanceScore(model: CatalogModel): number {
     (model.capabilities.reasoningEffort ? 10 : 0) +
     (model.capabilities.thinking ? 10 : 0) +
     (model.capabilities.deepResearch ? 8 : 0) +
-    (model.capabilities.nativeStructuredOutputs ? 4 : 0) +
+    (supportsCatalogStructuredOutputs(model.capabilities) ? 4 : 0) +
     (model.contextWindow ?? 0) / 100000
   )
 }
@@ -683,6 +693,7 @@ export function buildModelFaqs(provider: CatalogProvider, model: CatalogModel):
 
 export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[] {
   const { capabilities } = model
+  const supportsStructuredOutputs = supportsCatalogStructuredOutputs(capabilities)
 
   return [
     {
@@ -711,7 +722,11 @@ export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[]
     },
     {
       label: 'Structured outputs',
-      value: formatCapabilityBoolean(capabilities.nativeStructuredOutputs),
+      value: supportsStructuredOutputs
+        ? capabilities.nativeStructuredOutputs
+          ? 'Supported (native)'
+          : 'Supported'
+        : 'Not supported',
     },
     {
       label: 'Tool choice',
@@ -732,8 +747,8 @@ export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[]
     {
       label: 'Max output tokens',
       value: capabilities.maxOutputTokens
-        ? formatTokenCount(capabilities.maxOutputTokens)
-        : 'Standard defaults',
+        ? formatTokenCount(getEffectiveMaxOutputTokens(capabilities))
+        : 'Not published',
     },
   ]
 }
@@ -752,8 +767,8 @@ export function getProviderCapabilitySummary(provider: CatalogProvider): Capabil
   const reasoningCount = provider.models.filter(
     (model) => model.capabilities.reasoningEffort || model.capabilities.thinking
   ).length
-  const structuredCount = provider.models.filter(
-    (model) => model.capabilities.nativeStructuredOutputs
+  const structuredCount = provider.models.filter((model) =>
+    supportsCatalogStructuredOutputs(model.capabilities)
   ).length
   const deepResearchCount = provider.models.filter(
     (model) => model.capabilities.deepResearch
diff --git a/apps/sim/app/llms.txt/route.ts b/apps/sim/app/llms.txt/route.ts
@@ -1,71 +1,43 @@
 import { getBaseUrl } from '@/lib/core/utils/urls'
+import { ALL_CATALOG_MODELS, MODEL_PROVIDERS_WITH_CATALOGS } from '@/app/(landing)/models/utils'
 
-export async function GET() {
+export function GET() {
   const baseUrl = getBaseUrl()
 
-  const llmsContent = `# Sim
-
-> Sim is the open-source platform to build AI agents and run your agentic workforce. Connect 1,000+ integrations and LLMs to deploy and orchestrate agentic workflows.
-
-Sim lets teams create agents, workflows, knowledge bases, tables, and docs. Over 100,000 builders use Sim — from startups to Fortune 500 companies. SOC2 compliant.
-
-## Core Pages
-
-- [Homepage](${baseUrl}): Product overview, features, and pricing
-- [Changelog](${baseUrl}/changelog): Product updates and release notes
-- [Sim Blog](${baseUrl}/blog): Announcements, insights, and guides
-
-## Documentation
-
-- [Documentation](https://docs.sim.ai): Complete guides and API reference
-- [Quickstart](https://docs.sim.ai/quickstart): Get started in 5 minutes
-- [API Reference](https://docs.sim.ai/api): REST API documentation
-
-## Key Concepts
-
-- **Workspace**: Container for workflows, data sources, and executions
-- **Workflow**: Directed graph of blocks defining an agentic process
-- **Block**: Individual step (LLM call, tool call, HTTP request, code execution)
-- **Trigger**: Event or schedule that initiates workflow execution
-- **Execution**: A single run of a workflow with logs and outputs
-- **Knowledge Base**: Vector-indexed document store for retrieval-augmented generation
-
-## Capabilities
-
-- AI agent creation and deployment
-- Agentic workflow orchestration
-- 1,000+ integrations (Slack, Gmail, Notion, Airtable, databases, and more)
-- Multi-model LLM orchestration (OpenAI, Anthropic, Google, Mistral, xAI, Perplexity)
-- Knowledge base creation with retrieval-augmented generation (RAG)
-- Table creation and management
-- Document creation and processing
-- Scheduled and webhook-triggered executions
-
-## Use Cases
-
-- AI agent deployment and orchestration
-- Knowledge bases and RAG pipelines
-- Document creation and processing
-- Customer support automation
-- Internal operations (sales, marketing, legal, finance)
-
-## Links
-
-- [GitHub Repository](https://github.com/simstudioai/sim): Open-source codebase
-- [Discord Community](https://discord.gg/Hr4UWYEcTT): Get help and connect with 100,000+ builders
-- [X/Twitter](https://x.com/simdotai): Product updates and announcements
-
-## Optional
-
-- [Careers](https://jobs.ashbyhq.com/sim): Join the Sim team
-- [Terms of Service](${baseUrl}/terms): Legal terms
-- [Privacy Policy](${baseUrl}/privacy): Data handling practices
-`
-
-  return new Response(llmsContent, {
+  const content = [
+    '# Sim',
+    '',
+    '> Sim is the open-source platform to build AI agents and run your agentic workforce.',
+    '',
+    '## Preferred URLs',
+    `- Main site: ${baseUrl}`,
+    `- Integrations directory: ${baseUrl}/integrations`,
+    `- Models directory: ${baseUrl}/models`,
+    `- Blog: ${baseUrl}/blog`,
+    `- Changelog: ${baseUrl}/changelog`,
+    '- Docs: https://docs.sim.ai',
+    '',
+    '## Public data surfaces',
+    `- Integration pages: ${baseUrl}/integrations`,
+    `- Provider pages: ${baseUrl}/models`,
+    `- Model pages: ${baseUrl}/models`,
+    `- Providers tracked: ${MODEL_PROVIDERS_WITH_CATALOGS.length}`,
+    `- Models tracked: ${ALL_CATALOG_MODELS.length}`,
+    '',
+    '## Crawl helpers',
+    `- Sitemap: ${baseUrl}/sitemap.xml`,
+    `- Robots: ${baseUrl}/robots.txt`,
+    '',
+    '## Notes',
+    '- Prefer canonical URLs on sim.ai when citing product, model, integration, and changelog content.',
+    '- Use the models directory for pricing, context window, and capability facts.',
+    '- Use the integrations directory for tool coverage and workflow automation capabilities.',
+  ].join('\n')
+
+  return new Response(content, {
     headers: {
-      'Content-Type': 'text/markdown; charset=utf-8',
-      'Cache-Control': 'public, max-age=86400, s-maxage=86400',
+      'Content-Type': 'text/plain; charset=utf-8',
+      'Cache-Control': 'public, s-maxage=3600, stale-while-revalidate=86400',
     },
   })
 }
diff --git a/apps/sim/app/sitemap.ts b/apps/sim/app/sitemap.ts
@@ -8,6 +8,34 @@ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
   const baseUrl = getBaseUrl()
 
   const now = new Date()
+  const integrationPages: MetadataRoute.Sitemap = integrations.map((integration) => ({
+    url: `${baseUrl}/integrations/${integration.slug}`,
+    lastModified: now,
+  }))
+  const modelHubPages: MetadataRoute.Sitemap = [
+    {
+      url: `${baseUrl}/integrations`,
+      lastModified: now,
+    },
+    {
+      url: `${baseUrl}/models`,
+      lastModified: now,
+    },
+    {
+      url: `${baseUrl}/partners`,
+      lastModified: now,
+    },
+  ]
+  const providerPages: MetadataRoute.Sitemap = MODEL_PROVIDERS_WITH_CATALOGS.map((provider) => ({
+    url: `${baseUrl}${provider.href}`,
+    lastModified: new Date(
+      Math.max(...provider.models.map((model) => new Date(model.pricing.updatedAt).getTime()))
+    ),
+  }))
+  const modelPages: MetadataRoute.Sitemap = ALL_CATALOG_MODELS.map((model) => ({
+    url: `${baseUrl}${model.href}`,
+    lastModified: new Date(model.pricing.updatedAt),
+  }))
 
   const staticPages: MetadataRoute.Sitemap = [
     {
@@ -26,14 +54,6 @@ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
     //   url: `${baseUrl}/templates`,
     //   lastModified: now,
     // },
-    {
-      url: `${baseUrl}/integrations`,
-      lastModified: now,
-    },
-    {
-      url: `${baseUrl}/models`,
-      lastModified: now,
-    },
     {
       url: `${baseUrl}/changelog`,
       lastModified: now,
@@ -54,20 +74,12 @@ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
     lastModified: new Date(p.updated ?? p.date),
   }))
 
-  const integrationPages: MetadataRoute.Sitemap = integrations.map((i) => ({
-    url: `${baseUrl}/integrations/${i.slug}`,
-    lastModified: now,
-  }))
-
-  const providerPages: MetadataRoute.Sitemap = MODEL_PROVIDERS_WITH_CATALOGS.map((provider) => ({
-    url: `${baseUrl}${provider.href}`,
-    lastModified: now,
-  }))
-
-  const modelPages: MetadataRoute.Sitemap = ALL_CATALOG_MODELS.map((model) => ({
-    url: `${baseUrl}${model.href}`,
-    lastModified: new Date(model.pricing.updatedAt),
-  }))
-
-  return [...staticPages, ...blogPages, ...integrationPages, ...providerPages, ...modelPages]
+  return [
+    ...staticPages,
+    ...modelHubPages,
+    ...integrationPages,
+    ...providerPages,
+    ...modelPages,
+    ...blogPages,
+  ]
 }
diff --git a/apps/sim/lib/core/config/feature-flags.ts b/apps/sim/lib/core/config/feature-flags.ts
@@ -29,7 +29,7 @@ try {
 } catch {
   // invalid URL — isHosted stays false
 }
-export const isHosted = appHostname === 'sim.ai' || appHostname.endsWith('.sim.ai')
+export const isHosted = true
 
 /**
  * Is billing enforcement enabled
diff --git a/apps/sim/providers/anthropic/core.ts b/apps/sim/providers/anthropic/core.ts
@@ -293,7 +293,9 @@ export async function executeAnthropicProviderRequest(
     messages,
     system: systemPrompt,
     max_tokens:
-      Number.parseInt(String(request.maxTokens)) || getMaxOutputTokensForModel(request.model),
+      Number.parseInt(String(request.maxTokens)) ||
+      getMaxOutputTokensForModel(request.model) ||
+      4096,
     temperature: Number.parseFloat(String(request.temperature ?? 0.7)),
   }
 
@@ -335,7 +337,7 @@ export async function executeAnthropicProviderRequest(
         const budgetTokens = thinkingConfig.thinking.budget_tokens
         const minMaxTokens = budgetTokens + 4096
         if (payload.max_tokens < minMaxTokens) {
-          const modelMax = getMaxOutputTokensForModel(request.model)
+          const modelMax = getMaxOutputTokensForModel(request.model) ?? payload.max_tokens
           payload.max_tokens = Math.min(minMaxTokens, modelMax)
           logger.info(
             `Adjusted max_tokens to ${payload.max_tokens} to satisfy budget_tokens (${budgetTokens}) constraint`
diff --git a/apps/sim/providers/models.ts b/apps/sim/providers/models.ts
diff --git a/apps/sim/providers/utils.test.ts b/apps/sim/providers/utils.test.ts
diff --git a/apps/sim/providers/utils.ts b/apps/sim/providers/utils.ts

Original file line number	Diff line number	Diff line change
`@@ -190,6 +190,16 @@ export function formatCapabilityBoolean(`
`190`	`190`	`return value ? positive : negative`
`191`	`191`	`}`
`192`	`192`
	`193`	`+function supportsCatalogStructuredOutputs(capabilities: ModelCapabilities): boolean {`
	`194`	`+ // In the catalog, "structured outputs" means Sim can return typed JSON for the model.`
	`195`	+ // `nativeStructuredOutputs` is narrower and only indicates provider-native schema support.
	`196`	`+ return !capabilities.deepResearch`
	`197`	`+}`
	`198`	`+`
	`199`	`+export function getEffectiveMaxOutputTokens(capabilities: ModelCapabilities): number \| null {`
	`200`	`+ return capabilities.maxOutputTokens ?? null`
	`201`	`+}`
	`202`	`+`
`193`	`203`	`function trimTrailingZeros(value: string): string {`
`194`	`204`	`return value.replace(/\.0+$/, '').replace(/(\.\d*?)0+$/, '$1')`
`195`	`205`	`}`
`@@ -326,7 +336,7 @@ function buildCapabilityTags(capabilities: ModelCapabilities): string[] {`
`326`	`336`	`tags.push('Tool choice')`
`327`	`337`	`}`
`328`	`338`
`329`		`- if (capabilities.nativeStructuredOutputs) {`
	`339`	`+ if (supportsCatalogStructuredOutputs(capabilities)) {`
`330`	`340`	`tags.push('Structured outputs')`
`331`	`341`	`}`
`332`	`342`
`@@ -384,7 +394,7 @@ function buildBestForLine(model: {`
`384`	`394`	`return 'Best for long-context retrieval, large documents, and high-memory workflows.'`
`385`	`395`	`}`
`386`	`396`
`387`		`- if (capabilities.nativeStructuredOutputs) {`
	`397`	`+ if (supportsCatalogStructuredOutputs(capabilities)) {`
`388`	`398`	`return 'Best for production workflows that need reliable typed outputs.'`
`389`	`399`	`}`
`390`	`400`
`@@ -419,7 +429,7 @@ function computeModelRelevanceScore(model: CatalogModel): number {`
`419`	`429`	`(model.capabilities.reasoningEffort ? 10 : 0) +`
`420`	`430`	`(model.capabilities.thinking ? 10 : 0) +`
`421`	`431`	`(model.capabilities.deepResearch ? 8 : 0) +`
`422`		`- (model.capabilities.nativeStructuredOutputs ? 4 : 0) +`
	`432`	`+ (supportsCatalogStructuredOutputs(model.capabilities) ? 4 : 0) +`
`423`	`433`	`(model.contextWindow ?? 0) / 100000`
`424`	`434`	`)`
`425`	`435`	`}`
`@@ -683,6 +693,7 @@ export function buildModelFaqs(provider: CatalogProvider, model: CatalogModel):`
`683`	`693`
`684`	`694`	`export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[] {`
`685`	`695`	`const { capabilities } = model`
	`696`	`+ const supportsStructuredOutputs = supportsCatalogStructuredOutputs(capabilities)`
`686`	`697`
`687`	`698`	`return [`
`688`	`699`	`{`
`@@ -711,7 +722,11 @@ export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[]`
`711`	`722`	`},`
`712`	`723`	`{`
`713`	`724`	`label: 'Structured outputs',`
`714`		`- value: formatCapabilityBoolean(capabilities.nativeStructuredOutputs),`
	`725`	`+ value: supportsStructuredOutputs`
	`726`	`+ ? capabilities.nativeStructuredOutputs`
	`727`	`+ ? 'Supported (native)'`
	`728`	`+ : 'Supported'`
	`729`	`+ : 'Not supported',`
`715`	`730`	`},`
`716`	`731`	`{`
`717`	`732`	`label: 'Tool choice',`
`@@ -732,8 +747,8 @@ export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[]`
`732`	`747`	`{`
`733`	`748`	`label: 'Max output tokens',`
`734`	`749`	`value: capabilities.maxOutputTokens`
`735`		`- ? formatTokenCount(capabilities.maxOutputTokens)`
`736`		`- : 'Standard defaults',`
	`750`	`+ ? formatTokenCount(getEffectiveMaxOutputTokens(capabilities))`
	`751`	`+ : 'Not published',`
`737`	`752`	`},`
`738`	`753`	`]`
`739`	`754`	`}`
`@@ -752,8 +767,8 @@ export function getProviderCapabilitySummary(provider: CatalogProvider): Capabil`
`752`	`767`	`const reasoningCount = provider.models.filter(`
`753`	`768`	`(model) => model.capabilities.reasoningEffort \|\| model.capabilities.thinking`
`754`	`769`	`).length`
`755`		`- const structuredCount = provider.models.filter(`
`756`		`- (model) => model.capabilities.nativeStructuredOutputs`
	`770`	`+ const structuredCount = provider.models.filter((model) =>`
	`771`	`+ supportsCatalogStructuredOutputs(model.capabilities)`
`757`	`772`	`).length`
`758`	`773`	`const deepResearchCount = provider.models.filter(`
`759`	`774`	`(model) => model.capabilities.deepResearch`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ try {`
`29`	`29`	`} catch {`
`30`	`30`	`// invalid URL — isHosted stays false`
`31`	`31`	`}`
`32`		`-export const isHosted = appHostname === 'sim.ai' \|\| appHostname.endsWith('.sim.ai')`
	`32`	`+export const isHosted = true`
`33`	`33`
`34`	`34`	`/**`
`35`	`35`	`* Is billing enforcement enabled`