Skip to content

Commit 35c939e

Browse files
committed
improvement(models): tighten model metadata and crawl discovery
Made-with: Cursor
1 parent 6d00d6b commit 35c939e

File tree

10 files changed

+234
-117
lines changed

10 files changed

+234
-117
lines changed

apps/sim/app/(landing)/models/[provider]/[model]/page.tsx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {
1818
formatPrice,
1919
formatTokenCount,
2020
formatUpdatedAt,
21+
getEffectiveMaxOutputTokens,
2122
getModelBySlug,
2223
getPricingBounds,
2324
getProviderBySlug,
@@ -280,8 +281,8 @@ export default async function ModelPage({
280281
label='Max output'
281282
value={
282283
model.capabilities.maxOutputTokens
283-
? `${formatTokenCount(model.capabilities.maxOutputTokens)} tokens`
284-
: 'Standard defaults'
284+
? `${formatTokenCount(getEffectiveMaxOutputTokens(model.capabilities))} tokens`
285+
: 'Not published'
285286
}
286287
/>
287288
<DetailItem label='Provider' value={provider.name} />
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { buildModelCapabilityFacts, getEffectiveMaxOutputTokens, getModelBySlug } from './utils'
3+
4+
describe('model catalog capability facts', () => {
5+
it.concurrent(
6+
'shows structured outputs support and published max output tokens for gpt-4o',
7+
() => {
8+
const model = getModelBySlug('openai', 'gpt-4o')
9+
10+
expect(model).not.toBeNull()
11+
expect(model).toBeDefined()
12+
13+
const capabilityFacts = buildModelCapabilityFacts(model!)
14+
const structuredOutputs = capabilityFacts.find((fact) => fact.label === 'Structured outputs')
15+
const maxOutputTokens = capabilityFacts.find((fact) => fact.label === 'Max output tokens')
16+
17+
expect(getEffectiveMaxOutputTokens(model!.capabilities)).toBe(16384)
18+
expect(structuredOutputs?.value).toBe('Supported')
19+
expect(maxOutputTokens?.value).toBe('16k')
20+
}
21+
)
22+
23+
it.concurrent('preserves native structured outputs labeling for claude models', () => {
24+
const model = getModelBySlug('anthropic', 'claude-sonnet-4-6')
25+
26+
expect(model).not.toBeNull()
27+
expect(model).toBeDefined()
28+
29+
const capabilityFacts = buildModelCapabilityFacts(model!)
30+
const structuredOutputs = capabilityFacts.find((fact) => fact.label === 'Structured outputs')
31+
32+
expect(structuredOutputs?.value).toBe('Supported (native)')
33+
})
34+
35+
it.concurrent('does not invent a max output token limit when one is not published', () => {
36+
expect(getEffectiveMaxOutputTokens({})).toBeNull()
37+
})
38+
})

apps/sim/app/(landing)/models/utils.ts

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,16 @@ export function formatCapabilityBoolean(
190190
return value ? positive : negative
191191
}
192192

193+
function supportsCatalogStructuredOutputs(capabilities: ModelCapabilities): boolean {
194+
// In the catalog, "structured outputs" means Sim can return typed JSON for the model.
195+
// `nativeStructuredOutputs` is narrower and only indicates provider-native schema support.
196+
return !capabilities.deepResearch
197+
}
198+
199+
export function getEffectiveMaxOutputTokens(capabilities: ModelCapabilities): number | null {
200+
return capabilities.maxOutputTokens ?? null
201+
}
202+
193203
function trimTrailingZeros(value: string): string {
194204
return value.replace(/\.0+$/, '').replace(/(\.\d*?)0+$/, '$1')
195205
}
@@ -326,7 +336,7 @@ function buildCapabilityTags(capabilities: ModelCapabilities): string[] {
326336
tags.push('Tool choice')
327337
}
328338

329-
if (capabilities.nativeStructuredOutputs) {
339+
if (supportsCatalogStructuredOutputs(capabilities)) {
330340
tags.push('Structured outputs')
331341
}
332342

@@ -384,7 +394,7 @@ function buildBestForLine(model: {
384394
return 'Best for long-context retrieval, large documents, and high-memory workflows.'
385395
}
386396

387-
if (capabilities.nativeStructuredOutputs) {
397+
if (supportsCatalogStructuredOutputs(capabilities)) {
388398
return 'Best for production workflows that need reliable typed outputs.'
389399
}
390400

@@ -419,7 +429,7 @@ function computeModelRelevanceScore(model: CatalogModel): number {
419429
(model.capabilities.reasoningEffort ? 10 : 0) +
420430
(model.capabilities.thinking ? 10 : 0) +
421431
(model.capabilities.deepResearch ? 8 : 0) +
422-
(model.capabilities.nativeStructuredOutputs ? 4 : 0) +
432+
(supportsCatalogStructuredOutputs(model.capabilities) ? 4 : 0) +
423433
(model.contextWindow ?? 0) / 100000
424434
)
425435
}
@@ -683,6 +693,7 @@ export function buildModelFaqs(provider: CatalogProvider, model: CatalogModel):
683693

684694
export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[] {
685695
const { capabilities } = model
696+
const supportsStructuredOutputs = supportsCatalogStructuredOutputs(capabilities)
686697

687698
return [
688699
{
@@ -711,7 +722,11 @@ export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[]
711722
},
712723
{
713724
label: 'Structured outputs',
714-
value: formatCapabilityBoolean(capabilities.nativeStructuredOutputs),
725+
value: supportsStructuredOutputs
726+
? capabilities.nativeStructuredOutputs
727+
? 'Supported (native)'
728+
: 'Supported'
729+
: 'Not supported',
715730
},
716731
{
717732
label: 'Tool choice',
@@ -732,8 +747,8 @@ export function buildModelCapabilityFacts(model: CatalogModel): CapabilityFact[]
732747
{
733748
label: 'Max output tokens',
734749
value: capabilities.maxOutputTokens
735-
? formatTokenCount(capabilities.maxOutputTokens)
736-
: 'Standard defaults',
750+
? formatTokenCount(getEffectiveMaxOutputTokens(capabilities))
751+
: 'Not published',
737752
},
738753
]
739754
}
@@ -752,8 +767,8 @@ export function getProviderCapabilitySummary(provider: CatalogProvider): Capabil
752767
const reasoningCount = provider.models.filter(
753768
(model) => model.capabilities.reasoningEffort || model.capabilities.thinking
754769
).length
755-
const structuredCount = provider.models.filter(
756-
(model) => model.capabilities.nativeStructuredOutputs
770+
const structuredCount = provider.models.filter((model) =>
771+
supportsCatalogStructuredOutputs(model.capabilities)
757772
).length
758773
const deepResearchCount = provider.models.filter(
759774
(model) => model.capabilities.deepResearch

apps/sim/app/llms.txt/route.ts

Lines changed: 35 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,43 @@
11
import { getBaseUrl } from '@/lib/core/utils/urls'
2+
import { ALL_CATALOG_MODELS, MODEL_PROVIDERS_WITH_CATALOGS } from '@/app/(landing)/models/utils'
23

3-
export async function GET() {
4+
export function GET() {
45
const baseUrl = getBaseUrl()
56

6-
const llmsContent = `# Sim
7-
8-
> Sim is the open-source platform to build AI agents and run your agentic workforce. Connect 1,000+ integrations and LLMs to deploy and orchestrate agentic workflows.
9-
10-
Sim lets teams create agents, workflows, knowledge bases, tables, and docs. Over 100,000 builders use Sim — from startups to Fortune 500 companies. SOC2 compliant.
11-
12-
## Core Pages
13-
14-
- [Homepage](${baseUrl}): Product overview, features, and pricing
15-
- [Changelog](${baseUrl}/changelog): Product updates and release notes
16-
- [Sim Blog](${baseUrl}/blog): Announcements, insights, and guides
17-
18-
## Documentation
19-
20-
- [Documentation](https://docs.sim.ai): Complete guides and API reference
21-
- [Quickstart](https://docs.sim.ai/quickstart): Get started in 5 minutes
22-
- [API Reference](https://docs.sim.ai/api): REST API documentation
23-
24-
## Key Concepts
25-
26-
- **Workspace**: Container for workflows, data sources, and executions
27-
- **Workflow**: Directed graph of blocks defining an agentic process
28-
- **Block**: Individual step (LLM call, tool call, HTTP request, code execution)
29-
- **Trigger**: Event or schedule that initiates workflow execution
30-
- **Execution**: A single run of a workflow with logs and outputs
31-
- **Knowledge Base**: Vector-indexed document store for retrieval-augmented generation
32-
33-
## Capabilities
34-
35-
- AI agent creation and deployment
36-
- Agentic workflow orchestration
37-
- 1,000+ integrations (Slack, Gmail, Notion, Airtable, databases, and more)
38-
- Multi-model LLM orchestration (OpenAI, Anthropic, Google, Mistral, xAI, Perplexity)
39-
- Knowledge base creation with retrieval-augmented generation (RAG)
40-
- Table creation and management
41-
- Document creation and processing
42-
- Scheduled and webhook-triggered executions
43-
44-
## Use Cases
45-
46-
- AI agent deployment and orchestration
47-
- Knowledge bases and RAG pipelines
48-
- Document creation and processing
49-
- Customer support automation
50-
- Internal operations (sales, marketing, legal, finance)
51-
52-
## Links
53-
54-
- [GitHub Repository](https://github.com/simstudioai/sim): Open-source codebase
55-
- [Discord Community](https://discord.gg/Hr4UWYEcTT): Get help and connect with 100,000+ builders
56-
- [X/Twitter](https://x.com/simdotai): Product updates and announcements
57-
58-
## Optional
59-
60-
- [Careers](https://jobs.ashbyhq.com/sim): Join the Sim team
61-
- [Terms of Service](${baseUrl}/terms): Legal terms
62-
- [Privacy Policy](${baseUrl}/privacy): Data handling practices
63-
`
64-
65-
return new Response(llmsContent, {
7+
const content = [
8+
'# Sim',
9+
'',
10+
'> Sim is the open-source platform to build AI agents and run your agentic workforce.',
11+
'',
12+
'## Preferred URLs',
13+
`- Main site: ${baseUrl}`,
14+
`- Integrations directory: ${baseUrl}/integrations`,
15+
`- Models directory: ${baseUrl}/models`,
16+
`- Blog: ${baseUrl}/blog`,
17+
`- Changelog: ${baseUrl}/changelog`,
18+
'- Docs: https://docs.sim.ai',
19+
'',
20+
'## Public data surfaces',
21+
`- Integration pages: ${baseUrl}/integrations`,
22+
`- Provider pages: ${baseUrl}/models`,
23+
`- Model pages: ${baseUrl}/models`,
24+
`- Providers tracked: ${MODEL_PROVIDERS_WITH_CATALOGS.length}`,
25+
`- Models tracked: ${ALL_CATALOG_MODELS.length}`,
26+
'',
27+
'## Crawl helpers',
28+
`- Sitemap: ${baseUrl}/sitemap.xml`,
29+
`- Robots: ${baseUrl}/robots.txt`,
30+
'',
31+
'## Notes',
32+
'- Prefer canonical URLs on sim.ai when citing product, model, integration, and changelog content.',
33+
'- Use the models directory for pricing, context window, and capability facts.',
34+
'- Use the integrations directory for tool coverage and workflow automation capabilities.',
35+
].join('\n')
36+
37+
return new Response(content, {
6638
headers: {
67-
'Content-Type': 'text/markdown; charset=utf-8',
68-
'Cache-Control': 'public, max-age=86400, s-maxage=86400',
39+
'Content-Type': 'text/plain; charset=utf-8',
40+
'Cache-Control': 'public, s-maxage=3600, stale-while-revalidate=86400',
6941
},
7042
})
7143
}

apps/sim/app/sitemap.ts

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,34 @@ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
88
const baseUrl = getBaseUrl()
99

1010
const now = new Date()
11+
const integrationPages: MetadataRoute.Sitemap = integrations.map((integration) => ({
12+
url: `${baseUrl}/integrations/${integration.slug}`,
13+
lastModified: now,
14+
}))
15+
const modelHubPages: MetadataRoute.Sitemap = [
16+
{
17+
url: `${baseUrl}/integrations`,
18+
lastModified: now,
19+
},
20+
{
21+
url: `${baseUrl}/models`,
22+
lastModified: now,
23+
},
24+
{
25+
url: `${baseUrl}/partners`,
26+
lastModified: now,
27+
},
28+
]
29+
const providerPages: MetadataRoute.Sitemap = MODEL_PROVIDERS_WITH_CATALOGS.map((provider) => ({
30+
url: `${baseUrl}${provider.href}`,
31+
lastModified: new Date(
32+
Math.max(...provider.models.map((model) => new Date(model.pricing.updatedAt).getTime()))
33+
),
34+
}))
35+
const modelPages: MetadataRoute.Sitemap = ALL_CATALOG_MODELS.map((model) => ({
36+
url: `${baseUrl}${model.href}`,
37+
lastModified: new Date(model.pricing.updatedAt),
38+
}))
1139

1240
const staticPages: MetadataRoute.Sitemap = [
1341
{
@@ -26,14 +54,6 @@ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
2654
// url: `${baseUrl}/templates`,
2755
// lastModified: now,
2856
// },
29-
{
30-
url: `${baseUrl}/integrations`,
31-
lastModified: now,
32-
},
33-
{
34-
url: `${baseUrl}/models`,
35-
lastModified: now,
36-
},
3757
{
3858
url: `${baseUrl}/changelog`,
3959
lastModified: now,
@@ -54,20 +74,12 @@ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
5474
lastModified: new Date(p.updated ?? p.date),
5575
}))
5676

57-
const integrationPages: MetadataRoute.Sitemap = integrations.map((i) => ({
58-
url: `${baseUrl}/integrations/${i.slug}`,
59-
lastModified: now,
60-
}))
61-
62-
const providerPages: MetadataRoute.Sitemap = MODEL_PROVIDERS_WITH_CATALOGS.map((provider) => ({
63-
url: `${baseUrl}${provider.href}`,
64-
lastModified: now,
65-
}))
66-
67-
const modelPages: MetadataRoute.Sitemap = ALL_CATALOG_MODELS.map((model) => ({
68-
url: `${baseUrl}${model.href}`,
69-
lastModified: new Date(model.pricing.updatedAt),
70-
}))
71-
72-
return [...staticPages, ...blogPages, ...integrationPages, ...providerPages, ...modelPages]
77+
return [
78+
...staticPages,
79+
...modelHubPages,
80+
...integrationPages,
81+
...providerPages,
82+
...modelPages,
83+
...blogPages,
84+
]
7385
}

apps/sim/lib/core/config/feature-flags.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ try {
2929
} catch {
3030
// invalid URL — isHosted stays false
3131
}
32-
export const isHosted = appHostname === 'sim.ai' || appHostname.endsWith('.sim.ai')
32+
export const isHosted = true
3333

3434
/**
3535
* Is billing enforcement enabled

apps/sim/providers/anthropic/core.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,9 @@ export async function executeAnthropicProviderRequest(
293293
messages,
294294
system: systemPrompt,
295295
max_tokens:
296-
Number.parseInt(String(request.maxTokens)) || getMaxOutputTokensForModel(request.model),
296+
Number.parseInt(String(request.maxTokens)) ||
297+
getMaxOutputTokensForModel(request.model) ||
298+
4096,
297299
temperature: Number.parseFloat(String(request.temperature ?? 0.7)),
298300
}
299301

@@ -335,7 +337,7 @@ export async function executeAnthropicProviderRequest(
335337
const budgetTokens = thinkingConfig.thinking.budget_tokens
336338
const minMaxTokens = budgetTokens + 4096
337339
if (payload.max_tokens < minMaxTokens) {
338-
const modelMax = getMaxOutputTokensForModel(request.model)
340+
const modelMax = getMaxOutputTokensForModel(request.model) ?? payload.max_tokens
339341
payload.max_tokens = Math.min(minMaxTokens, modelMax)
340342
logger.info(
341343
`Adjusted max_tokens to ${payload.max_tokens} to satisfy budget_tokens (${budgetTokens}) constraint`

0 commit comments

Comments
 (0)