llm-cli/models.ts at main · david-crespo/llm-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import { ValidationError } from "@cliffy/command"
import { type TokenCounts } from "./types.ts"
import $ from "@david/dax"

// prices are per million tokens
export type Model = {
  provider: string
  /** Key provided to API call */
  key: string
  /** ID for display and usability purposes */
  id: string
  default?: true
  // prices
  input: number
  output: number
  input_cached?: number
  /** Cost per web search in dollars */
  search_cost?: number
}

/**
 * The order matters: preferred models go first.
 *
 * We pick a model by finding the first one containing the specified string.
 * But the same string can be in multiple model names. For example, "mini" is
 * in both gpt-4o-mini and the gemini models. By putting gpt-4o-mini earlier, we
 * ensure "mini" matches that. By putting gpt-4o first, we ensure "4o" matches
 * that.
 *
 * id is doing double duty as both a human-readable nickname and a unique ID.
 */
export const models: Model[] = [
  {
    provider: "anthropic",
    key: "claude-opus-4-7",
    id: "opus-4.7",
    input: 5,
    input_cached: 0.50,
    output: 25,
    search_cost: 0.01,
    default: true,
  },
  {
    provider: "anthropic",
    key: "claude-sonnet-4-6",
    id: "sonnet-4.6",
    input: 3,
    input_cached: 0.30,
    output: 15,
    search_cost: 0.01,
  },
  {
    provider: "anthropic",
    key: "claude-haiku-4-5",
    id: "haiku-4.5",
    input: 1,
    input_cached: 0.1,
    output: 5,
    search_cost: 0.01,
  },
  {
    provider: "google",
    key: "gemini-3.1-pro-preview",
    id: "gemini-3.1-pro",
    input: 2.00,
    input_cached: 0.50,
    output: 12.00,
    // 5,000 search queries per month (free), then (Coming soon) $14 / 1,000 search queries
    search_cost: 0,
  },
  {
    provider: "google",
    key: "gemini-3-flash-preview",
    id: "gemini-3-flash",
    input: .50,
    input_cached: 0.05,
    output: 3.00,
    // 1,500 RPD (free, limit shared with lite), then $35 / 1,000 grounded prompts
    search_cost: 0,
  },
  {
    provider: "google",
    key: "gemini-3.1-flash-lite-preview",
    id: "gemini-3-flash-lite",
    input: .25,
    input_cached: 0.025,
    output: 1.50,
    // 1,500 RPD (free, limit shared with lite), then $35 / 1,000 grounded prompts
    search_cost: 0,
  },
  {
    provider: "openai",
    key: "gpt-5.5",
    id: "gpt-5.5",
    input: 5.00,
    input_cached: 0.50,
    output: 30,
    search_cost: 0.01,
  },
  {
    provider: "openai",
    key: "gpt-5.4-mini",
    id: "gpt-5.4-mini",
    input: 0.75,
    input_cached: 0.075,
    output: 4.50,
    search_cost: 0.01,
  },
  {
    provider: "openai",
    key: "gpt-5.4-pro",
    id: "gpt-5.4-pro",
    input: 30, // no caching, yikes
    output: 180,
    search_cost: 0.01,
  },
  {
    provider: "groq",
    key: "moonshotai/kimi-k2-instruct-0905",
    id: "kimi-k2",
    input: 1.00,
    input_cached: 0.50,
    output: 3.00,
  },
  {
    provider: "openrouter",
    key: "moonshotai/kimi-k2.6",
    id: "kimi-k2.6",
    // prices actually vary but let's go with moonshot's own as a middle ground
    // https://openrouter.ai/moonshotai/kimi-k2.6
    input: 0.95,
    input_cached: 0.16,
    output: 4.00,
  },
  {
    provider: "cerebras",
    key: "zai-glm-4.7",
    id: "glm-4.7",
    input: 2.25,
    output: 2.75,
  },
  {
    provider: "cerebras",
    key: "gpt-oss-120b",
    id: "gpt-oss-120b",
    input: 0.35,
    output: 0.75,
  },
]

/** Errors and exits if it can't resolve to a model */
export function resolveModel(modelArg: string | undefined) {
  if (modelArg === undefined) return models.find((m) => m.default)!

  // Find the first model containing the arg as a substring. See comment at
  // allModels definition about ordering.
  const lower = modelArg.toLowerCase()
  // First look for an exact match, then find the first model containing the arg
  // as a substring. See comment at allModels definition about ordering. Without
  // this logic, you could never match o1 if o1-mini is present.
  const match = models.find((m) => m.key === lower || m.id === lower) ||
    models.find((m) => m.key.includes(lower) || m.id.includes(lower))

  if (!match) {
    // TODO: print list of models as part of this error, not just the help. or
    // throw here
    throw new ValidationError(
      `Model '${modelArg}' not found. Use the models command to list models.`,
    )
  }

  return match
}

const M = 1_000_000

export function getCost(model: Model, tokens: TokenCounts, searches = 0) {
  const { input, output, input_cached, search_cost } = model

  // when there is caching and we have cache pricing, take it into account
  const tokenCost = input_cached && tokens.input_cache_hit
    ? (input_cached * tokens.input_cache_hit) +
      (input * (tokens.input - tokens.input_cache_hit)) + (output * tokens.output)
    : (input * tokens.input) + (output * tokens.output)

  return tokenCost / M + (search_cost ?? 0) * searches
}

export const systemBase = $.dedent`
  - Answer the question precisely, without much elaboration
  - Write natural prose for a sophisticated reader, without unnecessary bullets or headings
  - Avoid referring to yourself in the first person. You are a computer program, not a person.
  - When asked to write code, primarily output code, with minimal explanation unless requested
  - When given code to modify, prefer diff output rather than rewriting the full input unless the input is short
  - Your answers MUST be in markdown format
  - Put code within a triple-backtick fence block with a language key (like \`\`\`rust)
  - Never put markdown prose (or bullets or whatever) in a fenced code block
  - When the answer is based on search, include citations directly in the response text when relevant

  Tailor answers to the user:
  - OS: macOS
  - Terminal: Ghostty
  - Text editor: Helix
  - Shell: zsh
  - Programming languages: TypeScript and Rust
  - Today's date is ${new Date().toISOString().slice(0, 10)}
`