Skip to content

Commit 7504daa

Browse files
committed
fix(stats): batch honeycomb backfill
1 parent 9ac0f3e commit 7504daa

2 files changed

Lines changed: 125 additions & 122 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"dev:desktop": "bun --cwd packages/desktop dev",
1111
"dev:web": "bun --cwd packages/app dev",
1212
"dev:console": "ulimit -n 10240 2>/dev/null; bun run --cwd packages/console/app dev",
13-
"dev:stats": "bun sst shell --stage=production -- bun run --cwd packages/stats/app dev",
13+
"dev:stats": "bun sst shell --stage=dev -- bun run --cwd packages/stats/app dev",
1414
"dev:storybook": "bun --cwd packages/storybook storybook",
1515
"lint": "oxlint",
1616
"typecheck": "bun turbo typecheck",

packages/stats/core/src/honeycomb-backfill.ts

Lines changed: 124 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ import {
1919
statPeriodKey,
2020
synthesizeAllTierRows,
2121
toStatBaseRow,
22-
UPSERT_CHUNK_SIZE,
2322
type StatBaseAggregate,
2423
} from "./domain/stat"
2524

2625
const DAY_MS = 86_400_000
26+
const DEFAULT_UPSERT_CHUNK_SIZE = 100
2727
const DEFAULT_TIERS = ["Go", "Free", "Paid"]
2828
const FREE_MODELS = new Set(["gpt-5-nano", "grok-code", "big-pickle"])
2929

@@ -44,6 +44,7 @@ type ImportOptions = {
4444
directories: string[]
4545
dryRun: boolean
4646
periodStart: Date | undefined
47+
upsertChunkSize: number
4748
files: Partial<Record<ImportKey, string[]>>
4849
}
4950
type ModelAggregate = StatBaseAggregate & { provider: string; model: string; provider_model: string }
@@ -168,6 +169,7 @@ async function importFiles(args: string[]) {
168169
providerRows: providerRows.length,
169170
geoRows: geoRows.length,
170171
dryRun: opts.dryRun,
172+
upsertChunkSize: opts.upsertChunkSize,
171173
},
172174
null,
173175
2,
@@ -178,9 +180,9 @@ async function importFiles(args: string[]) {
178180
if (!opts.databaseUrl) fail("DATABASE_URL is required unless --dry-run is set")
179181

180182
const db = drizzle({ client: new Client({ url: opts.databaseUrl }) })
181-
await upsertModelRows(db, modelRows)
182-
await upsertProviderRows(db, providerRows)
183-
await upsertGeoRows(db, geoRows)
183+
await upsertModelRows(db, modelRows, opts.upsertChunkSize)
184+
await upsertProviderRows(db, providerRows, opts.upsertChunkSize)
185+
await upsertGeoRows(db, geoRows, opts.upsertChunkSize)
184186
}
185187

186188
function buildQueries(limit: number, tiers: string[]): QuerySpec[] {
@@ -783,125 +785,125 @@ function isRecord(value: unknown): value is Record<string, unknown> {
783785
return typeof value === "object" && value !== null && !Array.isArray(value)
784786
}
785787

786-
async function upsertModelRows(db: ReturnType<typeof drizzle>, rows: ModelStatRow[]) {
787-
await Promise.all(
788-
chunks(rows, UPSERT_CHUNK_SIZE).map((chunk) =>
789-
db
790-
.insert(modelStat)
791-
.values(chunk)
792-
.onDuplicateKeyUpdate({
793-
set: {
794-
provider_model: inserted("provider_model"),
795-
sessions: inserted("sessions"),
796-
requests: inserted("requests"),
797-
input_tokens: inserted("input_tokens"),
798-
output_tokens: inserted("output_tokens"),
799-
reasoning_tokens: inserted("reasoning_tokens"),
800-
cache_read_tokens: inserted("cache_read_tokens"),
801-
total_tokens: inserted("total_tokens"),
802-
input_cost_microcents: inserted("input_cost_microcents"),
803-
output_cost_microcents: inserted("output_cost_microcents"),
804-
total_cost_microcents: inserted("total_cost_microcents"),
805-
avg_duration_ms: inserted("avg_duration_ms"),
806-
p50_duration_ms: inserted("p50_duration_ms"),
807-
p95_duration_ms: inserted("p95_duration_ms"),
808-
avg_ttfb_ms: inserted("avg_ttfb_ms"),
809-
p50_ttfb_ms: inserted("p50_ttfb_ms"),
810-
p95_ttfb_ms: inserted("p95_ttfb_ms"),
811-
avg_output_tps: inserted("avg_output_tps"),
812-
success_count: inserted("success_count"),
813-
error_count: inserted("error_count"),
814-
sample_count: inserted("sample_count"),
815-
rank_by_tokens: inserted("rank_by_tokens"),
816-
rank_by_requests: inserted("rank_by_requests"),
817-
rank_by_cost: inserted("rank_by_cost"),
818-
},
819-
}),
820-
),
821-
)
788+
async function upsertModelRows(db: ReturnType<typeof drizzle>, rows: ModelStatRow[], chunkSize: number) {
789+
const batches = chunks(rows, chunkSize)
790+
console.log(JSON.stringify({ table: "model_stat", batches: batches.length, chunkSize }))
791+
for (const chunk of batches) {
792+
await db
793+
.insert(modelStat)
794+
.values(chunk)
795+
.onDuplicateKeyUpdate({
796+
set: {
797+
provider_model: inserted("provider_model"),
798+
sessions: inserted("sessions"),
799+
requests: inserted("requests"),
800+
input_tokens: inserted("input_tokens"),
801+
output_tokens: inserted("output_tokens"),
802+
reasoning_tokens: inserted("reasoning_tokens"),
803+
cache_read_tokens: inserted("cache_read_tokens"),
804+
total_tokens: inserted("total_tokens"),
805+
input_cost_microcents: inserted("input_cost_microcents"),
806+
output_cost_microcents: inserted("output_cost_microcents"),
807+
total_cost_microcents: inserted("total_cost_microcents"),
808+
avg_duration_ms: inserted("avg_duration_ms"),
809+
p50_duration_ms: inserted("p50_duration_ms"),
810+
p95_duration_ms: inserted("p95_duration_ms"),
811+
avg_ttfb_ms: inserted("avg_ttfb_ms"),
812+
p50_ttfb_ms: inserted("p50_ttfb_ms"),
813+
p95_ttfb_ms: inserted("p95_ttfb_ms"),
814+
avg_output_tps: inserted("avg_output_tps"),
815+
success_count: inserted("success_count"),
816+
error_count: inserted("error_count"),
817+
sample_count: inserted("sample_count"),
818+
rank_by_tokens: inserted("rank_by_tokens"),
819+
rank_by_requests: inserted("rank_by_requests"),
820+
rank_by_cost: inserted("rank_by_cost"),
821+
},
822+
})
823+
}
822824
}
823825

824-
async function upsertProviderRows(db: ReturnType<typeof drizzle>, rows: ProviderStatRow[]) {
825-
await Promise.all(
826-
chunks(rows, UPSERT_CHUNK_SIZE).map((chunk) =>
827-
db
828-
.insert(providerStat)
829-
.values(chunk)
830-
.onDuplicateKeyUpdate({
831-
set: {
832-
sessions: inserted("sessions"),
833-
requests: inserted("requests"),
834-
input_tokens: inserted("input_tokens"),
835-
output_tokens: inserted("output_tokens"),
836-
reasoning_tokens: inserted("reasoning_tokens"),
837-
cache_read_tokens: inserted("cache_read_tokens"),
838-
total_tokens: inserted("total_tokens"),
839-
input_cost_microcents: inserted("input_cost_microcents"),
840-
output_cost_microcents: inserted("output_cost_microcents"),
841-
total_cost_microcents: inserted("total_cost_microcents"),
842-
avg_duration_ms: inserted("avg_duration_ms"),
843-
p50_duration_ms: inserted("p50_duration_ms"),
844-
p95_duration_ms: inserted("p95_duration_ms"),
845-
avg_ttfb_ms: inserted("avg_ttfb_ms"),
846-
p50_ttfb_ms: inserted("p50_ttfb_ms"),
847-
p95_ttfb_ms: inserted("p95_ttfb_ms"),
848-
avg_output_tps: inserted("avg_output_tps"),
849-
success_count: inserted("success_count"),
850-
error_count: inserted("error_count"),
851-
sample_count: inserted("sample_count"),
852-
market_share_tokens: inserted("market_share_tokens"),
853-
market_share_requests: inserted("market_share_requests"),
854-
market_share_sessions: inserted("market_share_sessions"),
855-
rank_by_tokens: inserted("rank_by_tokens"),
856-
rank_by_requests: inserted("rank_by_requests"),
857-
rank_by_sessions: inserted("rank_by_sessions"),
858-
rank_by_cost: inserted("rank_by_cost"),
859-
},
860-
}),
861-
),
862-
)
826+
async function upsertProviderRows(db: ReturnType<typeof drizzle>, rows: ProviderStatRow[], chunkSize: number) {
827+
const batches = chunks(rows, chunkSize)
828+
console.log(JSON.stringify({ table: "provider_stat", batches: batches.length, chunkSize }))
829+
for (const chunk of batches) {
830+
await db
831+
.insert(providerStat)
832+
.values(chunk)
833+
.onDuplicateKeyUpdate({
834+
set: {
835+
sessions: inserted("sessions"),
836+
requests: inserted("requests"),
837+
input_tokens: inserted("input_tokens"),
838+
output_tokens: inserted("output_tokens"),
839+
reasoning_tokens: inserted("reasoning_tokens"),
840+
cache_read_tokens: inserted("cache_read_tokens"),
841+
total_tokens: inserted("total_tokens"),
842+
input_cost_microcents: inserted("input_cost_microcents"),
843+
output_cost_microcents: inserted("output_cost_microcents"),
844+
total_cost_microcents: inserted("total_cost_microcents"),
845+
avg_duration_ms: inserted("avg_duration_ms"),
846+
p50_duration_ms: inserted("p50_duration_ms"),
847+
p95_duration_ms: inserted("p95_duration_ms"),
848+
avg_ttfb_ms: inserted("avg_ttfb_ms"),
849+
p50_ttfb_ms: inserted("p50_ttfb_ms"),
850+
p95_ttfb_ms: inserted("p95_ttfb_ms"),
851+
avg_output_tps: inserted("avg_output_tps"),
852+
success_count: inserted("success_count"),
853+
error_count: inserted("error_count"),
854+
sample_count: inserted("sample_count"),
855+
market_share_tokens: inserted("market_share_tokens"),
856+
market_share_requests: inserted("market_share_requests"),
857+
market_share_sessions: inserted("market_share_sessions"),
858+
rank_by_tokens: inserted("rank_by_tokens"),
859+
rank_by_requests: inserted("rank_by_requests"),
860+
rank_by_sessions: inserted("rank_by_sessions"),
861+
rank_by_cost: inserted("rank_by_cost"),
862+
},
863+
})
864+
}
863865
}
864866

865-
async function upsertGeoRows(db: ReturnType<typeof drizzle>, rows: GeoStatRow[]) {
866-
await Promise.all(
867-
chunks(rows, UPSERT_CHUNK_SIZE).map((chunk) =>
868-
db
869-
.insert(geoStat)
870-
.values(chunk)
871-
.onDuplicateKeyUpdate({
872-
set: {
873-
continent: inserted("continent"),
874-
sessions: inserted("sessions"),
875-
requests: inserted("requests"),
876-
input_tokens: inserted("input_tokens"),
877-
output_tokens: inserted("output_tokens"),
878-
reasoning_tokens: inserted("reasoning_tokens"),
879-
cache_read_tokens: inserted("cache_read_tokens"),
880-
total_tokens: inserted("total_tokens"),
881-
input_cost_microcents: inserted("input_cost_microcents"),
882-
output_cost_microcents: inserted("output_cost_microcents"),
883-
total_cost_microcents: inserted("total_cost_microcents"),
884-
avg_duration_ms: inserted("avg_duration_ms"),
885-
p50_duration_ms: inserted("p50_duration_ms"),
886-
p95_duration_ms: inserted("p95_duration_ms"),
887-
avg_ttfb_ms: inserted("avg_ttfb_ms"),
888-
p50_ttfb_ms: inserted("p50_ttfb_ms"),
889-
p95_ttfb_ms: inserted("p95_ttfb_ms"),
890-
avg_output_tps: inserted("avg_output_tps"),
891-
success_count: inserted("success_count"),
892-
error_count: inserted("error_count"),
893-
sample_count: inserted("sample_count"),
894-
market_share_tokens: inserted("market_share_tokens"),
895-
market_share_requests: inserted("market_share_requests"),
896-
market_share_sessions: inserted("market_share_sessions"),
897-
rank_by_tokens: inserted("rank_by_tokens"),
898-
rank_by_requests: inserted("rank_by_requests"),
899-
rank_by_sessions: inserted("rank_by_sessions"),
900-
rank_by_cost: inserted("rank_by_cost"),
901-
},
902-
}),
903-
),
904-
)
867+
async function upsertGeoRows(db: ReturnType<typeof drizzle>, rows: GeoStatRow[], chunkSize: number) {
868+
const batches = chunks(rows, chunkSize)
869+
console.log(JSON.stringify({ table: "geo_stat", batches: batches.length, chunkSize }))
870+
for (const chunk of batches) {
871+
await db
872+
.insert(geoStat)
873+
.values(chunk)
874+
.onDuplicateKeyUpdate({
875+
set: {
876+
continent: inserted("continent"),
877+
sessions: inserted("sessions"),
878+
requests: inserted("requests"),
879+
input_tokens: inserted("input_tokens"),
880+
output_tokens: inserted("output_tokens"),
881+
reasoning_tokens: inserted("reasoning_tokens"),
882+
cache_read_tokens: inserted("cache_read_tokens"),
883+
total_tokens: inserted("total_tokens"),
884+
input_cost_microcents: inserted("input_cost_microcents"),
885+
output_cost_microcents: inserted("output_cost_microcents"),
886+
total_cost_microcents: inserted("total_cost_microcents"),
887+
avg_duration_ms: inserted("avg_duration_ms"),
888+
p50_duration_ms: inserted("p50_duration_ms"),
889+
p95_duration_ms: inserted("p95_duration_ms"),
890+
avg_ttfb_ms: inserted("avg_ttfb_ms"),
891+
p50_ttfb_ms: inserted("p50_ttfb_ms"),
892+
p95_ttfb_ms: inserted("p95_ttfb_ms"),
893+
avg_output_tps: inserted("avg_output_tps"),
894+
success_count: inserted("success_count"),
895+
error_count: inserted("error_count"),
896+
sample_count: inserted("sample_count"),
897+
market_share_tokens: inserted("market_share_tokens"),
898+
market_share_requests: inserted("market_share_requests"),
899+
market_share_sessions: inserted("market_share_sessions"),
900+
rank_by_tokens: inserted("rank_by_tokens"),
901+
rank_by_requests: inserted("rank_by_requests"),
902+
rank_by_sessions: inserted("rank_by_sessions"),
903+
rank_by_cost: inserted("rank_by_cost"),
904+
},
905+
})
906+
}
905907
}
906908

907909
function parseImportOptions(args: string[]): ImportOptions {
@@ -917,6 +919,7 @@ function parseImportOptions(args: string[]): ImportOptions {
917919
directories: flags.get("dir") ?? flags.get("directory") ?? [],
918920
dryRun: flags.has("dry-run"),
919921
periodStart: parseDateFlag(flags, "period-start"),
922+
upsertChunkSize: parseIntegerFlag(flags, "upsert-chunk-size") ?? DEFAULT_UPSERT_CHUNK_SIZE,
920923
files,
921924
}
922925
}
@@ -969,8 +972,8 @@ function parseListFlag(flags: Map<string, string[]>, name: string) {
969972
function usage(): never {
970973
fail(`Usage:
971974
bun src/honeycomb-backfill.ts queries [--tiers Go,Free,Paid] [--limit 1000]
972-
bun src/honeycomb-backfill.ts import [--dry-run] [--database-url URL] --dir downloads
973-
bun src/honeycomb-backfill.ts import [--dry-run] [--database-url URL] --model-day file.csv [--model-day more.csv] ...`)
975+
bun src/honeycomb-backfill.ts import [--dry-run] [--upsert-chunk-size 100] [--database-url URL] --dir downloads
976+
bun src/honeycomb-backfill.ts import [--dry-run] [--upsert-chunk-size 100] [--database-url URL] --model-day file.csv [--model-day more.csv] ...`)
974977
}
975978

976979
function fail(message: string): never {

0 commit comments

Comments
 (0)