@@ -19,11 +19,11 @@ import {
1919 statPeriodKey ,
2020 synthesizeAllTierRows ,
2121 toStatBaseRow ,
22- UPSERT_CHUNK_SIZE ,
2322 type StatBaseAggregate ,
2423} from "./domain/stat"
2524
2625const DAY_MS = 86_400_000
26+ const DEFAULT_UPSERT_CHUNK_SIZE = 100
2727const DEFAULT_TIERS = [ "Go" , "Free" , "Paid" ]
2828const FREE_MODELS = new Set ( [ "gpt-5-nano" , "grok-code" , "big-pickle" ] )
2929
@@ -44,6 +44,7 @@ type ImportOptions = {
4444 directories : string [ ]
4545 dryRun : boolean
4646 periodStart : Date | undefined
47+ upsertChunkSize : number
4748 files : Partial < Record < ImportKey , string [ ] > >
4849}
4950type ModelAggregate = StatBaseAggregate & { provider : string ; model : string ; provider_model : string }
@@ -168,6 +169,7 @@ async function importFiles(args: string[]) {
168169 providerRows : providerRows . length ,
169170 geoRows : geoRows . length ,
170171 dryRun : opts . dryRun ,
172+ upsertChunkSize : opts . upsertChunkSize ,
171173 } ,
172174 null ,
173175 2 ,
@@ -178,9 +180,9 @@ async function importFiles(args: string[]) {
178180 if ( ! opts . databaseUrl ) fail ( "DATABASE_URL is required unless --dry-run is set" )
179181
180182 const db = drizzle ( { client : new Client ( { url : opts . databaseUrl } ) } )
181- await upsertModelRows ( db , modelRows )
182- await upsertProviderRows ( db , providerRows )
183- await upsertGeoRows ( db , geoRows )
183+ await upsertModelRows ( db , modelRows , opts . upsertChunkSize )
184+ await upsertProviderRows ( db , providerRows , opts . upsertChunkSize )
185+ await upsertGeoRows ( db , geoRows , opts . upsertChunkSize )
184186}
185187
186188function buildQueries ( limit : number , tiers : string [ ] ) : QuerySpec [ ] {
@@ -783,125 +785,125 @@ function isRecord(value: unknown): value is Record<string, unknown> {
783785 return typeof value === "object" && value !== null && ! Array . isArray ( value )
784786}
785787
786- async function upsertModelRows ( db : ReturnType < typeof drizzle > , rows : ModelStatRow [ ] ) {
787- await Promise . all (
788- chunks ( rows , UPSERT_CHUNK_SIZE ) . map ( ( chunk ) =>
789- db
790- . insert ( modelStat )
791- . values ( chunk )
792- . onDuplicateKeyUpdate ( {
793- set : {
794- provider_model : inserted ( "provider_model" ) ,
795- sessions : inserted ( "sessions " ) ,
796- requests : inserted ( "requests " ) ,
797- input_tokens : inserted ( "input_tokens " ) ,
798- output_tokens : inserted ( "output_tokens " ) ,
799- reasoning_tokens : inserted ( "reasoning_tokens " ) ,
800- cache_read_tokens : inserted ( "cache_read_tokens " ) ,
801- total_tokens : inserted ( "total_tokens " ) ,
802- input_cost_microcents : inserted ( "input_cost_microcents " ) ,
803- output_cost_microcents : inserted ( "output_cost_microcents " ) ,
804- total_cost_microcents : inserted ( "total_cost_microcents " ) ,
805- avg_duration_ms : inserted ( "avg_duration_ms " ) ,
806- p50_duration_ms : inserted ( "p50_duration_ms " ) ,
807- p95_duration_ms : inserted ( "p95_duration_ms " ) ,
808- avg_ttfb_ms : inserted ( "avg_ttfb_ms " ) ,
809- p50_ttfb_ms : inserted ( "p50_ttfb_ms " ) ,
810- p95_ttfb_ms : inserted ( "p95_ttfb_ms " ) ,
811- avg_output_tps : inserted ( "avg_output_tps " ) ,
812- success_count : inserted ( "success_count " ) ,
813- error_count : inserted ( "error_count " ) ,
814- sample_count : inserted ( "sample_count " ) ,
815- rank_by_tokens : inserted ( "rank_by_tokens " ) ,
816- rank_by_requests : inserted ( "rank_by_requests " ) ,
817- rank_by_cost : inserted ( "rank_by_cost " ) ,
818- } ,
819- } ) ,
820- ) ,
821- )
788+ async function upsertModelRows ( db : ReturnType < typeof drizzle > , rows : ModelStatRow [ ] , chunkSize : number ) {
789+ const batches = chunks ( rows , chunkSize )
790+ console . log ( JSON . stringify ( { table : "model_stat" , batches : batches . length , chunkSize } ) )
791+ for ( const chunk of batches ) {
792+ await db
793+ . insert ( modelStat )
794+ . values ( chunk )
795+ . onDuplicateKeyUpdate ( {
796+ set : {
797+ provider_model : inserted ( "provider_model " ) ,
798+ sessions : inserted ( "sessions " ) ,
799+ requests : inserted ( "requests " ) ,
800+ input_tokens : inserted ( "input_tokens " ) ,
801+ output_tokens : inserted ( "output_tokens " ) ,
802+ reasoning_tokens : inserted ( "reasoning_tokens " ) ,
803+ cache_read_tokens : inserted ( "cache_read_tokens " ) ,
804+ total_tokens : inserted ( "total_tokens " ) ,
805+ input_cost_microcents : inserted ( "input_cost_microcents " ) ,
806+ output_cost_microcents : inserted ( "output_cost_microcents " ) ,
807+ total_cost_microcents : inserted ( "total_cost_microcents " ) ,
808+ avg_duration_ms : inserted ( "avg_duration_ms " ) ,
809+ p50_duration_ms : inserted ( "p50_duration_ms " ) ,
810+ p95_duration_ms : inserted ( "p95_duration_ms " ) ,
811+ avg_ttfb_ms : inserted ( "avg_ttfb_ms " ) ,
812+ p50_ttfb_ms : inserted ( "p50_ttfb_ms " ) ,
813+ p95_ttfb_ms : inserted ( "p95_ttfb_ms " ) ,
814+ avg_output_tps : inserted ( "avg_output_tps " ) ,
815+ success_count : inserted ( "success_count " ) ,
816+ error_count : inserted ( "error_count " ) ,
817+ sample_count : inserted ( "sample_count " ) ,
818+ rank_by_tokens : inserted ( "rank_by_tokens " ) ,
819+ rank_by_requests : inserted ( "rank_by_requests " ) ,
820+ rank_by_cost : inserted ( "rank_by_cost" ) ,
821+ } ,
822+ } )
823+ }
822824}
823825
824- async function upsertProviderRows ( db : ReturnType < typeof drizzle > , rows : ProviderStatRow [ ] ) {
825- await Promise . all (
826- chunks ( rows , UPSERT_CHUNK_SIZE ) . map ( ( chunk ) =>
827- db
828- . insert ( providerStat )
829- . values ( chunk )
830- . onDuplicateKeyUpdate ( {
831- set : {
832- sessions : inserted ( "sessions" ) ,
833- requests : inserted ( "requests " ) ,
834- input_tokens : inserted ( "input_tokens " ) ,
835- output_tokens : inserted ( "output_tokens " ) ,
836- reasoning_tokens : inserted ( "reasoning_tokens " ) ,
837- cache_read_tokens : inserted ( "cache_read_tokens " ) ,
838- total_tokens : inserted ( "total_tokens " ) ,
839- input_cost_microcents : inserted ( "input_cost_microcents " ) ,
840- output_cost_microcents : inserted ( "output_cost_microcents " ) ,
841- total_cost_microcents : inserted ( "total_cost_microcents " ) ,
842- avg_duration_ms : inserted ( "avg_duration_ms " ) ,
843- p50_duration_ms : inserted ( "p50_duration_ms " ) ,
844- p95_duration_ms : inserted ( "p95_duration_ms " ) ,
845- avg_ttfb_ms : inserted ( "avg_ttfb_ms " ) ,
846- p50_ttfb_ms : inserted ( "p50_ttfb_ms " ) ,
847- p95_ttfb_ms : inserted ( "p95_ttfb_ms " ) ,
848- avg_output_tps : inserted ( "avg_output_tps " ) ,
849- success_count : inserted ( "success_count " ) ,
850- error_count : inserted ( "error_count " ) ,
851- sample_count : inserted ( "sample_count " ) ,
852- market_share_tokens : inserted ( "market_share_tokens " ) ,
853- market_share_requests : inserted ( "market_share_requests " ) ,
854- market_share_sessions : inserted ( "market_share_sessions " ) ,
855- rank_by_tokens : inserted ( "rank_by_tokens " ) ,
856- rank_by_requests : inserted ( "rank_by_requests " ) ,
857- rank_by_sessions : inserted ( "rank_by_sessions " ) ,
858- rank_by_cost : inserted ( "rank_by_cost " ) ,
859- } ,
860- } ) ,
861- ) ,
862- )
826+ async function upsertProviderRows ( db : ReturnType < typeof drizzle > , rows : ProviderStatRow [ ] , chunkSize : number ) {
827+ const batches = chunks ( rows , chunkSize )
828+ console . log ( JSON . stringify ( { table : "provider_stat" , batches : batches . length , chunkSize } ) )
829+ for ( const chunk of batches ) {
830+ await db
831+ . insert ( providerStat )
832+ . values ( chunk )
833+ . onDuplicateKeyUpdate ( {
834+ set : {
835+ sessions : inserted ( "sessions " ) ,
836+ requests : inserted ( "requests " ) ,
837+ input_tokens : inserted ( "input_tokens " ) ,
838+ output_tokens : inserted ( "output_tokens " ) ,
839+ reasoning_tokens : inserted ( "reasoning_tokens " ) ,
840+ cache_read_tokens : inserted ( "cache_read_tokens " ) ,
841+ total_tokens : inserted ( "total_tokens " ) ,
842+ input_cost_microcents : inserted ( "input_cost_microcents " ) ,
843+ output_cost_microcents : inserted ( "output_cost_microcents " ) ,
844+ total_cost_microcents : inserted ( "total_cost_microcents " ) ,
845+ avg_duration_ms : inserted ( "avg_duration_ms " ) ,
846+ p50_duration_ms : inserted ( "p50_duration_ms " ) ,
847+ p95_duration_ms : inserted ( "p95_duration_ms " ) ,
848+ avg_ttfb_ms : inserted ( "avg_ttfb_ms " ) ,
849+ p50_ttfb_ms : inserted ( "p50_ttfb_ms " ) ,
850+ p95_ttfb_ms : inserted ( "p95_ttfb_ms " ) ,
851+ avg_output_tps : inserted ( "avg_output_tps " ) ,
852+ success_count : inserted ( "success_count " ) ,
853+ error_count : inserted ( "error_count " ) ,
854+ sample_count : inserted ( "sample_count " ) ,
855+ market_share_tokens : inserted ( "market_share_tokens " ) ,
856+ market_share_requests : inserted ( "market_share_requests " ) ,
857+ market_share_sessions : inserted ( "market_share_sessions " ) ,
858+ rank_by_tokens : inserted ( "rank_by_tokens " ) ,
859+ rank_by_requests : inserted ( "rank_by_requests " ) ,
860+ rank_by_sessions : inserted ( "rank_by_sessions " ) ,
861+ rank_by_cost : inserted ( "rank_by_cost" ) ,
862+ } ,
863+ } )
864+ }
863865}
864866
865- async function upsertGeoRows ( db : ReturnType < typeof drizzle > , rows : GeoStatRow [ ] ) {
866- await Promise . all (
867- chunks ( rows , UPSERT_CHUNK_SIZE ) . map ( ( chunk ) =>
868- db
869- . insert ( geoStat )
870- . values ( chunk )
871- . onDuplicateKeyUpdate ( {
872- set : {
873- continent : inserted ( "continent" ) ,
874- sessions : inserted ( "sessions " ) ,
875- requests : inserted ( "requests " ) ,
876- input_tokens : inserted ( "input_tokens " ) ,
877- output_tokens : inserted ( "output_tokens " ) ,
878- reasoning_tokens : inserted ( "reasoning_tokens " ) ,
879- cache_read_tokens : inserted ( "cache_read_tokens " ) ,
880- total_tokens : inserted ( "total_tokens " ) ,
881- input_cost_microcents : inserted ( "input_cost_microcents " ) ,
882- output_cost_microcents : inserted ( "output_cost_microcents " ) ,
883- total_cost_microcents : inserted ( "total_cost_microcents " ) ,
884- avg_duration_ms : inserted ( "avg_duration_ms " ) ,
885- p50_duration_ms : inserted ( "p50_duration_ms " ) ,
886- p95_duration_ms : inserted ( "p95_duration_ms " ) ,
887- avg_ttfb_ms : inserted ( "avg_ttfb_ms " ) ,
888- p50_ttfb_ms : inserted ( "p50_ttfb_ms " ) ,
889- p95_ttfb_ms : inserted ( "p95_ttfb_ms " ) ,
890- avg_output_tps : inserted ( "avg_output_tps " ) ,
891- success_count : inserted ( "success_count " ) ,
892- error_count : inserted ( "error_count " ) ,
893- sample_count : inserted ( "sample_count " ) ,
894- market_share_tokens : inserted ( "market_share_tokens " ) ,
895- market_share_requests : inserted ( "market_share_requests " ) ,
896- market_share_sessions : inserted ( "market_share_sessions " ) ,
897- rank_by_tokens : inserted ( "rank_by_tokens " ) ,
898- rank_by_requests : inserted ( "rank_by_requests " ) ,
899- rank_by_sessions : inserted ( "rank_by_sessions " ) ,
900- rank_by_cost : inserted ( "rank_by_cost " ) ,
901- } ,
902- } ) ,
903- ) ,
904- )
867+ async function upsertGeoRows ( db : ReturnType < typeof drizzle > , rows : GeoStatRow [ ] , chunkSize : number ) {
868+ const batches = chunks ( rows , chunkSize )
869+ console . log ( JSON . stringify ( { table : "geo_stat" , batches : batches . length , chunkSize } ) )
870+ for ( const chunk of batches ) {
871+ await db
872+ . insert ( geoStat )
873+ . values ( chunk )
874+ . onDuplicateKeyUpdate ( {
875+ set : {
876+ continent : inserted ( "continent " ) ,
877+ sessions : inserted ( "sessions " ) ,
878+ requests : inserted ( "requests " ) ,
879+ input_tokens : inserted ( "input_tokens " ) ,
880+ output_tokens : inserted ( "output_tokens " ) ,
881+ reasoning_tokens : inserted ( "reasoning_tokens " ) ,
882+ cache_read_tokens : inserted ( "cache_read_tokens " ) ,
883+ total_tokens : inserted ( "total_tokens " ) ,
884+ input_cost_microcents : inserted ( "input_cost_microcents " ) ,
885+ output_cost_microcents : inserted ( "output_cost_microcents " ) ,
886+ total_cost_microcents : inserted ( "total_cost_microcents " ) ,
887+ avg_duration_ms : inserted ( "avg_duration_ms " ) ,
888+ p50_duration_ms : inserted ( "p50_duration_ms " ) ,
889+ p95_duration_ms : inserted ( "p95_duration_ms " ) ,
890+ avg_ttfb_ms : inserted ( "avg_ttfb_ms " ) ,
891+ p50_ttfb_ms : inserted ( "p50_ttfb_ms " ) ,
892+ p95_ttfb_ms : inserted ( "p95_ttfb_ms " ) ,
893+ avg_output_tps : inserted ( "avg_output_tps " ) ,
894+ success_count : inserted ( "success_count " ) ,
895+ error_count : inserted ( "error_count " ) ,
896+ sample_count : inserted ( "sample_count " ) ,
897+ market_share_tokens : inserted ( "market_share_tokens " ) ,
898+ market_share_requests : inserted ( "market_share_requests " ) ,
899+ market_share_sessions : inserted ( "market_share_sessions " ) ,
900+ rank_by_tokens : inserted ( "rank_by_tokens " ) ,
901+ rank_by_requests : inserted ( "rank_by_requests " ) ,
902+ rank_by_sessions : inserted ( "rank_by_sessions " ) ,
903+ rank_by_cost : inserted ( "rank_by_cost" ) ,
904+ } ,
905+ } )
906+ }
905907}
906908
907909function parseImportOptions ( args : string [ ] ) : ImportOptions {
@@ -917,6 +919,7 @@ function parseImportOptions(args: string[]): ImportOptions {
917919 directories : flags . get ( "dir" ) ?? flags . get ( "directory" ) ?? [ ] ,
918920 dryRun : flags . has ( "dry-run" ) ,
919921 periodStart : parseDateFlag ( flags , "period-start" ) ,
922+ upsertChunkSize : parseIntegerFlag ( flags , "upsert-chunk-size" ) ?? DEFAULT_UPSERT_CHUNK_SIZE ,
920923 files,
921924 }
922925}
@@ -969,8 +972,8 @@ function parseListFlag(flags: Map<string, string[]>, name: string) {
969972function usage ( ) : never {
970973 fail ( `Usage:
971974 bun src/honeycomb-backfill.ts queries [--tiers Go,Free,Paid] [--limit 1000]
972- bun src/honeycomb-backfill.ts import [--dry-run] [--database-url URL] --dir downloads
973- bun src/honeycomb-backfill.ts import [--dry-run] [--database-url URL] --model-day file.csv [--model-day more.csv] ...` )
975+ bun src/honeycomb-backfill.ts import [--dry-run] [--upsert-chunk-size 100] [-- database-url URL] --dir downloads
976+ bun src/honeycomb-backfill.ts import [--dry-run] [--upsert-chunk-size 100] [-- database-url URL] --model-day file.csv [--model-day more.csv] ...` )
974977}
975978
976979function fail ( message : string ) : never {
0 commit comments