@@ -33,6 +33,7 @@ use quickwit_actors::{
3333 Actor , ActorContext , ActorExitStatus , Command , Handler , Mailbox , QueueCapacity ,
3434} ;
3535use quickwit_common:: io:: IoControls ;
36+ use quickwit_common:: metrics:: GaugeGuard ;
3637use quickwit_common:: runtimes:: RuntimeType ;
3738use quickwit_common:: temp_dir:: TempDirectory ;
3839use quickwit_config:: IndexingSettings ;
@@ -151,10 +152,10 @@ impl IndexerState {
151152 other_split_opt : & ' a mut Option < IndexedSplitBuilder > ,
152153 counter : & ' a mut IndexerCounters ,
153154 ctx : & ActorContext < Indexer > ,
154- ) -> anyhow:: Result < & ' a mut IndexedSplitBuilder > {
155+ ) -> anyhow:: Result < ( & ' a mut IndexedSplitBuilder , bool ) > {
155156 let num_splits = splits. len ( ) ;
156157 match splits. entry ( partition_id) {
157- Entry :: Occupied ( indexed_split) => Ok ( indexed_split. into_mut ( ) ) ,
158+ Entry :: Occupied ( indexed_split) => Ok ( ( indexed_split. into_mut ( ) , false ) ) ,
158159 Entry :: Vacant ( vacant_entry) => {
159160 if num_splits as u32 >= self . max_num_partitions . get ( ) {
160161 // In order to avoid exceeding max_num_partitions, we map the document to the
@@ -172,11 +173,11 @@ impl IndexerState {
172173 ) ?;
173174 * other_split_opt = Some ( new_other_split) ;
174175 }
175- Ok ( other_split_opt. as_mut ( ) . unwrap ( ) )
176+ Ok ( ( other_split_opt. as_mut ( ) . unwrap ( ) , true ) )
176177 } else {
177178 let indexed_split =
178179 self . create_indexed_split_builder ( partition_id, last_delete_opstamp, ctx) ?;
179- Ok ( vacant_entry. insert ( indexed_split) )
180+ Ok ( ( vacant_entry. insert ( indexed_split) , true ) )
180181 }
181182 }
182183 }
@@ -236,7 +237,11 @@ impl IndexerState {
236237 publish_lock,
237238 publish_token_opt,
238239 last_delete_opstamp,
239- memory_usage : ByteSize ( 0 ) ,
240+ memory_usage : GaugeGuard :: from_gauge (
241+ & quickwit_common:: metrics:: MEMORY_METRICS
242+ . in_flight_data
243+ . index_writer ,
244+ ) ,
240245 } ;
241246 Ok ( workbench)
242247 }
@@ -294,7 +299,7 @@ impl IndexerState {
294299 . source_delta
295300 . extend ( batch. checkpoint_delta )
296301 . context ( "batch delta does not follow indexer checkpoint" ) ?;
297- let mut memory_usage_delta: u64 = 0 ;
302+ let mut memory_usage_delta: i64 = 0 ;
298303 counters. num_doc_batches_in_workbench += 1 ;
299304 for doc in batch. docs {
300305 let ProcessedDoc {
@@ -304,7 +309,7 @@ impl IndexerState {
304309 num_bytes,
305310 } = doc;
306311 counters. num_docs_in_workbench += 1 ;
307- let indexed_split: & mut IndexedSplitBuilder = self . get_or_create_indexed_split (
312+ let ( indexed_split, split_created ) = self . get_or_create_indexed_split (
308313 partition,
309314 * last_delete_opstamp,
310315 indexed_splits,
@@ -313,6 +318,11 @@ impl IndexerState {
313318 ctx,
314319 ) ?;
315320 let mem_usage_before = indexed_split. index_writer . mem_usage ( ) as u64 ;
321+ if split_created {
322+ // The split was just created. We need to account for the initial index writer's
323+ // memory usage.
324+ memory_usage_delta += mem_usage_before as i64 ;
325+ }
316326 indexed_split. split_attrs . uncompressed_docs_size_in_bytes += num_bytes as u64 ;
317327 indexed_split. split_attrs . num_docs += 1 ;
318328 if let Some ( timestamp) = timestamp_opt {
@@ -324,10 +334,10 @@ impl IndexerState {
324334 . add_document ( doc)
325335 . context ( "failed to add document" ) ?;
326336 let mem_usage_after = indexed_split. index_writer . mem_usage ( ) as u64 ;
327- memory_usage_delta += mem_usage_after - mem_usage_before;
337+ memory_usage_delta += mem_usage_after as i64 - mem_usage_before as i64 ;
328338 ctx. record_progress ( ) ;
329339 }
330- * memory_usage = ByteSize ( memory_usage . as_u64 ( ) + memory_usage_delta) ;
340+ memory_usage. add ( memory_usage_delta) ;
331341 Ok ( ( ) )
332342 }
333343}
@@ -353,7 +363,7 @@ struct IndexingWorkbench {
353363 // We use this value to set the `delete_opstamp` of the workbench splits.
354364 last_delete_opstamp : u64 ,
355365 // Number of bytes declared as used by tantivy.
356- memory_usage : ByteSize ,
366+ memory_usage : GaugeGuard ,
357367}
358368
359369pub struct Indexer {
@@ -570,9 +580,9 @@ impl Indexer {
570580
571581 fn memory_usage ( & self ) -> ByteSize {
572582 if let Some ( workbench) = & self . indexing_workbench_opt {
573- workbench. memory_usage
583+ ByteSize ( workbench. memory_usage . get ( ) as u64 )
574584 } else {
575- ByteSize ( 0 )
585+ ByteSize ( 0u64 )
576586 }
577587 }
578588
@@ -591,7 +601,8 @@ impl Indexer {
591601 ctx,
592602 )
593603 . await ?;
594- if self . memory_usage ( ) >= self . indexer_state . indexing_settings . resources . heap_size {
604+ let memory_usage = self . memory_usage ( ) ;
605+ if memory_usage >= self . indexer_state . indexing_settings . resources . heap_size {
595606 self . send_to_serializer ( CommitTrigger :: MemoryLimit , ctx)
596607 . await ?;
597608 }
@@ -623,6 +634,7 @@ impl Indexer {
623634 publish_token_opt,
624635 batch_parent_span,
625636 indexing_permit,
637+ memory_usage,
626638 ..
627639 } ) = self . indexing_workbench_opt . take ( )
628640 else {
@@ -674,6 +686,7 @@ impl Indexer {
674686 publish_token_opt,
675687 commit_trigger,
676688 batch_parent_span,
689+ memory_usage,
677690 } ,
678691 )
679692 . await ?;
@@ -883,7 +896,7 @@ mod tests {
883896 let body_field = schema. get_field ( "body" ) . unwrap ( ) ;
884897 let indexing_directory = TempDirectory :: for_test ( ) ;
885898 let mut indexing_settings = IndexingSettings :: for_test ( ) ;
886- indexing_settings. resources . heap_size = ByteSize :: mb ( 5 ) ;
899+ indexing_settings. resources . heap_size = ByteSize :: mb ( 16 ) ;
887900 let ( index_serializer_mailbox, index_serializer_inbox) = universe. create_test_mailbox ( ) ;
888901 let mut metastore = MetastoreServiceClient :: mock ( ) ;
889902 metastore. expect_publish_splits ( ) . never ( ) ;
@@ -1214,7 +1227,8 @@ mod tests {
12141227 let body_field = schema. get_field ( "body" ) . unwrap ( ) ;
12151228
12161229 let indexing_directory = TempDirectory :: for_test ( ) ;
1217- let indexing_settings = IndexingSettings :: for_test ( ) ;
1230+ let mut indexing_settings = IndexingSettings :: for_test ( ) ;
1231+ indexing_settings. resources . heap_size = ByteSize :: mb ( 100 ) ;
12181232 let ( index_serializer_mailbox, index_serializer_inbox) = universe. create_test_mailbox ( ) ;
12191233 let mut metastore = MetastoreServiceClient :: mock ( ) ;
12201234 metastore. expect_publish_splits ( ) . never ( ) ;
@@ -1310,7 +1324,8 @@ mod tests {
13101324 Arc :: new ( serde_json:: from_str :: < DefaultDocMapper > ( DOCMAPPER_SIMPLE_JSON ) . unwrap ( ) ) ;
13111325 let body_field = doc_mapper. schema ( ) . get_field ( "body" ) . unwrap ( ) ;
13121326 let indexing_directory = TempDirectory :: for_test ( ) ;
1313- let indexing_settings = IndexingSettings :: for_test ( ) ;
1327+ let mut indexing_settings = IndexingSettings :: for_test ( ) ;
1328+ indexing_settings. resources . heap_size = ByteSize :: gb ( 5 ) ;
13141329 let mut metastore = MetastoreServiceClient :: mock ( ) ;
13151330 metastore
13161331 . expect_last_delete_opstamp ( )
0 commit comments