@@ -19,7 +19,6 @@ use std::collections::HashMap;
1919use std:: collections:: HashSet ;
2020use std:: sync:: Arc ;
2121
22- use databend_common_base:: runtime:: GLOBAL_MEM_STAT ;
2322use databend_common_base:: runtime:: execute_futures_in_parallel;
2423use databend_common_catalog:: plan:: ReclusterParts ;
2524use databend_common_catalog:: plan:: ReclusterTask ;
@@ -237,16 +236,11 @@ impl ReclusterMutator {
237236 blocks_map. entry ( stats. level ) . or_default ( ) . push ( idx) ;
238237 }
239238
240- // Compute memory threshold and maximum number of blocks allowed for reclustering.
239+ // Use the configured recluster budget as a stable scheduling target. Runtime
240+ // memory usage is intentionally not folded in here because sort spill can
241+ // absorb pressure and the available memory snapshot changes during execution.
241242 let settings = self . ctx . get_settings ( ) ;
242- let avail_memory_usage =
243- settings. get_max_memory_usage ( ) ? - GLOBAL_MEM_STAT . get_memory_usage ( ) as u64 ;
244- let memory_threshold = settings
245- . get_recluster_block_size ( ) ?
246- . min ( avail_memory_usage * 30 / 100 ) as usize ;
247- // specify a rather small value, so that `recluster_block_size` might be tuned to lower value.
248- let max_blocks_num =
249- ( memory_threshold / self . block_thresholds . max_bytes_per_block ) . max ( 2 ) * self . max_tasks ;
243+ let memory_threshold = settings. get_recluster_block_size ( ) ? as usize ;
250244 let block_per_seg = self . block_thresholds . block_per_segment ;
251245
252246 // Prepare task generation parameters
@@ -327,7 +321,12 @@ impl ReclusterMutator {
327321 break ;
328322 }
329323
330- // Select blocks for reclustering based on depth threshold and max block size
324+ let max_blocks_num_per_node =
325+ self . max_blocks_num_per_node ( total_bytes as usize , block_count, memory_threshold) ;
326+ let max_blocks_num = max_blocks_num_per_node * self . max_tasks ;
327+ // Fetch enough candidates for all workers. The per-node quota is based
328+ // on the observed block size in selected segments instead of the worst
329+ // allowed block size, otherwise distributed recluster can under-select.
331330 let mut selected_idx =
332331 self . fetch_max_depth ( points_map, self . depth_threshold , max_blocks_num) ?;
333332 if selected_idx. is_empty ( ) {
@@ -337,19 +336,60 @@ impl ReclusterMutator {
337336 selected_idx = IndexSet :: from_iter ( small_blocks) ;
338337 }
339338
340- // Process selected blocks into recluster tasks based on memory threshold
341- let mut task_bytes = 0 ;
339+ let max_total_bytes = memory_threshold. saturating_mul ( self . max_tasks ) ;
340+ // Keep the first, highest-depth blocks within the total execution budget.
341+ // This is a second-stage guard after candidate selection: the average
342+ // block size is only an estimate, while task generation uses real bytes.
343+ let selected_total_bytes =
344+ Self :: limit_selected_blocks_by_budget ( & mut selected_idx, & blocks, max_total_bytes) ;
345+ let selected_block_count = selected_idx. len ( ) ;
346+ if selected_block_count < 2 {
347+ continue ;
348+ }
349+
350+ let min_blocks_per_task = ( max_blocks_num_per_node / 2 ) . max ( 2 ) ;
351+ let target_tasks_by_blocks = selected_block_count / min_blocks_per_task;
352+ let target_tasks_by_memory = selected_total_bytes. div_ceil ( memory_threshold) ;
353+ // A recluster task needs at least two blocks, so this caps parallelism
354+ // when the selected candidate set is too small for every worker.
355+ let max_tasks_by_blocks = selected_block_count / 2 ;
356+ let target_tasks = target_tasks_by_blocks
357+ . max ( target_tasks_by_memory)
358+ . max ( 1 )
359+ . min ( self . max_tasks )
360+ . min ( max_tasks_by_blocks) ;
361+ let target_task_bytes = selected_total_bytes. div_ceil ( target_tasks) ;
362+ let target_task_blocks = selected_block_count. div_ceil ( target_tasks) ;
363+
364+ // Process selected blocks into recluster tasks based on memory and parallelism targets.
365+ let mut task_bytes = 0usize ;
342366 let mut task_rows = 0 ;
343367 let mut task_compressed = 0 ;
344368 let mut task_indices = Vec :: new ( ) ;
345369 let mut selected_blocks = Vec :: new ( ) ;
346- for idx in selected_idx {
370+ for ( processed_blocks , idx) in selected_idx. into_iter ( ) . enumerate ( ) {
347371 let block = blocks[ idx] . 1 . clone ( ) ;
348372 let block_size = block. block_size as usize ;
349373 let row_count = block. row_count as usize ;
350374
351- // If memory threshold exceeded, generate a new task and reset accumulators
352- if task_bytes + block_size > memory_threshold && selected_blocks. len ( ) > 1 {
375+ let remaining_tasks = target_tasks. saturating_sub ( tasks. len ( ) + 1 ) ;
376+ let remaining_blocks = selected_block_count. saturating_sub ( processed_blocks) ;
377+ // Only split for parallelism when the remaining blocks can still
378+ // satisfy the minimum task size for the tasks left to create.
379+ let has_enough_remaining_blocks =
380+ remaining_blocks >= remaining_tasks * min_blocks_per_task;
381+ let should_split_for_memory = task_bytes. saturating_add ( block_size)
382+ > memory_threshold
383+ && selected_blocks. len ( ) > 1 ;
384+ // Memory split is the hard safety guard. Parallel split is a load
385+ // balancing target and is allowed only while preserving task size.
386+ let should_split_for_parallelism = tasks. len ( ) + 1 < target_tasks
387+ && selected_blocks. len ( ) >= min_blocks_per_task
388+ && has_enough_remaining_blocks
389+ && ( task_bytes >= target_task_bytes
390+ || selected_blocks. len ( ) >= target_task_blocks) ;
391+
392+ if should_split_for_memory || should_split_for_parallelism {
353393 selected_blocks_idx. extend ( std:: mem:: take ( & mut task_indices) ) ;
354394
355395 tasks. push ( self . generate_task (
@@ -489,6 +529,41 @@ impl ReclusterMutator {
489529 }
490530 }
491531
532+ fn max_blocks_num_per_node (
533+ & self ,
534+ total_bytes : usize ,
535+ block_count : usize ,
536+ memory_threshold : usize ,
537+ ) -> usize {
538+ let avg_block_bytes = ( total_bytes / block_count) . max ( 1 ) ;
539+ // Clamp the observed average to normal block thresholds so tiny fragments
540+ // do not inflate the candidate count and unusually large blocks do not
541+ // make the distributed selection overly conservative.
542+ let target_block_bytes = avg_block_bytes
543+ . max ( self . block_thresholds . min_bytes_per_block )
544+ . min ( self . block_thresholds . max_bytes_per_block ) ;
545+ ( memory_threshold / target_block_bytes) . max ( 2 )
546+ }
547+
548+ fn limit_selected_blocks_by_budget (
549+ selected_idx : & mut IndexSet < usize > ,
550+ blocks : & [ ( BlockIndex , Arc < BlockMeta > ) ] ,
551+ max_total_bytes : usize ,
552+ ) -> usize {
553+ let mut total_bytes = 0usize ;
554+ let mut keep_blocks = 0 ;
555+ for idx in selected_idx. iter ( ) . copied ( ) {
556+ let block_size = blocks[ idx] . 1 . block_size as usize ;
557+ if keep_blocks >= 2 && total_bytes. saturating_add ( block_size) > max_total_bytes {
558+ break ;
559+ }
560+ total_bytes = total_bytes. saturating_add ( block_size) ;
561+ keep_blocks += 1 ;
562+ }
563+ selected_idx. truncate ( keep_blocks) ;
564+ total_bytes
565+ }
566+
492567 pub fn select_segments (
493568 & self ,
494569 compact_segments : & [ ( SegmentLocation , Arc < CompactSegmentInfo > ) ] ,
0 commit comments