@@ -26,7 +26,7 @@ use crate::cluster::transport::{ClusterTransport, MetaStoreTransport, WorkerConn
2626use crate :: config:: injection:: { DIService , Injector } ;
2727use crate :: config:: is_router;
2828#[ allow( unused_imports) ]
29- use crate :: config:: { Config , ConfigObj } ;
29+ use crate :: config:: { Config , ConfigObj , RepartitionStrategy } ;
3030use crate :: metastore:: chunks:: chunk_file_name;
3131use crate :: metastore:: job:: { Job , JobRunnerPool , JobStatus , JobType } ;
3232use crate :: metastore:: {
@@ -945,17 +945,71 @@ impl Cluster for ClusterImpl {
945945 . filter ( |c| !c. get_row ( ) . in_memory ( ) )
946946 . collect :: < Vec < _ > > ( ) ;
947947
948- if self . config_obj . batch_repartition_enabled ( ) {
949- // FIXME: one job per partition that batches all persisted chunks, but
950- // keyed on a chunk (RepartitionChunk), not the partition. We reuse the
951- // existing job type instead of a dedicated per-partition JobType so an
952- // older binary stays able to deserialize it across `latest`/`release`
953- // channel switches (a new variant would make its whole job shard
954- // unreadable). The anchor is the smallest persisted chunk id so add_job
955- // dedups to a single job per partition; the worker resolves the
956- // partition from it and processes the anchor last (see
957- // repartition_partition_chunks). An old binary just repartitions this
958- // one chunk and drains the rest via its own per-chunk path.
948+ if self . config_obj . repartition_strategy ( ) == RepartitionStrategy :: Range {
949+ // Slice the parent's persisted chunks into RepartitionRange jobs. Walk ALL
950+ // chunks (active and inactive) sorted by id so the [start, end] boundaries
951+ // stay pinned to chunk ids and don't shift when chunks deactivate; cut a
952+ // range once its rows reach max_rows or its chunk count reaches the fan-in
953+ // cap, so a range never merges an unbounded number of chunks at once. A
954+ // range is only scheduled when it still has an active chunk; the end is
955+ // carried as the job's data, not its dedup key, so a tail that extends the
956+ // trailing range dedups on the start.
957+ // Clamp to >= 1 so a misconfigured 0 cap doesn't break the inner loop before
958+ // adding any chunk. Even 1 degrades to one chunk per range (no merge gain);
959+ // the sane range is >= 2.
960+ let max_rows = self . config_obj . repartition_merge_max_rows ( ) . max ( 1 ) ;
961+ let max_files = self . config_obj . repartition_merge_max_input_files ( ) . max ( 1 ) ;
962+ let mut all = self
963+ . meta_store
964+ . get_chunks_by_partition ( p. get_id ( ) , true )
965+ . await ?
966+ . into_iter ( )
967+ . filter ( |c| !c. get_row ( ) . in_memory ( ) )
968+ . collect :: < Vec < _ > > ( ) ;
969+ all. sort_by_key ( |c| c. get_id ( ) ) ;
970+
971+ let mut i = 0 ;
972+ while i < all. len ( ) {
973+ let start = all[ i] . get_id ( ) ;
974+ let mut rows = 0u64 ;
975+ let mut count = 0usize ;
976+ let mut end = start;
977+ let mut has_active = false ;
978+ while i < all. len ( ) {
979+ let c = & all[ i] ;
980+ rows += c. get_row ( ) . get_row_count ( ) ;
981+ count += 1 ;
982+ end = c. get_id ( ) ;
983+ has_active |= c. get_row ( ) . active ( ) ;
984+ i += 1 ;
985+ if rows >= max_rows || count >= max_files {
986+ break ;
987+ }
988+ }
989+ if has_active {
990+ let node =
991+ pick_worker_by_ids ( self . config_obj . as_ref ( ) , [ start, end] ) . to_string ( ) ;
992+ let job = self
993+ . meta_store
994+ . add_job ( Job :: new (
995+ RowKey :: Table ( TableId :: Chunks , start) ,
996+ JobType :: RepartitionRange ( end) ,
997+ node. clone ( ) ,
998+ ) )
999+ . await ?;
1000+ if job. is_some ( ) {
1001+ self . notify_job_runner ( node) . await ?;
1002+ }
1003+ }
1004+ }
1005+ } else if self . config_obj . repartition_strategy ( ) == RepartitionStrategy :: PerPartition {
1006+ // One job per partition, keyed on a chunk (RepartitionChunk) rather than a
1007+ // dedicated per-partition JobType so an older binary can still deserialize it
1008+ // across `latest`/`release` channel switches. The anchor is the smallest
1009+ // persisted chunk id so add_job dedups to a single job per partition; the
1010+ // worker resolves the partition from it and merges all its chunks (see
1011+ // repartition_partition_chunks). An old binary just repartitions this one
1012+ // chunk and drains the rest via its own per-chunk path.
9591013 if let Some ( anchor_chunk_id) = chunks. iter ( ) . map ( |c| c. get_id ( ) ) . min ( ) {
9601014 let node = self . node_name_by_partition ( p) ;
9611015 let job = self
0 commit comments