cube-js
diff --git a/‎rust/cubestore/cubestore/src/cluster/ingestion/job_processor.rs‎
Lines changed: 24 additions & 0 deletions b/‎rust/cubestore/cubestore/src/cluster/ingestion/job_processor.rs‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎rust/cubestore/cubestore/src/cluster/ingestion/job_runner.rs‎
Lines changed: 41 additions & 0 deletions b/‎rust/cubestore/cubestore/src/cluster/ingestion/job_runner.rs‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎rust/cubestore/cubestore/src/cluster/mod.rs‎
Lines changed: 69 additions & 12 deletions b/‎rust/cubestore/cubestore/src/cluster/mod.rs‎
Lines changed: 69 additions & 12 deletions
diff --git a/‎rust/cubestore/cubestore/src/config/mod.rs‎
Lines changed: 102 additions & 10 deletions b/‎rust/cubestore/cubestore/src/config/mod.rs‎
Lines changed: 102 additions & 10 deletions
diff --git a/‎rust/cubestore/cubestore/src/metastore/job.rs‎
Lines changed: 8 additions & 0 deletions b/‎rust/cubestore/cubestore/src/metastore/job.rs‎
Lines changed: 8 additions & 0 deletions
@@ -260,6 +260,30 @@ impl JobIsolatedProcessor {
                     Self::fail_job_row_key(job)
                 }
             }
+            JobType::RepartitionRange(end_chunk_id) => {
+                if let RowKey::Table(TableId::Chunks, start_chunk_id) = job.row_reference() {
+                    let start_chunk_id = *start_chunk_id;
+                    let end_chunk_id = *end_chunk_id;
+                    let data_loaded_size = DataLoadedSize::new();
+                    app_metrics::JOBS_REPARTITION_CHUNK.add(1);
+                    let r = self
+                        .chunk_store
+                        .repartition_chunk_range(
+                            start_chunk_id,
+                            end_chunk_id,
+                            data_loaded_size.clone(),
+                        )
+                        .await;
+                    if let Err(e) = r {
+                        app_metrics::JOBS_REPARTITION_CHUNK_FAILURES.add(1);
+                        return Err(e);
+                    }
+                    app_metrics::JOBS_REPARTITION_CHUNK_COMPLETED.add(1);
+                    Ok(JobProcessResult::new(data_loaded_size.get()))
+                } else {
+                    Self::fail_job_row_key(job)
+                }
+            }
             _ => Err(CubeError::internal(format!(
                 "Job {:?} cannot be processed in separate process",
                 job.job_type()
 
@@ -397,6 +397,47 @@ impl JobRunner {
                     Self::fail_job_row_key(job)
                 }
             }
+            JobType::RepartitionRange(_) => {
+                if let RowKey::Table(TableId::Chunks, start_chunk_id) = job.row_reference() {
+                    let start_chunk_id = *start_chunk_id;
+                    let process_rate_limiter = self.process_rate_limiter.clone();
+                    let timeout = Some(Duration::from_secs(self.config_obj.import_job_timeout()));
+                    let metastore = self.meta_store.clone();
+                    let job_to_move = job.clone();
+                    let job_processor = self.job_processor.clone();
+                    Ok(cube_ext::spawn(async move {
+                        let wait_ms = process_rate_limiter
+                            .wait_for_allow(TaskType::Job, timeout)
+                            .await?;
+                        let chunk = metastore.get_chunk(start_chunk_id).await?;
+                        let (_, _, table, _) = metastore
+                            .get_partition_for_compaction(chunk.get_row().get_partition_id())
+                            .await?;
+                        let table_id = table.get_id();
+                        let trace_obj = metastore.get_trace_obj_by_table_id(table_id).await?;
+                        let trace_index = TraceIndex {
+                            table_id: Some(table_id),
+                            trace_obj,
+                        };
+                        match job_processor.process_job(job_to_move).await {
+                            Ok(job_res) => {
+                                process_rate_limiter
+                                    .commit_task_usage(
+                                        TaskType::Job,
+                                        job_res.data_loaded_size() as i64,
+                                        wait_ms,
+                                        trace_index,
+                                    )
+                                    .await;
+                                Ok(())
+                            }
+                            Err(e) => Err(e),
+                        }
+                    }))
+                } else {
+                    Self::fail_job_row_key(job)
+                }
+            }
         }
     }
 
 
@@ -26,7 +26,7 @@ use crate::cluster::transport::{ClusterTransport, MetaStoreTransport, WorkerConn
 use crate::config::injection::{DIService, Injector};
 use crate::config::is_router;
 #[allow(unused_imports)]
-use crate::config::{Config, ConfigObj};
+use crate::config::{Config, ConfigObj, RepartitionStrategy};
 use crate::metastore::chunks::chunk_file_name;
 use crate::metastore::job::{Job, JobRunnerPool, JobStatus, JobType};
 use crate::metastore::{
@@ -945,17 +945,74 @@ impl Cluster for ClusterImpl {
             .filter(|c| !c.get_row().in_memory())
             .collect::<Vec<_>>();
 
-        if self.config_obj.batch_repartition_enabled() {
-            // FIXME: one job per partition that batches all persisted chunks, but
-            // keyed on a chunk (RepartitionChunk), not the partition. We reuse the
-            // existing job type instead of a dedicated per-partition JobType so an
-            // older binary stays able to deserialize it across `latest`/`release`
-            // channel switches (a new variant would make its whole job shard
-            // unreadable). The anchor is the smallest persisted chunk id so add_job
-            // dedups to a single job per partition; the worker resolves the
-            // partition from it and processes the anchor last (see
-            // repartition_partition_chunks). An old binary just repartitions this
-            // one chunk and drains the rest via its own per-chunk path.
+        if self.config_obj.repartition_strategy() == RepartitionStrategy::Range {
+            // Slice the parent's persisted chunks into RepartitionRange jobs. Walk ALL
+            // chunks (active and inactive) sorted by id so the [start, end] boundaries
+            // stay pinned to chunk ids and don't shift when chunks deactivate; cut a
+            // range once its rows reach max_rows or its chunk count reaches the fan-in
+            // cap, so a range never merges an unbounded number of chunks at once. A
+            // range is only scheduled when it still has an active chunk; the end is
+            // carried as the job's data, not its dedup key, so a tail that extends the
+            // trailing range dedups on the start.
+            let max_rows = self.config_obj.repartition_merge_max_rows();
+            let max_files = self.config_obj.repartition_merge_max_input_files();
+            let mut all = self
+                .meta_store
+                .get_chunks_by_partition(p.get_id(), true)
+                .await?
+                .into_iter()
+                .filter(|c| !c.get_row().in_memory())
+                .collect::<Vec<_>>();
+            all.sort_by_key(|c| c.get_id());
+
+            let mut i = 0;
+            while i < all.len() {
+                let start = all[i].get_id();
+                let mut rows = 0u64;
+                let mut count = 0usize;
+                let mut end = start;
+                let mut has_active = false;
+                while i < all.len() {
+                    let c = &all[i];
+                    rows += c.get_row().get_row_count();
+                    count += 1;
+                    end = c.get_id();
+                    has_active |= c.get_row().active();
+                    i += 1;
+                    if rows >= max_rows || count >= max_files {
+                        break;
+                    }
+                }
+                if has_active {
+                    let node =
+                        pick_worker_by_ids(self.config_obj.as_ref(), [start, end]).to_string();
+                    let job = self
+                        .meta_store
+                        .add_job(Job::new(
+                            RowKey::Table(TableId::Chunks, start),
+                            JobType::RepartitionRange(end),
+                            node.clone(),
+                        ))
+                        .await?;
+                    if job.is_some() {
+                        self.notify_job_runner(node).await?;
+                    }
+                }
+            }
+        } else if self.config_obj.repartition_strategy() == RepartitionStrategy::PerPartition
+            || self.config_obj.batch_repartition_enabled()
+        {
+            // One job per partition that batches all persisted chunks, but keyed on a
+            // chunk (RepartitionChunk), not the partition. We reuse the existing job
+            // type instead of a dedicated per-partition JobType so an older binary stays
+            // able to deserialize it across `latest`/`release` channel switches (a new
+            // variant would make its whole job shard unreadable). The anchor is the
+            // smallest persisted chunk id so add_job dedups to a single job per
+            // partition; the worker resolves the partition from it and processes the
+            // anchor last (see repartition_partition_chunks). The PerPartition strategy
+            // requires this single-job form: a per-chunk job under it would re-merge the
+            // whole partition. An old binary just repartitions this one chunk and drains
+            // the rest via its own per-chunk path.
             if let Some(anchor_chunk_id) = chunks.iter().map(|c| c.get_id()).min() {
                 let node = self.node_name_by_partition(p);
                 let job = self
 
@@ -361,6 +361,37 @@ pub struct Config {
     injector: Arc<Injector>,
 }
 
+/// How an inactive parent's persisted chunks are repartitioned into its children.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RepartitionStrategy {
+    /// One chunk at a time, each split independently into the children.
+    PerChunk,
+    /// One job per partition that k-way merges all its chunks in groups and splits the
+    /// merged stream into the children at the wal-split limit.
+    PerPartition,
+    /// Many jobs sliced at schedule time, each merging an inclusive chunk-id range;
+    /// spreads across workers by the hash of the range bounds.
+    Range,
+}
+
+impl FromStr for RepartitionStrategy {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "per_chunk" | "perchunk" | "per-chunk" => Ok(RepartitionStrategy::PerChunk),
+            "per_partition" | "perpartition" | "per-partition" => {
+                Ok(RepartitionStrategy::PerPartition)
+            }
+            "range" => Ok(RepartitionStrategy::Range),
+            _ => Err(format!(
+                "unknown repartition strategy '{}' (expected per_chunk, per_partition or range)",
+                s
+            )),
+        }
+    }
+}
+
 #[automock]
 pub trait ConfigObj: DIService {
     fn partition_split_threshold(&self) -> u64;
@@ -525,12 +556,17 @@ pub trait ConfigObj: DIService {
     /// `Some(0)` disables prefetching.
     fn repartition_prefetch_budget_bytes(&self) -> Option<u64>;
 
-    /// Max number of persisted input chunks merged in one group when repartitioning
-    /// an inactive parent. `Some(m >= 2)` streams the parent's chunks through a k-way
-    /// merge in groups of up to m and splits each group into the active children at
-    /// the wal-split limit, capping each merge+swap group at m chunks. `None` /
-    /// `Some(0)` / `Some(1)` disable the merge path.
-    fn repartition_merge_max_input_files(&self) -> Option<usize>;
+    /// Which repartition strategy to use for an inactive parent's persisted chunks.
+    /// Defaults to PerChunk.
+    fn repartition_strategy(&self) -> RepartitionStrategy;
+
+    /// Cap on the number of chunks merged together in one Merge group / RepartitionRange.
+    /// Bounds the concurrent parquet readers, the k-way merge width, the swap size and
+    /// (since a range downloads its chunks sequentially) the per-job download time.
+    fn repartition_merge_max_input_files(&self) -> usize;
+
+    /// Cap on the total rows merged together in one Merge group / RepartitionRange.
+    fn repartition_merge_max_rows(&self) -> u64;
 
     fn allow_decimal128(&self) -> bool;
 
@@ -681,7 +717,9 @@ pub struct ConfigObjImpl {
     pub batch_repartition_enabled: bool,
     pub repartition_chunks_time_budget_secs: u64,
     pub repartition_prefetch_budget_bytes: Option<u64>,
-    pub repartition_merge_max_input_files: Option<usize>,
+    pub repartition_strategy: RepartitionStrategy,
+    pub repartition_merge_max_input_files: usize,
+    pub repartition_merge_max_rows: u64,
     pub allow_decimal128: bool,
     pub enable_remove_orphaned_remote_files: bool,
     pub enable_startup_warmup: bool,
@@ -1003,9 +1041,15 @@ impl ConfigObj for ConfigObjImpl {
     fn repartition_prefetch_budget_bytes(&self) -> Option<u64> {
         self.repartition_prefetch_budget_bytes
     }
-    fn repartition_merge_max_input_files(&self) -> Option<usize> {
+    fn repartition_strategy(&self) -> RepartitionStrategy {
+        self.repartition_strategy
+    }
+    fn repartition_merge_max_input_files(&self) -> usize {
         self.repartition_merge_max_input_files
     }
+    fn repartition_merge_max_rows(&self) -> u64 {
+        self.repartition_merge_max_rows
+    }
 
     fn allow_decimal128(&self) -> bool {
         self.allow_decimal128
@@ -1308,6 +1352,24 @@ where
     })
 }
 
+// Unlike env_optparse, an unparseable value is not fatal: it logs a warning and falls
+// back to per_chunk, so a typo in the strategy env never takes the process down.
+fn env_repartition_strategy() -> RepartitionStrategy {
+    match env::var("CUBESTORE_REPARTITION_STRATEGY") {
+        Ok(v) => match v.parse::<RepartitionStrategy>() {
+            Ok(s) => s,
+            Err(e) => {
+                log::warn!(
+                    "Ignoring CUBESTORE_REPARTITION_STRATEGY: {}; using per_chunk",
+                    e
+                );
+                RepartitionStrategy::PerChunk
+            }
+        },
+        Err(_) => RepartitionStrategy::PerChunk,
+    }
+}
+
 impl Config {
     fn calculate_cache_compaction_trigger_size(cache_max_size: usize) -> usize {
         let trigger_size = match cache_max_size >> 20 {
@@ -1622,8 +1684,14 @@ impl Config {
                 repartition_prefetch_budget_bytes: env_optparse_size(
                     "CUBESTORE_REPARTITION_PREFETCH_BUDGET",
                 ),
-                repartition_merge_max_input_files: env_optparse(
+                repartition_strategy: env_repartition_strategy(),
+                repartition_merge_max_input_files: env_parse(
                     "CUBESTORE_REPARTITION_MERGE_MAX_INPUT_FILES",
+                    50,
+                ),
+                repartition_merge_max_rows: env_parse(
+                    "CUBESTORE_REPARTITION_MERGE_MAX_ROWS",
+                    4_000_000,
                 ),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
@@ -1866,7 +1934,9 @@ impl Config {
                 batch_repartition_enabled: true,
                 repartition_chunks_time_budget_secs: 60,
                 repartition_prefetch_budget_bytes: None,
-                repartition_merge_max_input_files: None,
+                repartition_strategy: RepartitionStrategy::PerChunk,
+                repartition_merge_max_input_files: 50,
+                repartition_merge_max_rows: 4_000_000,
                 allow_decimal128: false,
                 enable_remove_orphaned_remote_files: false,
                 enable_startup_warmup: true,
@@ -2728,3 +2798,25 @@ pub async fn uses_remote_metastore(i: &Injector) -> bool {
 pub fn is_router(c: &dyn ConfigObj) -> bool {
     !c.worker_bind_address().is_some()
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn repartition_strategy_from_str() {
+        assert_eq!(
+            "per_chunk".parse::<RepartitionStrategy>().unwrap(),
+            RepartitionStrategy::PerChunk
+        );
+        assert_eq!(
+            "per_partition".parse::<RepartitionStrategy>().unwrap(),
+            RepartitionStrategy::PerPartition
+        );
+        assert_eq!(
+            "RANGE".parse::<RepartitionStrategy>().unwrap(),
+            RepartitionStrategy::Range
+        );
+        assert!("nonsense".parse::<RepartitionStrategy>().is_err());
+    }
+}
@@ -21,6 +21,12 @@ pub enum JobType {
     RepartitionChunk,
     InMemoryChunksCompaction,
     NodeInMemoryChunksCompaction(/*node*/ String),
+    // Repartition an inclusive [start, end] chunk-id range of an inactive parent in
+    // one merge+swap. row_reference carries the start chunk; the end is data only and
+    // is deliberately excluded from the job index key (see key_to_bytes) so a tail
+    // that extends the trailing range dedups on the start instead of spawning a
+    // second job for the same start.
+    RepartitionRange(/*end_chunk_id*/ u64),
 }
 
 fn get_job_type_index(j: &JobType) -> u32 {
@@ -35,6 +41,7 @@ fn get_job_type_index(j: &JobType) -> u32 {
         JobType::RepartitionChunk => 8,
         JobType::InMemoryChunksCompaction => 9,
         JobType::NodeInMemoryChunksCompaction(_) => 10,
+        JobType::RepartitionRange(_) => 11,
     }
 }
 
@@ -49,6 +56,7 @@ fn get_job_type_priority(j: &JobType) -> u32 {
         JobType::MultiPartitionSplit => 1000,
         JobType::FinishMultiSplit => 1000,
         JobType::RepartitionChunk => 1000,
+        JobType::RepartitionRange(_) => 1000,
         JobType::InMemoryChunksCompaction => 10000,
         JobType::NodeInMemoryChunksCompaction(_) => 10000,
     }