cube-js
diff --git a/‎rust/cubestore/cubestore/src/cluster/ingestion/job_processor.rs‎
Lines changed: 33 additions & 12 deletions b/‎rust/cubestore/cubestore/src/cluster/ingestion/job_processor.rs‎
Lines changed: 33 additions & 12 deletions
diff --git a/‎rust/cubestore/cubestore/src/cluster/ingestion/job_runner.rs‎
Lines changed: 41 additions & 0 deletions b/‎rust/cubestore/cubestore/src/cluster/ingestion/job_runner.rs‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎rust/cubestore/cubestore/src/cluster/mod.rs‎
Lines changed: 66 additions & 12 deletions b/‎rust/cubestore/cubestore/src/cluster/mod.rs‎
Lines changed: 66 additions & 12 deletions
@@ -1,5 +1,5 @@
 use crate::config::injection::DIService;
-use crate::config::{Config, ConfigObj};
+use crate::config::{Config, ConfigObj, RepartitionStrategy};
 use crate::import::ImportService;
 use crate::metastore::job::{Job, JobType};
 use crate::metastore::table::Table;
@@ -222,17 +222,14 @@ impl JobIsolatedProcessor {
                     }
                     let data_loaded_size = DataLoadedSize::new();
                     app_metrics::JOBS_REPARTITION_CHUNK.add(1);
-                    // FIXME: a RepartitionChunk job whose chunk_id is used as an
-                    // anchor for the whole partition (batch_repartition_enabled).
-                    // We deliberately overload the existing RepartitionChunk type
-                    // instead of introducing a dedicated per-partition JobType:
-                    // a new JobType variant cannot be deserialized by an older
-                    // binary, which would make the whole job shard unreadable when
-                    // a deployment switches between the `latest` and `release`
-                    // channels (version can move both ways at any time). Reusing a
-                    // known type keeps both directions safe — an old binary just
-                    // repartitions the single anchor chunk. See repartition_partition_chunks.
-                    let r = if self.config_obj.batch_repartition_enabled() {
+                    // PerPartition reuses the RepartitionChunk job with the smallest chunk
+                    // as the anchor (one job per partition) and merges all its chunks;
+                    // PerChunk gets one job per chunk and repartitions just that chunk. We
+                    // overload the existing RepartitionChunk type (no new JobType) so an
+                    // older binary stays able to deserialize it across channel switches.
+                    let r = if self.config_obj.repartition_strategy()
+                        == RepartitionStrategy::PerPartition
+                    {
                         let partition_id = chunk.get_row().get_partition_id();
                         let time_budget = Duration::from_secs(
                             self.config_obj.repartition_chunks_time_budget_secs(),
@@ -260,6 +257,30 @@ impl JobIsolatedProcessor {
                     Self::fail_job_row_key(job)
                 }
             }
+            JobType::RepartitionRange(end_chunk_id) => {
+                if let RowKey::Table(TableId::Chunks, start_chunk_id) = job.row_reference() {
+                    let start_chunk_id = *start_chunk_id;
+                    let end_chunk_id = *end_chunk_id;
+                    let data_loaded_size = DataLoadedSize::new();
+                    app_metrics::JOBS_REPARTITION_CHUNK.add(1);
+                    let r = self
+                        .chunk_store
+                        .repartition_chunk_range(
+                            start_chunk_id,
+                            end_chunk_id,
+                            data_loaded_size.clone(),
+                        )
+                        .await;
+                    if let Err(e) = r {
+                        app_metrics::JOBS_REPARTITION_CHUNK_FAILURES.add(1);
+                        return Err(e);
+                    }
+                    app_metrics::JOBS_REPARTITION_CHUNK_COMPLETED.add(1);
+                    Ok(JobProcessResult::new(data_loaded_size.get()))
+                } else {
+                    Self::fail_job_row_key(job)
+                }
+            }
             _ => Err(CubeError::internal(format!(
                 "Job {:?} cannot be processed in separate process",
                 job.job_type()
 
@@ -397,6 +397,47 @@ impl JobRunner {
                     Self::fail_job_row_key(job)
                 }
             }
+            JobType::RepartitionRange(_) => {
+                if let RowKey::Table(TableId::Chunks, start_chunk_id) = job.row_reference() {
+                    let start_chunk_id = *start_chunk_id;
+                    let process_rate_limiter = self.process_rate_limiter.clone();
+                    let timeout = Some(Duration::from_secs(self.config_obj.import_job_timeout()));
+                    let metastore = self.meta_store.clone();
+                    let job_to_move = job.clone();
+                    let job_processor = self.job_processor.clone();
+                    Ok(cube_ext::spawn(async move {
+                        let wait_ms = process_rate_limiter
+                            .wait_for_allow(TaskType::Job, timeout)
+                            .await?;
+                        let chunk = metastore.get_chunk(start_chunk_id).await?;
+                        let (_, _, table, _) = metastore
+                            .get_partition_for_compaction(chunk.get_row().get_partition_id())
+                            .await?;
+                        let table_id = table.get_id();
+                        let trace_obj = metastore.get_trace_obj_by_table_id(table_id).await?;
+                        let trace_index = TraceIndex {
+                            table_id: Some(table_id),
+                            trace_obj,
+                        };
+                        match job_processor.process_job(job_to_move).await {
+                            Ok(job_res) => {
+                                process_rate_limiter
+                                    .commit_task_usage(
+                                        TaskType::Job,
+                                        job_res.data_loaded_size() as i64,
+                                        wait_ms,
+                                        trace_index,
+                                    )
+                                    .await;
+                                Ok(())
+                            }
+                            Err(e) => Err(e),
+                        }
+                    }))
+                } else {
+                    Self::fail_job_row_key(job)
+                }
+            }
             // Defense-in-depth: start_processing_job never selects an Unknown job, so
             // this arm is not a live path — it just guarantees we never panic on one.
             JobType::Unknown => Err(CubeError::internal(format!(
 
@@ -26,7 +26,7 @@ use crate::cluster::transport::{ClusterTransport, MetaStoreTransport, WorkerConn
 use crate::config::injection::{DIService, Injector};
 use crate::config::is_router;
 #[allow(unused_imports)]
-use crate::config::{Config, ConfigObj};
+use crate::config::{Config, ConfigObj, RepartitionStrategy};
 use crate::metastore::chunks::chunk_file_name;
 use crate::metastore::job::{Job, JobRunnerPool, JobStatus, JobType};
 use crate::metastore::{
@@ -945,17 +945,71 @@ impl Cluster for ClusterImpl {
             .filter(|c| !c.get_row().in_memory())
             .collect::<Vec<_>>();
 
-        if self.config_obj.batch_repartition_enabled() {
-            // FIXME: one job per partition that batches all persisted chunks, but
-            // keyed on a chunk (RepartitionChunk), not the partition. We reuse the
-            // existing job type instead of a dedicated per-partition JobType so an
-            // older binary stays able to deserialize it across `latest`/`release`
-            // channel switches (a new variant would make its whole job shard
-            // unreadable). The anchor is the smallest persisted chunk id so add_job
-            // dedups to a single job per partition; the worker resolves the
-            // partition from it and processes the anchor last (see
-            // repartition_partition_chunks). An old binary just repartitions this
-            // one chunk and drains the rest via its own per-chunk path.
+        if self.config_obj.repartition_strategy() == RepartitionStrategy::Range {
+            // Slice the parent's persisted chunks into RepartitionRange jobs. Walk ALL
+            // chunks (active and inactive) sorted by id so the [start, end] boundaries
+            // stay pinned to chunk ids and don't shift when chunks deactivate; cut a
+            // range once its rows reach max_rows or its chunk count reaches the fan-in
+            // cap, so a range never merges an unbounded number of chunks at once. A
+            // range is only scheduled when it still has an active chunk; the end is
+            // carried as the job's data, not its dedup key, so a tail that extends the
+            // trailing range dedups on the start.
+            // Clamp to >= 1 so a misconfigured 0 cap doesn't break the inner loop before
+            // adding any chunk. Even 1 degrades to one chunk per range (no merge gain);
+            // the sane range is >= 2.
+            let max_rows = self.config_obj.repartition_merge_max_rows().max(1);
+            let max_files = self.config_obj.repartition_merge_max_input_files().max(1);
+            let mut all = self
+                .meta_store
+                .get_chunks_by_partition(p.get_id(), true)
+                .await?
+                .into_iter()
+                .filter(|c| !c.get_row().in_memory())
+                .collect::<Vec<_>>();
+            all.sort_by_key(|c| c.get_id());
+
+            let mut i = 0;
+            while i < all.len() {
+                let start = all[i].get_id();
+                let mut rows = 0u64;
+                let mut count = 0usize;
+                let mut end = start;
+                let mut has_active = false;
+                while i < all.len() {
+                    let c = &all[i];
+                    rows += c.get_row().get_row_count();
+                    count += 1;
+                    end = c.get_id();
+                    has_active |= c.get_row().active();
+                    i += 1;
+                    if rows >= max_rows || count >= max_files {
+                        break;
+                    }
+                }
+                if has_active {
+                    let node =
+                        pick_worker_by_ids(self.config_obj.as_ref(), [start, end]).to_string();
+                    let job = self
+                        .meta_store
+                        .add_job(Job::new(
+                            RowKey::Table(TableId::Chunks, start),
+                            JobType::RepartitionRange(end),
+                            node.clone(),
+                        ))
+                        .await?;
+                    if job.is_some() {
+                        self.notify_job_runner(node).await?;
+                    }
+                }
+            }
+        } else if self.config_obj.repartition_strategy() == RepartitionStrategy::PerPartition {
+            // One job per partition, keyed on a chunk (RepartitionChunk) rather than a
+            // dedicated per-partition JobType so an older binary can still deserialize it
+            // across `latest`/`release` channel switches. The anchor is the smallest
+            // persisted chunk id so add_job dedups to a single job per partition; the
+            // worker resolves the partition from it and merges all its chunks (see
+            // repartition_partition_chunks). An old binary just repartitions this one
+            // chunk and drains the rest via its own per-chunk path.
             if let Some(anchor_chunk_id) = chunks.iter().map(|c| c.get_id()).min() {
                 let node = self.node_name_by_partition(p);
                 let job = self