forked from apache/datafusion-comet
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcomet_partitioning.rs
More file actions
71 lines (64 loc) · 2.87 KB
/
comet_partitioning.rs
File metadata and controls
71 lines (64 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use arrow::row::{OwnedRow, RowConverter};
use datafusion::physical_expr::{LexOrdering, PhysicalExpr};
use std::sync::Arc;
#[derive(Debug, Clone)]
pub enum CometPartitioning {
SinglePartition,
/// Allocate rows based on a hash of one of more expressions and the specified number of
/// partitions. Args are 1) the expression to hash on, and 2) the number of partitions.
Hash(Vec<Arc<dyn PhysicalExpr>>, usize),
/// Allocate rows based on the lexical order of one of more expressions and the specified number of
/// partitions. Args are 1) the LexOrdering to use to compare values and split into partitions,
/// 2) the number of partitions, 3) the RowConverter used to view incoming RecordBatches as Arrow
/// Rows for comparing to 4) OwnedRows that represent the boundaries of each partition, used with
/// LexOrdering to bin each value in the RecordBatch to a partition.
RangePartitioning(LexOrdering, usize, Arc<RowConverter>, Vec<OwnedRow>),
/// Round robin partitioning. Distributes rows across partitions by sorting them by hash
/// (computed from columns) and then assigning partitions sequentially. Args are:
/// 1) number of partitions, 2) max columns to hash (0 means no limit).
RoundRobin(usize, usize),
}
impl CometPartitioning {
pub fn partition_count(&self) -> usize {
use CometPartitioning::*;
match self {
SinglePartition => 1,
Hash(_, n) | RangePartitioning(_, n, _, _) | RoundRobin(n, _) => *n,
}
}
}
pub(super) fn pmod(hash: u32, n: usize) -> usize {
let hash = hash as i32;
let n = n as i32;
let r = hash % n;
let result = if r < 0 { (r + n) % n } else { r };
result as usize
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pmod() {
let i: Vec<u32> = vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b, 0xcd1e64fb];
let result = i.into_iter().map(|i| pmod(i, 200)).collect::<Vec<usize>>();
// expected partition from Spark with n=200
let expected = vec![69, 5, 193, 171, 115];
assert_eq!(result, expected);
}
}