Skip to content

Commit c3733d4

Browse files
authored
feat: Histogram should honour "buckets" option (#4485)
Signed-off-by: Gordon Smith <GordonJSmith@gmail.com>
1 parent c8ef69b commit c3733d4

2 files changed

Lines changed: 100 additions & 2 deletions

File tree

packages/dataflow/src/activities/histogram.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ import { scalar } from "../observers/observer.ts";
55
export type HistogramFn<T> = (row: T) => number;
66
export type HistogramRow<T> = { from: number, to: number, value: T[] };
77
export type OptionA = { buckets: number };
8+
/**
9+
* Histogram options specifying the minimum value and bucket range (size).
10+
* @property min - The minimum value for the first bucket's lower bound
11+
* @property range - The size/width of each bucket (distance between bucket boundaries)
12+
*/
813
export type OptionB = { min: number, range: number };
914
export type Options = OptionA | OptionB;
1015

@@ -21,8 +26,17 @@ function histogramGen<T = any>(callbackFn: HistogramFn<T>, options: Options): It
2126
if (isOptionA(options)) {
2227
source = Array.isArray(_source) ? _source : [..._source];
2328
const minMax = scalar(extent(callbackFn))(source);
24-
if (minMax === undefined) {
25-
return undefined;
29+
if (minMax === undefined || minMax[0] === undefined || minMax[1] === undefined) {
30+
// For empty sources with buckets option, generate empty buckets with NaN bounds
31+
const buckets = options.buckets;
32+
for (let i = 0; i < buckets; ++i) {
33+
yield {
34+
from: NaN,
35+
to: NaN,
36+
value: []
37+
};
38+
}
39+
return;
2640
}
2741
min = minMax[0];
2842
const max = minMax[1];
@@ -37,7 +51,9 @@ function histogramGen<T = any>(callbackFn: HistogramFn<T>, options: Options): It
3751
const histogram: { [key: number]: T[] } = {};
3852

3953
let maxBucketID = 0;
54+
let hasData = false;
4055
for (const row of source) {
56+
hasData = true;
4157
const value = callbackFn(row);
4258
const bucketID = Math.floor((value - min) / bucketSize);
4359
if (maxBucketID < bucketID) {
@@ -49,6 +65,11 @@ function histogramGen<T = any>(callbackFn: HistogramFn<T>, options: Options): It
4965
histogram[bucketID].push(row);
5066
}
5167

68+
// If no data, return empty for OptionB (min/range)
69+
if (!hasData) {
70+
return;
71+
}
72+
5273
const lastBucket = histogram[maxBucketID];
5374
const from = min + maxBucketID * bucketSize;
5475

packages/dataflow/tests/histogram.spec.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,84 @@ describe("histogram", () => {
4141
const data = [1, 12, 13, 13, 3, 14, 19, 6];
4242
const h = [...histogram(data, n => n, { buckets: 3 })];
4343
expect(h).to.have.length;
44+
expect(h.length).to.equal(3);
4445
const h2 = [...histogram(data, n => n, { min: 0, range: 5 })];
4546
expect(h2).to.have.length;
47+
expect(h2.length).to.equal(4);
48+
});
49+
50+
51+
it("sparse buckets", () => {
52+
const data = [1, 6, 13, 13, 3, 14, 19, 4];
53+
const h = [...histogram(data, n => n, { buckets: 3 })];
54+
expect(h).to.have.length;
55+
expect(h.length).to.equal(3);
56+
for (let i = 0; i < h.length; i++) {
57+
switch (i) {
58+
case 0:
59+
expect(h[i].from).to.equal(1);
60+
expect(h[i].to).to.equal(7);
61+
expect(h[i].value).to.deep.equal([1, 6, 3, 4]);
62+
break;
63+
case 1:
64+
expect(h[i].from).to.equal(7);
65+
expect(h[i].to).to.equal(13);
66+
expect(h[i].value).to.deep.equal([]);
67+
break;
68+
case 2:
69+
expect(h[i].from).to.be.equal(13);
70+
expect(h[i].to).to.be.equal(19);
71+
expect(h[i].value).to.deep.equal([13, 13, 14, 19]);
72+
break;
73+
}
74+
}
75+
const data2 = [1, 13, 12, 13, 3, 14, 19, 4];
76+
const h2 = [...histogram(data2, n => n, { min: 0, range: 5 })];
77+
expect(h2).to.have.length;
78+
expect(h2.length).to.equal(4);
79+
for (let i = 0; i < h.length; i++) {
80+
switch (i) {
81+
case 0:
82+
expect(h2[i].from).to.equal(0);
83+
expect(h2[i].to).to.equal(5);
84+
expect(h2[i].value).to.deep.equal([1, 3, 4]);
85+
break;
86+
case 1:
87+
expect(h2[i].from).to.equal(5);
88+
expect(h2[i].to).to.equal(10);
89+
expect(h2[i].value).to.deep.equal([]);
90+
break;
91+
case 2:
92+
expect(h2[i].from).to.be.equal(10);
93+
expect(h2[i].to).to.be.equal(15);
94+
expect(h2[i].value).to.deep.equal([13, 12, 13, 14]);
95+
break;
96+
case 3:
97+
expect(h2[i].from).to.be.equal(15);
98+
expect(h2[i].to).to.be.equal(20);
99+
expect(h2[i].value).to.deep.equal([19]);
100+
break;
101+
}
102+
}
103+
});
104+
105+
it("empty source with buckets", () => {
106+
const h = [...histogram<number>(n => n, { buckets: 10 })([])];
107+
expect(h.length).to.equal(10);
108+
for (const b of h) {
109+
expect(b.from).to.be.NaN;
110+
expect(b.to).to.be.NaN;
111+
expect(b.value).to.deep.equal([]);
112+
}
113+
});
114+
115+
it("empty source with buckets 2", () => {
116+
const data: number[] = [];
117+
const h = [...histogram(data, n => n, { buckets: 3 })];
118+
expect(h).to.have.length;
119+
expect(h.length).to.equal(3);
120+
const h2 = [...histogram(data, n => n, { min: 0, range: 5 })];
121+
expect(h2).to.have.length;
122+
expect(h2.length).to.equal(0);
46123
});
47124
});

0 commit comments

Comments
 (0)