Skip to content

Commit e9355d3

Browse files
committed
Merge branch 'literal-test-of-specification'
2 parents e1de29f + 7cbb72d commit e9355d3

11 files changed

Lines changed: 609 additions & 244 deletions

File tree

README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ The tests are thorough.
2828
* `original.py` were written during development to test all features as they were added.
2929
* `autogenerated.py` is from the language-independent testing suite ([histogrammar-multilang](https://github.com/histogrammar/histogrammar-multilang)), which provides greater coverage, value-explicitness in the test script, and cross-language agreement.
3030
* `testnumpy.py` tests numerical agreement between the conventional implementation and the Numpy implementation, which are very different. Also tests much larger datasets and infinity/NaN handling.
31+
* contrary to its name, `testnumpy.py` also compares its implementation with the literal code given in [the specification](http://histogrammar.org/docs/specification/) as well.
3132

3233
Primitive implementation is mature. Notes in the "Numpy" column are rough speedup factors for a range of tests.
3334

@@ -37,23 +38,19 @@ Primitive implementation is mature. Notes in the "Numpy" column are rough speedu
3738
| Sum | done | 40-100X |
3839
| Average | done | 40-100X |
3940
| Deviate | done | 40-80X |
40-
| AbsoluteErr | done | 40-100X |
4141
| Minimize | done | 50-150X |
4242
| Maximize | done | 50-150X |
43-
| Quantile | done | 1-5X |
43+
| Bag | done | 1.5-2X |
4444
| Bin | done | 5-25X (100 bins) |
4545
| SparselyBin | done | 4-5X (~100 bins) |
4646
| CentrallyBin | done | 25-40X (10 bins) |
47-
| AdaptivelyBin | done | not possible |
47+
| IrregularlyBin | done | 1-4X (10 plots) |
4848
| Categorize | done | 1.5X |
4949
| Fraction | done | 4-20X (100 bins) |
5050
| Stack | done | 2-12X (10 plots) |
51-
| Partition | done | 1-4X (10 plots) |
5251
| Select | done | 4-20X (100 bins) |
5352
| Limit | done | pass-through |
5453
| Label | done | pass-through |
5554
| UntypedLabel | done | pass-through |
5655
| Index | done | pass-through |
5756
| Branch | done | pass-through |
58-
| Bag | done | 1.5-2X |
59-
| Sample | done | 1-1.5X |

histogrammar/primitives/categorize.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class Categorize(Factory, Container):
2929
"""
3030

3131
@staticmethod
32-
def ed(entries, contentType, **pairs):
32+
def ed(entries, contentType, pairsAsDict=None, **pairs):
3333
"""Create a Categorize that is only capable of being added.
3434
3535
Parameters:
@@ -48,7 +48,11 @@ def ed(entries, contentType, **pairs):
4848

4949
out = Categorize(None, None)
5050
out.entries = float(entries)
51-
out.pairs = pairs
51+
if pairsAsDict is None:
52+
out.pairs = {}
53+
else:
54+
out.pairs = pairsAsDict
55+
out.pairs.update(pairs)
5256
out.contentType = contentType
5357
return out.specialize()
5458

@@ -74,6 +78,8 @@ def __init__(self, quantity, value=Count()):
7478
self.quantity = serializable(quantity)
7579
self.value = value
7680
self.pairs = {}
81+
if value is not None:
82+
self.contentType = str(value.factory.name)
7783
super(Categorize, self).__init__()
7884
self.specialize()
7985

histogrammar/primitives/centrallybin.py

Lines changed: 5 additions & 210 deletions
Original file line numberDiff line numberDiff line change
@@ -26,38 +26,28 @@ class CentrallyBin(Factory, Container):
2626
"""Split a quantity into bins defined by irregularly spaced bin centers, with exactly one sub-aggregator filled per datum (the closest one).
2727
2828
Unlike irregular bins defined by explicit ranges, irregular bins defined by bin centers are guaranteed to fully partition the space with no gaps and no overlaps. It could be viewed as cluster scoring in one dimension.
29-
30-
The first and last bins cover semi-infinite domains, so it is unclear how to interpret them as part of the probability density function (PDF). Finite-width bins approximate the PDF in piecewise steps, but the first and last bins could be taken as zero (an underestimate) or as uniform from the most extreme point to the inner bin edge (an overestimate, but one that is compensated by underestimating the region just beyond the extreme point). For the sake of the latter interpretation, the minimum and maximum values are accumulated along with the bin values.
3129
"""
3230

3331
@staticmethod
34-
def ed(entries, bins, min, max, nanflow):
32+
def ed(entries, bins, nanflow):
3533
"""Create a CentrallyBin that is only capable of being added.
3634
3735
Parameters:
3836
entries (float): the number of entries.
3937
bins (list of float, :doc:`Container <histogrammar.defs.Container>` pairs): the list of bin centers and their accumulated data.
40-
min (float): the lowest value of the quantity observed or NaN if no data were observed.
41-
max (float): the highest value of the quantity observed or NaN if no data were observed.
4238
nanflow (:doc:`Container <histogrammar.defs.Container>`): the filled nanflow bin.
4339
"""
4440
if not isinstance(entries, numbers.Real) and entries not in ("nan", "inf", "-inf"):
4541
raise TypeError("entries ({0}) must be a number".format(entries))
4642
if not isinstance(bins, (list, tuple)) and not all(isinstance(v, (list, tuple)) and len(v) == 2 and isinstance(v[0], numbers.Real) and isinstance(v[1], Container) for v in bins):
4743
raise TypeError("bins ({0}) must be a list of number, Container pairs".format(bins))
48-
if not isinstance(min, numbers.Real) and entries not in ("nan", "inf", "-inf"):
49-
raise TypeError("min ({0}) must be a number".format(min))
50-
if not isinstance(max, numbers.Real) and entries not in ("nan", "inf", "-inf"):
51-
raise TypeError("max ({0}) must be a number".format(max))
5244
if not isinstance(nanflow, Container):
5345
raise TypeError("nanflow ({0}) must be a Container".format(nanflow))
5446
if entries < 0.0:
5547
raise ValueError("entries ({0}) cannot be negative".format(entries))
5648
out = CentrallyBin(bins, None, None, nanflow)
5749
out.entries = float(entries)
5850
out.bins = bins
59-
out.min = float(min)
60-
out.max = float(max)
6151
return out.specialize()
6252

6353
@staticmethod
@@ -77,8 +67,6 @@ def __init__(self, bins, quantity, value=Count(), nanflow=Count()):
7767
Other parameters:
7868
entries (float): the number of entries, initially 0.0.
7969
bins (list of float, :doc:`Container <histogrammar.defs.Container>` pairs): the bin centers and sub-aggregators in each bin.
80-
min (float): the lowest value of the quantity observed, initially NaN.
81-
max (float): the highest value of the quantity observed, initially NaN.
8270
"""
8371

8472
if not isinstance(bins, (list, tuple)) and not all(isinstance(v, (list, tuple)) and len(v) == 2 and isinstance(v[0], numbers.Real) and isinstance(v[1], Container) for v in bins):
@@ -95,8 +83,6 @@ def __init__(self, bins, quantity, value=Count(), nanflow=Count()):
9583
self.bins = None
9684
else:
9785
self.bins = [(x, value.zero()) for x in sorted(bins)]
98-
self.min = float("nan")
99-
self.max = float("nan")
10086

10187
self.quantity = serializable(quantity)
10288
self.value = value
@@ -111,8 +97,6 @@ def histogram(self):
11197
out.entries = self.entries
11298
for i, v in self.bins:
11399
out.bins[i] = Count.ed(v.entries)
114-
out.min = self.min
115-
out.max = self.max
116100
return out.specialize()
117101

118102
@property
@@ -174,160 +158,6 @@ def range(self, center):
174158
else:
175159
return (below + center)/2.0, (above + center)/2.0
176160

177-
def pdf(self, *xs):
178-
"""Probability distribution function (PDF) of one sample point.
179-
180-
Computed as the ``entries`` of the corresponding bin divided by total number of entries divided by bin width.
181-
"""
182-
if len(xs) == 0:
183-
return self.pdfTimesEntries(xs[0]) / self.entries
184-
else:
185-
return [x / self.entries for x in self.pdfTimesEntries(*xs)]
186-
187-
def cdf(self, *xs):
188-
"""Cumulative distribution function (CDF, or "accumulation function") of one sample point.
189-
190-
Computed by adding bin contents from minus infinity to the point in question. This is a continuous, piecewise linear function.
191-
"""
192-
if len(xs) == 0:
193-
return self.cdfTimesEntries(xs[0]) / self.entries
194-
else:
195-
return [x / self.entries for x in self.cdfTimesEntries(*xs)]
196-
197-
def qf(self, *xs):
198-
"""Quantile function (QF, or "inverse of the accumulation function") of one sample point.
199-
200-
Computed like the CDF, but solving for the point in question, rather than integrating up to it. This is a continuous, piecewise linear function.
201-
"""
202-
if len(xs) == 0:
203-
return self.qfTimesEntries(xs[0]) * self.entries
204-
else:
205-
return [x * self.entries for x in self.qfTimesEntries(*xs)]
206-
207-
def pdfTimesEntries(self, x, *xs):
208-
"""PDF without the non-unity number of entries removed (no division by zero when ``entries`` is zero)."""
209-
210-
xs = [x] + list(xs)
211-
212-
if len(self.bins) == 0 or math.isnan(self.min) or math.isnan(self.max):
213-
out = [0.0] * len(xs)
214-
215-
elif len(self.bins) == 1:
216-
out = [float("inf") if x == self.bins[0][0] else 0.0 for x in xs]
217-
218-
else:
219-
out = [0.0] * len(xs)
220-
221-
left = self.min
222-
for i in xrange(len(self.bins)):
223-
if i < len(self.bins) - 1:
224-
right = (self.bins[i][0] + self.bins[i + 1][0]) / 2.0
225-
else:
226-
right = self.max
227-
228-
entries = self.bins[i][1].entries
229-
230-
for j, x in enumerate(xs):
231-
if left <= x and x < right:
232-
out[j] = entries / (right - left)
233-
234-
left = right
235-
236-
if len(xs) == 1:
237-
return out[0]
238-
else:
239-
return out
240-
241-
def cdfTimesEntries(self, x, *xs):
242-
"""CDF without the non-unity number of entries removed (no division by zero when ``entries`` is zero)."""
243-
244-
xs = [x] + list(xs)
245-
246-
if len(self.bins) == 0 or math.isnan(self.min) or math.isnan(self.max):
247-
out = [0.0] * len(xs)
248-
249-
elif len(self.bins) == 1:
250-
out = []
251-
for x in xs:
252-
if x < self.bins[0][0]:
253-
out.append(0.0)
254-
elif x == self.bins[0][0]:
255-
out.append(self.bins[0][1].entries / 2.0)
256-
else:
257-
out.append(self.bins[0][1].entries)
258-
259-
else:
260-
out = [0.0] * len(xs)
261-
262-
left = self.min
263-
cumulative = 0.0
264-
for i in xrange(len(self.bins)):
265-
if i < len(self.bins) - 1:
266-
right = (self.bins[i][0] + self.bins[i + 1][0]) / 2.0
267-
else:
268-
right = self.max
269-
270-
entries = self.bins[i][1].entries
271-
272-
for j, x in enumerate(xs):
273-
if left <= x and x < right:
274-
out[j] = cumulative + entries * (x - left)/(right - left)
275-
276-
left = right
277-
cumulative += entries
278-
279-
for j, x in enumerate(xs):
280-
if x >= self.max:
281-
out[j] = cumulative
282-
283-
if len(xs) == 1:
284-
return out[0]
285-
else:
286-
return out
287-
288-
def qfTimesEntries(self, y, *ys):
289-
"""QF without the non-unity number of entries removed (no division by zero when ``entries`` is zero)."""
290-
291-
ys = [y] + list(ys)
292-
293-
if len(self.bins) == 0 or math.isnan(self.min) or math.isnan(self.max):
294-
out = [float("nan")] * len(ys)
295-
296-
elif len(self.bins) == 1:
297-
out = [self.bins[0][0]] * len(ys)
298-
299-
else:
300-
out = [self.min] * len(ys)
301-
302-
left = self.min
303-
cumulative = 0.0
304-
for i in xrange(len(self.bins)):
305-
if i < len(self.bins) - 1:
306-
right = (self.bins[i][0] + self.bins[i + 1][0]) / 2.0
307-
else:
308-
right = self.max
309-
310-
entries = self.bins[i][1].entries
311-
312-
low = cumulative
313-
high = cumulative + entries
314-
315-
for j, y in enumerate(ys):
316-
if low <= y and y < high:
317-
out[j] = left + (right - left)*(y - low)/(high - low)
318-
319-
left = right
320-
cumulative += entries
321-
322-
for j, y in enumerate(ys):
323-
if y >= cumulative:
324-
out[j] = self.max
325-
326-
if len(ys) == 1:
327-
return out[0]
328-
else:
329-
return out
330-
331161
@inheritdoc(Container)
332162
def zero(self):
333163
return CentrallyBin([c for c, v in self.bins], self.quantity, self.value, self.nanflow.zero())
@@ -342,8 +172,6 @@ def __add__(self, other):
342172
out = CentrallyBin([c for c, v in self.bins], self.quantity, self.value, self.nanflow + other.nanflow)
343173
out.entries = self.entries + other.entries
344174
out.bins = newbins
345-
out.min = minplus(self.min, other.min)
346-
out.max = maxplus(self.max, other.max)
347175
return out.specialize()
348176

349177
@inheritdoc(Container)
@@ -362,10 +190,6 @@ def fill(self, datum, weight=1.0):
362190

363191
# no possibility of exception from here on out (for rollback)
364192
self.entries += weight
365-
if math.isnan(self.min) or q < self.min:
366-
self.min = q
367-
if math.isnan(self.max) or q > self.max:
368-
self.max = q
369193

370194
def _numpy(self, data, weights, shape):
371195
q = self.quantity(data)
@@ -421,23 +245,6 @@ def _numpy(self, data, weights, shape):
421245
self.bins[index][1]._numpy(data, subweights, shape)
422246

423247
# no possibility of exception from here on out (for rollback)
424-
425-
q = q[weights > 0.0]
426-
427-
if math.isnan(self.min):
428-
if q.shape[0] > 0:
429-
self.min = float(q.min())
430-
else:
431-
if q.shape[0] > 0:
432-
self.min = min(self.min, float(q.min()))
433-
434-
if math.isnan(self.max):
435-
if q.shape[0] > 0:
436-
self.max = float(q.max())
437-
else:
438-
if q.shape[0] > 0:
439-
self.max = max(self.max, float(q.max()))
440-
441248
self.entries += float(newentries)
442249

443250
@property
@@ -458,8 +265,6 @@ def toJsonFragment(self, suppressName):
458265
"entries": floatToJson(self.entries),
459266
"bins:type": self.bins[0][1].name,
460267
"bins": [{"center": floatToJson(c), "value": v.toJsonFragment(True)} for c, v in self.bins],
461-
"min": floatToJson(self.min),
462-
"max": floatToJson(self.max),
463268
"nanflow:type": self.nanflow.name,
464269
"nanflow": self.nanflow.toJsonFragment(False),
465270
}, **{"name": None if suppressName else self.quantity.name,
@@ -468,7 +273,7 @@ def toJsonFragment(self, suppressName):
468273
@staticmethod
469274
@inheritdoc(Factory)
470275
def fromJsonFragment(json, nameFromParent):
471-
if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "bins:type", "bins", "min", "max", "nanflow:type", "nanflow"], ["name", "bins:name"]):
276+
if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "bins:type", "bins", "nanflow:type", "nanflow"], ["name", "bins:name"]):
472277
if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real):
473278
entries = float(json["entries"])
474279
else:
@@ -505,23 +310,13 @@ def fromJsonFragment(json, nameFromParent):
505310
else:
506311
raise JsonFormatException(binpair, "CentrallyBin.bins {0}".format(i))
507312

508-
if json["min"] in ("nan", "inf", "-inf") or isinstance(json["min"], numbers.Real):
509-
min = float(json["min"])
510-
else:
511-
raise JsonFormatException(json, "CentrallyBin.min")
512-
513-
if json["max"] in ("nan", "inf", "-inf") or isinstance(json["max"], numbers.Real):
514-
max = float(json["max"])
515-
else:
516-
raise JsonFormatException(json, "CentrallyBin.max")
517-
518313
if isinstance(json["nanflow:type"], basestring):
519314
nanflowFactory = Factory.registered[json["nanflow:type"]]
520315
else:
521316
raise JsonFormatException(json, "CentrallyBin.nanflow:type")
522317
nanflow = nanflowFactory.fromJsonFragment(json["nanflow"], None)
523318

524-
out = CentrallyBin.ed(entries, bins, min, max, nanflow)
319+
out = CentrallyBin.ed(entries, bins, nanflow)
525320
out.quantity.name = nameFromParent if name is None else name
526321
return out.specialize()
527322

@@ -532,11 +327,11 @@ def __repr__(self):
532327
return "<CentrallyBin bins={0} size={1} nanflow={2}>".format(self.bins[0][1].name, len(self.bins), self.nanflow.name)
533328

534329
def __eq__(self, other):
535-
return isinstance(other, CentrallyBin) and self.quantity == other.quantity and numeq(self.entries, other.entries) and self.bins == other.bins and numeq(self.min, other.min) and numeq(self.max, other.max) and self.nanflow == other.nanflow
330+
return isinstance(other, CentrallyBin) and self.quantity == other.quantity and numeq(self.entries, other.entries) and self.bins == other.bins and self.nanflow == other.nanflow
536331

537332
def __ne__(self, other): return not self == other
538333

539334
def __hash__(self):
540-
return hash((self.quantity, self.entries, tuple(self.bins), self.min, self.max, self.nanflow))
335+
return hash((self.quantity, self.entries, tuple(self.bins), self.nanflow))
541336

542337
Factory.register(CentrallyBin)

0 commit comments

Comments
 (0)