-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathplot_digitize.py
More file actions
120 lines (98 loc) · 3.61 KB
/
plot_digitize.py
File metadata and controls
120 lines (98 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
.. _l-example-digitize
========================
numpy.digitize as a tree
========================
.. index:: digitize, decision tree, onnx, onnxruntime
Function :epkg:`numpy:digitize` transforms a real variable
into a discrete one by returning the buckets the variable
falls into. This bucket can be efficiently retrieved by doing a
binary search over the bins. That's equivalent to decision tree.
Function :func:`digitize2tree
<mlinsights.mltree.tree_digitize.digitize2tree>`.
.. contents::
:local:
Simple example
==============
"""
import warnings
import numpy
from pandas import DataFrame, pivot_table
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from sklearn.tree import export_text
from skl2onnx import to_onnx
from cpyquickhelper.numbers.speed_measure import measure_time
from mlinsights.mltree import digitize2tree
from tqdm import tqdm
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
print(expected, pred)
##########################################
# The tree looks like the following.
print(export_text(tree, feature_names=['x']))
#######################################
# Benchmark
# =========
#
# Let's measure the processing time. *numpy* should be
# much faster than *scikit-learn* as it is adding many verification.
# However, the benchmark also includes a conversion of the tree into
# ONNX and measure the processing time with :epkg:`onnxruntime`.
obs = []
for shape in tqdm([1, 10, 100, 1000, 10000, 100000]):
x = numpy.random.random(shape).astype(numpy.float32)
if shape < 1000:
repeat = number = 100
else:
repeat = number = 10
for n_bins in [1, 10, 100]:
bins = (numpy.arange(n_bins) / n_bins).astype(numpy.float32)
ti = measure_time(
"numpy.digitize(x, bins, right=True)",
context={'numpy': numpy, "x": x, "bins": bins},
div_by_number=True, repeat=repeat, number=number)
ti['name'] = 'numpy'
ti['n_bins'] = n_bins
ti['shape'] = shape
obs.append(ti)
tree = digitize2tree(bins, right=True)
ti = measure_time(
"tree.predict(x)",
context={'numpy': numpy, "x": x.reshape((-1, 1)), "tree": tree},
div_by_number=True, repeat=repeat, number=number)
ti['name'] = 'sklearn'
ti['n_bins'] = n_bins
ti['shape'] = shape
obs.append(ti)
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
onx = to_onnx(tree, x.reshape((-1, 1)))
sess = InferenceSession(onx.SerializeToString())
ti = measure_time(
"sess.run(None, {'X': x})",
context={'numpy': numpy, "x": x.reshape((-1, 1)), "sess": sess},
div_by_number=True, repeat=repeat, number=number)
ti['name'] = 'ort'
ti['n_bins'] = n_bins
ti['shape'] = shape
obs.append(ti)
df = DataFrame(obs)
piv = pivot_table(data=df, index="shape", columns=["n_bins", "name"],
values=["average"])
print(piv)
##########################################
# Plotting
# ========
n_bins = list(sorted(set(df.n_bins)))
fig, ax = plt.subplots(1, len(n_bins), figsize=(14, 4))
for i, nb in enumerate(n_bins):
piv = pivot_table(data=df[df.n_bins == nb], index="shape",
columns=["name"],
values=["average"])
piv.plot(title="Benchmark digitize / onnxruntime\nn_bins=%d" % nb,
logx=True, logy=True, ax=ax[i])
plt.show()