Skip to content

Commit 42ed1ba

Browse files
authored
Merge pull request #328 from githubnext/autoloop/perf-comparison
[Autoloop: perf-comparison]
2 parents 7272e5f + 32be405 commit 42ed1ba

98 files changed

Lines changed: 4311 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

benchmarks/pandas/bench_at_iat.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access"""
2+
import json
3+
import time
4+
import pandas as pd
5+
6+
N = 100_000
7+
WARMUP = 3
8+
ITERATIONS = 10
9+
10+
labels = [f"r{i}" for i in range(N)]
11+
values = [i * 1.5 for i in range(N)]
12+
13+
s = pd.Series(values, index=labels)
14+
df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels)
15+
16+
mid_label = f"r{N // 2}"
17+
18+
for _ in range(WARMUP):
19+
_ = s.at[mid_label]
20+
_ = s.iat[N // 2]
21+
_ = df.at[mid_label, "a"]
22+
_ = df.iat[N // 2, 0]
23+
24+
start = time.perf_counter()
25+
for _ in range(ITERATIONS):
26+
_ = s.at[mid_label]
27+
_ = s.iat[N // 2]
28+
_ = df.at[mid_label, "a"]
29+
_ = df.iat[N // 2, 0]
30+
total = (time.perf_counter() - start) * 1000
31+
32+
print(json.dumps({
33+
"function": "at_iat",
34+
"mean_ms": total / ITERATIONS,
35+
"iterations": ITERATIONS,
36+
"total_ms": total,
37+
}))
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""
2+
Benchmark: Series.autocorr(lag) — lag-N autocorrelation for a 100k-element numeric Series.
3+
4+
Mirrors tsb autoCorr.
5+
Benchmarks lag=1, lag=5, and lag=20.
6+
Outputs JSON: {"function": "autocorr", "mean_ms": ..., "iterations": ..., "total_ms": ...}
7+
"""
8+
import json
9+
import math
10+
import time
11+
import pandas as pd
12+
13+
SIZE = 100_000
14+
WARMUP = 5
15+
ITERATIONS = 50
16+
17+
data = [math.sin(i * 0.05) + (i % 7) * 0.01 for i in range(SIZE)]
18+
s = pd.Series(data)
19+
20+
for _ in range(WARMUP):
21+
s.autocorr(lag=1)
22+
s.autocorr(lag=5)
23+
s.autocorr(lag=20)
24+
25+
start = time.perf_counter()
26+
for _ in range(ITERATIONS):
27+
s.autocorr(lag=1)
28+
s.autocorr(lag=5)
29+
s.autocorr(lag=20)
30+
total_ms = (time.perf_counter() - start) * 1000
31+
32+
print(json.dumps({
33+
"function": "autocorr",
34+
"mean_ms": total_ms / ITERATIONS,
35+
"iterations": ITERATIONS,
36+
"total_ms": total_ms,
37+
}))
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Benchmark: pandas Series.convert_dtypes() and DataFrame.convert_dtypes()
3+
4+
Creates a 50k-row dataset with object-dtype numeric, boolean, and string
5+
columns, then measures how fast pandas can infer and convert to best dtypes.
6+
"""
7+
import json
8+
import time
9+
import numpy as np
10+
import pandas as pd
11+
12+
N = 50_000
13+
WARMUP = 3
14+
ITERATIONS = 20
15+
16+
# Object-dtype arrays (same structure as the TypeScript version)
17+
int_data = [None if i % 17 == 0 else i for i in range(N)]
18+
float_data = [None if i % 13 == 0 else i * 1.5 for i in range(N)]
19+
str_data = [None if i % 11 == 0 else f"str_{i}" for i in range(N)]
20+
bool_data = [None if i % 7 == 0 else (i % 2 == 0) for i in range(N)]
21+
22+
int_series = pd.Series(int_data, dtype=object)
23+
float_series = pd.Series(float_data, dtype=object)
24+
25+
df = pd.DataFrame({
26+
"int_col": int_data,
27+
"float_col": float_data,
28+
"str_col": str_data,
29+
"bool_col": bool_data,
30+
})
31+
32+
# Warm-up
33+
for _ in range(WARMUP):
34+
int_series.convert_dtypes()
35+
float_series.convert_dtypes()
36+
df.convert_dtypes()
37+
38+
start = time.perf_counter()
39+
for _ in range(ITERATIONS):
40+
int_series.convert_dtypes()
41+
float_series.convert_dtypes()
42+
df.convert_dtypes()
43+
total_ms = (time.perf_counter() - start) * 1000
44+
45+
print(json.dumps({
46+
"function": "convert_dtypes",
47+
"mean_ms": total_ms / ITERATIONS,
48+
"iterations": ITERATIONS,
49+
"total_ms": total_ms,
50+
}))
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)"""
2+
import json
3+
import time
4+
import pandas as pd
5+
6+
N = 300
7+
WARMUP = 3
8+
ITERATIONS = 10
9+
10+
left = pd.DataFrame({
11+
"id_a": list(range(N)),
12+
"val_a": [i * 1.5 for i in range(N)],
13+
})
14+
right = pd.DataFrame({
15+
"id_b": list(range(N)),
16+
"val_b": [i * 2.5 for i in range(N)],
17+
})
18+
19+
for _ in range(WARMUP):
20+
pd.merge(left, right, how="cross")
21+
22+
start = time.perf_counter()
23+
for _ in range(ITERATIONS):
24+
pd.merge(left, right, how="cross")
25+
total = (time.perf_counter() - start) * 1000
26+
27+
print(json.dumps({
28+
"function": "cross_join",
29+
"mean_ms": total / ITERATIONS,
30+
"iterations": ITERATIONS,
31+
"total_ms": total,
32+
}))
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
2+
import json, time
3+
import numpy as np
4+
import pandas as pd
5+
6+
SIZE = 100_000
7+
NUM_BINS = 20
8+
WARMUP = 5
9+
ITERATIONS = 50
10+
11+
data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])
12+
13+
for _ in range(WARMUP):
14+
# pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
15+
cut_result = pd.cut(data, NUM_BINS)
16+
# Summary DataFrame equivalent to cutBinsToFrame
17+
counts = cut_result.value_counts(sort=False)
18+
summary = pd.DataFrame({
19+
"bin": counts.index.astype(str),
20+
"left": [iv.left for iv in counts.index],
21+
"right": [iv.right for iv in counts.index],
22+
"count": counts.values,
23+
"frequency": counts.values / len(data),
24+
})
25+
# cutBinCounts equivalent: counts dict
26+
count_dict = dict(zip(counts.index.astype(str), counts.values))
27+
# binEdges equivalent: DataFrame of interval edges
28+
edges = pd.DataFrame({
29+
"left": [iv.left for iv in counts.index],
30+
"right": [iv.right for iv in counts.index],
31+
})
32+
33+
start = time.perf_counter()
34+
for _ in range(ITERATIONS):
35+
cut_result = pd.cut(data, NUM_BINS)
36+
counts = cut_result.value_counts(sort=False)
37+
summary = pd.DataFrame({
38+
"bin": counts.index.astype(str),
39+
"left": [iv.left for iv in counts.index],
40+
"right": [iv.right for iv in counts.index],
41+
"count": counts.values,
42+
"frequency": counts.values / len(data),
43+
})
44+
count_dict = dict(zip(counts.index.astype(str), counts.values))
45+
edges = pd.DataFrame({
46+
"left": [iv.left for iv in counts.index],
47+
"right": [iv.right for iv in counts.index],
48+
})
49+
total = (time.perf_counter() - start) * 1000
50+
51+
print(json.dumps({
52+
"function": "cut_bins_to_frame",
53+
"mean_ms": total / ITERATIONS,
54+
"iterations": ITERATIONS,
55+
"total_ms": total,
56+
}))
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Benchmark: DataFrame-to-DataFrame element-wise comparisons.
3+
4+
The existing dataframe_compare benchmark tests scalar comparisons only.
5+
This tests df1.eq(df2), df1.ne(df2), df1.gt(df2), df1.le(df2) (DataFrame vs DataFrame).
6+
Mirrors tsb dataFrameEq(df1, df2), dataFrameNe, dataFrameGt, dataFrameLe.
7+
8+
Outputs JSON: {"function": "dataframe_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...}
9+
"""
10+
import json
11+
import time
12+
import numpy as np
13+
import pandas as pd
14+
15+
SIZE = 50_000
16+
WARMUP = 5
17+
ITERATIONS = 50
18+
19+
df1 = pd.DataFrame({
20+
"a": np.array([(i * 1.7) % 1000 for i in range(SIZE)]),
21+
"b": np.array([(i * 2.3) % 1000 for i in range(SIZE)]),
22+
"c": np.array([i % 100 for i in range(SIZE)]),
23+
})
24+
25+
df2 = pd.DataFrame({
26+
"a": np.array([(i * 2.1) % 1000 for i in range(SIZE)]),
27+
"b": np.array([(i * 1.9) % 1000 for i in range(SIZE)]),
28+
"c": np.array([(i + 7) % 100 for i in range(SIZE)]),
29+
})
30+
31+
for _ in range(WARMUP):
32+
df1.eq(df2)
33+
df1.ne(df2)
34+
df1.gt(df2)
35+
df1.le(df2)
36+
37+
start = time.perf_counter()
38+
for _ in range(ITERATIONS):
39+
df1.eq(df2)
40+
df1.ne(df2)
41+
df1.gt(df2)
42+
df1.le(df2)
43+
total_ms = (time.perf_counter() - start) * 1000
44+
45+
print(json.dumps({
46+
"function": "dataframe_compare_pair",
47+
"mean_ms": total_ms / ITERATIONS,
48+
"iterations": ITERATIONS,
49+
"total_ms": total_ms,
50+
}))
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Benchmark: DataFrame.itertuples() — iterate over rows as namedtuples."""
2+
import time
3+
import pandas as pd
4+
5+
ROWS = 1_000
6+
WARMUP = 5
7+
ITERATIONS = 50
8+
9+
df = pd.DataFrame({
10+
"x": [i * 1.5 for i in range(ROWS)],
11+
"y": [i * 2.5 for i in range(ROWS)],
12+
"z": [i * 3.5 for i in range(ROWS)],
13+
})
14+
15+
for _ in range(WARMUP):
16+
for _row in df.itertuples():
17+
pass
18+
19+
times = []
20+
for _ in range(ITERATIONS):
21+
t0 = time.perf_counter()
22+
for _row in df.itertuples():
23+
pass
24+
times.append(time.perf_counter() - t0)
25+
26+
total = sum(times)
27+
mean_ms = (total / ITERATIONS) * 1000
28+
total_ms = total * 1000
29+
print(f'{{"function": "dataframe_itertuples", "mean_ms": {mean_ms:.6f}, "iterations": {ITERATIONS}, "total_ms": {total_ms:.6f}}}')
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
Benchmark: pandas DataFrame.transform() with named aggregation strings.
3+
4+
Mirrors tsb dataFrameTransform with string names like "mean", "cumsum",
5+
and ["sum", "mean"] applied column-wise.
6+
7+
Uses 10k-row DataFrame to match the TypeScript benchmark.
8+
"""
9+
import json
10+
import time
11+
import pandas as pd
12+
13+
ROWS = 10_000
14+
WARMUP = 3
15+
ITERATIONS = 20
16+
17+
a = [(i % 100) * 1.5 + 1 for i in range(ROWS)]
18+
b = [((i * 3) % 200) * 0.5 + 2 for i in range(ROWS)]
19+
c = [((i * 7) % 50) * 2.0 + 0.5 for i in range(ROWS)]
20+
df = pd.DataFrame({"a": a, "b": b, "c": c})
21+
22+
# Warm-up
23+
for _ in range(WARMUP):
24+
df.transform("mean")
25+
df.transform("cumsum")
26+
df.transform(["sum", "mean"])
27+
28+
start = time.perf_counter()
29+
for _ in range(ITERATIONS):
30+
df.transform("mean")
31+
df.transform("cumsum")
32+
df.transform(["sum", "mean"])
33+
total_ms = (time.perf_counter() - start) * 1000
34+
35+
print(json.dumps({
36+
"function": "dataframe_transform_named",
37+
"mean_ms": total_ms / ITERATIONS,
38+
"iterations": ITERATIONS,
39+
"total_ms": total_ms,
40+
}))
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Benchmark: DataFrame.update() — in-place-style DataFrame value update.
3+
4+
Mirrors tsb dataFrameUpdate.
5+
Overwrites non-null values from `other` into `self`.
6+
Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...}
7+
"""
8+
9+
import json
10+
import time
11+
12+
import numpy as np
13+
import pandas as pd
14+
15+
N = 10_000
16+
WARMUP = 20
17+
ITERATIONS = 200
18+
19+
# Build two DataFrames; `other` has NaN in ~2/3 of rows (so 1/3 rows are updated).
20+
a_data = [i * 1.0 for i in range(N)]
21+
b_data = [i * 2.0 for i in range(N)]
22+
a_other = [i * 10.0 if i % 3 == 0 else np.nan for i in range(N)]
23+
b_other = [i * 20.0 if i % 3 == 0 else np.nan for i in range(N)]
24+
25+
df = pd.DataFrame({"a": a_data, "b": b_data})
26+
other = pd.DataFrame({"a": a_other, "b": b_other})
27+
28+
# Warm-up
29+
for _ in range(WARMUP):
30+
dc = df.copy()
31+
dc.update(other)
32+
33+
start = time.perf_counter()
34+
for _ in range(ITERATIONS):
35+
dc = df.copy()
36+
dc.update(other)
37+
total_ms = (time.perf_counter() - start) * 1000
38+
39+
print(
40+
json.dumps(
41+
{
42+
"function": "dataframe_update",
43+
"mean_ms": total_ms / ITERATIONS,
44+
"iterations": ITERATIONS,
45+
"total_ms": total_ms,
46+
}
47+
)
48+
)

0 commit comments

Comments
 (0)