githubnext
diff --git a/‎benchmarks/pandas/bench_at_iat.py‎
Lines changed: 37 additions & 0 deletions b/‎benchmarks/pandas/bench_at_iat.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_autocorr.py‎
Lines changed: 37 additions & 0 deletions b/‎benchmarks/pandas/bench_autocorr.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_convert_dtypes.py‎
Lines changed: 50 additions & 0 deletions b/‎benchmarks/pandas/bench_convert_dtypes.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_cross_join.py‎
Lines changed: 32 additions & 0 deletions b/‎benchmarks/pandas/bench_cross_join.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_cut_bins_to_frame.py‎
Lines changed: 56 additions & 0 deletions b/‎benchmarks/pandas/bench_cut_bins_to_frame.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_dataframe_compare_pair.py‎
Lines changed: 50 additions & 0 deletions b/‎benchmarks/pandas/bench_dataframe_compare_pair.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_dataframe_itertuples.py‎
Lines changed: 29 additions & 0 deletions b/‎benchmarks/pandas/bench_dataframe_itertuples.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_dataframe_transform_named.py‎
Lines changed: 40 additions & 0 deletions b/‎benchmarks/pandas/bench_dataframe_transform_named.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎benchmarks/pandas/bench_dataframe_update.py‎
Lines changed: 48 additions & 0 deletions b/‎benchmarks/pandas/bench_dataframe_update.py‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,37 @@
+"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access"""
+import json
+import time
+import pandas as pd
+
+N = 100_000
+WARMUP = 3
+ITERATIONS = 10
+
+labels = [f"r{i}" for i in range(N)]
+values = [i * 1.5 for i in range(N)]
+
+s = pd.Series(values, index=labels)
+df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels)
+
+mid_label = f"r{N // 2}"
+
+for _ in range(WARMUP):
+    _ = s.at[mid_label]
+    _ = s.iat[N // 2]
+    _ = df.at[mid_label, "a"]
+    _ = df.iat[N // 2, 0]
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    _ = s.at[mid_label]
+    _ = s.iat[N // 2]
+    _ = df.at[mid_label, "a"]
+    _ = df.iat[N // 2, 0]
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "at_iat",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
@@ -0,0 +1,37 @@
+"""
+Benchmark: Series.autocorr(lag) — lag-N autocorrelation for a 100k-element numeric Series.
+
+Mirrors tsb autoCorr.
+Benchmarks lag=1, lag=5, and lag=20.
+Outputs JSON: {"function": "autocorr", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import math
+import time
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+data = [math.sin(i * 0.05) + (i % 7) * 0.01 for i in range(SIZE)]
+s = pd.Series(data)
+
+for _ in range(WARMUP):
+    s.autocorr(lag=1)
+    s.autocorr(lag=5)
+    s.autocorr(lag=20)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.autocorr(lag=1)
+    s.autocorr(lag=5)
+    s.autocorr(lag=20)
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "autocorr",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
@@ -0,0 +1,50 @@
+"""
+Benchmark: pandas Series.convert_dtypes() and DataFrame.convert_dtypes()
+
+Creates a 50k-row dataset with object-dtype numeric, boolean, and string
+columns, then measures how fast pandas can infer and convert to best dtypes.
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+N = 50_000
+WARMUP = 3
+ITERATIONS = 20
+
+# Object-dtype arrays (same structure as the TypeScript version)
+int_data = [None if i % 17 == 0 else i for i in range(N)]
+float_data = [None if i % 13 == 0 else i * 1.5 for i in range(N)]
+str_data = [None if i % 11 == 0 else f"str_{i}" for i in range(N)]
+bool_data = [None if i % 7 == 0 else (i % 2 == 0) for i in range(N)]
+
+int_series = pd.Series(int_data, dtype=object)
+float_series = pd.Series(float_data, dtype=object)
+
+df = pd.DataFrame({
+    "int_col": int_data,
+    "float_col": float_data,
+    "str_col": str_data,
+    "bool_col": bool_data,
+})
+
+# Warm-up
+for _ in range(WARMUP):
+    int_series.convert_dtypes()
+    float_series.convert_dtypes()
+    df.convert_dtypes()
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    int_series.convert_dtypes()
+    float_series.convert_dtypes()
+    df.convert_dtypes()
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "convert_dtypes",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
@@ -0,0 +1,32 @@
+"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)"""
+import json
+import time
+import pandas as pd
+
+N = 300
+WARMUP = 3
+ITERATIONS = 10
+
+left = pd.DataFrame({
+    "id_a": list(range(N)),
+    "val_a": [i * 1.5 for i in range(N)],
+})
+right = pd.DataFrame({
+    "id_b": list(range(N)),
+    "val_b": [i * 2.5 for i in range(N)],
+})
+
+for _ in range(WARMUP):
+    pd.merge(left, right, how="cross")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.merge(left, right, how="cross")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "cross_join",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
@@ -0,0 +1,56 @@
+"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+NUM_BINS = 20
+WARMUP = 5
+ITERATIONS = 50
+
+data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])
+
+for _ in range(WARMUP):
+    # pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
+    cut_result = pd.cut(data, NUM_BINS)
+    # Summary DataFrame equivalent to cutBinsToFrame
+    counts = cut_result.value_counts(sort=False)
+    summary = pd.DataFrame({
+        "bin": counts.index.astype(str),
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+        "count": counts.values,
+        "frequency": counts.values / len(data),
+    })
+    # cutBinCounts equivalent: counts dict
+    count_dict = dict(zip(counts.index.astype(str), counts.values))
+    # binEdges equivalent: DataFrame of interval edges
+    edges = pd.DataFrame({
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+    })
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    cut_result = pd.cut(data, NUM_BINS)
+    counts = cut_result.value_counts(sort=False)
+    summary = pd.DataFrame({
+        "bin": counts.index.astype(str),
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+        "count": counts.values,
+        "frequency": counts.values / len(data),
+    })
+    count_dict = dict(zip(counts.index.astype(str), counts.values))
+    edges = pd.DataFrame({
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+    })
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "cut_bins_to_frame",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
@@ -0,0 +1,50 @@
+"""
+Benchmark: DataFrame-to-DataFrame element-wise comparisons.
+
+The existing dataframe_compare benchmark tests scalar comparisons only.
+This tests df1.eq(df2), df1.ne(df2), df1.gt(df2), df1.le(df2) (DataFrame vs DataFrame).
+Mirrors tsb dataFrameEq(df1, df2), dataFrameNe, dataFrameGt, dataFrameLe.
+
+Outputs JSON: {"function": "dataframe_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 50_000
+WARMUP = 5
+ITERATIONS = 50
+
+df1 = pd.DataFrame({
+    "a": np.array([(i * 1.7) % 1000 for i in range(SIZE)]),
+    "b": np.array([(i * 2.3) % 1000 for i in range(SIZE)]),
+    "c": np.array([i % 100 for i in range(SIZE)]),
+})
+
+df2 = pd.DataFrame({
+    "a": np.array([(i * 2.1) % 1000 for i in range(SIZE)]),
+    "b": np.array([(i * 1.9) % 1000 for i in range(SIZE)]),
+    "c": np.array([(i + 7) % 100 for i in range(SIZE)]),
+})
+
+for _ in range(WARMUP):
+    df1.eq(df2)
+    df1.ne(df2)
+    df1.gt(df2)
+    df1.le(df2)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    df1.eq(df2)
+    df1.ne(df2)
+    df1.gt(df2)
+    df1.le(df2)
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "dataframe_compare_pair",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
@@ -0,0 +1,29 @@
+"""Benchmark: DataFrame.itertuples() — iterate over rows as namedtuples."""
+import time
+import pandas as pd
+
+ROWS = 1_000
+WARMUP = 5
+ITERATIONS = 50
+
+df = pd.DataFrame({
+    "x": [i * 1.5 for i in range(ROWS)],
+    "y": [i * 2.5 for i in range(ROWS)],
+    "z": [i * 3.5 for i in range(ROWS)],
+})
+
+for _ in range(WARMUP):
+    for _row in df.itertuples():
+        pass
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    for _row in df.itertuples():
+        pass
+    times.append(time.perf_counter() - t0)
+
+total = sum(times)
+mean_ms = (total / ITERATIONS) * 1000
+total_ms = total * 1000
+print(f'{{"function": "dataframe_itertuples", "mean_ms": {mean_ms:.6f}, "iterations": {ITERATIONS}, "total_ms": {total_ms:.6f}}}')
@@ -0,0 +1,40 @@
+"""
+Benchmark: pandas DataFrame.transform() with named aggregation strings.
+
+Mirrors tsb dataFrameTransform with string names like "mean", "cumsum",
+and ["sum", "mean"] applied column-wise.
+
+Uses 10k-row DataFrame to match the TypeScript benchmark.
+"""
+import json
+import time
+import pandas as pd
+
+ROWS = 10_000
+WARMUP = 3
+ITERATIONS = 20
+
+a = [(i % 100) * 1.5 + 1 for i in range(ROWS)]
+b = [((i * 3) % 200) * 0.5 + 2 for i in range(ROWS)]
+c = [((i * 7) % 50) * 2.0 + 0.5 for i in range(ROWS)]
+df = pd.DataFrame({"a": a, "b": b, "c": c})
+
+# Warm-up
+for _ in range(WARMUP):
+    df.transform("mean")
+    df.transform("cumsum")
+    df.transform(["sum", "mean"])
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    df.transform("mean")
+    df.transform("cumsum")
+    df.transform(["sum", "mean"])
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "dataframe_transform_named",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
@@ -0,0 +1,48 @@
+"""
+Benchmark: DataFrame.update() — in-place-style DataFrame value update.
+
+Mirrors tsb dataFrameUpdate.
+Overwrites non-null values from `other` into `self`.
+Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+
+import json
+import time
+
+import numpy as np
+import pandas as pd
+
+N = 10_000
+WARMUP = 20
+ITERATIONS = 200
+
+# Build two DataFrames; `other` has NaN in ~2/3 of rows (so 1/3 rows are updated).
+a_data = [i * 1.0 for i in range(N)]
+b_data = [i * 2.0 for i in range(N)]
+a_other = [i * 10.0 if i % 3 == 0 else np.nan for i in range(N)]
+b_other = [i * 20.0 if i % 3 == 0 else np.nan for i in range(N)]
+
+df = pd.DataFrame({"a": a_data, "b": b_data})
+other = pd.DataFrame({"a": a_other, "b": b_other})
+
+# Warm-up
+for _ in range(WARMUP):
+    dc = df.copy()
+    dc.update(other)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    dc = df.copy()
+    dc.update(other)
+total_ms = (time.perf_counter() - start) * 1000
+
+print(
+    json.dumps(
+        {
+            "function": "dataframe_update",
+            "mean_ms": total_ms / ITERATIONS,
+            "iterations": ITERATIONS,
+            "total_ms": total_ms,
+        }
+    )
+)