srpraneeth
diff --git a/‎benchmarks/_common.py‎
Lines changed: 2 additions & 4 deletions b/‎benchmarks/_common.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎benchmarks/baselines/manual_sharded.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/baselines/manual_sharded.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/baselines/naive_iterable.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/baselines/naive_iterable.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/gen_data.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/gen_data.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/report.py‎
Lines changed: 309 additions & 126 deletions b/‎benchmarks/report.py‎
Lines changed: 309 additions & 126 deletions
diff --git a/‎benchmarks/run.py‎
Lines changed: 27 additions & 12 deletions b/‎benchmarks/run.py‎
Lines changed: 27 additions & 12 deletions
diff --git a/‎benchmarks/scenarios/__init__.py‎
Lines changed: 7 additions & 7 deletions b/‎benchmarks/scenarios/__init__.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎benchmarks/scenarios/s1_throughput.py‎
Lines changed: 3 additions & 1 deletion b/‎benchmarks/scenarios/s1_throughput.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/scenarios/s3_single_large.py‎
Lines changed: 20 additions & 8 deletions b/‎benchmarks/scenarios/s3_single_large.py‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎benchmarks/scenarios/s4_rank_sharding.py‎
Lines changed: 7 additions & 3 deletions b/‎benchmarks/scenarios/s4_rank_sharding.py‎
Lines changed: 7 additions & 3 deletions
@@ -7,7 +7,7 @@
 import os
 import statistics
 import time
-from typing import Callable
+from collections.abc import Callable
 
 try:
     import psutil
@@ -152,9 +152,7 @@ def verify_manifest(data_dir: str) -> dict:
             for chunk in iter(lambda: f.read(65536), b""):
                 h.update(chunk)
         if h.hexdigest() != entry["sha256"]:
-            raise ValueError(
-                f"Checksum mismatch for {entry['name']} — re-run gen_data.py"
-            )
+            raise ValueError(f"Checksum mismatch for {entry['name']} — re-run gen_data.py")
     return manifest
 
 
 
@@ -9,8 +9,8 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
 from glob import glob
-from typing import Iterator
 
 import pyarrow.parquet as pq
 import torch.utils.data as tud
 
@@ -8,8 +8,8 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
 from glob import glob
-from typing import Iterator
 
 import pyarrow.parquet as pq
 import torch.utils.data as tud
 
@@ -95,7 +95,7 @@ def generate(out_dir: str, dataset: str, fmt: str = "parquet") -> dict:
         table = make_table(n_rows, row_id_offset=total)
 
         if cfg.get("sorted_by_label"):
-            import pyarrow.compute as pc
+
             table = table.sort_by("label")
 
         if fmt == "parquet":
 
@@ -38,28 +38,44 @@
 BASELINE_PATH = os.path.join(RESULTS_DIR, "baseline.json")
 
 CI_DATASETS = {
-    "S1": "tiny", "S2": "unequal", "S3": "single_large",
-    "S4": "tiny", "S5": "tiny",    "S6": "root",
-    "S7": "tiny", "S8": "tiny",
+    "S1": "tiny",
+    "S2": "unequal",
+    "S3": "single_large",
+    "S4": "tiny",
+    "S5": "tiny",
+    "S6": "root",
+    "S7": "tiny",
+    "S8": "tiny",
 }
 DEFAULT_DATASETS = {
-    "S1": "large", "S2": "unequal", "S3": "single_large",
-    "S4": "large", "S5": "large",   "S6": "root",
-    "S7": "small", "S8": "small",
+    "S1": "large",
+    "S2": "unequal",
+    "S3": "single_large",
+    "S4": "large",
+    "S5": "large",
+    "S6": "root",
+    "S7": "small",
+    "S8": "small",
 }
 
 
 def _run_metadata() -> dict:
     import importlib.metadata
+
     try:
         version = importlib.metadata.version("torch-dataloader-utils")
     except Exception:
         version = "dev"
     try:
         import subprocess
-        git_sha = subprocess.check_output(
-            ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
-        ).decode().strip()
+
+        git_sha = (
+            subprocess.check_output(
+                ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
+            )
+            .decode()
+            .strip()
+        )
     except Exception:
         git_sha = "unknown"
     return {
@@ -134,7 +150,6 @@ def main() -> int:
             module, uses_root = ALL_SCENARIOS[sid]
             ds = dataset_map[sid]
             d = _dataset_dir(args.data_dir, sid, ds, uses_root)
-            check_dir = d if uses_root else d
             # For root-dir scenarios, verify the subdatasets that exist
             if uses_root:
                 for sub in ["tiny", "small", "medium", "large"]:
@@ -158,11 +173,11 @@ def main() -> int:
         module, uses_root = ALL_SCENARIOS[sid]
         ds = dataset_map[sid]
         d = _dataset_dir(args.data_dir, sid, ds, uses_root)
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(f"Running {sid}: {module.__name__.split('.')[-1]}")
         print(f"  data_dir : {d}")
         print(f"  n_runs   : {n_runs}  n_warmup: {n_warmup}")
-        print(f"{'='*60}")
+        print(f"{'=' * 60}")
         try:
             result = module.run(d, n_warmup=n_warmup, n_runs=n_runs)
             all_results["scenarios"][sid] = result
 
@@ -13,12 +13,12 @@
 # uses_root_dir=True  → scenario receives the parent data dir (contains dataset subdirs)
 # uses_root_dir=False → scenario receives the single-dataset subdirectory directly
 ALL_SCENARIOS: dict[str, tuple] = {
-    "S1": (s1_throughput,         False),
-    "S2": (s2_unequal,            False),
-    "S3": (s3_single_large,       False),
-    "S4": (s4_rank_sharding,      False),
-    "S5": (s5_column_projection,  False),
+    "S1": (s1_throughput, False),
+    "S2": (s2_unequal, False),
+    "S3": (s3_single_large, False),
+    "S4": (s4_rank_sharding, False),
+    "S5": (s5_column_projection, False),
     "S6": (s6_predicate_pushdown, True),
-    "S7": (s7_startup_latency,    True),
-    "S8": (s8_format_comparison,  True),
+    "S7": (s7_startup_latency, True),
+    "S8": (s8_format_comparison, True),
 }
@@ -14,7 +14,9 @@
 from benchmarks._common import load_manifest, measure, parquet_glob, passthrough
 from torch_dataloader_utils import StructuredDataset
 
-DESCRIPTION = "Baseline throughput sweep across num_workers on equal-sized files. All three implementations."
+DESCRIPTION = (
+    "Baseline throughput sweep across num_workers on equal-sized files. All three implementations."
+)
 DATASET = "small"
 WORKER_COUNTS = [0, 2, 4, 8]
 BATCH_SIZE = 1024
 
@@ -13,7 +13,7 @@
 import statistics
 import time
 
-from benchmarks._common import parquet_glob, passthrough, load_manifest, run_epoch
+from benchmarks._common import load_manifest, parquet_glob, passthrough, run_epoch
 from torch_dataloader_utils import StructuredDataset
 
 DESCRIPTION = (
@@ -60,24 +60,35 @@ def run(data_dir: str, n_warmup: int = 1, n_runs: int = 5) -> dict:
     # Warmup: prime OS disk cache
     for _ in range(n_warmup):
         loader, _ = StructuredDataset.create_dataloader(
-            path=parquet_glob(data_dir), format="parquet", num_workers=0,
-            batch_size=BATCH_SIZE, split_bytes=SPLIT_BYTES,
-            output_format="arrow", collate_fn=passthrough,
+            path=parquet_glob(data_dir),
+            format="parquet",
+            num_workers=0,
+            batch_size=BATCH_SIZE,
+            split_bytes=SPLIT_BYTES,
+            output_format="arrow",
+            collate_fn=passthrough,
         )
         run_epoch(loader)
 
     def _lib_worker(w: int):
         loader, _ = StructuredDataset.create_dataloader(
-            path=parquet_glob(data_dir), format="parquet", num_workers=0,
-            batch_size=BATCH_SIZE, split_bytes=SPLIT_BYTES,
-            num_ranks=NUM_WORKERS, rank=w,
-            output_format="arrow", collate_fn=passthrough,
+            path=parquet_glob(data_dir),
+            format="parquet",
+            num_workers=0,
+            batch_size=BATCH_SIZE,
+            split_bytes=SPLIT_BYTES,
+            num_ranks=NUM_WORKERS,
+            rank=w,
+            output_format="arrow",
+            collate_fn=passthrough,
         )
         return loader
 
     def _manual_worker(w: int):
         from glob import glob
+
         import pyarrow.parquet as pq
+
         files = sorted(glob(f"{data_dir}/*.parquet"))
         my_files = files[w::NUM_WORKERS]
 
@@ -88,6 +99,7 @@ def __iter__(self_):
                     for batch in pf.iter_batches(BATCH_SIZE):
                         if batch.num_rows > 0:
                             yield batch
+
         return _W()
 
     lib_stats = _simulate_parallel(_lib_worker, NUM_WORKERS, n_runs)
 
@@ -11,7 +11,7 @@
 
 from __future__ import annotations
 
-from benchmarks._common import parquet_glob, passthrough, load_manifest, measure
+from benchmarks._common import load_manifest, measure, parquet_glob, passthrough
 from torch_dataloader_utils import StructuredDataset
 
 DESCRIPTION = (
@@ -34,7 +34,11 @@ def run(data_dir: str, n_warmup: int = 1, n_runs: int = 5) -> dict:
         "dataset": DATASET,
         "total_rows": total,
         "num_workers": NUM_WORKERS,
-        "config": {"num_workers": NUM_WORKERS, "batch_size": BATCH_SIZE, "rank_counts": RANK_COUNTS},
+        "config": {
+            "num_workers": NUM_WORKERS,
+            "batch_size": BATCH_SIZE,
+            "rank_counts": RANK_COUNTS,
+        },
         "this_library": {},
         "naive_ddp": {},
     }
@@ -89,7 +93,7 @@ def _loader(nr=nr):
 
         # naive_ddp at this num_ranks: reads all rows but delivers only 1/nr fraction
         naive_elapsed = naive_stats["elapsed_sec"]["median"]
-        naive_rps = naive_stats["rows_per_sec"]["median"]
+        _naive_rps = naive_stats["rows_per_sec"]["median"]
         results["naive_ddp"][f"num_ranks={nr}"] = {
             "rows_received": actual_rows,
             "fraction_of_total": round(actual_rows / total, 4),