From 12d434e0775ee7bb244e705943324f06f7cbf007 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 30 Mar 2026 09:24:38 -0700 Subject: [PATCH 1/4] Fix parquet loading crash from datasets version mismatch When local parquet files contain HF datasets metadata written by a different version of the `datasets` library, `load_dataset("parquet")` can raise a TypeError during feature deserialization. Fall back to reading via PyArrow directly in that case. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- .../specdec_bench/specdec_bench/datasets/speed.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py index e3429126d9..49157e9037 100644 --- a/examples/specdec_bench/specdec_bench/datasets/speed.py +++ b/examples/specdec_bench/specdec_bench/datasets/speed.py @@ -716,7 +716,18 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data } else: data_files = {"test": [str(config_name_or_dataset_path_path)]} - dataset = load_dataset("parquet", data_files=data_files, split="test") + try: + dataset = load_dataset("parquet", data_files=data_files, split="test") + except TypeError: + # Fallback: parquet metadata may be incompatible with the installed + # ``datasets`` version. Read via PyArrow and convert directly. + import pyarrow + import pyarrow.parquet as pq + from datasets import Dataset as HFDataset + + tables = [pq.read_table(f) for f in data_files["test"]] + table = pyarrow.concat_tables(tables) if len(tables) > 1 else tables[0] + dataset = HFDataset(table) if self.num_samples is not None: dataset = dataset.select(range(self.num_samples)) return dataset From 30bcb261cc249cc54b8197808e4eec1c760fa686 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Tue, 31 Mar 2026 09:54:37 -0700 Subject: [PATCH 2/4] Strip HF metadata from arrow table in parquet fallback The PyArrow fallback still failed because HFDataset(table) parses the huggingface metadata embedded in the arrow schema, hitting the same TypeError. Strip that metadata before constructing the Dataset. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- examples/specdec_bench/specdec_bench/datasets/speed.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py index 49157e9037..a22de37d11 100644 --- a/examples/specdec_bench/specdec_bench/datasets/speed.py +++ b/examples/specdec_bench/specdec_bench/datasets/speed.py @@ -727,6 +727,15 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data tables = [pq.read_table(f) for f in data_files["test"]] table = pyarrow.concat_tables(tables) if len(tables) > 1 else tables[0] + # Strip HF metadata from the schema to avoid Feature parsing errors + schema = table.schema + if schema.metadata and b"huggingface" in schema.metadata: + new_meta = { + k: v + for k, v in schema.metadata.items() + if k != b"huggingface" + } + table = table.replace_schema_metadata(new_meta or None) dataset = HFDataset(table) if self.num_samples is not None: dataset = dataset.select(range(self.num_samples)) From 455759c5e3af013f1b6ee5d17965370b6b0541d0 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Tue, 31 Mar 2026 09:55:28 -0700 Subject: [PATCH 3/4] Relax datasets version pin to avoid conflict with TRT-LLM The tensorrt_llm 1.3.0rc5 container pins datasets==3.1.0. The previous pin (>=4.4.0) caused concurrent pip installs across ranks to race and corrupt the datasets package, breaking tensorrt_llm imports entirely. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- examples/specdec_bench/requirements_speed.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/specdec_bench/requirements_speed.txt b/examples/specdec_bench/requirements_speed.txt index 5b0117e3a7..549a5d73e8 100644 --- a/examples/specdec_bench/requirements_speed.txt +++ b/examples/specdec_bench/requirements_speed.txt @@ -1,4 +1,4 @@ -datasets>=4.4.0,<5.0.0 +datasets>=3.1.0 rich>=14.2.0 seaborn>=0.13.2 tiktoken>=0.12.0 From 95a9395fa8e116ff8cf586e220f5e14242d950f3 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 6 Apr 2026 12:40:10 -0700 Subject: [PATCH 4/4] fix: catch ValueError in parquet fallback for HF datasets compat HF datasets raises ValueError (not just TypeError) when it encounters unknown feature types in embedded parquet metadata. Catch both so the PyArrow fallback triggers correctly. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- examples/specdec_bench/specdec_bench/datasets/speed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py index a22de37d11..fe544bb353 100644 --- a/examples/specdec_bench/specdec_bench/datasets/speed.py +++ b/examples/specdec_bench/specdec_bench/datasets/speed.py @@ -718,7 +718,7 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data data_files = {"test": [str(config_name_or_dataset_path_path)]} try: dataset = load_dataset("parquet", data_files=data_files, split="test") - except TypeError: + except (TypeError, ValueError): # Fallback: parquet metadata may be incompatible with the installed # ``datasets`` version. Read via PyArrow and convert directly. import pyarrow