Skip to content

Commit 1c01e02

Browse files
committed
removed dataset holdout logic
Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com>
1 parent c3f93ef commit 1c01e02

2 files changed

Lines changed: 12 additions & 340 deletions

File tree

examples/llm_ptq/hf_ptq.py

Lines changed: 11 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@
7171
)
7272
from modelopt.torch.utils.dataset_utils import (
7373
create_forward_loop,
74-
get_calib_and_holdout_dataloaders,
7574
get_dataset_dataloader,
7675
get_max_batch_size,
7776
get_supported_datasets,
@@ -204,10 +203,9 @@ def make_calib_dataloader(
204203
tokenizer: PreTrainedTokenizerBase | None,
205204
device: torch.device,
206205
model_type: str | None,
207-
) -> tuple[DataLoader | _DeviceDataLoader, str | None, Path | None]:
206+
) -> tuple[DataLoader | _DeviceDataLoader, str | None]:
208207
calib_dataloader = None
209208
first_text_speech_dataset = None
210-
holdout_path = None
211209
if args.specdec_offline_dataset is not None:
212210
offline_data_path = Path(args.specdec_offline_dataset)
213211
dumped_files = sorted(str(p) for p in offline_data_path.glob("*.pt"))
@@ -286,28 +284,15 @@ def make_calib_dataloader(
286284
args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
287285
)
288286

289-
if args.holdout_size > 0:
290-
calib_dataloader, holdout_path = get_calib_and_holdout_dataloaders(
291-
dataset_name=args.dataset,
292-
tokenizer=tokenizer,
293-
batch_size=args.batch_size,
294-
calib_size=args.calib_size,
295-
holdout_size=args.holdout_size,
296-
max_sample_length=args.calib_seq,
297-
device=device,
298-
include_labels=include_labels,
299-
save_dir=args.calib_data_dir,
300-
)
301-
else:
302-
calib_dataloader = get_dataset_dataloader(
303-
dataset_name=args.dataset,
304-
tokenizer=tokenizer,
305-
batch_size=args.batch_size,
306-
num_samples=args.calib_size,
307-
device=device,
308-
include_labels=include_labels,
309-
)
310-
return calib_dataloader, first_text_speech_dataset, holdout_path
287+
calib_dataloader = get_dataset_dataloader(
288+
dataset_name=args.dataset,
289+
tokenizer=tokenizer,
290+
batch_size=args.batch_size,
291+
num_samples=args.calib_size,
292+
device=device,
293+
include_labels=include_labels,
294+
)
295+
return calib_dataloader, first_text_speech_dataset
311296

312297

313298
def auto_quantize(
@@ -1049,7 +1034,7 @@ def quantize_main(
10491034

10501035
print(f"Use calib batch_size {args.batch_size}")
10511036

1052-
calib_dataloader, first_text_speech_dataset, holdout_path = make_calib_dataloader(
1037+
calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
10531038
args, language_model, processor, tokenizer, device, model_type
10541039
)
10551040

@@ -1205,26 +1190,6 @@ def parse_args() -> argparse.Namespace:
12051190
type=str,
12061191
default="512",
12071192
)
1208-
parser.add_argument(
1209-
"--holdout_size",
1210-
help=(
1211-
"Number of holdout samples to save as a .pt file for evaluation. "
1212-
"Holdout samples are drawn from the same dataset immediately after "
1213-
"the calibration samples so there is no overlap. 0 disables holdout."
1214-
),
1215-
type=int,
1216-
default=0,
1217-
)
1218-
parser.add_argument(
1219-
"--calib_data_dir",
1220-
help=(
1221-
"Directory to save/load calib.pt and holdout.pt. "
1222-
"If both files exist, data is reloaded from disk instead of re-downloading. "
1223-
"Defaults to --export_path if not specified."
1224-
),
1225-
type=str,
1226-
default=None,
1227-
)
12281193
parser.add_argument(
12291194
"--calib_seq",
12301195
help="Maximum sequence length for calibration.",

0 commit comments

Comments
 (0)