Skip to content

Commit c531a18

Browse files
Making chnages for code uniformity and removing unnecessary code
1 parent 21a4cc8 commit c531a18

15 files changed

Lines changed: 175 additions & 142 deletions

examples/benchmarks/ort_inference_performance.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,16 @@
99
python3 examples/benchmarks/ort_inference_performance.py --model_source in-house
1010
1111
HuggingFace models:
12-
python3 examples/benchmarks/ort_inference_performance.py --model_source huggingface --model_identifier bert-base-uncased
13-
python3 examples/benchmarks/ort_inference_performance.py --model_source huggingface --model_identifier microsoft/resnet-50
12+
python3 examples/benchmarks/ort_inference_performance.py \
13+
--model_source huggingface --model_identifier bert-base-uncased
14+
python3 examples/benchmarks/ort_inference_performance.py \
15+
--model_source huggingface --model_identifier microsoft/resnet-50
1416
1517
Environment variables:
1618
HF_TOKEN: HuggingFace token for gated models (optional)
1719
"""
1820

1921
import argparse
20-
import os
2122

2223
from superbench.benchmarks import BenchmarkRegistry, Platform
2324
from superbench.common.utils import logger
@@ -76,7 +77,10 @@ def run_huggingface_benchmark(model_identifier, precision='float16', batch_size=
7677
'--model_source', type=str, default='in-house', choices=['in-house', 'huggingface'],
7778
help='Source of the model: in-house (default) or huggingface'
7879
)
79-
parser.add_argument('--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier')
80+
parser.add_argument(
81+
'--model_identifier', type=str, default='bert-base-uncased',
82+
help='HuggingFace model identifier'
83+
)
8084
parser.add_argument('--precision', type=str, default='float16', choices=['float32', 'float16', 'int8'])
8185
parser.add_argument('--batch_size', type=int, default=32)
8286
parser.add_argument('--seq_length', type=int, default=512)

examples/benchmarks/pytorch_huggingface_models.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
"""
1818

1919
import argparse
20-
import os
2120

2221
from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
2322
from superbench.common.utils import logger
@@ -57,11 +56,11 @@ def run_huggingface_benchmark(model_key, distributed=False, precision='float32',
5756
# Build parameters with HuggingFace model source
5857
parameters = (
5958
f"{model_config['parameters']} "
60-
f"--duration {duration} "
61-
f"--precision {precision} "
62-
f"--run_count 2 "
63-
f"--model_source huggingface "
64-
f"--model_identifier {hf_identifier}"
59+
f'--duration {duration} '
60+
f'--precision {precision} '
61+
f'--run_count 2 '
62+
f'--model_source huggingface '
63+
f'--model_identifier {hf_identifier}'
6564
)
6665

6766
if distributed:

examples/benchmarks/tensorrt_inference_performance.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,16 @@
99
python3 examples/benchmarks/tensorrt_inference_performance.py --model_source in-house
1010
1111
HuggingFace models:
12-
python3 examples/benchmarks/tensorrt_inference_performance.py --model_source huggingface --model_identifier bert-base-uncased
13-
python3 examples/benchmarks/tensorrt_inference_performance.py --model_source huggingface --model_identifier microsoft/resnet-50
12+
python3 examples/benchmarks/tensorrt_inference_performance.py \
13+
--model_source huggingface --model_identifier bert-base-uncased
14+
python3 examples/benchmarks/tensorrt_inference_performance.py \
15+
--model_source huggingface --model_identifier microsoft/resnet-50
1416
1517
Environment variables:
1618
HF_TOKEN: HuggingFace token for gated models (optional)
1719
"""
1820

1921
import argparse
20-
import os
2122

2223
from superbench.benchmarks import BenchmarkRegistry, Platform
2324
from superbench.common.utils import logger
@@ -76,14 +77,20 @@ def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32,
7677
'--model_source', type=str, default='in-house', choices=['in-house', 'huggingface'],
7778
help='Source of the model: in-house (default) or huggingface'
7879
)
79-
parser.add_argument('--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier')
80+
parser.add_argument(
81+
'--model_identifier', type=str, default='bert-base-uncased',
82+
help='HuggingFace model identifier'
83+
)
8084
parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'int8'])
8185
parser.add_argument('--batch_size', type=int, default=32)
8286
parser.add_argument('--seq_length', type=int, default=512)
8387
parser.add_argument('--iterations', type=int, default=2048)
8488
args = parser.parse_args()
8589

8690
if args.model_source == 'huggingface':
87-
run_huggingface_benchmark(args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations)
91+
run_huggingface_benchmark(
92+
args.model_identifier, args.precision, args.batch_size,
93+
args.seq_length, args.iterations
94+
)
8895
else:
8996
run_inhouse_benchmark()

superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44
"""Hugging Face model loader for benchmarking."""
55

66
import os
7-
import warnings
87
from pathlib import Path
9-
from typing import Optional, Tuple, Dict, Any, List
8+
from typing import Optional, Tuple
109

1110
import torch
1211
from transformers import (
@@ -173,7 +172,7 @@ def load_model(
173172
if 'not found' in str(e).lower() or '404' in str(e):
174173
raise ModelNotFoundError(
175174
f"Model '{model_identifier}' not found on Hugging Face Hub. "
176-
f"Please check the model ID at https://huggingface.co/models"
175+
f'Please check the model ID at https://huggingface.co/models'
177176
) from e
178177
raise ModelLoadError(f"Failed to load model '{model_identifier}': {e}") from e
179178
except Exception as e:
@@ -206,7 +205,7 @@ def load_model_from_config(
206205
# Validate config
207206
is_valid, error = config.validate()
208207
if not is_valid:
209-
raise ValueError(f"Invalid configuration: {error}")
208+
raise ValueError(f'Invalid configuration: {error}')
210209

211210
if device is None:
212211
device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -247,7 +246,7 @@ def _get_torch_dtype(self, dtype_str: str) -> torch.dtype:
247246
if dtype_str.lower() not in dtype_map:
248247
raise ValueError(
249248
f"Invalid dtype '{dtype_str}'. "
250-
f"Must be one of {list(dtype_map.keys())}"
249+
f'Must be one of {list(dtype_map.keys())}'
251250
)
252251

253252
return dtype_map[dtype_str.lower()]

superbench/benchmarks/micro_benchmarks/model_source_config.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""Configuration classes for model source and loading."""
55

66
from dataclasses import dataclass, field
7-
from typing import Optional, Dict, Any
7+
from typing import Optional, Dict, Any, Tuple
88

99

1010
@dataclass
@@ -55,14 +55,14 @@ def __post_init__(self):
5555
if self.torch_dtype not in valid_dtypes:
5656
raise ValueError(
5757
f"Invalid torch_dtype '{self.torch_dtype}'. "
58-
f"Must be one of {valid_dtypes}."
58+
f'Must be one of {valid_dtypes}.'
5959
)
6060

6161
# Validate identifier is provided
6262
if not self.identifier:
63-
raise ValueError("Model identifier must be provided.")
63+
raise ValueError('Model identifier must be provided.')
6464

65-
def validate(self) -> tuple[bool, str]:
65+
def validate(self) -> Tuple[bool, str]:
6666
"""Validate configuration parameters.
6767
6868
Returns:
@@ -74,7 +74,7 @@ def validate(self) -> tuple[bool, str]:
7474
if not self.identifier or not self.identifier.strip():
7575
return (
7676
False,
77-
"HuggingFace model identifier cannot be empty"
77+
'HuggingFace model identifier cannot be empty'
7878
)
7979

8080
return (True, '')
@@ -94,5 +94,5 @@ def __repr__(self) -> str:
9494
f"ModelSourceConfig(source='{self.source}', "
9595
f"identifier='{self.identifier}', "
9696
f"torch_dtype='{self.torch_dtype}', "
97-
f"hf_token={token_status})"
97+
f'hf_token={token_status})'
9898
)

superbench/benchmarks/micro_benchmarks/ort_inference_performance.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ def _preprocess_huggingface_models(self):
175175
Returns:
176176
bool: True if preprocessing succeeds.
177177
"""
178-
import torch
179178
import os
180179

181180
if not self._args.model_identifier:
@@ -192,7 +191,7 @@ def _preprocess_huggingface_models(self):
192191
if hf_token:
193192
load_kwargs['token'] = hf_token
194193
hf_config = AutoConfig.from_pretrained(
195-
self._args.model_identifier, **load_kwargs
194+
self._args.model_identifier, trust_remote_code=True, **load_kwargs
196195
)
197196

198197
precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32'
@@ -232,8 +231,13 @@ def _preprocess_huggingface_models(self):
232231
proc_output_path = self.__model_cache_path / f'rank_{proc_rank}'
233232
proc_output_path.mkdir(parents=True, exist_ok=True)
234233

235-
# Include precision in model name to match expected filename format
236-
model_name_with_precision = f'{model_name}.{self._args.precision.value}'
234+
# For INT8, export as float32 first then quantize (matching in-house model behavior).
235+
# For other precisions, include precision in the model name directly.
236+
if self._args.precision == Precision.INT8:
237+
export_precision = Precision.FLOAT32.value
238+
else:
239+
export_precision = self._args.precision.value
240+
model_name_with_precision = f'{model_name}.{export_precision}'
237241

238242
# Export directly to final destination to avoid path issues with external data
239243
onnx_path = exporter.export_huggingface_model(
@@ -251,15 +255,15 @@ def _preprocess_huggingface_models(self):
251255
# Apply INT8 quantization if requested (matching in-house model behavior)
252256
if self._args.precision == Precision.INT8:
253257
from onnxruntime.quantization import quantize_dynamic
254-
quantized_path = str(proc_output_path / f'{model_name}.{self._args.precision.value}.onnx')
258+
quantized_path = str(proc_output_path / f'{model_name}.{Precision.INT8.value}.onnx')
255259
quantize_dynamic(onnx_path, quantized_path)
256-
logger.info(f'Applied INT8 quantization to HuggingFace model')
260+
logger.info('Applied INT8 quantization to HuggingFace model')
257261

258262
# Update model list and cache path for benchmarking
259263
self._args.pytorch_models = [model_name]
260264
self.__model_cache_path = proc_output_path
261265

262-
logger.info(f'Successfully prepared HuggingFace model for ORT inference')
266+
logger.info('Successfully prepared HuggingFace model for ORT inference')
263267
return True
264268

265269
except Exception as e:
@@ -280,15 +284,13 @@ def _benchmark(self):
280284
f'CUDAExecutionProvider is not available (available: {available}).'
281285
)
282286
return False
283-
providers = ['CUDAExecutionProvider']
284-
logger.info(f'ORT Inference - using providers: {providers}')
285287

286288
for model in self._args.pytorch_models:
287289
sess_options = ort.SessionOptions()
288290
sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
289291
file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision)
290292
ort_sess = ort.InferenceSession(
291-
f'{self.__model_cache_path / file_name}', sess_options, providers=providers
293+
f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider']
292294
)
293295

294296
elapse_times = self.__inference(ort_sess)

superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _preprocess(self):
128128
f'--onnx={onnx_model}',
129129
# build options
130130
f'--optShapes=input:{input_shape}',
131-
f'--memPoolSize=workspace:8192M',
131+
'--memPoolSize=workspace:8192M',
132132
None if self._args.precision == 'fp32' else f'--{self._args.precision}',
133133
# inference options
134134
f'--iterations={self._args.iterations}',
@@ -145,9 +145,7 @@ def _preprocess_huggingface_models(self):
145145
Returns:
146146
bool: True if preprocessing succeeds.
147147
"""
148-
import torch
149148
import os
150-
import time
151149
from transformers import AutoConfig
152150

153151
if not self._args.model_identifier:
@@ -163,7 +161,7 @@ def _preprocess_huggingface_models(self):
163161
load_kwargs['token'] = hf_token
164162

165163
hf_config = AutoConfig.from_pretrained(
166-
self._args.model_identifier, **load_kwargs
164+
self._args.model_identifier, trust_remote_code=True, **load_kwargs
167165
)
168166
precision_str = self._args.precision # already a string: 'fp16', 'fp32', 'int8'
169167
fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
@@ -173,27 +171,43 @@ def _preprocess_huggingface_models(self):
173171
return False
174172

175173
# Step 2: Download and load the full model
176-
logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
177174

178-
loader = HuggingFaceModelLoader()
179-
hf_model, hf_config, tokenizer = loader.load_model(
180-
model_identifier=self._args.model_identifier,
175+
# Get GPU rank to create unique file paths and avoid race conditions
176+
# when multiple processes export the same model simultaneously
177+
gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0')
178+
proc_rank = os.getenv('PROC_RANK', gpu_rank)
179+
180+
# Create model source config - load on CPU to avoid accelerate dispatching
181+
# model across multiple GPUs which causes device mismatch during ONNX export.
182+
# TensorRT handles precision internally via --fp16/--int8 flags,
183+
# so the ONNX model is always exported in float32.
184+
model_config = ModelSourceConfig(
185+
source='huggingface',
186+
identifier=self._args.model_identifier,
187+
hf_token=hf_token,
181188
torch_dtype='float32',
182-
device='cpu',
183189
device_map=None,
184190
)
191+
192+
logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
193+
194+
# Load model from HuggingFace on CPU
195+
loader = HuggingFaceModelLoader()
196+
hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
185197
exporter = torch2onnxExporter()
186198

187-
# Get process rank for unique directory
188-
proc_rank = os.environ.get('PROC_RANK', os.environ.get('CUDA_VISIBLE_DEVICES', '0'))
199+
model_name = self._args.model_identifier.replace('/', '_')
200+
201+
# Prepare output path - use proc_rank subdirectory to avoid race conditions
202+
# when multiple processes export the same model simultaneously
189203
output_dir = f'/tmp/tensorrt_onnx_rank_{proc_rank}'
190204
os.makedirs(output_dir, exist_ok=True)
191205

192206
onnx_path = exporter.export_huggingface_model(
193207
model=hf_model,
194-
model_name=self._args.model_identifier.replace('/', '_'),
208+
model_name=model_name,
195209
batch_size=self._args.batch_size,
196-
seq_length=getattr(self._args, 'seq_length', 512),
210+
seq_length=self._args.seq_length,
197211
output_dir=output_dir,
198212
)
199213

@@ -236,7 +250,7 @@ def _preprocess_huggingface_models(self):
236250
self.__bin_path,
237251
f'--onnx={onnx_path}',
238252
f'--optShapes={input_shapes}',
239-
f'--memPoolSize=workspace:8192M',
253+
'--memPoolSize=workspace:8192M',
240254
None if self._args.precision == 'fp32' else f'--{self._args.precision}',
241255
f'--iterations={self._args.iterations}',
242256
'--percentile=99',
@@ -246,7 +260,7 @@ def _preprocess_huggingface_models(self):
246260
# Store model name for result processing
247261
self._args.pytorch_models = [self._args.model_identifier.replace('/', '_')]
248262

249-
logger.info(f'Successfully prepared HuggingFace model for TensorRT inference')
263+
logger.info('Successfully prepared HuggingFace model for TensorRT inference')
250264
return True
251265

252266
except Exception as e:

0 commit comments

Comments
 (0)