microsoft
diff --git a/‎examples/benchmarks/ort_inference_performance.py‎
Lines changed: 8 additions & 4 deletions b/‎examples/benchmarks/ort_inference_performance.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎examples/benchmarks/pytorch_huggingface_models.py‎
Lines changed: 5 additions & 6 deletions b/‎examples/benchmarks/pytorch_huggingface_models.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎examples/benchmarks/tensorrt_inference_performance.py‎
Lines changed: 12 additions & 5 deletions b/‎examples/benchmarks/tensorrt_inference_performance.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py‎
Lines changed: 4 additions & 5 deletions b/‎superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎superbench/benchmarks/micro_benchmarks/model_source_config.py‎
Lines changed: 6 additions & 6 deletions b/‎superbench/benchmarks/micro_benchmarks/model_source_config.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎superbench/benchmarks/micro_benchmarks/ort_inference_performance.py‎
Lines changed: 12 additions & 10 deletions b/‎superbench/benchmarks/micro_benchmarks/ort_inference_performance.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py‎
Lines changed: 29 additions & 15 deletions b/‎superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py‎
Lines changed: 29 additions & 15 deletions
@@ -9,15 +9,16 @@
     python3 examples/benchmarks/ort_inference_performance.py --model_source in-house
 
   HuggingFace models:
-    python3 examples/benchmarks/ort_inference_performance.py --model_source huggingface --model_identifier bert-base-uncased
-    python3 examples/benchmarks/ort_inference_performance.py --model_source huggingface --model_identifier microsoft/resnet-50
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier bert-base-uncased
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier microsoft/resnet-50
 
 Environment variables:
   HF_TOKEN: HuggingFace token for gated models (optional)
 """
 
 import argparse
-import os
 
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
@@ -76,7 +77,10 @@ def run_huggingface_benchmark(model_identifier, precision='float16', batch_size=
         '--model_source', type=str, default='in-house', choices=['in-house', 'huggingface'],
         help='Source of the model: in-house (default) or huggingface'
     )
-    parser.add_argument('--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier')
+    parser.add_argument(
+        '--model_identifier', type=str, default='bert-base-uncased',
+        help='HuggingFace model identifier'
+    )
     parser.add_argument('--precision', type=str, default='float16', choices=['float32', 'float16', 'int8'])
     parser.add_argument('--batch_size', type=int, default=32)
     parser.add_argument('--seq_length', type=int, default=512)
 
@@ -17,7 +17,6 @@
 """
 
 import argparse
-import os
 
 from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
 from superbench.common.utils import logger
@@ -57,11 +56,11 @@ def run_huggingface_benchmark(model_key, distributed=False, precision='float32',
     # Build parameters with HuggingFace model source
     parameters = (
         f"{model_config['parameters']} "
-        f"--duration {duration} "
-        f"--precision {precision} "
-        f"--run_count 2 "
-        f"--model_source huggingface "
-        f"--model_identifier {hf_identifier}"
+        f'--duration {duration} '
+        f'--precision {precision} '
+        f'--run_count 2 '
+        f'--model_source huggingface '
+        f'--model_identifier {hf_identifier}'
     )
 
     if distributed:
 
@@ -9,15 +9,16 @@
     python3 examples/benchmarks/tensorrt_inference_performance.py --model_source in-house
 
   HuggingFace models:
-    python3 examples/benchmarks/tensorrt_inference_performance.py --model_source huggingface --model_identifier bert-base-uncased
-    python3 examples/benchmarks/tensorrt_inference_performance.py --model_source huggingface --model_identifier microsoft/resnet-50
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier bert-base-uncased
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier microsoft/resnet-50
 
 Environment variables:
   HF_TOKEN: HuggingFace token for gated models (optional)
 """
 
 import argparse
-import os
 
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
@@ -76,14 +77,20 @@ def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32,
         '--model_source', type=str, default='in-house', choices=['in-house', 'huggingface'],
         help='Source of the model: in-house (default) or huggingface'
     )
-    parser.add_argument('--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier')
+    parser.add_argument(
+        '--model_identifier', type=str, default='bert-base-uncased',
+        help='HuggingFace model identifier'
+    )
     parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'int8'])
     parser.add_argument('--batch_size', type=int, default=32)
     parser.add_argument('--seq_length', type=int, default=512)
     parser.add_argument('--iterations', type=int, default=2048)
     args = parser.parse_args()
 
     if args.model_source == 'huggingface':
-        run_huggingface_benchmark(args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations)
+        run_huggingface_benchmark(
+            args.model_identifier, args.precision, args.batch_size,
+            args.seq_length, args.iterations
+        )
     else:
         run_inhouse_benchmark()
@@ -4,9 +4,8 @@
 """Hugging Face model loader for benchmarking."""
 
 import os
-import warnings
 from pathlib import Path
-from typing import Optional, Tuple, Dict, Any, List
+from typing import Optional, Tuple
 
 import torch
 from transformers import (
@@ -173,7 +172,7 @@ def load_model(
             if 'not found' in str(e).lower() or '404' in str(e):
                 raise ModelNotFoundError(
                     f"Model '{model_identifier}' not found on Hugging Face Hub. "
-                    f"Please check the model ID at https://huggingface.co/models"
+                    f'Please check the model ID at https://huggingface.co/models'
                 ) from e
             raise ModelLoadError(f"Failed to load model '{model_identifier}': {e}") from e
         except Exception as e:
@@ -206,7 +205,7 @@ def load_model_from_config(
         # Validate config
         is_valid, error = config.validate()
         if not is_valid:
-            raise ValueError(f"Invalid configuration: {error}")
+            raise ValueError(f'Invalid configuration: {error}')
 
         if device is None:
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -247,7 +246,7 @@ def _get_torch_dtype(self, dtype_str: str) -> torch.dtype:
         if dtype_str.lower() not in dtype_map:
             raise ValueError(
                 f"Invalid dtype '{dtype_str}'. "
-                f"Must be one of {list(dtype_map.keys())}"
+                f'Must be one of {list(dtype_map.keys())}'
             )
 
         return dtype_map[dtype_str.lower()]
 
@@ -4,7 +4,7 @@
 """Configuration classes for model source and loading."""
 
 from dataclasses import dataclass, field
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, Tuple
 
 
 @dataclass
@@ -55,14 +55,14 @@ def __post_init__(self):
         if self.torch_dtype not in valid_dtypes:
             raise ValueError(
                 f"Invalid torch_dtype '{self.torch_dtype}'. "
-                f"Must be one of {valid_dtypes}."
+                f'Must be one of {valid_dtypes}.'
             )
 
         # Validate identifier is provided
         if not self.identifier:
-            raise ValueError("Model identifier must be provided.")
+            raise ValueError('Model identifier must be provided.')
 
-    def validate(self) -> tuple[bool, str]:
+    def validate(self) -> Tuple[bool, str]:
         """Validate configuration parameters.
 
         Returns:
@@ -74,7 +74,7 @@ def validate(self) -> tuple[bool, str]:
             if not self.identifier or not self.identifier.strip():
                 return (
                     False,
-                    "HuggingFace model identifier cannot be empty"
+                    'HuggingFace model identifier cannot be empty'
                 )
 
         return (True, '')
@@ -94,5 +94,5 @@ def __repr__(self) -> str:
             f"ModelSourceConfig(source='{self.source}', "
             f"identifier='{self.identifier}', "
             f"torch_dtype='{self.torch_dtype}', "
-            f"hf_token={token_status})"
+            f'hf_token={token_status})'
         )
@@ -175,7 +175,6 @@ def _preprocess_huggingface_models(self):
         Returns:
             bool: True if preprocessing succeeds.
         """
-        import torch
         import os
 
         if not self._args.model_identifier:
@@ -192,7 +191,7 @@ def _preprocess_huggingface_models(self):
             if hf_token:
                 load_kwargs['token'] = hf_token
             hf_config = AutoConfig.from_pretrained(
-                self._args.model_identifier, **load_kwargs
+                self._args.model_identifier, trust_remote_code=True, **load_kwargs
             )
 
             precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32'
@@ -232,8 +231,13 @@ def _preprocess_huggingface_models(self):
             proc_output_path = self.__model_cache_path / f'rank_{proc_rank}'
             proc_output_path.mkdir(parents=True, exist_ok=True)
 
-            # Include precision in model name to match expected filename format
-            model_name_with_precision = f'{model_name}.{self._args.precision.value}'
+            # For INT8, export as float32 first then quantize (matching in-house model behavior).
+            # For other precisions, include precision in the model name directly.
+            if self._args.precision == Precision.INT8:
+                export_precision = Precision.FLOAT32.value
+            else:
+                export_precision = self._args.precision.value
+            model_name_with_precision = f'{model_name}.{export_precision}'
 
             # Export directly to final destination to avoid path issues with external data
             onnx_path = exporter.export_huggingface_model(
@@ -251,15 +255,15 @@ def _preprocess_huggingface_models(self):
             # Apply INT8 quantization if requested (matching in-house model behavior)
             if self._args.precision == Precision.INT8:
                 from onnxruntime.quantization import quantize_dynamic
-                quantized_path = str(proc_output_path / f'{model_name}.{self._args.precision.value}.onnx')
+                quantized_path = str(proc_output_path / f'{model_name}.{Precision.INT8.value}.onnx')
                 quantize_dynamic(onnx_path, quantized_path)
-                logger.info(f'Applied INT8 quantization to HuggingFace model')
+                logger.info('Applied INT8 quantization to HuggingFace model')
 
             # Update model list and cache path for benchmarking
             self._args.pytorch_models = [model_name]
             self.__model_cache_path = proc_output_path
 
-            logger.info(f'Successfully prepared HuggingFace model for ORT inference')
+            logger.info('Successfully prepared HuggingFace model for ORT inference')
             return True
 
         except Exception as e:
@@ -280,15 +284,13 @@ def _benchmark(self):
                 f'CUDAExecutionProvider is not available (available: {available}).'
             )
             return False
-        providers = ['CUDAExecutionProvider']
-        logger.info(f'ORT Inference - using providers: {providers}')
 
         for model in self._args.pytorch_models:
             sess_options = ort.SessionOptions()
             sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
             file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision)
             ort_sess = ort.InferenceSession(
-                f'{self.__model_cache_path / file_name}', sess_options, providers=providers
+                f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider']
             )
 
             elapse_times = self.__inference(ort_sess)
 
@@ -128,7 +128,7 @@ def _preprocess(self):
                 f'--onnx={onnx_model}',
                 # build options
                 f'--optShapes=input:{input_shape}',
-                f'--memPoolSize=workspace:8192M',
+                '--memPoolSize=workspace:8192M',
                 None if self._args.precision == 'fp32' else f'--{self._args.precision}',
                 # inference options
                 f'--iterations={self._args.iterations}',
@@ -145,9 +145,7 @@ def _preprocess_huggingface_models(self):
         Returns:
             bool: True if preprocessing succeeds.
         """
-        import torch
         import os
-        import time
         from transformers import AutoConfig
 
         if not self._args.model_identifier:
@@ -163,7 +161,7 @@ def _preprocess_huggingface_models(self):
                 load_kwargs['token'] = hf_token
 
             hf_config = AutoConfig.from_pretrained(
-                self._args.model_identifier, **load_kwargs
+                self._args.model_identifier, trust_remote_code=True, **load_kwargs
             )
             precision_str = self._args.precision    # already a string: 'fp16', 'fp32', 'int8'
             fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
@@ -173,27 +171,43 @@ def _preprocess_huggingface_models(self):
                 return False
 
             # Step 2: Download and load the full model
-            logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
 
-            loader = HuggingFaceModelLoader()
-            hf_model, hf_config, tokenizer = loader.load_model(
-                model_identifier=self._args.model_identifier,
+            # Get GPU rank to create unique file paths and avoid race conditions
+            # when multiple processes export the same model simultaneously
+            gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0')
+            proc_rank = os.getenv('PROC_RANK', gpu_rank)
+
+            # Create model source config - load on CPU to avoid accelerate dispatching
+            # model across multiple GPUs which causes device mismatch during ONNX export.
+            # TensorRT handles precision internally via --fp16/--int8 flags,
+            # so the ONNX model is always exported in float32.
+            model_config = ModelSourceConfig(
+                source='huggingface',
+                identifier=self._args.model_identifier,
+                hf_token=hf_token,
                 torch_dtype='float32',
-                device='cpu',
                 device_map=None,
             )
+
+            logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
+
+            # Load model from HuggingFace on CPU
+            loader = HuggingFaceModelLoader()
+            hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
             exporter = torch2onnxExporter()
 
-            # Get process rank for unique directory
-            proc_rank = os.environ.get('PROC_RANK', os.environ.get('CUDA_VISIBLE_DEVICES', '0'))
+            model_name = self._args.model_identifier.replace('/', '_')
+
+            # Prepare output path - use proc_rank subdirectory to avoid race conditions
+            # when multiple processes export the same model simultaneously
             output_dir = f'/tmp/tensorrt_onnx_rank_{proc_rank}'
             os.makedirs(output_dir, exist_ok=True)
 
             onnx_path = exporter.export_huggingface_model(
                 model=hf_model,
-                model_name=self._args.model_identifier.replace('/', '_'),
+                model_name=model_name,
                 batch_size=self._args.batch_size,
-                seq_length=getattr(self._args, 'seq_length', 512),
+                seq_length=self._args.seq_length,
                 output_dir=output_dir,
             )
 
@@ -236,7 +250,7 @@ def _preprocess_huggingface_models(self):
                 self.__bin_path,
                 f'--onnx={onnx_path}',
                 f'--optShapes={input_shapes}',
-                f'--memPoolSize=workspace:8192M',
+                '--memPoolSize=workspace:8192M',
                 None if self._args.precision == 'fp32' else f'--{self._args.precision}',
                 f'--iterations={self._args.iterations}',
                 '--percentile=99',
@@ -246,7 +260,7 @@ def _preprocess_huggingface_models(self):
             # Store model name for result processing
             self._args.pytorch_models = [self._args.model_identifier.replace('/', '_')]
 
-            logger.info(f'Successfully prepared HuggingFace model for TensorRT inference')
+            logger.info('Successfully prepared HuggingFace model for TensorRT inference')
             return True
 
         except Exception as e: