microsoft
diff --git a/‎docs/source/how-to/configure-workflows/metrics-configuration.md‎
Lines changed: 25 additions & 0 deletions b/‎docs/source/how-to/configure-workflows/metrics-configuration.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎olive/data/component/pre_process_data.py‎
Lines changed: 88 additions & 0 deletions b/‎olive/data/component/pre_process_data.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎olive/data/container/huggingface_container.py‎
Lines changed: 3 additions & 0 deletions b/‎olive/data/container/huggingface_container.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎olive/evaluator/accuracy.py‎
Lines changed: 60 additions & 0 deletions b/‎olive/evaluator/accuracy.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎olive/evaluator/lmeval_ort.py‎
Lines changed: 73 additions & 1 deletion b/‎olive/evaluator/lmeval_ort.py‎
Lines changed: 73 additions & 1 deletion
diff --git a/‎olive/evaluator/metric.py‎
Lines changed: 9 additions & 1 deletion b/‎olive/evaluator/metric.py‎
Lines changed: 9 additions & 1 deletion
@@ -128,3 +128,28 @@ If you have multiple metrics to evaluate, you can configure them in the followin
 ```{Note}
 If you have more than one metric, you need to specify `priority: {RANK}`, which Olive will use to determine the best model.
 ```
+
+## Speech Evaluation Metrics (WER and RTFx)
+
+Olive supports Word Error Rate (WER) and Real-Time Factor (RTFx) as built-in accuracy sub-types for evaluating speech/ASR models.
+
+### Using WER with the accuracy metric type
+
+WER can be used as an accuracy sub-type when your data pipeline returns text predictions and references:
+
+```json
+{
+    "name": "speech_accuracy",
+    "type": "accuracy",
+    "data_config": "speech_data_config",
+    "sub_types": [
+        {"name": "wer", "priority": 1, "higher_is_better": false},
+        {"name": "rtfx", "priority": 2, "higher_is_better": true}
+    ]
+}
+```
+
+```{Note}
+- `wer` (Word Error Rate): Measures transcription errors. Lower is better (defaults to `higher_is_better: false`).
+- `rtfx` (Real-Time Factor): Ratio of audio duration to inference time. Higher means faster (defaults to `higher_is_better: true`).
+```
@@ -291,3 +291,91 @@ def _tokenizer_and_align_labels(examples):
 
     tokenized_datasets = _huggingface_pre_process_helper(dataset, _tokenizer_and_align_labels, max_samples, **kwargs)
     return ClassificationDataset(tokenized_datasets, label_col="label", max_samples=max_samples)
+
+
+@Registry.register_pre_process()
+def speech_transcription_pre_process(
+    dataset,
+    audio_col: str = "audio",
+    text_col: str = "text",
+    sample_rate: int = 16000,
+    max_samples: Optional[int] = None,
+    limit: Optional[float] = None,
+    seed: int = 42,
+    **kwargs,
+):
+    """Pre-process data for speech transcription (ASR) evaluation.
+
+    Loads audio arrays and reference transcription text from a HuggingFace dataset.
+    Returns a dataset of (audio_array, reference_text) pairs suitable for WER evaluation.
+
+    Args:
+        dataset: HuggingFace dataset with audio and text columns.
+        audio_col: Name of the audio column. Defaults to "audio".
+        text_col: Name of the reference text column. Defaults to "text".
+        sample_rate: Target sample rate for audio. Defaults to 16000.
+        max_samples: Maximum number of samples (deprecated, use limit). Defaults to None.
+        limit: Sampling limit following Olive convention:
+            If >= 1: use first N samples.
+            If 0 < limit < 1: randomly sample that percentage.
+            If 0 or None: use all samples.
+        seed: Random seed for percentage-based sampling. Defaults to 42.
+        **kwargs: Additional arguments.
+
+    """
+    from datasets import Audio
+
+    dataset = dataset.cast_column(audio_col, Audio(sampling_rate=sample_rate))
+
+    # Apply sampling: prefer limit over max_samples
+    effective_limit = limit if limit is not None else (max_samples if max_samples else 0)
+    if effective_limit and effective_limit != 0:
+        from random import Random
+
+        total = len(dataset)
+        if 0 < effective_limit < 1:
+            n = max(1, int(total * effective_limit))
+            rng = Random(seed)
+            indices = sorted(rng.sample(range(total), min(n, total)))
+            dataset = dataset.select(indices)
+        elif effective_limit >= 1:
+            n = min(int(effective_limit), total)
+            dataset = dataset.select(range(n))
+
+    class SpeechTranscriptionDataset:
+        """Dataset that returns (audio_array, reference_text) pairs.
+
+        Note: Use batch_size=1 in dataloader config as audio samples have variable lengths.
+        """
+
+        def __init__(self, hf_dataset, audio_column, text_column):
+            self.dataset = hf_dataset
+            self.audio_column = audio_column
+            self.text_column = text_column
+
+        def __len__(self):
+            return len(self.dataset)
+
+        def __getitem__(self, idx):
+            item = self.dataset[idx]
+            import numpy as np
+
+            audio_array = np.array(item[self.audio_column]["array"], dtype=np.float32)
+            reference_text = item[self.text_column]
+            return audio_array, reference_text
+
+        @staticmethod
+        def collate_fn(batch):
+            """Collate variable-length audio batches. Use with batch_size=1 or pad audio."""
+            import numpy as np
+
+            # batch_size=1 is expected for speech evaluation (variable-length audio)
+            if len(batch) == 1:
+                audio, text = batch[0]
+                return (np.expand_dims(audio, 0), [text])
+            # For batch_size > 1, return as lists (no padding)
+            audios = [item[0] for item in batch]
+            texts = [item[1] for item in batch]
+            return (audios, texts)
+
+    return SpeechTranscriptionDataset(dataset, audio_col, text_col)
@@ -38,4 +38,7 @@ class HuggingfaceContainer(DataContainer):
             DataComponentType.PRE_PROCESS_DATA.value: "audio_classification_pre_process",
             DataComponentType.POST_PROCESS_DATA.value: "text_classification_post_process",
         },
+        "speech-transcription": {
+            DataComponentType.PRE_PROCESS_DATA.value: "speech_transcription_pre_process",
+        },
     }
@@ -26,6 +26,7 @@ class AccuracyBase(AutoConfigClass):
         "recall": torchmetrics.Recall,
         "auroc": torchmetrics.AUROC,
         "perplexity": torchmetrics.text.perplexity.Perplexity,
+        "wer": torchmetrics.text.WordErrorRate,
     }
 
     def __init__(self, config: Optional[Union[ConfigBase, dict[str, Any]]] = None) -> None:
@@ -157,3 +158,62 @@ def measure(self, model_output, target):
             perplexity.update(logits, targets)
         result = perplexity.compute()
         return result.item()
+
+
+class WordErrorRate(AccuracyBase):
+    """Word Error Rate metric for speech/ASR evaluation.
+
+    Expects model_output.preds to be a list of predicted transcription strings
+    and target to be a list of reference transcription strings.
+    """
+
+    name: Optional[str] = "wer"
+
+    @classmethod
+    def _default_config(cls) -> dict[str, ConfigParam]:
+        return {}
+
+    def measure(self, model_output, target):
+        preds = model_output.preds
+        refs = target
+        # Ensure inputs are lists of strings
+        if isinstance(preds, str):
+            preds = [preds]
+        elif not isinstance(preds, list):
+            preds = list(preds)
+        if isinstance(refs, str):
+            refs = [refs]
+        elif not isinstance(refs, list):
+            refs = list(refs)
+
+        wer = torchmetrics.text.WordErrorRate(**self.config_dict)
+        result = wer(preds, refs)
+        return result.item()
+
+
+class RealTimeFactor(AccuracyBase):
+    """Real-Time Factor (RTFx) metric for speech/ASR evaluation.
+
+    RTFx = total_audio_duration / total_inference_time.
+    A value > 1 means faster than real-time (e.g., RTFx=5 means 5x faster).
+    Timing metadata is provided via model_output.logits dict.
+    """
+
+    name: Optional[str] = "rtfx"
+
+    @classmethod
+    def _default_config(cls) -> dict[str, ConfigParam]:
+        return {}
+
+    def measure(self, model_output, target):
+        timing = model_output.logits
+        if not isinstance(timing, dict) or "total_audio_duration" not in timing:
+            raise ValueError(
+                "RTFx metric requires timing metadata from text-based inference path. "
+                "Ensure the metric is used with speech evaluation (WER + RTFx together)."
+            )
+        total_audio = timing["total_audio_duration"]
+        total_inference = timing["total_inference_time"]
+        if total_inference == 0:
+            return float("inf")
+        return round(total_audio / total_inference, 2)
@@ -509,7 +509,12 @@ def __init__(
                 self.max_length = max_length
             else:
                 self.max_length = genai_config["search"]["max_length"]
-            self._eot_token_id = genai_config["model"]["eos_token_id"]
+            eot = genai_config["model"]["eos_token_id"]
+            # eos_token_id can be a list (e.g. [1, 106] for Gemma4) or a scalar.
+            # Store all EOS IDs for generate_until stop detection,
+            # and first/scalar for loglikelihood (TemplateLM.eot_token_id expects int).
+            self._eos_token_ids = list(eot) if isinstance(eot, list) else [eot]
+            self._eot_token_id = self._eos_token_ids[0]
         self.params = og.GeneratorParams(self.model)
         self.params.set_search_options(max_length=self.max_length, past_present_share_buffer=False)
 
@@ -575,3 +580,70 @@ def model_call(self, input_ids: torch.Tensor, cont_len: int = 0) -> torch.Tensor
 
     def complete(self):
         pass
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
+        """Generate text until a stop sequence is reached.
+
+        Used by benchmarks like MMLU Pro (CoT variant) that score by generating
+        chain-of-thought text and extracting the answer with a regex filter.
+        """
+        results = []
+        for request in tqdm(requests, disable=disable_tqdm, desc="Running generate_until requests"):
+            context = request.args[0]
+            gen_kwargs = request.args[1]
+
+            until = gen_kwargs.get("until", [])
+            max_gen_toks = gen_kwargs.get("max_gen_toks", 256)
+            if isinstance(until, str):
+                until = [until]
+
+            input_ids = self.tok_encode(context)
+            max_new_tokens = min(max_gen_toks, self.max_length - len(input_ids))
+            if max_new_tokens <= 0:
+                results.append("")
+                continue
+
+            params = og.GeneratorParams(self.model)
+            params.set_search_options(
+                max_length=len(input_ids) + max_new_tokens,
+                past_present_share_buffer=False,
+                batch_size=1,
+            )
+            if gen_kwargs.get("temperature", 0.0) == 0.0:
+                params.set_search_options(do_sample=False)
+            else:
+                params.set_search_options(
+                    do_sample=True,
+                    temperature=gen_kwargs["temperature"],
+                )
+
+            generator = og.Generator(self.model, params)
+            generator.append_tokens([input_ids])
+
+            eos_ids = self._eos_token_ids
+
+            generated_ids = []
+            # Decode periodically to check for stop sequences
+            decode_interval = 16
+            while not generator.is_done():
+                generator.generate_next_token()
+                token_id = generator.get_next_tokens()[0]
+                generated_ids.append(token_id)
+                if token_id in eos_ids:
+                    break
+                # Check stop sequences periodically by decoding
+                if until and len(generated_ids) % decode_interval == 0:
+                    partial_text = self.tokenizer.decode(generated_ids)
+                    if any(stop_seq in partial_text for stop_seq in until):
+                        break
+
+            generated_text = self.tokenizer.decode(generated_ids)
+
+            # Truncate at the first stop sequence
+            for stop_seq in until:
+                idx = generated_text.find(stop_seq)
+                if idx != -1:
+                    generated_text = generated_text[:idx]
+
+            results.append(generated_text)
+        return results
@@ -38,6 +38,8 @@ class AccuracySubType(StrEnumBase):
     RECALL = "recall"
     AUROC = "auroc"
     PERPLEXITY = "perplexity"
+    WER = "wer"
+    RTFX = "rtfx"
 
 
 class LatencySubType(StrEnumBase):
@@ -206,7 +208,13 @@ def validate_sub_types(cls, v, info):
             # metric_config
             metric_config_cls = None
             if info.data["type"] == MetricType.ACCURACY:
-                item["higher_is_better"] = item.get("higher_is_better", True)
+                # Error rate metrics (WER) default to higher_is_better=False
+                _error_rate_metrics = {"wer"}
+                item_name = item["name"] if isinstance(item["name"], str) else item["name"].value
+                if item_name in _error_rate_metrics:
+                    item["higher_is_better"] = item.get("higher_is_better", False)
+                else:
+                    item["higher_is_better"] = item.get("higher_is_better", True)
                 if info.data["backend"] == "torch_metrics":
                     metric_config_cls = AccuracyBase.registry[item["name"]].get_config_class()
                 elif info.data["backend"] == "huggingface_metrics":
Original file line number	Diff line number	Diff line change
`@@ -38,4 +38,7 @@ class HuggingfaceContainer(DataContainer):`
`38`	`38`	`DataComponentType.PRE_PROCESS_DATA.value: "audio_classification_pre_process",`
`39`	`39`	`DataComponentType.POST_PROCESS_DATA.value: "text_classification_post_process",`
`40`	`40`	`},`
	`41`	`+ "speech-transcription": {`
	`42`	`+ DataComponentType.PRE_PROCESS_DATA.value: "speech_transcription_pre_process",`
	`43`	`+ },`
`41`	`44`	`}`