Refine recipe telemetry metadata

bmehta001 · Copilot · bmehta001 · commit 4a91dda21598 · 2026-05-22T09:57:42.000-05:00
Avoid treating the full workflow config as a telemetry override and keep recipe metadata deterministic and privacy-preserving across Path and model source inputs.

Files changed:
- olive/cli/run.py
- olive/telemetry/recipe_telemetry.py
- olive/workflows/run/run.py
- test/cli/test_cli.py
- test/workflows/test_workflow_run.py

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/olive/cli/run.py b/olive/cli/run.py
@@ -49,19 +49,22 @@ def register_subcommand(parser: ArgumentParser):
 
     @action
     def run(self):
+        from copy import deepcopy
         from pathlib import Path
 
         from olive.common.config_utils import load_config_file
         from olive.workflows import run as olive_run
 
         # allow the run_config to be a dict already (for api use)
         run_config_input = self.args.run_config
-        run_config = run_config_input
-        if not isinstance(run_config, dict):
-            run_config = load_config_file(run_config)
+        run_config = (
+            deepcopy(run_config_input) if isinstance(run_config_input, dict) else load_config_file(run_config_input)
+        )
+        config_overrides = {}
         if input_model_config := get_input_model_config(self.args, required=False):
             print("Replacing input model config in run config")
             run_config["input_model"] = input_model_config
+            config_overrides["input_model"] = input_model_config
 
         for arg_key, rc_key in [("output_path", "output_dir"), ("log_level", "log_severity_level")]:
             if (arg_value := getattr(self.args, arg_key)) is not None:
@@ -70,21 +73,26 @@ def run(self):
                 run_config.get("engine", {}).pop(rc_key, None)
                 # add value to run config directly
                 run_config[rc_key] = arg_value
+                config_overrides[rc_key] = arg_value
+
+        recipe_telemetry_metadata = {
+            "recipe_command": "WorkflowRun",
+            "recipe_source": "config_dict" if isinstance(run_config_input, dict) else "config_file",
+            "recipe_format": "dict"
+            if isinstance(run_config_input, dict)
+            else Path(run_config_input).suffix.lstrip(".").lower() or "unknown",
+            "execution_mode": "list_required_packages" if self.args.list_required_packages else "run",
+            "package_config_provided": bool(self.args.package_config),
+        }
+        if config_overrides:
+            recipe_telemetry_metadata["config_overrides"] = config_overrides
 
         workflow_output = olive_run(
             run_config,
             list_required_packages=self.args.list_required_packages,
             tempdir=self.args.tempdir,
             package_config=self.args.package_config,
-            recipe_telemetry_metadata={
-                "recipe_command": "WorkflowRun",
-                "recipe_source": "config_dict" if isinstance(run_config_input, dict) else "config_file",
-                "recipe_format": "dict"
-                if isinstance(run_config_input, dict)
-                else Path(run_config_input).suffix.lstrip(".").lower() or "unknown",
-                "execution_mode": "list_required_packages" if self.args.list_required_packages else "run",
-                "package_config_provided": bool(self.args.package_config),
-            },
+            recipe_telemetry_metadata=recipe_telemetry_metadata,
         )
 
         if self.args.list_required_packages is True:
diff --git a/olive/telemetry/recipe_telemetry.py b/olive/telemetry/recipe_telemetry.py
@@ -4,6 +4,7 @@
 # --------------------------------------------------------------------------
 import functools
 import json
+import re
 from copy import deepcopy
 from os import PathLike
 from pathlib import Path, PurePosixPath, PureWindowsPath
@@ -14,10 +15,9 @@
 from olive.package_config import OlivePackageConfig
 from olive.systems.common import SystemType
 from olive.telemetry.telemetry import is_ci_environment
-from olive.workflows.run.config import RunConfig
 
 if TYPE_CHECKING:
-    from olive.engine.config import RunPassConfig
+    from olive.workflows.run.config import RunConfig
 
 RECIPE_HASH_REDACTED_VALUE = "<resource>"
 CONFIG_REFERENCE_REDACTED_VALUE = "<reference>"
@@ -45,14 +45,18 @@
     "adapter_path",
     "user_script",
 }
+HF_MODEL_IDENTIFIER_KEYS = {"model_path", "_name_or_path"}
 CONFIG_REFERENCE_KEYS = {"host", "target", "evaluator"}
+LOCAL_MODEL_FILE_SUFFIXES = {".bin", ".model", ".onnx", ".pb", ".pt", ".pth", ".safetensors", ".tflite"}
+HF_CACHE_MODEL_PATTERN = re.compile(r"(?:^|[\\/])models--([^\\/]+)--([^\\/]+)(?:[\\/]|$)")
+HF_REPO_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*(/[A-Za-z0-9][A-Za-z0-9._-]*)?$")
 _NO_OVERRIDE = object()
 
 
 def _build_recipe_result_metadata(
     run_config_input: Union[str, Path, dict],
     run_config_telemetry_input: Optional[Any],
-    run_config: Optional[RunConfig],
+    run_config: Optional["RunConfig"],
     recipe_telemetry_metadata: Optional[dict[str, Any]],
     *,
     list_required_packages: bool,
@@ -65,9 +69,17 @@ def _build_recipe_result_metadata(
     metadata.setdefault("recipe_format", default_format)
     metadata.setdefault("execution_mode", "list_required_packages" if list_required_packages else "run")
     metadata.setdefault("package_config_provided", package_config_provided)
-    metadata.setdefault("config_overrides", _build_config_overrides(run_config_telemetry_input))
+    config_overrides = metadata.pop("config_overrides", _NO_OVERRIDE)
+    if config_overrides is _NO_OVERRIDE:
+        config_overrides = _build_config_overrides(run_config_telemetry_input)
+    elif not isinstance(config_overrides, str):
+        config_overrides = _build_config_overrides(config_overrides)
+    if config_overrides is not None:
+        metadata["config_overrides"] = config_overrides
     if package_config_provided:
-        metadata.setdefault("package_config_overrides", _build_package_config_overrides(package_config_input))
+        package_config_overrides = _build_package_config_overrides(package_config_input)
+        if package_config_overrides is not None:
+            metadata.setdefault("package_config_overrides", package_config_overrides)
     metadata["is_ci"] = is_ci_environment()
 
     if run_config is None:
@@ -78,7 +90,7 @@ def _build_recipe_result_metadata(
     model_metadata = _extract_input_model_metadata(run_config_json["input_model"])
     target_metadata = _extract_target_metadata(run_config)
     host_metadata = _extract_host_metadata(run_config)
-    pass_types = [pass_config.type for pass_config in _get_used_passes_configs(run_config)]
+    pass_types = _get_used_pass_types(run_config)
 
     metadata.setdefault("recipe_name", metadata.get("recipe_command") or run_config.workflow_id)
     metadata.setdefault("workflow_id", run_config.workflow_id)
@@ -208,34 +220,44 @@ def _load_config_input_for_telemetry(config_input: Any) -> Optional[Any]:
     return None
 
 
-def _sanitize_config_snapshot(value: Any, key: Optional[str] = None) -> Any:
+def _sanitize_config_snapshot(value: Any, key: Optional[str] = None, model_type: Optional[str] = None) -> Any:
+    if key in HF_MODEL_IDENTIFIER_KEYS:
+        if str(model_type).lower() == "hfmodel":
+            hf_model_id = _extract_huggingface_model_id(value)
+            if hf_model_id:
+                return hf_model_id
+        return RECIPE_HASH_REDACTED_VALUE
     if key in CONFIG_SNAPSHOT_REDACTED_KEYS or _is_path_like_key(key):
         return RECIPE_HASH_REDACTED_VALUE
     if key in CONFIG_REFERENCE_KEYS and isinstance(value, str):
         return CONFIG_REFERENCE_REDACTED_VALUE
 
     if isinstance(value, dict):
+        child_model_type = _get_model_type(value) or model_type
         if key == "systems":
-            return [_sanitize_config_snapshot(system, "system") for system in value.values()]
+            return [_sanitize_config_snapshot(system, "system", child_model_type) for system in value.values()]
         if key == "passes":
             passes = []
             for pass_configs in value.values():
                 if isinstance(pass_configs, list):
                     passes.extend(pass_configs)
                 else:
                     passes.append(pass_configs)
-            return [_sanitize_config_snapshot(pass_config, "pass") for pass_config in passes]
+            return [_sanitize_config_snapshot(pass_config, "pass", child_model_type) for pass_config in passes]
         if key == "evaluators":
-            return [_sanitize_config_snapshot(evaluator, "evaluator_config") for evaluator in value.values()]
+            return [
+                _sanitize_config_snapshot(evaluator, "evaluator_config", child_model_type)
+                for evaluator in value.values()
+            ]
         return {
-            child_key: _sanitize_config_snapshot(child_value, child_key)
+            child_key: _sanitize_config_snapshot(child_value, child_key, child_model_type)
             for child_key, child_value in value.items()
             if child_value is not None
         }
     if isinstance(value, list):
-        return [_sanitize_config_snapshot(item, key) for item in value]
+        return [_sanitize_config_snapshot(item, key, model_type) for item in value]
     if isinstance(value, tuple):
-        return [_sanitize_config_snapshot(item, key) for item in value]
+        return [_sanitize_config_snapshot(item, key, model_type) for item in value]
     if isinstance(value, Path):
         return RECIPE_HASH_REDACTED_VALUE
     if callable(value):
@@ -255,6 +277,35 @@ def _is_path_like_key(key: Optional[str]) -> bool:
     )
 
 
+def _get_model_type(config: dict[str, Any]) -> Optional[str]:
+    model_type = config.get("type")
+    return str(model_type).lower() if model_type is not None else None
+
+
+def _extract_huggingface_model_id(model_identifier: Any) -> Optional[str]:
+    if not isinstance(model_identifier, str):
+        return None
+
+    identifier = model_identifier.strip()
+    if not identifier:
+        return None
+
+    if identifier.startswith("https://huggingface.co/"):
+        parts = identifier.removeprefix("https://huggingface.co/").strip("/").split("/")
+        if len(parts) >= 2:
+            return f"{parts[0]}/{parts[1]}"
+        if parts and parts[0]:
+            return parts[0]
+
+    if match := HF_CACHE_MODEL_PATTERN.search(identifier):
+        return f"{match.group(1)}/{match.group(2)}"
+
+    if HF_REPO_ID_PATTERN.match(identifier) and not _has_local_model_file_suffix(identifier):
+        return identifier
+
+    return None
+
+
 def _extract_input_model_metadata(input_model_config: dict[str, Any]) -> dict[str, Optional[str]]:
     model_config = input_model_config.get("config", {})
     model_attributes = model_config.get("model_attributes", {})
@@ -290,19 +341,26 @@ def _classify_input_model_source(model_identifier: Any) -> str:
 
 
 def _is_explicit_local_model_path(identifier: str) -> bool:
+    if _has_local_model_file_suffix(identifier):
+        return True
     return (
         identifier.startswith(("./", "../", ".\\", "..\\", "~/", "~\\", "/", "\\\\"))
         or PureWindowsPath(identifier).is_absolute()
         or PurePosixPath(identifier).is_absolute()
     )
 
 
-def _extract_target_metadata(run_config: RunConfig) -> dict[str, Optional[str]]:
+def _has_local_model_file_suffix(identifier: str) -> bool:
+    suffix = PureWindowsPath(identifier).suffix or PurePosixPath(identifier).suffix
+    return suffix.lower() in LOCAL_MODEL_FILE_SUFFIXES
+
+
+def _extract_target_metadata(run_config: "RunConfig") -> dict[str, Optional[str]]:
     target_system = run_config.engine.target
     return _extract_system_metadata(target_system, "target")
 
 
-def _extract_host_metadata(run_config: RunConfig) -> dict[str, Optional[str]]:
+def _extract_host_metadata(run_config: "RunConfig") -> dict[str, Optional[str]]:
     host_system = run_config.engine.host
     if host_system is None:
         return {
@@ -340,9 +398,9 @@ def _set_metadata_if_present(metadata: dict[str, Any], values: dict[str, Optiona
             metadata.setdefault(key, value)
 
 
-def _get_used_passes_configs(run_config: RunConfig) -> list["RunPassConfig"]:
+def _get_used_pass_types(run_config: "RunConfig") -> list[str]:
     return (
-        [pass_config for _, pass_configs in run_config.passes.items() for pass_config in pass_configs]
+        [pass_config.type for _, pass_configs in run_config.passes.items() for pass_config in pass_configs]
         if run_config.passes
         else []
     )
@@ -363,4 +421,12 @@ def _redact_recipe_hash_keys(value: Any, key: Optional[str] = None) -> Any:
     elif isinstance(value, list):
         for index, item in enumerate(value):
             value[index] = _redact_recipe_hash_keys(item, key)
+    elif isinstance(value, tuple):
+        return [_redact_recipe_hash_keys(item, key) for item in value]
+    elif isinstance(value, Path):
+        return RECIPE_HASH_REDACTED_VALUE
+    elif callable(value):
+        return CONFIG_CALLABLE_REDACTED_VALUE
+    elif hasattr(value, "value") and isinstance(value.value, (str, int, float, bool)):
+        return value.value
     return value
diff --git a/olive/workflows/run/run.py b/olive/workflows/run/run.py
@@ -160,11 +160,6 @@ def run(
     # set tempdir
     set_tempdir(tempdir)
 
-    try:
-        run_config_telemetry_input = _load_config_input_for_telemetry(run_config)
-    except Exception:
-        run_config_telemetry_input = None
-
     package_config_input = package_config
     try:
         package_config_telemetry_input = (
@@ -215,7 +210,7 @@ def run(
         if emit_recipe_telemetry:
             metadata = _build_recipe_result_metadata(
                 run_config,
-                run_config_telemetry_input,
+                None,
                 parsed_run_config,
                 recipe_telemetry_metadata,
                 list_required_packages=list_required_packages,
diff --git a/test/cli/test_cli.py b/test/cli/test_cli.py
@@ -163,6 +163,15 @@ def test_workflow_run_command_with_overrides(mock_run, tmp_path):
             "recipe_format": "json",
             "execution_mode": "run",
             "package_config_provided": False,
+            "config_overrides": {
+                "input_model": {
+                    "type": "HfModel",
+                    "model_path": "hf-internal-testing/tiny-random-LlamaForCausalLM",
+                    "load_kwargs": {"attn_implementation": "eager", "trust_remote_code": False},
+                },
+                "output_dir": str(Path("new_output_path").resolve()),
+                "log_severity_level": 2,
+            },
         },
     )
 
diff --git a/test/workflows/test_workflow_run.py b/test/workflows/test_workflow_run.py
@@ -195,12 +195,41 @@ def test_run_logs_recipe_result_success(_, mock_run_engine, mock_log_recipe_resu
     assert metadata["is_ci"] is False
     assert metadata["recipe_hash"]
     assert "input_model_name_hash" not in metadata
+    assert "config_overrides" not in metadata
 
+
+@patch("olive.workflows.run.run.log_recipe_result")
+@patch("olive.workflows.run.run.run_engine")
+def test_run_logs_config_overrides_when_recipe_metadata_provides_overrides(mock_run_engine, mock_log_recipe_result):
+    config = {
+        "input_model": {
+            "type": "HfModel",
+            "model_path": "Qwen/Qwen2.5-0.5B-Instruct",
+            "task": "text-generation",
+        }
+    }
+    mock_run_engine.return_value = object()
+
+    olive_run(
+        config,
+        recipe_telemetry_metadata={
+            "recipe_name": "WorkflowRun",
+            "config_overrides": {
+                "input_model": {
+                    "type": "HfModel",
+                    "model_path": "Qwen/Qwen2.5-0.5B-Instruct",
+                },
+                "engine": {"target": "local_system"},
+                "data_path": Path("data"),
+            },
+        },
+    )
+
+    metadata = mock_log_recipe_result.call_args.kwargs["metadata"]
     config_overrides = json.loads(metadata["config_overrides"])
-    assert config_overrides["input_model"]["model_path"] == "<resource>"
+    assert config_overrides["input_model"]["model_path"] == "Qwen/Qwen2.5-0.5B-Instruct"
     assert config_overrides["engine"]["target"] == "<reference>"
-    assert config_overrides["systems"][0]["type"] == "LocalSystem"
-    assert config_overrides["systems"][0]["accelerators"][0]["execution_providers"] == ["CUDAExecutionProvider"]
+    assert config_overrides["data_path"] == "<resource>"
 
 
 @patch("olive.workflows.run.run.log_error")
@@ -389,6 +418,7 @@ def test_classify_input_model_source_does_not_depend_on_local_filesystem(tmp_pat
 
     assert _classify_input_model_source("bert-base-uncased") == "string_name"
     assert _classify_input_model_source("./model.onnx") == "local_file"
+    assert _classify_input_model_source("model.onnx") == "local_file"
 
 
 def test_recipe_hash_does_not_depend_on_local_model_path_presence(tmp_path, monkeypatch):
@@ -402,3 +432,12 @@ def test_recipe_hash_does_not_depend_on_local_model_path_presence(tmp_path, monk
     (tmp_path / "bert-base-uncased").mkdir()
 
     assert _build_recipe_hash(config) == recipe_hash
+
+
+def test_recipe_hash_handles_path_values():
+    config = {
+        "input_model": {"type": "HfModel", "config": {"model_path": Path("model")}},
+        "custom_value": Path("custom"),
+    }
+
+    assert _build_recipe_hash(config)