From 8cc37df2ede749db32fee5679d8066d5ff2c7354 Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Tue, 23 Jun 2026 14:06:16 +0800
Subject: [PATCH] fix(perf): resolve concrete EP for the analyzer instead of
 ep=None (#931)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In `winml perf` without `--ep`, the EP was never resolved before the build,
so the static analyzer ran with `ep=None` and aggregated across all EPs (and
logged "analyze_onnx called with ep=None — results will aggregate all EPs"),
even though inference runs on a single device's EP.

Resolve a concrete device + EP from the request and pass it down to the build:

- PerfBenchmark resolves device + EP internally (_resolve_device_ep), at the
  start of _load_model so an unavailable/invalid combo fails fast before the
  export/optimize/quantize/compile pipeline runs. BenchmarkConfig keeps only the
  raw request; the resolved values live on the instance and drive from_pretrained
  / from_onnx.
- _perf_modules derives a concrete EP from the resolved device when none is given
  (explicit EPs are kept verbatim; downstream stages normalize aliases).
- The CLI no longer pre-resolves: it just builds the config / dispatches.

Verified end-to-end on a CPU build: analyzer target goes from "None on cpu" to
"OpenVINOExecutionProvider on cpu" and the ep=None warning no longer fires.

Follow-up #939 tracks folding _perf_modules into PerfBenchmark to unify the two
resolution sites.
---
 src/winml/modelkit/commands/perf.py     |  68 ++++++++++++--
 tests/unit/commands/test_perf_cli.py    | 117 +++++++++++++++++++++++-
 tests/unit/commands/test_perf_module.py |  23 +++++
 3 files changed, 198 insertions(+), 10 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 2f63e5652..95fc3b060 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -303,6 +303,46 @@ def __init__(self, config: BenchmarkConfig) -> None:
         self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None
         self._inputs: dict[str, np.ndarray] | None = None
         self._memory: dict[str, float] | None = None
+        # Concrete device + EP resolved from the config's request, populated by
+        # _resolve_device_ep() on the first call (before the build). The config
+        # keeps the raw request (e.g. "auto"); these hold what actually drives
+        # the build and inference.
+        self._resolved_device: str | None = None
+        self._resolved_ep: EPNameOrAlias | None = None
+
+    def _resolve_device_ep(self) -> None:
+        """Resolve the concrete target device + EP, failing fast on a bad combo.
+
+        Idempotent: resolves once, then returns cached values. Called at the
+        start of model loading so an unavailable/invalid device+EP raises here —
+        before the export/optimize/quantize/compile pipeline runs — instead of
+        only surfacing at session.compile(). Deriving a concrete EP also lets the
+        build's static analyzer target one EP instead of aggregating across all
+        of them (WinMLAutoModel itself stays permissive: ep=None is a valid
+        library mode).
+
+        Raises:
+            ValueError: If the requested device/EP combination is unavailable
+                or invalid (propagated from ``resolve_device``).
+        """
+        if self._resolved_device is not None:
+            return
+
+        from ..sysinfo import resolve_device, resolve_eps
+
+        # resolve_device() availability-checks even when --ep is explicit, so a
+        # named-but-absent EP is caught here too.
+        resolved_device, _ = resolve_device(device=self.config.device, ep=self.config.ep)
+        if self.config.ep is not None:
+            # Keep the user's EP (alias or canonical) verbatim — downstream
+            # stages normalize it. Only derive one when the user gave none.
+            resolved_ep: EPNameOrAlias | None = self.config.ep
+        else:
+            device_eps = resolve_eps(resolved_device)
+            resolved_ep = device_eps[0] if device_eps else None
+
+        self._resolved_device = resolved_device
+        self._resolved_ep = resolved_ep
 
     @property
     def _is_composite(self) -> bool:
@@ -478,6 +518,10 @@ def _load_model(self) -> None:
         from ..config import WinMLBuildConfig
         from ..models import WinMLAutoModel
 
+        # Resolve the concrete device + EP first so a bad combo fails fast,
+        # before from_pretrained/from_onnx kick off the build pipeline.
+        self._resolve_device_ep()
+
         model_id = self.config.model_id
         model_path = Path(model_id)
         is_onnx = model_path.suffix.lower() == ".onnx"
@@ -500,9 +544,9 @@ def _load_model(self) -> None:
         common_kwargs: dict[str, Any] = {
             "task": self.config.task,
             "config": override,
-            "device": self.config.device,
+            "device": self._resolved_device or self.config.device,
             "precision": self.config.precision,
-            "ep": self.config.ep,
+            "ep": self._resolved_ep,
             "provider_options": self.config.ep_options,
             "use_cache": use_cache,
             "force_rebuild": force_rebuild,
@@ -538,7 +582,7 @@ def _resolve_adapter_luid(self) -> str | None:
             return None
 
         assert self._model is not None
-        device = self._single.device or self.config.device
+        device = self._single.device or self._resolved_device or self.config.device
         if device == "cpu":
             return None
 
@@ -739,7 +783,8 @@ def _perf_modules(
         monitor: If True, wrap each per-module benchmark with HWMonitor.
         device: Target device policy ("auto", "cpu", "gpu", "npu").
         ep: Explicit execution provider (e.g., "qnn", "dml"). Overrides
-            device-to-provider mapping when set.
+            device-to-provider mapping when set. When ``None``, a concrete EP is
+            derived from the resolved device so the analyzer targets one EP.
         ep_options: Runtime EP provider options (e.g. QNN
             ``htp_performance_mode``) forwarded to each per-module session.
         precision: Precision mode passed through to the build stage.
@@ -752,10 +797,17 @@ def _perf_modules(
 
     from ..build import build_hf_model
     from ..config import SubmoduleClassNotFoundError, generate_hf_build_config
-    from ..sysinfo import resolve_device
+    from ..sysinfo import resolve_device, resolve_eps
     from .build import _instantiate_parent_model
 
     resolved_device, _ = resolve_device(device=device, ep=ep)
+    # Derive a concrete EP when none was given so each per-module build's static
+    # analyzer targets one EP instead of ep=None (which aggregates across all
+    # EPs and warns; see #931). An explicit EP is kept verbatim — downstream
+    # stages normalize it.
+    if ep is None:
+        device_eps = resolve_eps(resolved_device)
+        ep = device_eps[0] if device_eps else None
 
     console.print(f"[dim]Generating module configs for {module_class}...[/dim]")
 
@@ -1514,6 +1566,8 @@ def perf(
                 "[yellow]Warning:[/yellow] --shape-config is not supported "
                 "in --module mode and will be ignored."
             )
+        # _perf_modules resolves the device + derives a concrete EP internally
+        # (it will fold into PerfBenchmark — see #939).
         _perf_modules(
             hf_model=hf_model,
             module_class=module_class,
@@ -1559,7 +1613,9 @@ def perf(
     if output is None:
         output = generate_output_path(hf_model)
 
-    # Create config
+    # Create config. The raw device/EP request is passed through unchanged;
+    # PerfBenchmark resolves the concrete device + EP internally (failing fast
+    # before the build), so the CLI does not pre-resolve here.
     config = BenchmarkConfig(
         model_id=hf_model,
         task=task,
diff --git a/tests/unit/commands/test_perf_cli.py b/tests/unit/commands/test_perf_cli.py
index 9e05c3c57..1b6f7746e 100644
--- a/tests/unit/commands/test_perf_cli.py
+++ b/tests/unit/commands/test_perf_cli.py
@@ -29,10 +29,20 @@
 
 @pytest.fixture(autouse=True)
 def mock_resolve_device():
-    """Mock resolve_device to avoid hardware detection in all perf CLI tests."""
-    with patch(
-        "winml.modelkit.sysinfo.resolve_device",
-        return_value=("cpu", ["cpu"]),
+    """Mock device/EP resolution to avoid hardware detection in all perf CLI tests.
+
+    perf() resolves the device (and, when --ep is omitted, derives a concrete EP
+    via resolve_eps) up front, so both are stubbed to a deterministic CPU result.
+    """
+    with (
+        patch(
+            "winml.modelkit.sysinfo.resolve_device",
+            return_value=("cpu", ["cpu"]),
+        ),
+        patch(
+            "winml.modelkit.sysinfo.resolve_eps",
+            return_value=["CPUExecutionProvider"],
+        ),
     ):
         yield
 
@@ -495,6 +505,105 @@ def test_cli_ep_options_invalid_format_rejected(
         assert result.exit_code != 0
         assert "KEY=VALUE" in result.output
 
+    def test_load_model_no_ep_derives_concrete_ep(self, tmp_path: Path) -> None:
+        """Without an EP, PerfBenchmark resolves a concrete one before building.
+
+        Regression guard: previously ep stayed None down to the build, so the
+        static analyzer ran with ep=None and aggregated across all EPs (and
+        logged a warning). PerfBenchmark now resolves the EP from the device
+        (autouse fixture stubs resolve_eps -> ["CPUExecutionProvider"]) and
+        passes it to from_onnx. The config keeps the raw request (ep=None);
+        the resolved value lives on the instance.
+        """
+        onnx_file = tmp_path / "model.onnx"
+        onnx_file.write_bytes(b"fake onnx")
+
+        config = BenchmarkConfig(model_id=str(onnx_file), task="image-classification")
+        benchmark = PerfBenchmark(config)
+
+        with patch(
+            "winml.modelkit.models.auto.WinMLAutoModel.from_onnx",
+            return_value=MagicMock(),
+        ) as mock_from_onnx:
+            benchmark._load_model()
+
+        assert mock_from_onnx.call_args.kwargs["ep"] == "CPUExecutionProvider"
+        assert benchmark._resolved_ep == "CPUExecutionProvider"
+        assert config.ep is None
+
+    def test_load_model_explicit_ep_passed_through_verbatim(self, tmp_path: Path) -> None:
+        """An explicit EP reaches from_onnx unchanged (no normalization).
+
+        Downstream build/session stages normalize aliases themselves, so
+        PerfBenchmark must not rewrite the user's value (e.g. 'qnn' stays 'qnn').
+        """
+        onnx_file = tmp_path / "model.onnx"
+        onnx_file.write_bytes(b"fake onnx")
+
+        config = BenchmarkConfig(
+            model_id=str(onnx_file), task="image-classification", device="npu", ep="qnn"
+        )
+        benchmark = PerfBenchmark(config)
+
+        with patch(
+            "winml.modelkit.models.auto.WinMLAutoModel.from_onnx",
+            return_value=MagicMock(),
+        ) as mock_from_onnx:
+            benchmark._load_model()
+
+        assert mock_from_onnx.call_args.kwargs["ep"] == "qnn"
+        assert benchmark._resolved_ep == "qnn"
+
+    def test_load_model_unavailable_device_ep_fails_before_build(self, tmp_path: Path) -> None:
+        """An unavailable device/EP combo fails before the build pipeline runs.
+
+        PerfBenchmark resolves device+EP at the start of _load_model, so an
+        unavailable combo (resolve_device raises ValueError) surfaces before
+        from_onnx kicks off the build — the user does not wait for the whole
+        build only to fail at session.compile().
+        """
+        onnx_file = tmp_path / "model.onnx"
+        onnx_file.write_bytes(b"fake onnx")
+
+        config = BenchmarkConfig(model_id=str(onnx_file), task="image-classification", device="npu")
+        benchmark = PerfBenchmark(config)
+
+        with (
+            patch(
+                "winml.modelkit.sysinfo.resolve_device",
+                side_effect=ValueError("no compatible EP is available"),
+            ),
+            patch("winml.modelkit.models.auto.WinMLAutoModel.from_onnx") as mock_from_onnx,
+            pytest.raises(ValueError, match="no compatible EP is available"),
+        ):
+            benchmark._load_model()
+
+        mock_from_onnx.assert_not_called()
+
+    def test_cli_unavailable_device_ep_surfaces_error(
+        self, runner: CliRunner, tmp_path: Path
+    ) -> None:
+        """The CLI surfaces the fail-fast resolution error with a non-zero exit."""
+        onnx_file = tmp_path / "model.onnx"
+        onnx_file.write_bytes(b"fake onnx")
+
+        with (
+            patch(
+                "winml.modelkit.sysinfo.resolve_device",
+                side_effect=ValueError("no compatible EP is available"),
+            ),
+            patch("winml.modelkit.models.auto.WinMLAutoModel.from_onnx") as mock_from_onnx,
+        ):
+            result = runner.invoke(
+                perf,
+                ["-m", str(onnx_file), "--device", "npu", "-o", str(tmp_path / "out.json")],
+                obj={},
+            )
+
+        assert result.exit_code != 0
+        assert "no compatible EP is available" in result.output
+        mock_from_onnx.assert_not_called()
+
     def test_help_shows_ep_options(self, runner: CliRunner) -> None:
         result = runner.invoke(perf, ["--help"])
         assert result.exit_code == 0
diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py
index 9d3628dbd..4240897a6 100644
--- a/tests/unit/commands/test_perf_module.py
+++ b/tests/unit/commands/test_perf_module.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING
 from unittest.mock import ANY, MagicMock, patch
 
+import pytest
 from click.testing import CliRunner
 
 from winml.modelkit.cli import main
@@ -20,6 +21,28 @@
     from pathlib import Path
 
 
+@pytest.fixture(autouse=True)
+def _mock_device_resolution():
+    """Stub perf()'s up-front device/EP resolution so module tests stay hermetic.
+
+    perf() calls resolve_device() (and resolve_eps() when --ep is omitted) before
+    branching into module mode. Tests that need a specific device override
+    resolve_device locally inside their own ``with patch(...)`` block, which
+    nests over (and wins against) this autouse default.
+    """
+    with (
+        patch(
+            "winml.modelkit.sysinfo.resolve_device",
+            return_value=("cpu", ["cpu"]),
+        ),
+        patch(
+            "winml.modelkit.sysinfo.resolve_eps",
+            return_value=["CPUExecutionProvider"],
+        ),
+    ):
+        yield
+
+
 class TestPerfModuleFlag:
     """Tests for --module flag on winml perf."""