From 8cc37df2ede749db32fee5679d8066d5ff2c7354 Mon Sep 17 00:00:00 2001 From: xieofxie Date: Tue, 23 Jun 2026 14:06:16 +0800 Subject: [PATCH] fix(perf): resolve concrete EP for the analyzer instead of ep=None (#931) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `winml perf` without `--ep`, the EP was never resolved before the build, so the static analyzer ran with `ep=None` and aggregated across all EPs (and logged "analyze_onnx called with ep=None — results will aggregate all EPs"), even though inference runs on a single device's EP. Resolve a concrete device + EP from the request and pass it down to the build: - PerfBenchmark resolves device + EP internally (_resolve_device_ep), at the start of _load_model so an unavailable/invalid combo fails fast before the export/optimize/quantize/compile pipeline runs. BenchmarkConfig keeps only the raw request; the resolved values live on the instance and drive from_pretrained / from_onnx. - _perf_modules derives a concrete EP from the resolved device when none is given (explicit EPs are kept verbatim; downstream stages normalize aliases). - The CLI no longer pre-resolves: it just builds the config / dispatches. Verified end-to-end on a CPU build: analyzer target goes from "None on cpu" to "OpenVINOExecutionProvider on cpu" and the ep=None warning no longer fires. Follow-up #939 tracks folding _perf_modules into PerfBenchmark to unify the two resolution sites. --- src/winml/modelkit/commands/perf.py | 68 ++++++++++++-- tests/unit/commands/test_perf_cli.py | 117 +++++++++++++++++++++++- tests/unit/commands/test_perf_module.py | 23 +++++ 3 files changed, 198 insertions(+), 10 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 2f63e5652..95fc3b060 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -303,6 +303,46 @@ def __init__(self, config: BenchmarkConfig) -> None: self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None self._inputs: dict[str, np.ndarray] | None = None self._memory: dict[str, float] | None = None + # Concrete device + EP resolved from the config's request, populated by + # _resolve_device_ep() on the first call (before the build). The config + # keeps the raw request (e.g. "auto"); these hold what actually drives + # the build and inference. + self._resolved_device: str | None = None + self._resolved_ep: EPNameOrAlias | None = None + + def _resolve_device_ep(self) -> None: + """Resolve the concrete target device + EP, failing fast on a bad combo. + + Idempotent: resolves once, then returns cached values. Called at the + start of model loading so an unavailable/invalid device+EP raises here — + before the export/optimize/quantize/compile pipeline runs — instead of + only surfacing at session.compile(). Deriving a concrete EP also lets the + build's static analyzer target one EP instead of aggregating across all + of them (WinMLAutoModel itself stays permissive: ep=None is a valid + library mode). + + Raises: + ValueError: If the requested device/EP combination is unavailable + or invalid (propagated from ``resolve_device``). + """ + if self._resolved_device is not None: + return + + from ..sysinfo import resolve_device, resolve_eps + + # resolve_device() availability-checks even when --ep is explicit, so a + # named-but-absent EP is caught here too. + resolved_device, _ = resolve_device(device=self.config.device, ep=self.config.ep) + if self.config.ep is not None: + # Keep the user's EP (alias or canonical) verbatim — downstream + # stages normalize it. Only derive one when the user gave none. + resolved_ep: EPNameOrAlias | None = self.config.ep + else: + device_eps = resolve_eps(resolved_device) + resolved_ep = device_eps[0] if device_eps else None + + self._resolved_device = resolved_device + self._resolved_ep = resolved_ep @property def _is_composite(self) -> bool: @@ -478,6 +518,10 @@ def _load_model(self) -> None: from ..config import WinMLBuildConfig from ..models import WinMLAutoModel + # Resolve the concrete device + EP first so a bad combo fails fast, + # before from_pretrained/from_onnx kick off the build pipeline. + self._resolve_device_ep() + model_id = self.config.model_id model_path = Path(model_id) is_onnx = model_path.suffix.lower() == ".onnx" @@ -500,9 +544,9 @@ def _load_model(self) -> None: common_kwargs: dict[str, Any] = { "task": self.config.task, "config": override, - "device": self.config.device, + "device": self._resolved_device or self.config.device, "precision": self.config.precision, - "ep": self.config.ep, + "ep": self._resolved_ep, "provider_options": self.config.ep_options, "use_cache": use_cache, "force_rebuild": force_rebuild, @@ -538,7 +582,7 @@ def _resolve_adapter_luid(self) -> str | None: return None assert self._model is not None - device = self._single.device or self.config.device + device = self._single.device or self._resolved_device or self.config.device if device == "cpu": return None @@ -739,7 +783,8 @@ def _perf_modules( monitor: If True, wrap each per-module benchmark with HWMonitor. device: Target device policy ("auto", "cpu", "gpu", "npu"). ep: Explicit execution provider (e.g., "qnn", "dml"). Overrides - device-to-provider mapping when set. + device-to-provider mapping when set. When ``None``, a concrete EP is + derived from the resolved device so the analyzer targets one EP. ep_options: Runtime EP provider options (e.g. QNN ``htp_performance_mode``) forwarded to each per-module session. precision: Precision mode passed through to the build stage. @@ -752,10 +797,17 @@ def _perf_modules( from ..build import build_hf_model from ..config import SubmoduleClassNotFoundError, generate_hf_build_config - from ..sysinfo import resolve_device + from ..sysinfo import resolve_device, resolve_eps from .build import _instantiate_parent_model resolved_device, _ = resolve_device(device=device, ep=ep) + # Derive a concrete EP when none was given so each per-module build's static + # analyzer targets one EP instead of ep=None (which aggregates across all + # EPs and warns; see #931). An explicit EP is kept verbatim — downstream + # stages normalize it. + if ep is None: + device_eps = resolve_eps(resolved_device) + ep = device_eps[0] if device_eps else None console.print(f"[dim]Generating module configs for {module_class}...[/dim]") @@ -1514,6 +1566,8 @@ def perf( "[yellow]Warning:[/yellow] --shape-config is not supported " "in --module mode and will be ignored." ) + # _perf_modules resolves the device + derives a concrete EP internally + # (it will fold into PerfBenchmark — see #939). _perf_modules( hf_model=hf_model, module_class=module_class, @@ -1559,7 +1613,9 @@ def perf( if output is None: output = generate_output_path(hf_model) - # Create config + # Create config. The raw device/EP request is passed through unchanged; + # PerfBenchmark resolves the concrete device + EP internally (failing fast + # before the build), so the CLI does not pre-resolve here. config = BenchmarkConfig( model_id=hf_model, task=task, diff --git a/tests/unit/commands/test_perf_cli.py b/tests/unit/commands/test_perf_cli.py index 9e05c3c57..1b6f7746e 100644 --- a/tests/unit/commands/test_perf_cli.py +++ b/tests/unit/commands/test_perf_cli.py @@ -29,10 +29,20 @@ @pytest.fixture(autouse=True) def mock_resolve_device(): - """Mock resolve_device to avoid hardware detection in all perf CLI tests.""" - with patch( - "winml.modelkit.sysinfo.resolve_device", - return_value=("cpu", ["cpu"]), + """Mock device/EP resolution to avoid hardware detection in all perf CLI tests. + + perf() resolves the device (and, when --ep is omitted, derives a concrete EP + via resolve_eps) up front, so both are stubbed to a deterministic CPU result. + """ + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + return_value=("cpu", ["cpu"]), + ), + patch( + "winml.modelkit.sysinfo.resolve_eps", + return_value=["CPUExecutionProvider"], + ), ): yield @@ -495,6 +505,105 @@ def test_cli_ep_options_invalid_format_rejected( assert result.exit_code != 0 assert "KEY=VALUE" in result.output + def test_load_model_no_ep_derives_concrete_ep(self, tmp_path: Path) -> None: + """Without an EP, PerfBenchmark resolves a concrete one before building. + + Regression guard: previously ep stayed None down to the build, so the + static analyzer ran with ep=None and aggregated across all EPs (and + logged a warning). PerfBenchmark now resolves the EP from the device + (autouse fixture stubs resolve_eps -> ["CPUExecutionProvider"]) and + passes it to from_onnx. The config keeps the raw request (ep=None); + the resolved value lives on the instance. + """ + onnx_file = tmp_path / "model.onnx" + onnx_file.write_bytes(b"fake onnx") + + config = BenchmarkConfig(model_id=str(onnx_file), task="image-classification") + benchmark = PerfBenchmark(config) + + with patch( + "winml.modelkit.models.auto.WinMLAutoModel.from_onnx", + return_value=MagicMock(), + ) as mock_from_onnx: + benchmark._load_model() + + assert mock_from_onnx.call_args.kwargs["ep"] == "CPUExecutionProvider" + assert benchmark._resolved_ep == "CPUExecutionProvider" + assert config.ep is None + + def test_load_model_explicit_ep_passed_through_verbatim(self, tmp_path: Path) -> None: + """An explicit EP reaches from_onnx unchanged (no normalization). + + Downstream build/session stages normalize aliases themselves, so + PerfBenchmark must not rewrite the user's value (e.g. 'qnn' stays 'qnn'). + """ + onnx_file = tmp_path / "model.onnx" + onnx_file.write_bytes(b"fake onnx") + + config = BenchmarkConfig( + model_id=str(onnx_file), task="image-classification", device="npu", ep="qnn" + ) + benchmark = PerfBenchmark(config) + + with patch( + "winml.modelkit.models.auto.WinMLAutoModel.from_onnx", + return_value=MagicMock(), + ) as mock_from_onnx: + benchmark._load_model() + + assert mock_from_onnx.call_args.kwargs["ep"] == "qnn" + assert benchmark._resolved_ep == "qnn" + + def test_load_model_unavailable_device_ep_fails_before_build(self, tmp_path: Path) -> None: + """An unavailable device/EP combo fails before the build pipeline runs. + + PerfBenchmark resolves device+EP at the start of _load_model, so an + unavailable combo (resolve_device raises ValueError) surfaces before + from_onnx kicks off the build — the user does not wait for the whole + build only to fail at session.compile(). + """ + onnx_file = tmp_path / "model.onnx" + onnx_file.write_bytes(b"fake onnx") + + config = BenchmarkConfig(model_id=str(onnx_file), task="image-classification", device="npu") + benchmark = PerfBenchmark(config) + + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + side_effect=ValueError("no compatible EP is available"), + ), + patch("winml.modelkit.models.auto.WinMLAutoModel.from_onnx") as mock_from_onnx, + pytest.raises(ValueError, match="no compatible EP is available"), + ): + benchmark._load_model() + + mock_from_onnx.assert_not_called() + + def test_cli_unavailable_device_ep_surfaces_error( + self, runner: CliRunner, tmp_path: Path + ) -> None: + """The CLI surfaces the fail-fast resolution error with a non-zero exit.""" + onnx_file = tmp_path / "model.onnx" + onnx_file.write_bytes(b"fake onnx") + + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + side_effect=ValueError("no compatible EP is available"), + ), + patch("winml.modelkit.models.auto.WinMLAutoModel.from_onnx") as mock_from_onnx, + ): + result = runner.invoke( + perf, + ["-m", str(onnx_file), "--device", "npu", "-o", str(tmp_path / "out.json")], + obj={}, + ) + + assert result.exit_code != 0 + assert "no compatible EP is available" in result.output + mock_from_onnx.assert_not_called() + def test_help_shows_ep_options(self, runner: CliRunner) -> None: result = runner.invoke(perf, ["--help"]) assert result.exit_code == 0 diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py index 9d3628dbd..4240897a6 100644 --- a/tests/unit/commands/test_perf_module.py +++ b/tests/unit/commands/test_perf_module.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING from unittest.mock import ANY, MagicMock, patch +import pytest from click.testing import CliRunner from winml.modelkit.cli import main @@ -20,6 +21,28 @@ from pathlib import Path +@pytest.fixture(autouse=True) +def _mock_device_resolution(): + """Stub perf()'s up-front device/EP resolution so module tests stay hermetic. + + perf() calls resolve_device() (and resolve_eps() when --ep is omitted) before + branching into module mode. Tests that need a specific device override + resolve_device locally inside their own ``with patch(...)`` block, which + nests over (and wins against) this autouse default. + """ + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + return_value=("cpu", ["cpu"]), + ), + patch( + "winml.modelkit.sysinfo.resolve_eps", + return_value=["CPUExecutionProvider"], + ), + ): + yield + + class TestPerfModuleFlag: """Tests for --module flag on winml perf."""