From 7938dbfbbebabaa411c5a9a244d7d3bfa37734be Mon Sep 17 00:00:00 2001 From: Hualiang Xie Date: Thu, 11 Jun 2026 14:56:26 +0800 Subject: [PATCH 1/5] add model path to perf result --- src/winml/modelkit/commands/perf.py | 7 +++++++ src/winml/modelkit/models/winml/base.py | 9 +++++++++ .../modelkit/session/qairt/qairt_session.py | 1 + src/winml/modelkit/session/session.py | 17 +++++++++++++++++ tests/e2e/test_perf_e2e.py | 11 ++++++----- 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 5f6141114..4747f489c 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -126,6 +126,10 @@ class BenchmarkResult: actual_task: str = "" actual_ep: EPName | None = None + # ONNX model ORT actually loaded (may be an EPContext model, differing + # from the input model_id when compiled or a cached one is reused) + running_model_path: str = "" + # Hardware monitor metrics (from HWMonitor.to_dict()) hw_monitor: dict[str, Any] | None = None @@ -134,6 +138,7 @@ def to_dict(self) -> dict[str, Any]: result = { "benchmark_info": { "model_id": self.config.model_id, + "running_model_path": self.running_model_path, "task": self.actual_task, "device": self.actual_device, "ep": self.actual_ep, @@ -515,6 +520,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: actual_device=self._model.device, actual_task=self._model.task or self.config.task or "auto-detected", actual_ep=self._model.ep_name, + running_model_path=str(self._model.running_model_path), # Hardware monitor metrics (only present when --monitor is used) hw_monitor=getattr(self, "_hw_metrics", None), ) @@ -707,6 +713,7 @@ def _perf_modules( mod_stats = stats result_entry: dict[str, Any] = { "module_path": module_path, + "running_model_path": str(session.running_model_path), "mean_ms": round(mod_stats.mean_ms, 3), "p50_ms": round(mod_stats.p50_ms, 3), "p90_ms": round(mod_stats.p90_ms, 3), diff --git a/src/winml/modelkit/models/winml/base.py b/src/winml/modelkit/models/winml/base.py index 43854390b..426c861be 100644 --- a/src/winml/modelkit/models/winml/base.py +++ b/src/winml/modelkit/models/winml/base.py @@ -104,6 +104,15 @@ def onnx_path(self) -> Path: """Path to the ONNX model file.""" return self._onnx_path + @property + def running_model_path(self) -> Path: + """Path to the ONNX model the session actually loads. + + Differs from ``onnx_path`` when the session compiles or reuses an + EPContext model. Falls back to ``onnx_path`` before compilation. + """ + return self._session.running_model_path + def _format_inputs( self, data: torch.Tensor | np.ndarray | list | dict | None = None, diff --git a/src/winml/modelkit/session/qairt/qairt_session.py b/src/winml/modelkit/session/qairt/qairt_session.py index ed2caccb5..19c7a23a6 100644 --- a/src/winml/modelkit/session/qairt/qairt_session.py +++ b/src/winml/modelkit/session/qairt/qairt_session.py @@ -233,6 +233,7 @@ def _create_inference_session(self) -> None: import onnxruntime as ort sess_options, _, _ = self._build_session_options(self._device) + self._running_model_path = self._ctx_path self._session = ort.InferenceSession(str(self._ctx_path), sess_options=sess_options) self._state = SessionState.COMPILED diff --git a/src/winml/modelkit/session/session.py b/src/winml/modelkit/session/session.py index dc2bca5f4..34f6c3f09 100644 --- a/src/winml/modelkit/session/session.py +++ b/src/winml/modelkit/session/session.py @@ -183,6 +183,10 @@ def __init__( # Single session (one session = one EP) self._session: ort.InferenceSession | None = None + # ONNX model ORT actually loads (set during compile()). May differ from + # _onnx_path when an EPContext model is compiled or a cached one reused. + self._running_model_path: Path | None = None + # Cached I/O metadata (lazy-loaded) self._io_config: dict | None = None @@ -245,6 +249,9 @@ def compile(self) -> None: # Some EPs don't support compilation - fall back to original logger.warning("ModelCompiler failed, using original: %s", e) + # Record the model ORT actually loads (original or EPContext). + self._running_model_path = model_path + try: # Create InferenceSession. # EP is either configured via add_provider_for_devices (WinML EP @@ -505,6 +512,16 @@ def is_compiled(self) -> bool: """Check if session is compiled.""" return self._session is not None + @property + def running_model_path(self) -> Path: + """Path to the ONNX model ORT actually loads. + + May differ from the input ``onnx_path`` when an EPContext model is + compiled or a cached one is reused. Falls back to the input path + before ``compile()`` runs. + """ + return self._running_model_path or self._onnx_path + @property def perf_stats(self) -> PerfStats | None: """Performance statistics (None if not in perf() context). diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py index c82a4cf27..c46ddcf5b 100644 --- a/tests/e2e/test_perf_e2e.py +++ b/tests/e2e/test_perf_e2e.py @@ -31,7 +31,7 @@ import json import sys -from typing import TYPE_CHECKING +from pathlib import Path import pytest from click.testing import CliRunner @@ -41,10 +41,6 @@ from winml.modelkit.utils.constants import EP_ALIASES -if TYPE_CHECKING: - from pathlib import Path - - pytestmark = [pytest.mark.e2e] @@ -214,6 +210,11 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str): assert binfo["warmup"] == 1 assert binfo["device"] == "cpu" + # The real ONNX model ORT loaded is recorded and points at a file + running_model = Path(binfo["running_model_path"]) + assert running_model.suffix == ".onnx" + assert running_model.exists() + # Verify latency stats are populated latency = data["latency_ms"] assert latency["mean"] > 0 From 389f06458ab94027bcb8e9432734c433621ded46 Mon Sep 17 00:00:00 2001 From: Hualiang Xie Date: Thu, 11 Jun 2026 15:15:10 +0800 Subject: [PATCH 2/5] Add model precision from io_config to perf result --- src/winml/modelkit/commands/perf.py | 7 +++++++ tests/e2e/test_perf_e2e.py | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 4747f489c..807650b96 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -101,6 +101,10 @@ class BenchmarkResult: output_names: list[str] = field(default_factory=list) output_shapes: list[list[int]] = field(default_factory=list) + # Resolved model precision from io_config (None if the model does not + # expose one). Distinct from the requested config.precision policy. + model_precision: str | None = None + # Latency stats (milliseconds) mean_ms: float = 0.0 min_ms: float = 0.0 @@ -172,6 +176,8 @@ def to_dict(self) -> dict[str, Any]: }, "raw_samples_ms": [round(s, 3) for s in self.raw_samples_ms], } + if self.model_precision: + result["model_info"]["precision"] = self.model_precision if self.hw_monitor: result["hw_monitor"] = self.hw_monitor return result @@ -502,6 +508,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: input_types=[str(t) for t in io_config["input_types"]], output_names=io_config["output_names"], output_shapes=[list(s) if s else [] for s in io_config["output_shapes"]], + model_precision=io_config.get("precision"), # Latency stats mean_ms=stats.mean_ms, min_ms=stats.min_ms, diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py index c46ddcf5b..52d6b8bf2 100644 --- a/tests/e2e/test_perf_e2e.py +++ b/tests/e2e/test_perf_e2e.py @@ -228,6 +228,12 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str): assert isinstance(minfo["output_names"], list) assert len(minfo["output_names"]) >= 1 + # Precision is best-effort: emitted only when derivable from the graph, + # and a non-empty string when present. + if "precision" in minfo: + assert isinstance(minfo["precision"], str) + assert minfo["precision"] + # Verify raw samples count matches iterations assert len(data["raw_samples_ms"]) == 3 From 6427b41a6d2ec8da4fccfbc08f4b8c57a8a6fa4e Mon Sep 17 00:00:00 2001 From: Hualiang Xie Date: Thu, 11 Jun 2026 15:33:58 +0800 Subject: [PATCH 3/5] Fix mypy: annotate to_dict result as dict[str, Any] --- src/winml/modelkit/commands/perf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 807650b96..090ad18e3 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -139,7 +139,7 @@ class BenchmarkResult: def to_dict(self) -> dict[str, Any]: """Convert to dictionary for JSON serialization.""" - result = { + result: dict[str, Any] = { "benchmark_info": { "model_id": self.config.model_id, "running_model_path": self.running_model_path, From 8050ca58023a5e15e8929e8b4634045a1f3a3006 Mon Sep 17 00:00:00 2001 From: Hualiang Xie Date: Fri, 12 Jun 2026 11:07:39 +0800 Subject: [PATCH 4/5] update test --- tests/e2e/test_perf_e2e.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py index 52d6b8bf2..b0ec2e6ab 100644 --- a/tests/e2e/test_perf_e2e.py +++ b/tests/e2e/test_perf_e2e.py @@ -209,6 +209,7 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str): assert binfo["iterations"] == 3 assert binfo["warmup"] == 1 assert binfo["device"] == "cpu" + assert binfo["precision"] == "auto" # The real ONNX model ORT loaded is recorded and points at a file running_model = Path(binfo["running_model_path"]) @@ -227,12 +228,7 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str): assert len(minfo["input_names"]) >= 1 assert isinstance(minfo["output_names"], list) assert len(minfo["output_names"]) >= 1 - - # Precision is best-effort: emitted only when derivable from the graph, - # and a non-empty string when present. - if "precision" in minfo: - assert isinstance(minfo["precision"], str) - assert minfo["precision"] + assert minfo["precision"] == "fp32" # Verify raw samples count matches iterations assert len(data["raw_samples_ms"]) == 3 From 4e9932c5ef5a80df2439cfe175e3d7bbae15015d Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 16:19:34 +0800 Subject: [PATCH 5/5] address comments --- src/winml/modelkit/commands/perf.py | 3 +- .../modelkit/session/qairt/qairt_session.py | 4 +- src/winml/modelkit/session/session.py | 8 +- tests/unit/commands/test_perf_module.py | 96 +++++++++++++++++++ 4 files changed, 104 insertions(+), 7 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 090ad18e3..1c9b3cf3e 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -158,6 +158,7 @@ def to_dict(self) -> dict[str, Any]: "input_types": self.input_types, "output_names": self.output_names, "output_shapes": self.output_shapes, + "precision": self.model_precision, }, "latency_ms": { "mean": round(self.mean_ms, 3), @@ -176,8 +177,6 @@ def to_dict(self) -> dict[str, Any]: }, "raw_samples_ms": [round(s, 3) for s in self.raw_samples_ms], } - if self.model_precision: - result["model_info"]["precision"] = self.model_precision if self.hw_monitor: result["hw_monitor"] = self.hw_monitor return result diff --git a/src/winml/modelkit/session/qairt/qairt_session.py b/src/winml/modelkit/session/qairt/qairt_session.py index 19c7a23a6..47dfe2b9f 100644 --- a/src/winml/modelkit/session/qairt/qairt_session.py +++ b/src/winml/modelkit/session/qairt/qairt_session.py @@ -233,8 +233,10 @@ def _create_inference_session(self) -> None: import onnxruntime as ort sess_options, _, _ = self._build_session_options(self._device) - self._running_model_path = self._ctx_path self._session = ort.InferenceSession(str(self._ctx_path), sess_options=sess_options) + # Record the loaded model only after the session is successfully + # created, so a failed load leaves running_model_path unset. + self._running_model_path = self._ctx_path self._state = SessionState.COMPILED actual_providers = self._session.get_providers() diff --git a/src/winml/modelkit/session/session.py b/src/winml/modelkit/session/session.py index 34f6c3f09..8bacf7135 100644 --- a/src/winml/modelkit/session/session.py +++ b/src/winml/modelkit/session/session.py @@ -249,9 +249,6 @@ def compile(self) -> None: # Some EPs don't support compilation - fall back to original logger.warning("ModelCompiler failed, using original: %s", e) - # Record the model ORT actually loads (original or EPContext). - self._running_model_path = model_path - try: # Create InferenceSession. # EP is either configured via add_provider_for_devices (WinML EP @@ -282,8 +279,11 @@ def compile(self) -> None: actual_providers, ) - # Store session + # Store session. Record the model ORT actually loaded (original or + # EPContext) only after the session is successfully created, so a + # failed compile leaves running_model_path unset rather than stale. self._session = session + self._running_model_path = model_path self._state = SessionState.COMPILED # Resolve device label from the primary provider ORT actually selected diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py index 051bae425..f08f5d3d7 100644 --- a/tests/unit/commands/test_perf_module.py +++ b/tests/unit/commands/test_perf_module.py @@ -6,6 +6,7 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch @@ -218,3 +219,98 @@ def test_device_and_ep_forwarded_through_module_path(self, tmp_path: Path) -> No session_kwargs = mock_session_cls.call_args.kwargs assert session_kwargs["device"] == "npu" assert session_kwargs["ep"] == "qnn" + + def test_running_model_path_in_module_result(self, tmp_path: Path) -> None: + """A completed module benchmark records running_model_path in its + per-instance result entry. + + Unlike the forwarding test above (which short-circuits the benchmark + loop via a RuntimeError), this drives a successful run so result_entry + is actually populated, then reads it back from the JSON report. + """ + fake_cfg = MagicMock() + fake_cfg.loader.model_type = "bert" + fake_cfg.loader.module_path = "encoder.layer.0" + + fake_build_result = MagicMock() + fake_build_result.final_onnx_path = tmp_path / "model.onnx" + + # Stats yielded by `with session.perf(...) as stats` — needs real + # numbers since result_entry rounds/divides them. + fake_stats = MagicMock() + fake_stats.mean_ms = 1.0 + fake_stats.p50_ms = 1.0 + fake_stats.p90_ms = 1.0 + fake_stats.p95_ms = 1.0 + fake_stats.p99_ms = 1.0 + fake_stats.min_ms = 1.0 + fake_stats.max_ms = 1.0 + fake_stats.samples_ms = [1.0, 1.0] + + running_model_path = tmp_path / "model_cpu_ctx.onnx" + fake_session = MagicMock() + fake_session.perf.return_value.__enter__.return_value = fake_stats + fake_session.running_model_path = running_model_path + + fake_loader_cfg = MagicMock() + fake_loader_cfg.task = "fill-mask" + + out_path = tmp_path / "out.json" + + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + return_value=("npu", "qnn"), + ), + patch( + "winml.modelkit.config.generate_hf_build_config", + return_value=[fake_cfg], + ), + patch( + "winml.modelkit.loader.resolve_loader_config", + return_value=(fake_loader_cfg, MagicMock(), MagicMock()), + ), + patch( + "winml.modelkit.commands.build._instantiate_parent_model", + return_value=MagicMock(), + ), + patch( + "winml.modelkit.build.build_hf_model", + return_value=fake_build_result, + ), + patch( + "winml.modelkit.session.WinMLSession", + return_value=fake_session, + ), + patch( + "winml.modelkit.commands.perf.generate_random_inputs", + return_value={}, + ), + ): + runner = CliRunner() + result = runner.invoke( + main, + [ + "perf", + "-m", + "fake/model", + "--module", + "BertLayer", + "--device", + "npu", + "--ep", + "qnn", + "--iterations", + "1", + "--warmup", + "0", + "-o", + str(out_path), + ], + ) + + assert result.exit_code == 0, result.output + + report = json.loads(out_path.read_text(encoding="utf-8")) + instance = report["instances"][0] + assert instance["running_model_path"] == str(running_model_path)