From 7938dbfbbebabaa411c5a9a244d7d3bfa37734be Mon Sep 17 00:00:00 2001
From: Hualiang Xie <hualxie@microsoft.com>
Date: Thu, 11 Jun 2026 14:56:26 +0800
Subject: [PATCH 1/5] add model path to perf result

---
 src/winml/modelkit/commands/perf.py             |  7 +++++++
 src/winml/modelkit/models/winml/base.py         |  9 +++++++++
 .../modelkit/session/qairt/qairt_session.py     |  1 +
 src/winml/modelkit/session/session.py           | 17 +++++++++++++++++
 tests/e2e/test_perf_e2e.py                      | 11 ++++++-----
 5 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 5f6141114..4747f489c 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -126,6 +126,10 @@ class BenchmarkResult:
     actual_task: str = ""
     actual_ep: EPName | None = None
 
+    # ONNX model ORT actually loaded (may be an EPContext model, differing
+    # from the input model_id when compiled or a cached one is reused)
+    running_model_path: str = ""
+
     # Hardware monitor metrics (from HWMonitor.to_dict())
     hw_monitor: dict[str, Any] | None = None
 
@@ -134,6 +138,7 @@ def to_dict(self) -> dict[str, Any]:
         result = {
             "benchmark_info": {
                 "model_id": self.config.model_id,
+                "running_model_path": self.running_model_path,
                 "task": self.actual_task,
                 "device": self.actual_device,
                 "ep": self.actual_ep,
@@ -515,6 +520,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
             actual_device=self._model.device,
             actual_task=self._model.task or self.config.task or "auto-detected",
             actual_ep=self._model.ep_name,
+            running_model_path=str(self._model.running_model_path),
             # Hardware monitor metrics (only present when --monitor is used)
             hw_monitor=getattr(self, "_hw_metrics", None),
         )
@@ -707,6 +713,7 @@ def _perf_modules(
                 mod_stats = stats
                 result_entry: dict[str, Any] = {
                     "module_path": module_path,
+                    "running_model_path": str(session.running_model_path),
                     "mean_ms": round(mod_stats.mean_ms, 3),
                     "p50_ms": round(mod_stats.p50_ms, 3),
                     "p90_ms": round(mod_stats.p90_ms, 3),
diff --git a/src/winml/modelkit/models/winml/base.py b/src/winml/modelkit/models/winml/base.py
index 43854390b..426c861be 100644
--- a/src/winml/modelkit/models/winml/base.py
+++ b/src/winml/modelkit/models/winml/base.py
@@ -104,6 +104,15 @@ def onnx_path(self) -> Path:
         """Path to the ONNX model file."""
         return self._onnx_path
 
+    @property
+    def running_model_path(self) -> Path:
+        """Path to the ONNX model the session actually loads.
+
+        Differs from ``onnx_path`` when the session compiles or reuses an
+        EPContext model. Falls back to ``onnx_path`` before compilation.
+        """
+        return self._session.running_model_path
+
     def _format_inputs(
         self,
         data: torch.Tensor | np.ndarray | list | dict | None = None,
diff --git a/src/winml/modelkit/session/qairt/qairt_session.py b/src/winml/modelkit/session/qairt/qairt_session.py
index ed2caccb5..19c7a23a6 100644
--- a/src/winml/modelkit/session/qairt/qairt_session.py
+++ b/src/winml/modelkit/session/qairt/qairt_session.py
@@ -233,6 +233,7 @@ def _create_inference_session(self) -> None:
         import onnxruntime as ort
 
         sess_options, _, _ = self._build_session_options(self._device)
+        self._running_model_path = self._ctx_path
         self._session = ort.InferenceSession(str(self._ctx_path), sess_options=sess_options)
         self._state = SessionState.COMPILED
 
diff --git a/src/winml/modelkit/session/session.py b/src/winml/modelkit/session/session.py
index dc2bca5f4..34f6c3f09 100644
--- a/src/winml/modelkit/session/session.py
+++ b/src/winml/modelkit/session/session.py
@@ -183,6 +183,10 @@ def __init__(
         # Single session (one session = one EP)
         self._session: ort.InferenceSession | None = None
 
+        # ONNX model ORT actually loads (set during compile()). May differ from
+        # _onnx_path when an EPContext model is compiled or a cached one reused.
+        self._running_model_path: Path | None = None
+
         # Cached I/O metadata (lazy-loaded)
         self._io_config: dict | None = None
 
@@ -245,6 +249,9 @@ def compile(self) -> None:
                     # Some EPs don't support compilation - fall back to original
                     logger.warning("ModelCompiler failed, using original: %s", e)
 
+        # Record the model ORT actually loads (original or EPContext).
+        self._running_model_path = model_path
+
         try:
             # Create InferenceSession.
             # EP is either configured via add_provider_for_devices (WinML EP
@@ -505,6 +512,16 @@ def is_compiled(self) -> bool:
         """Check if session is compiled."""
         return self._session is not None
 
+    @property
+    def running_model_path(self) -> Path:
+        """Path to the ONNX model ORT actually loads.
+
+        May differ from the input ``onnx_path`` when an EPContext model is
+        compiled or a cached one is reused. Falls back to the input path
+        before ``compile()`` runs.
+        """
+        return self._running_model_path or self._onnx_path
+
     @property
     def perf_stats(self) -> PerfStats | None:
         """Performance statistics (None if not in perf() context).
diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py
index c82a4cf27..c46ddcf5b 100644
--- a/tests/e2e/test_perf_e2e.py
+++ b/tests/e2e/test_perf_e2e.py
@@ -31,7 +31,7 @@
 
 import json
 import sys
-from typing import TYPE_CHECKING
+from pathlib import Path
 
 import pytest
 from click.testing import CliRunner
@@ -41,10 +41,6 @@
 from winml.modelkit.utils.constants import EP_ALIASES
 
 
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
 pytestmark = [pytest.mark.e2e]
 
 
@@ -214,6 +210,11 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str):
         assert binfo["warmup"] == 1
         assert binfo["device"] == "cpu"
 
+        # The real ONNX model ORT loaded is recorded and points at a file
+        running_model = Path(binfo["running_model_path"])
+        assert running_model.suffix == ".onnx"
+        assert running_model.exists()
+
         # Verify latency stats are populated
         latency = data["latency_ms"]
         assert latency["mean"] > 0

From 389f06458ab94027bcb8e9432734c433621ded46 Mon Sep 17 00:00:00 2001
From: Hualiang Xie <hualxie@microsoft.com>
Date: Thu, 11 Jun 2026 15:15:10 +0800
Subject: [PATCH 2/5] Add model precision from io_config to perf result

---
 src/winml/modelkit/commands/perf.py | 7 +++++++
 tests/e2e/test_perf_e2e.py          | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 4747f489c..807650b96 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -101,6 +101,10 @@ class BenchmarkResult:
     output_names: list[str] = field(default_factory=list)
     output_shapes: list[list[int]] = field(default_factory=list)
 
+    # Resolved model precision from io_config (None if the model does not
+    # expose one). Distinct from the requested config.precision policy.
+    model_precision: str | None = None
+
     # Latency stats (milliseconds)
     mean_ms: float = 0.0
     min_ms: float = 0.0
@@ -172,6 +176,8 @@ def to_dict(self) -> dict[str, Any]:
             },
             "raw_samples_ms": [round(s, 3) for s in self.raw_samples_ms],
         }
+        if self.model_precision:
+            result["model_info"]["precision"] = self.model_precision
         if self.hw_monitor:
             result["hw_monitor"] = self.hw_monitor
         return result
@@ -502,6 +508,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
             input_types=[str(t) for t in io_config["input_types"]],
             output_names=io_config["output_names"],
             output_shapes=[list(s) if s else [] for s in io_config["output_shapes"]],
+            model_precision=io_config.get("precision"),
             # Latency stats
             mean_ms=stats.mean_ms,
             min_ms=stats.min_ms,
diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py
index c46ddcf5b..52d6b8bf2 100644
--- a/tests/e2e/test_perf_e2e.py
+++ b/tests/e2e/test_perf_e2e.py
@@ -228,6 +228,12 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str):
         assert isinstance(minfo["output_names"], list)
         assert len(minfo["output_names"]) >= 1
 
+        # Precision is best-effort: emitted only when derivable from the graph,
+        # and a non-empty string when present.
+        if "precision" in minfo:
+            assert isinstance(minfo["precision"], str)
+            assert minfo["precision"]
+
         # Verify raw samples count matches iterations
         assert len(data["raw_samples_ms"]) == 3
 

From 6427b41a6d2ec8da4fccfbc08f4b8c57a8a6fa4e Mon Sep 17 00:00:00 2001
From: Hualiang Xie <hualxie@microsoft.com>
Date: Thu, 11 Jun 2026 15:33:58 +0800
Subject: [PATCH 3/5] Fix mypy: annotate to_dict result as dict[str, Any]

---
 src/winml/modelkit/commands/perf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 807650b96..090ad18e3 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -139,7 +139,7 @@ class BenchmarkResult:
 
     def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for JSON serialization."""
-        result = {
+        result: dict[str, Any] = {
             "benchmark_info": {
                 "model_id": self.config.model_id,
                 "running_model_path": self.running_model_path,

From 8050ca58023a5e15e8929e8b4634045a1f3a3006 Mon Sep 17 00:00:00 2001
From: Hualiang Xie <hualxie@microsoft.com>
Date: Fri, 12 Jun 2026 11:07:39 +0800
Subject: [PATCH 4/5] update test

---
 tests/e2e/test_perf_e2e.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py
index 52d6b8bf2..b0ec2e6ab 100644
--- a/tests/e2e/test_perf_e2e.py
+++ b/tests/e2e/test_perf_e2e.py
@@ -209,6 +209,7 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str):
         assert binfo["iterations"] == 3
         assert binfo["warmup"] == 1
         assert binfo["device"] == "cpu"
+        assert binfo["precision"] == "auto"
 
         # The real ONNX model ORT loaded is recorded and points at a file
         running_model = Path(binfo["running_model_path"])
@@ -227,12 +228,7 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str):
         assert len(minfo["input_names"]) >= 1
         assert isinstance(minfo["output_names"], list)
         assert len(minfo["output_names"]) >= 1
-
-        # Precision is best-effort: emitted only when derivable from the graph,
-        # and a non-empty string when present.
-        if "precision" in minfo:
-            assert isinstance(minfo["precision"], str)
-            assert minfo["precision"]
+        assert minfo["precision"] == "fp32"
 
         # Verify raw samples count matches iterations
         assert len(data["raw_samples_ms"]) == 3

From 4e9932c5ef5a80df2439cfe175e3d7bbae15015d Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 16:19:34 +0800
Subject: [PATCH 5/5] address comments

---
 src/winml/modelkit/commands/perf.py           |  3 +-
 .../modelkit/session/qairt/qairt_session.py   |  4 +-
 src/winml/modelkit/session/session.py         |  8 +-
 tests/unit/commands/test_perf_module.py       | 96 +++++++++++++++++++
 4 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 090ad18e3..1c9b3cf3e 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -158,6 +158,7 @@ def to_dict(self) -> dict[str, Any]:
                 "input_types": self.input_types,
                 "output_names": self.output_names,
                 "output_shapes": self.output_shapes,
+                "precision": self.model_precision,
             },
             "latency_ms": {
                 "mean": round(self.mean_ms, 3),
@@ -176,8 +177,6 @@ def to_dict(self) -> dict[str, Any]:
             },
             "raw_samples_ms": [round(s, 3) for s in self.raw_samples_ms],
         }
-        if self.model_precision:
-            result["model_info"]["precision"] = self.model_precision
         if self.hw_monitor:
             result["hw_monitor"] = self.hw_monitor
         return result
diff --git a/src/winml/modelkit/session/qairt/qairt_session.py b/src/winml/modelkit/session/qairt/qairt_session.py
index 19c7a23a6..47dfe2b9f 100644
--- a/src/winml/modelkit/session/qairt/qairt_session.py
+++ b/src/winml/modelkit/session/qairt/qairt_session.py
@@ -233,8 +233,10 @@ def _create_inference_session(self) -> None:
         import onnxruntime as ort
 
         sess_options, _, _ = self._build_session_options(self._device)
-        self._running_model_path = self._ctx_path
         self._session = ort.InferenceSession(str(self._ctx_path), sess_options=sess_options)
+        # Record the loaded model only after the session is successfully
+        # created, so a failed load leaves running_model_path unset.
+        self._running_model_path = self._ctx_path
         self._state = SessionState.COMPILED
 
         actual_providers = self._session.get_providers()
diff --git a/src/winml/modelkit/session/session.py b/src/winml/modelkit/session/session.py
index 34f6c3f09..8bacf7135 100644
--- a/src/winml/modelkit/session/session.py
+++ b/src/winml/modelkit/session/session.py
@@ -249,9 +249,6 @@ def compile(self) -> None:
                     # Some EPs don't support compilation - fall back to original
                     logger.warning("ModelCompiler failed, using original: %s", e)
 
-        # Record the model ORT actually loads (original or EPContext).
-        self._running_model_path = model_path
-
         try:
             # Create InferenceSession.
             # EP is either configured via add_provider_for_devices (WinML EP
@@ -282,8 +279,11 @@ def compile(self) -> None:
             actual_providers,
         )
 
-        # Store session
+        # Store session. Record the model ORT actually loaded (original or
+        # EPContext) only after the session is successfully created, so a
+        # failed compile leaves running_model_path unset rather than stale.
         self._session = session
+        self._running_model_path = model_path
         self._state = SessionState.COMPILED
 
         # Resolve device label from the primary provider ORT actually selected
diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py
index 051bae425..f08f5d3d7 100644
--- a/tests/unit/commands/test_perf_module.py
+++ b/tests/unit/commands/test_perf_module.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING
 from unittest.mock import MagicMock, patch
 
@@ -218,3 +219,98 @@ def test_device_and_ep_forwarded_through_module_path(self, tmp_path: Path) -> No
         session_kwargs = mock_session_cls.call_args.kwargs
         assert session_kwargs["device"] == "npu"
         assert session_kwargs["ep"] == "qnn"
+
+    def test_running_model_path_in_module_result(self, tmp_path: Path) -> None:
+        """A completed module benchmark records running_model_path in its
+        per-instance result entry.
+
+        Unlike the forwarding test above (which short-circuits the benchmark
+        loop via a RuntimeError), this drives a successful run so result_entry
+        is actually populated, then reads it back from the JSON report.
+        """
+        fake_cfg = MagicMock()
+        fake_cfg.loader.model_type = "bert"
+        fake_cfg.loader.module_path = "encoder.layer.0"
+
+        fake_build_result = MagicMock()
+        fake_build_result.final_onnx_path = tmp_path / "model.onnx"
+
+        # Stats yielded by `with session.perf(...) as stats` — needs real
+        # numbers since result_entry rounds/divides them.
+        fake_stats = MagicMock()
+        fake_stats.mean_ms = 1.0
+        fake_stats.p50_ms = 1.0
+        fake_stats.p90_ms = 1.0
+        fake_stats.p95_ms = 1.0
+        fake_stats.p99_ms = 1.0
+        fake_stats.min_ms = 1.0
+        fake_stats.max_ms = 1.0
+        fake_stats.samples_ms = [1.0, 1.0]
+
+        running_model_path = tmp_path / "model_cpu_ctx.onnx"
+        fake_session = MagicMock()
+        fake_session.perf.return_value.__enter__.return_value = fake_stats
+        fake_session.running_model_path = running_model_path
+
+        fake_loader_cfg = MagicMock()
+        fake_loader_cfg.task = "fill-mask"
+
+        out_path = tmp_path / "out.json"
+
+        with (
+            patch(
+                "winml.modelkit.sysinfo.resolve_device",
+                return_value=("npu", "qnn"),
+            ),
+            patch(
+                "winml.modelkit.config.generate_hf_build_config",
+                return_value=[fake_cfg],
+            ),
+            patch(
+                "winml.modelkit.loader.resolve_loader_config",
+                return_value=(fake_loader_cfg, MagicMock(), MagicMock()),
+            ),
+            patch(
+                "winml.modelkit.commands.build._instantiate_parent_model",
+                return_value=MagicMock(),
+            ),
+            patch(
+                "winml.modelkit.build.build_hf_model",
+                return_value=fake_build_result,
+            ),
+            patch(
+                "winml.modelkit.session.WinMLSession",
+                return_value=fake_session,
+            ),
+            patch(
+                "winml.modelkit.commands.perf.generate_random_inputs",
+                return_value={},
+            ),
+        ):
+            runner = CliRunner()
+            result = runner.invoke(
+                main,
+                [
+                    "perf",
+                    "-m",
+                    "fake/model",
+                    "--module",
+                    "BertLayer",
+                    "--device",
+                    "npu",
+                    "--ep",
+                    "qnn",
+                    "--iterations",
+                    "1",
+                    "--warmup",
+                    "0",
+                    "-o",
+                    str(out_path),
+                ],
+            )
+
+        assert result.exit_code == 0, result.output
+
+        report = json.loads(out_path.read_text(encoding="utf-8"))
+        instance = report["instances"][0]
+        assert instance["running_model_path"] == str(running_model_path)