Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion src/winml/modelkit/commands/perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ class BenchmarkResult:
output_names: list[str] = field(default_factory=list)
output_shapes: list[list[int]] = field(default_factory=list)

# Resolved model precision from io_config (None if the model does not
# expose one). Distinct from the requested config.precision policy.
model_precision: str | None = None

# Latency stats (milliseconds)
mean_ms: float = 0.0
min_ms: float = 0.0
Expand All @@ -126,14 +130,19 @@ class BenchmarkResult:
actual_task: str = ""
actual_ep: EPName | None = None

# ONNX model ORT actually loaded (may be an EPContext model, differing
# from the input model_id when compiled or a cached one is reused)
running_model_path: str = ""

# Hardware monitor metrics (from HWMonitor.to_dict())
hw_monitor: dict[str, Any] | None = None

def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
result = {
result: dict[str, Any] = {
"benchmark_info": {
"model_id": self.config.model_id,
"running_model_path": self.running_model_path,
"task": self.actual_task,
"device": self.actual_device,
"ep": self.actual_ep,
Expand Down Expand Up @@ -167,6 +176,8 @@ def to_dict(self) -> dict[str, Any]:
},
"raw_samples_ms": [round(s, 3) for s in self.raw_samples_ms],
}
if self.model_precision:
result["model_info"]["precision"] = self.model_precision
if self.hw_monitor:
result["hw_monitor"] = self.hw_monitor
return result
Expand Down Expand Up @@ -497,6 +508,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
input_types=[str(t) for t in io_config["input_types"]],
output_names=io_config["output_names"],
output_shapes=[list(s) if s else [] for s in io_config["output_shapes"]],
model_precision=io_config.get("precision"),
# Latency stats
mean_ms=stats.mean_ms,
min_ms=stats.min_ms,
Expand All @@ -515,6 +527,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
actual_device=self._model.device,
actual_task=self._model.task or self.config.task or "auto-detected",
actual_ep=self._model.ep_name,
running_model_path=str(self._model.running_model_path),
# Hardware monitor metrics (only present when --monitor is used)
hw_monitor=getattr(self, "_hw_metrics", None),
)
Expand Down Expand Up @@ -707,6 +720,7 @@ def _perf_modules(
mod_stats = stats
result_entry: dict[str, Any] = {
"module_path": module_path,
"running_model_path": str(session.running_model_path),
"mean_ms": round(mod_stats.mean_ms, 3),
"p50_ms": round(mod_stats.p50_ms, 3),
"p90_ms": round(mod_stats.p90_ms, 3),
Expand Down
9 changes: 9 additions & 0 deletions src/winml/modelkit/models/winml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ def onnx_path(self) -> Path:
"""Path to the ONNX model file."""
return self._onnx_path

@property
def running_model_path(self) -> Path:
"""Path to the ONNX model the session actually loads.

Differs from ``onnx_path`` when the session compiles or reuses an
EPContext model. Falls back to ``onnx_path`` before compilation.
"""
return self._session.running_model_path

def _format_inputs(
self,
data: torch.Tensor | np.ndarray | list | dict | None = None,
Expand Down
1 change: 1 addition & 0 deletions src/winml/modelkit/session/qairt/qairt_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def _create_inference_session(self) -> None:
import onnxruntime as ort

sess_options, _, _ = self._build_session_options(self._device)
self._running_model_path = self._ctx_path
self._session = ort.InferenceSession(str(self._ctx_path), sess_options=sess_options)
self._state = SessionState.COMPILED

Expand Down
17 changes: 17 additions & 0 deletions src/winml/modelkit/session/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ def __init__(
# Single session (one session = one EP)
self._session: ort.InferenceSession | None = None

# ONNX model ORT actually loads (set during compile()). May differ from
# _onnx_path when an EPContext model is compiled or a cached one reused.
self._running_model_path: Path | None = None

# Cached I/O metadata (lazy-loaded)
self._io_config: dict | None = None

Expand Down Expand Up @@ -245,6 +249,9 @@ def compile(self) -> None:
# Some EPs don't support compilation - fall back to original
logger.warning("ModelCompiler failed, using original: %s", e)

# Record the model ORT actually loads (original or EPContext).
self._running_model_path = model_path

try:
# Create InferenceSession.
# EP is either configured via add_provider_for_devices (WinML EP
Expand Down Expand Up @@ -505,6 +512,16 @@ def is_compiled(self) -> bool:
"""Check if session is compiled."""
return self._session is not None

@property
def running_model_path(self) -> Path:
"""Path to the ONNX model ORT actually loads.

May differ from the input ``onnx_path`` when an EPContext model is
compiled or a cached one is reused. Falls back to the input path
before ``compile()`` runs.
"""
return self._running_model_path or self._onnx_path

@property
def perf_stats(self) -> PerfStats | None:
"""Performance statistics (None if not in perf() context).
Expand Down
17 changes: 12 additions & 5 deletions tests/e2e/test_perf_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

import json
import sys
from typing import TYPE_CHECKING
from pathlib import Path

import pytest
from click.testing import CliRunner
Expand All @@ -41,10 +41,6 @@
from winml.modelkit.utils.constants import EP_ALIASES


if TYPE_CHECKING:
from pathlib import Path


pytestmark = [pytest.mark.e2e]


Expand Down Expand Up @@ -214,6 +210,11 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str):
assert binfo["warmup"] == 1
assert binfo["device"] == "cpu"

# The real ONNX model ORT loaded is recorded and points at a file
running_model = Path(binfo["running_model_path"])
assert running_model.suffix == ".onnx"
assert running_model.exists()

# Verify latency stats are populated
latency = data["latency_ms"]
assert latency["mean"] > 0
Expand All @@ -227,6 +228,12 @@ def test_benchmark_cpu(self, tmp_path: Path, model_arg: str):
assert isinstance(minfo["output_names"], list)
assert len(minfo["output_names"]) >= 1

# Precision is best-effort: emitted only when derivable from the graph,
# and a non-empty string when present.
if "precision" in minfo:
assert isinstance(minfo["precision"], str)
assert minfo["precision"]

# Verify raw samples count matches iterations
assert len(data["raw_samples_ms"]) == 3

Expand Down
Loading