From a0b72d324471e8271f67ed87af1d10f2578d41d9 Mon Sep 17 00:00:00 2001 From: hualxie Date: Wed, 10 Jun 2026 17:20:23 +0800 Subject: [PATCH 1/9] fix(perf): support composite (dual-encoder) models in winml perf `winml perf` assumed every model exposes a single `io_config`/`_session`, so composite models (CLIP/SigLIP zero-shot-image-classification) crashed with `AttributeError: ... has no attribute io_config` during input generation. Make `PerfBenchmark` composite-aware: - `_aggregate_io_config()` unions the sub-models inputs (their union is exactly the composite forward() kwargs) for input generation/display. - Time the full `forward()` pass via an external PerfStats; single-session models keep recording pure-ORT time inside session.perf(). The monitored loop is refactored to take a run-iteration callable so both paths share it. - Device/EP/task are resolved from a representative sub-model. - `_probe_composite_outputs()` runs one forward() and introspects the result so reported outputs are the composite task-level tensors (e.g. logits_per_image) rather than a deduped union of sub-model ONNX outputs. Add tests/unit/commands/test_perf_composite.py covering aggregation, output describing/probing, input generation, device/EP/task resolution, and the full-forward timing path. --- src/winml/modelkit/commands/perf.py | 240 ++++++++++++++++++-- tests/unit/commands/test_perf_composite.py | 249 +++++++++++++++++++++ 2 files changed, 466 insertions(+), 23 deletions(-) create mode 100644 tests/unit/commands/test_perf_composite.py diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index b53578d3d..88e2ac07f 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -14,6 +14,7 @@ from __future__ import annotations +import contextlib import json import logging from dataclasses import dataclass, field @@ -33,6 +34,8 @@ if TYPE_CHECKING: + from collections.abc import Callable, Iterable + from ..models.winml.base import WinMLPreTrainedModel from ..session.stats import PerfStats @@ -258,6 +261,81 @@ def _resolve_shape( return tuple(resolved) +def _aggregate_io_config(sub_models: Iterable[Any]) -> dict[str, Any]: + """Merge a composite model's sub-model io_configs into one unified view. + + Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT + session; they orchestrate several sub-models. For benchmarking we present a + unified io_config whose inputs are the union of every sub-model's inputs + (deduplicated by name, order preserved) -- which is exactly the set of + keyword arguments the composite's ``forward()`` consumes. Outputs are + likewise unioned for display and result-reporting purposes. + """ + agg: dict[str, Any] = { + "input_names": [], + "input_shapes": [], + "input_types": [], + "output_names": [], + "output_shapes": [], + "output_types": [], + "precision": None, + } + seen_in: set[str] = set() + seen_out: set[str] = set() + for sub in sub_models: + io = sub.io_config + for name, shape, dtype in zip( + io["input_names"], io["input_shapes"], io["input_types"], strict=True + ): + if name not in seen_in: + seen_in.add(name) + agg["input_names"].append(name) + agg["input_shapes"].append(shape) + agg["input_types"].append(dtype) + out_types = io.get("output_types") or [None] * len(io["output_names"]) + for name, shape, dtype in zip( + io["output_names"], io["output_shapes"], out_types, strict=False + ): + if name not in seen_out: + seen_out.add(name) + agg["output_names"].append(name) + agg["output_shapes"].append(shape) + agg["output_types"].append(dtype) + if agg["precision"] is None: + agg["precision"] = io.get("precision") + return agg + + +def _describe_outputs(output: Any) -> tuple[list[str], list[list[int]], list[str | None]]: + """Extract ``(names, shapes, dtypes)`` from a model ``forward()`` result. + + Architecture-agnostic: handles HuggingFace ``ModelOutput`` / ``dict`` + (named fields), plain sequences (positional ``output_N`` names), and a + single tensor. ``None`` fields and non-array values are skipped. Used to + report a composite model's real task-level outputs (e.g. ``logits``) + rather than its sub-models' raw ONNX outputs. + """ + if hasattr(output, "items"): + pairs = list(output.items()) + elif isinstance(output, (list, tuple)): + pairs = [(f"output_{i}", value) for i, value in enumerate(output)] + else: + pairs = [("output_0", output)] + + names: list[str] = [] + shapes: list[list[int]] = [] + types: list[str | None] = [] + for name, value in pairs: + shape = getattr(value, "shape", None) + if value is None or shape is None: + continue + names.append(name) + shapes.append([int(dim) for dim in shape]) + dtype = getattr(value, "dtype", None) + types.append(str(dtype) if dtype is not None else None) + return names, shapes, types + + # ============================================================================= # Benchmark Engine # ============================================================================= @@ -281,6 +359,77 @@ def __init__(self, config: BenchmarkConfig) -> None: self.config = config self._model: WinMLPreTrainedModel | None = None self._inputs: dict[str, np.ndarray] | None = None + self._io_config: dict[str, Any] | None = None + + @property + def _is_composite(self) -> bool: + """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP).""" + return hasattr(self._model, "sub_models") + + def _resolved_io_config(self) -> dict[str, Any]: + """Unified io_config (aggregated across sub-models for composites).""" + if self._io_config is None: + assert self._model is not None + if self._is_composite: + self._io_config = _aggregate_io_config(self._model.sub_models.values()) + else: + self._io_config = self._model.io_config + return self._io_config + + def _compile_model(self) -> None: + """Compile the underlying ORT session(s) so device/EP are resolved.""" + assert self._model is not None + if self._is_composite: + for sub in self._model.sub_models.values(): + sub._session.compile() + else: + self._model._session.compile() + + def _resolved_device(self) -> str: + """Actual device bound after compile (representative sub-model for composites).""" + assert self._model is not None + if self._is_composite: + return next(iter(self._model.sub_models.values())).device + return self._model.device + + def _resolved_ep(self) -> EPName | None: + """Primary EP bound after compile (representative sub-model for composites).""" + assert self._model is not None + if self._is_composite: + return next(iter(self._model.sub_models.values())).ep_name + return self._model.ep_name + + def _resolved_task(self) -> str | None: + """Resolved task; composites fall back to the requested task.""" + assert self._model is not None + if self._is_composite: + return self.config.task + return self._model.task or self.config.task + + def _probe_composite_outputs(self) -> None: + """Overwrite a composite's reported outputs with its real forward() result. + + Runs one ``forward()`` pass (an extra warmup) and introspects the + returned object so the displayed/reported outputs reflect the + composite's task-level tensors (e.g. ``logits_per_image``) instead of + the deduplicated union of its sub-models' raw ONNX outputs. Falls back + to the aggregated view if the probe fails or yields nothing. + """ + assert self._model is not None + assert self._inputs is not None + try: + output = self._model(**self._inputs) + except Exception: # best-effort display only; never fail the run + logger.debug("Composite output probe failed; keeping aggregated view", exc_info=True) + return + + names, shapes, types = _describe_outputs(output) + if not names: + return + io = self._resolved_io_config() + io["output_names"] = names + io["output_shapes"] = shapes + io["output_types"] = types def run(self) -> BenchmarkResult: """Execute full benchmark pipeline. @@ -297,16 +446,21 @@ def run(self) -> BenchmarkResult: logger.info("Generating benchmark inputs") self._generate_inputs() - # Compile session early so model.device is resolved for display - self._model._session.compile() + # Compile session(s) early so model.device is resolved for display + self._compile_model() + + # Composite forward() returns task-level outputs (e.g. logits) that + # don't map to any single sub-model's ONNX outputs; probe the real ones. + if self._is_composite: + self._probe_composite_outputs() # Print model info before benchmark starts _print_model_info( - self._model.io_config, - task=self._model.task or self.config.task, + self._resolved_io_config(), + task=self._resolved_task(), req_device=self.config.device, - act_device=self._model.device, - ep_name=self._model.ep_name, + act_device=self._resolved_device(), + ep_name=self._resolved_ep(), ) # [3] Run benchmark @@ -378,12 +532,18 @@ def _load_model(self) -> None: def _generate_inputs(self) -> None: """Generate random inputs based on model io_config.""" assert self._model is not None - io_config = self._model.io_config + io_config = self._resolved_io_config() self._inputs = generate_random_inputs( io_config=io_config, batch_size=self.config.batch_size, ) + def _composite_run_iteration(self, stats: PerfStats) -> None: + """Time one full composite forward() pass (orchestrates all sub-sessions).""" + assert self._model is not None + assert self._inputs is not None + stats.record(lambda: self._model(**self._inputs)) + def _run_benchmark(self) -> PerfStats: """Execute benchmark iterations with timing.""" if self.config.monitor: @@ -394,9 +554,21 @@ def _run_benchmark_simple(self) -> PerfStats: """Execute benchmark without live monitoring.""" assert self._model is not None assert self._inputs is not None - session = self._model._session total_iterations = self.config.warmup + self.config.iterations + # Composite models have no single ORT session; time the full forward() + # pass with an external PerfStats instead of the session's perf() hook. + if self._is_composite: + from ..session.stats import PerfStats + + stats = PerfStats(warmup=self.config.warmup) + for i in range(total_iterations): + self._composite_run_iteration(stats) + if (i + 1) % max(1, total_iterations // 10) == 0: + logger.debug("Progress: %d/%d", i + 1, total_iterations) + return stats + + session = self._model._session with session.perf(warmup=self.config.warmup) as stats: _run_simple_loop(session, self._inputs, total_iterations) @@ -413,10 +585,10 @@ def _run_benchmark_monitored(self) -> PerfStats: from ..session.monitor.ep_monitor import NullEPMonitor from ..session.monitor.hw_monitor import HWMonitor from ..session.monitor.vitisai_monitor import VitisAIMonitor + from ..session.stats import PerfStats assert self._model is not None assert self._inputs is not None - session = self._model._session total_iterations = self.config.warmup + self.config.iterations if not HWMonitor.is_available(): @@ -430,31 +602,49 @@ def _run_benchmark_monitored(self) -> PerfStats: # GPU when --device gpu is specified, NPU when --device npu, etc. # ep_name lets the monitor resolve the exact LUID via ORT's autoEP # metadata so we follow the adapter the session actually binds to. - monitor_device = self._model.device or self.config.device or "auto" + ep_name = self._resolved_ep() + monitor_device = self._resolved_device() or self.config.device or "auto" hw_monitor = HWMonitor( poll_interval_ms=_HW_POLL_INTERVAL_MS, device=monitor_device, - ep_name=session.ep_name, + ep_name=ep_name, ) # EP-specific proof-of-execution monitor. # When QNN/OpenVINO monitors become real, add entries here. _ep_monitors: dict[EPName, Any] = {"VitisAIExecutionProvider": VitisAIMonitor} - monitor_cls = _ep_monitors.get(session.ep_name) if session.ep_name else None + monitor_cls = _ep_monitors.get(ep_name) if ep_name else None ep_monitor: Any if monitor_cls and monitor_cls.is_available(): ep_monitor = monitor_cls() else: ep_monitor = NullEPMonitor() + # Composite models time the full forward() pass via an external + # PerfStats; single-session models record pure-ORT time inside the + # session's perf() context. The run callable abstracts that difference. + if self._is_composite: + stats_cm: Any = contextlib.nullcontext(PerfStats(warmup=self.config.warmup)) + else: + stats_cm = self._model._session.perf(warmup=self.config.warmup) + with ( - session.perf(warmup=self.config.warmup) as stats, + stats_cm as stats, hw_monitor as hw, ep_monitor as ep_mon, ): + if self._is_composite: + + def run_iteration() -> None: + self._composite_run_iteration(stats) + else: + session = self._model._session + + def run_iteration() -> None: + session.run(self._inputs) + _run_monitored_loop( - session, - self._inputs, + run_iteration, stats, hw, total_iterations=total_iterations, @@ -474,7 +664,7 @@ def _run_benchmark_monitored(self) -> PerfStats: def _collect_results(self, stats: PerfStats) -> BenchmarkResult: """Collect benchmark results from PerfStats.""" assert self._model is not None - io_config = self._model.io_config + io_config = self._resolved_io_config() # Calculate throughput mean_latency_sec = stats.mean_ms / 1000.0 @@ -512,9 +702,9 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: samples_per_sec=samples_per_sec, batches_per_sec=batches_per_sec, # Actual values (resolved after build + compile) - actual_device=self._model.device, - actual_task=self._model.task or self.config.task or "auto-detected", - actual_ep=self._model.ep_name, + actual_device=self._resolved_device(), + actual_task=self._resolved_task() or "auto-detected", + actual_ep=self._resolved_ep(), # Hardware monitor metrics (only present when --monitor is used) hw_monitor=getattr(self, "_hw_metrics", None), ) @@ -961,8 +1151,7 @@ def _print_model_info( def _run_monitored_loop( - session: Any, - inputs: dict[str, Any], + run_iteration: Callable[[], Any], stats: PerfStats, hw: Any, *, @@ -971,7 +1160,12 @@ def _run_monitored_loop( model_id: str, device: str, ) -> None: - """Run the benchmark iteration loop with live hardware monitoring.""" + """Run the benchmark iteration loop with live hardware monitoring. + + ``run_iteration`` runs (and times into ``stats``) a single inference. For + single-session models it invokes ``session.run`` inside the session's + perf() context; for composite models it records a full ``forward()`` pass. + """ display = LiveMonitorDisplay( total_iterations=total_iterations, warmup=warmup, @@ -981,7 +1175,7 @@ def _run_monitored_loop( ) with display: for i in range(total_iterations): - session.run(inputs) + run_iteration() latest_latency = stats.all_samples_ms[-1] if stats.all_samples_ms else 0 display.update( diff --git a/tests/unit/commands/test_perf_composite.py b/tests/unit/commands/test_perf_composite.py new file mode 100644 index 000000000..42c9defef --- /dev/null +++ b/tests/unit/commands/test_perf_composite.py @@ -0,0 +1,249 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for winml perf support of composite (multi-session) models. + +Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ONNX +session; they orchestrate several sub-models. The perf benchmark must +aggregate their io_configs and time the full ``forward()`` pass rather +than reaching for a single ``_session``. + +Regression guard: previously ``PerfBenchmark`` assumed every model exposed +``io_config`` / ``_session`` and raised ``AttributeError`` on composites. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any + +import numpy as np + +from winml.modelkit.commands.perf import ( + BenchmarkConfig, + PerfBenchmark, + _aggregate_io_config, + _describe_outputs, +) + + +def _make_sub_model( + input_names: list[str], + input_shapes: list[list[int | None]], + input_types: list[str], + output_names: list[str], + output_shapes: list[list[int | None]], + *, + device: str = "GPU", + ep_name: str = "OpenVINOExecutionProvider", + precision: str | None = "fp16", +) -> Any: + """Build a minimal stand-in for a WinMLAutoModel sub-component.""" + io_config = { + "input_names": input_names, + "input_shapes": input_shapes, + "input_types": input_types, + "output_names": output_names, + "output_shapes": output_shapes, + "output_types": ["float32"] * len(output_names), + "precision": precision, + } + compiled: dict[str, bool] = {"compiled": False} + + def _compile() -> None: + compiled["compiled"] = True + + return SimpleNamespace( + io_config=io_config, + device=device, + ep_name=ep_name, + _session=SimpleNamespace(compile=_compile), + _compiled_flag=compiled, + ) + + +class _FakeComposite: + """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``).""" + + def __init__(self, sub_models: dict[str, Any]) -> None: + self.sub_models = sub_models + self.call_log: list[dict[str, np.ndarray]] = [] + + def __call__(self, **kwargs: np.ndarray) -> dict[str, np.ndarray]: + self.call_log.append(kwargs) + # Mimics a composite's task-level forward() output (e.g. SigLIP): + # tensors that exist on no single sub-model's ONNX graph. + return { + "logits_per_image": np.zeros((1, 1), dtype=np.float32), + "image_embeds": np.zeros((1, 768), dtype=np.float32), + "text_embeds": np.zeros((1, 768), dtype=np.float32), + } + + +def _siglip_like() -> _FakeComposite: + image_encoder = _make_sub_model( + input_names=["pixel_values"], + input_shapes=[[1, 3, 224, 224]], + input_types=["float32"], + output_names=["image_embeds"], + output_shapes=[[1, 768]], + ) + text_encoder = _make_sub_model( + input_names=["input_ids", "attention_mask"], + input_shapes=[[1, 64], [1, 64]], + input_types=["int64", "int64"], + output_names=["text_embeds"], + output_shapes=[[1, 768]], + ) + return _FakeComposite({"image-encoder": image_encoder, "text-encoder": text_encoder}) + + +class TestAggregateIoConfig: + """Unit tests for the io_config union helper.""" + + def test_union_dedupes_by_name_preserving_order(self) -> None: + model = _siglip_like() + agg = _aggregate_io_config(model.sub_models.values()) + + assert agg["input_names"] == ["pixel_values", "input_ids", "attention_mask"] + assert agg["input_shapes"] == [[1, 3, 224, 224], [1, 64], [1, 64]] + assert agg["input_types"] == ["float32", "int64", "int64"] + assert agg["output_names"] == ["image_embeds", "text_embeds"] + + def test_shared_input_name_is_not_duplicated(self) -> None: + # Both encoders consume "attention_mask" -> it must appear once. + a = _make_sub_model( + ["input_ids", "attention_mask"], + [[1, 8], [1, 8]], + ["int64", "int64"], + ["a"], + [[1, 4]], + ) + b = _make_sub_model( + ["attention_mask", "token_type_ids"], + [[1, 8], [1, 8]], + ["int64", "int64"], + ["b"], + [[1, 4]], + ) + agg = _aggregate_io_config([a, b]) + assert agg["input_names"] == ["input_ids", "attention_mask", "token_type_ids"] + + def test_precision_taken_from_first_sub_model(self) -> None: + a = _make_sub_model(["x"], [[1]], ["float32"], ["y"], [[1]], precision="int8") + b = _make_sub_model(["z"], [[1]], ["float32"], ["w"], [[1]], precision="fp16") + assert _aggregate_io_config([a, b])["precision"] == "int8" + + +class TestPerfBenchmarkComposite: + """PerfBenchmark must transparently handle composite models.""" + + def _benchmark(self) -> tuple[PerfBenchmark, _FakeComposite]: + config = BenchmarkConfig( + model_id="google/siglip-base-patch16-224", + task="zero-shot-image-classification", + device="gpu", + iterations=3, + warmup=1, + ) + bench = PerfBenchmark(config) + model = _siglip_like() + bench._model = model # bypass _load_model (no HF download in unit tests) + return bench, model + + def test_detects_composite(self) -> None: + bench, _ = self._benchmark() + assert bench._is_composite is True + + def test_resolved_io_config_is_aggregated_and_cached(self) -> None: + bench, _ = self._benchmark() + io = bench._resolved_io_config() + assert io["input_names"] == ["pixel_values", "input_ids", "attention_mask"] + # Cached: second call returns the same object. + assert bench._resolved_io_config() is io + + def test_compile_compiles_every_sub_session(self) -> None: + bench, model = self._benchmark() + bench._compile_model() + assert all(s._compiled_flag["compiled"] for s in model.sub_models.values()) + + def test_generate_inputs_covers_all_sub_model_inputs(self) -> None: + bench, _ = self._benchmark() + bench._generate_inputs() + assert set(bench._inputs) == {"pixel_values", "input_ids", "attention_mask"} + assert bench._inputs["pixel_values"].shape == (1, 3, 224, 224) + assert bench._inputs["input_ids"].shape == (1, 64) + + def test_resolved_device_ep_task_from_sub_model(self) -> None: + bench, _ = self._benchmark() + assert bench._resolved_device() == "GPU" + assert bench._resolved_ep() == "OpenVINOExecutionProvider" + assert bench._resolved_task() == "zero-shot-image-classification" + + def test_simple_benchmark_times_full_forward(self) -> None: + bench, model = self._benchmark() + bench._generate_inputs() + stats = bench._run_benchmark_simple() + + # warmup(1) + iterations(3) == 4 forward() calls; stats excludes warmup. + assert len(model.call_log) == 4 + assert stats.count == 3 + # forward() received the generated inputs as kwargs. + assert set(model.call_log[0]) == {"pixel_values", "input_ids", "attention_mask"} + + def test_probe_replaces_outputs_with_real_forward_result(self) -> None: + # The aggregated view reports the image encoder's raw ONNX outputs; + # probing must replace them with the composite forward()'s outputs. + bench, _ = self._benchmark() + bench._generate_inputs() + assert bench._resolved_io_config()["output_names"] == ["image_embeds", "text_embeds"] + + bench._probe_composite_outputs() + io = bench._resolved_io_config() + assert io["output_names"] == ["logits_per_image", "image_embeds", "text_embeds"] + assert io["output_shapes"] == [[1, 1], [1, 768], [1, 768]] + + def test_collect_results_reports_probed_outputs(self) -> None: + bench, _ = self._benchmark() + bench._generate_inputs() + bench._probe_composite_outputs() + stats = bench._run_benchmark_simple() + result = bench._collect_results(stats) + + assert result.input_names == ["pixel_values", "input_ids", "attention_mask"] + # Real composite outputs, not the deduped sub-model ONNX outputs. + assert result.output_names == ["logits_per_image", "image_embeds", "text_embeds"] + assert result.actual_device == "GPU" + assert result.actual_ep == "OpenVINOExecutionProvider" + assert result.actual_task == "zero-shot-image-classification" + + +class TestDescribeOutputs: + """Unit tests for the architecture-agnostic forward()-output describer.""" + + def test_dict_output_named_fields(self) -> None: + out = { + "logits": np.zeros((2, 5), dtype=np.float32), + "embeds": np.zeros((2, 8), dtype=np.float32), + } + names, shapes, types = _describe_outputs(out) + assert names == ["logits", "embeds"] + assert shapes == [[2, 5], [2, 8]] + assert all("float32" in t for t in types) + + def test_skips_none_and_non_array_fields(self) -> None: + out = {"a": np.zeros((1, 3)), "b": None, "c": "not-an-array"} + names, shapes, _ = _describe_outputs(out) + assert names == ["a"] + assert shapes == [[1, 3]] + + def test_sequence_output_positional_names(self) -> None: + names, shapes, _ = _describe_outputs([np.zeros((1, 4)), np.zeros((1, 2))]) + assert names == ["output_0", "output_1"] + assert shapes == [[1, 4], [1, 2]] + + def test_single_tensor_output(self) -> None: + names, shapes, _ = _describe_outputs(np.zeros((3, 3))) + assert names == ["output_0"] + assert shapes == [[3, 3]] From 0bd7bb4751c22e751d58a1ca54184411242ed84f Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 11 Jun 2026 09:48:01 +0800 Subject: [PATCH 2/9] mypy --- src/winml/modelkit/commands/perf.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 8e5c5542c..548e27d8b 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -20,7 +20,7 @@ from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast import click import numpy as np @@ -37,6 +37,7 @@ from collections.abc import Callable, Iterable from ..models.winml.base import WinMLPreTrainedModel + from ..models.winml.composite_model import WinMLCompositeModel from ..session.stats import PerfStats logger = logging.getLogger(__name__) @@ -366,12 +367,16 @@ def _is_composite(self) -> bool: """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP).""" return hasattr(self._model, "sub_models") + def _sub_models(self) -> dict[str, WinMLPreTrainedModel]: + """Sub-models of a composite model (only valid when ``_is_composite``).""" + return cast("WinMLCompositeModel", self._model).sub_models + def _resolved_io_config(self) -> dict[str, Any]: """Unified io_config (aggregated across sub-models for composites).""" if self._io_config is None: assert self._model is not None if self._is_composite: - self._io_config = _aggregate_io_config(self._model.sub_models.values()) + self._io_config = _aggregate_io_config(self._sub_models().values()) else: self._io_config = self._model.io_config return self._io_config @@ -380,7 +385,7 @@ def _compile_model(self) -> None: """Compile the underlying ORT session(s) so device/EP are resolved.""" assert self._model is not None if self._is_composite: - for sub in self._model.sub_models.values(): + for sub in self._sub_models().values(): sub._session.compile() else: self._model._session.compile() @@ -389,14 +394,14 @@ def _resolved_device(self) -> str: """Actual device bound after compile (representative sub-model for composites).""" assert self._model is not None if self._is_composite: - return next(iter(self._model.sub_models.values())).device + return next(iter(self._sub_models().values())).device return self._model.device def _resolved_ep(self) -> EPName | None: """Primary EP bound after compile (representative sub-model for composites).""" assert self._model is not None if self._is_composite: - return next(iter(self._model.sub_models.values())).ep_name + return next(iter(self._sub_models().values())).ep_name return self._model.ep_name def _resolved_task(self) -> str | None: @@ -542,7 +547,9 @@ def _composite_run_iteration(self, stats: PerfStats) -> None: """Time one full composite forward() pass (orchestrates all sub-sessions).""" assert self._model is not None assert self._inputs is not None - stats.record(lambda: self._model(**self._inputs)) + model = self._model + inputs = self._inputs + stats.record(lambda: model(**inputs)) def _run_benchmark(self) -> PerfStats: """Execute benchmark iterations with timing.""" @@ -639,9 +646,10 @@ def run_iteration() -> None: self._composite_run_iteration(stats) else: session = self._model._session + inputs = self._inputs def run_iteration() -> None: - session.run(self._inputs) + session.run(inputs) _run_monitored_loop( run_iteration, @@ -659,7 +667,7 @@ def run_iteration() -> None: if ep_dict: # NullEPMonitor returns {}, real monitors return data self._hw_metrics["ep_proof"] = ep_dict - return stats + return cast("PerfStats", stats) def _collect_results(self, stats: PerfStats) -> BenchmarkResult: """Collect benchmark results from PerfStats.""" From 3fc29435d0b56361706926064364b669e5860d6a Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 14:01:58 +0800 Subject: [PATCH 3/9] what's now --- src/winml/modelkit/commands/perf.py | 297 +++++++--------- tests/unit/commands/test_perf_composite.py | 392 +++++++++++---------- 2 files changed, 331 insertions(+), 358 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 548e27d8b..614c59a49 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -14,7 +14,6 @@ from __future__ import annotations -import contextlib import json import logging from dataclasses import dataclass, field @@ -34,7 +33,7 @@ if TYPE_CHECKING: - from collections.abc import Callable, Iterable + from collections.abc import Callable from ..models.winml.base import WinMLPreTrainedModel from ..models.winml.composite_model import WinMLCompositeModel @@ -262,81 +261,6 @@ def _resolve_shape( return tuple(resolved) -def _aggregate_io_config(sub_models: Iterable[Any]) -> dict[str, Any]: - """Merge a composite model's sub-model io_configs into one unified view. - - Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT - session; they orchestrate several sub-models. For benchmarking we present a - unified io_config whose inputs are the union of every sub-model's inputs - (deduplicated by name, order preserved) -- which is exactly the set of - keyword arguments the composite's ``forward()`` consumes. Outputs are - likewise unioned for display and result-reporting purposes. - """ - agg: dict[str, Any] = { - "input_names": [], - "input_shapes": [], - "input_types": [], - "output_names": [], - "output_shapes": [], - "output_types": [], - "precision": None, - } - seen_in: set[str] = set() - seen_out: set[str] = set() - for sub in sub_models: - io = sub.io_config - for name, shape, dtype in zip( - io["input_names"], io["input_shapes"], io["input_types"], strict=True - ): - if name not in seen_in: - seen_in.add(name) - agg["input_names"].append(name) - agg["input_shapes"].append(shape) - agg["input_types"].append(dtype) - out_types = io.get("output_types") or [None] * len(io["output_names"]) - for name, shape, dtype in zip( - io["output_names"], io["output_shapes"], out_types, strict=False - ): - if name not in seen_out: - seen_out.add(name) - agg["output_names"].append(name) - agg["output_shapes"].append(shape) - agg["output_types"].append(dtype) - if agg["precision"] is None: - agg["precision"] = io.get("precision") - return agg - - -def _describe_outputs(output: Any) -> tuple[list[str], list[list[int]], list[str | None]]: - """Extract ``(names, shapes, dtypes)`` from a model ``forward()`` result. - - Architecture-agnostic: handles HuggingFace ``ModelOutput`` / ``dict`` - (named fields), plain sequences (positional ``output_N`` names), and a - single tensor. ``None`` fields and non-array values are skipped. Used to - report a composite model's real task-level outputs (e.g. ``logits``) - rather than its sub-models' raw ONNX outputs. - """ - if hasattr(output, "items"): - pairs = list(output.items()) - elif isinstance(output, (list, tuple)): - pairs = [(f"output_{i}", value) for i, value in enumerate(output)] - else: - pairs = [("output_0", output)] - - names: list[str] = [] - shapes: list[list[int]] = [] - types: list[str | None] = [] - for name, value in pairs: - shape = getattr(value, "shape", None) - if value is None or shape is None: - continue - names.append(name) - shapes.append([int(dim) for dim in shape]) - dtype = getattr(value, "dtype", None) - types.append(str(dtype) if dtype is not None else None) - return names, shapes, types - - # ============================================================================= # Benchmark Engine # ============================================================================= @@ -372,93 +296,83 @@ def _sub_models(self) -> dict[str, WinMLPreTrainedModel]: return cast("WinMLCompositeModel", self._model).sub_models def _resolved_io_config(self) -> dict[str, Any]: - """Unified io_config (aggregated across sub-models for composites).""" + """I/O config of the (single-session) model being benchmarked.""" if self._io_config is None: assert self._model is not None - if self._is_composite: - self._io_config = _aggregate_io_config(self._sub_models().values()) - else: - self._io_config = self._model.io_config + self._io_config = self._model.io_config return self._io_config def _compile_model(self) -> None: - """Compile the underlying ORT session(s) so device/EP are resolved.""" + """Compile the underlying ORT session so device/EP are resolved.""" assert self._model is not None - if self._is_composite: - for sub in self._sub_models().values(): - sub._session.compile() - else: - self._model._session.compile() + self._model._session.compile() def _resolved_device(self) -> str: - """Actual device bound after compile (representative sub-model for composites).""" + """Actual device bound after compile.""" assert self._model is not None - if self._is_composite: - return next(iter(self._sub_models().values())).device return self._model.device def _resolved_ep(self) -> EPName | None: - """Primary EP bound after compile (representative sub-model for composites).""" + """Primary EP bound after compile.""" assert self._model is not None - if self._is_composite: - return next(iter(self._sub_models().values())).ep_name return self._model.ep_name def _resolved_task(self) -> str | None: - """Resolved task; composites fall back to the requested task.""" + """Resolved task; falls back to the requested task.""" assert self._model is not None - if self._is_composite: - return self.config.task return self._model.task or self.config.task - def _probe_composite_outputs(self) -> None: - """Overwrite a composite's reported outputs with its real forward() result. + def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]: + """Execute full benchmark pipeline. - Runs one ``forward()`` pass (an extra warmup) and introspects the - returned object so the displayed/reported outputs reflect the - composite's task-level tensors (e.g. ``logits_per_image``) instead of - the deduplicated union of its sub-models' raw ONNX outputs. Falls back - to the aggregated view if the probe fails or yields nothing. + Returns: + A single ``BenchmarkResult`` for single-session models, or a + ``{sub_model_name: BenchmarkResult}`` mapping for composite models + (e.g. CLIP/SigLIP dual-encoders). Composite models have no single + ORT session, so each sub-model is benchmarked individually rather + than timing the aggregate ``forward()`` pass. """ + # [1] Load model + logger.info("Loading model: %s", self.config.model_id) + self._load_model() assert self._model is not None - assert self._inputs is not None - try: - output = self._model(**self._inputs) - except Exception: # best-effort display only; never fail the run - logger.debug("Composite output probe failed; keeping aggregated view", exc_info=True) - return - names, shapes, types = _describe_outputs(output) - if not names: - return - io = self._resolved_io_config() - io["output_names"] = names - io["output_shapes"] = shapes - io["output_types"] = types + if self._is_composite: + return self._run_sub_models() + return self._run_single() - def run(self) -> BenchmarkResult: - """Execute full benchmark pipeline. + def _run_sub_models(self) -> dict[str, BenchmarkResult]: + """Benchmark each sub-model of a composite individually. + + Each sub-model is itself a single-session ``WinMLAutoModel``, so it is + benchmarked through the standard single-model pipeline by spawning a + child ``PerfBenchmark`` with the already-loaded sub-model. Results are + keyed by sub-model name for per-component reporting. + """ + results: dict[str, BenchmarkResult] = {} + for name, sub in self._sub_models().items(): + logger.info("Benchmarking sub-model '%s'", name) + Console(stderr=True).print(f"\n[bold]Sub-model:[/bold] {name}") + child = PerfBenchmark(self.config) + child._model = sub + results[name] = child._run_single() + return results + + def _run_single(self) -> BenchmarkResult: + """Benchmark the loaded single-session model. Returns: BenchmarkResult with timing statistics """ - # [1] Load model - logger.info("Loading model: %s", self.config.model_id) - self._load_model() assert self._model is not None # [2] Generate inputs logger.info("Generating benchmark inputs") self._generate_inputs() - # Compile session(s) early so model.device is resolved for display + # Compile session early so model.device is resolved for display self._compile_model() - # Composite forward() returns task-level outputs (e.g. logits) that - # don't map to any single sub-model's ONNX outputs; probe the real ones. - if self._is_composite: - self._probe_composite_outputs() - # Print model info before benchmark starts _print_model_info( self._resolved_io_config(), @@ -543,14 +457,6 @@ def _generate_inputs(self) -> None: batch_size=self.config.batch_size, ) - def _composite_run_iteration(self, stats: PerfStats) -> None: - """Time one full composite forward() pass (orchestrates all sub-sessions).""" - assert self._model is not None - assert self._inputs is not None - model = self._model - inputs = self._inputs - stats.record(lambda: model(**inputs)) - def _run_benchmark(self) -> PerfStats: """Execute benchmark iterations with timing.""" if self.config.monitor: @@ -563,18 +469,6 @@ def _run_benchmark_simple(self) -> PerfStats: assert self._inputs is not None total_iterations = self.config.warmup + self.config.iterations - # Composite models have no single ORT session; time the full forward() - # pass with an external PerfStats instead of the session's perf() hook. - if self._is_composite: - from ..session.stats import PerfStats - - stats = PerfStats(warmup=self.config.warmup) - for i in range(total_iterations): - self._composite_run_iteration(stats) - if (i + 1) % max(1, total_iterations // 10) == 0: - logger.debug("Progress: %d/%d", i + 1, total_iterations) - return stats - session = self._model._session with session.perf(warmup=self.config.warmup) as stats: _run_simple_loop(session, self._inputs, total_iterations) @@ -592,7 +486,6 @@ def _run_benchmark_monitored(self) -> PerfStats: from ..session.monitor.ep_monitor import NullEPMonitor from ..session.monitor.hw_monitor import HWMonitor from ..session.monitor.vitisai_monitor import VitisAIMonitor - from ..session.stats import PerfStats assert self._model is not None assert self._inputs is not None @@ -627,29 +520,16 @@ def _run_benchmark_monitored(self) -> PerfStats: else: ep_monitor = NullEPMonitor() - # Composite models time the full forward() pass via an external - # PerfStats; single-session models record pure-ORT time inside the - # session's perf() context. The run callable abstracts that difference. - if self._is_composite: - stats_cm: Any = contextlib.nullcontext(PerfStats(warmup=self.config.warmup)) - else: - stats_cm = self._model._session.perf(warmup=self.config.warmup) - with ( - stats_cm as stats, + self._model._session.perf(warmup=self.config.warmup) as stats, hw_monitor as hw, ep_monitor as ep_mon, ): - if self._is_composite: - - def run_iteration() -> None: - self._composite_run_iteration(stats) - else: - session = self._model._session - inputs = self._inputs + session = self._model._session + inputs = self._inputs - def run_iteration() -> None: - session.run(inputs) + def run_iteration() -> None: + session.run(inputs) _run_monitored_loop( run_iteration, @@ -1084,6 +964,74 @@ def write_json_report(result: BenchmarkResult, output_path: Path) -> None: json.dump(result.to_dict(), f, indent=2) +def _composite_report_dict( + results: dict[str, BenchmarkResult], + *, + model_id: str, + task: str | None, +) -> dict[str, Any]: + """Build the combined JSON report for a composite model's sub-models.""" + return { + "model_id": model_id, + "task": task, + "component_count": len(results), + "components": {name: result.to_dict() for name, result in results.items()}, + } + + +def report_composite_results( + results: dict[str, BenchmarkResult], + *, + console: Console, + json_mode: bool, + output_path: Path, + model_id: str, + task: str | None, +) -> None: + """Display and persist per-sub-model results for a composite model. + + Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT + session; each sub-model is benchmarked individually (like ``--module``) + and reported as its own summary row rather than timing the aggregate + ``forward()`` pass. The combined JSON nests each sub-model's full + ``BenchmarkResult.to_dict()`` under ``components``. + """ + combined = _composite_report_dict(results, model_id=model_id, task=task) + + if json_mode: + click.echo(json.dumps(combined, indent=2)) + else: + table = Table(title="Per-Sub-Model Perf", show_header=True) + table.add_column("Sub-Model", style="cyan") + table.add_column("Task") + table.add_column("Device") + table.add_column("Mean (ms)", justify="right") + table.add_column("P90 (ms)", justify="right") + table.add_column("Min (ms)", justify="right") + table.add_column("Max (ms)", justify="right") + for name, result in results.items(): + device_str = _device_string( + result.config.device, result.actual_device, result.actual_ep + ) + table.add_row( + name, + result.actual_task, + device_str, + f"{result.mean_ms:.2f}", + f"{result.p90_ms:.2f}", + f"{result.min_ms:.2f}", + f"{result.max_ms:.2f}", + ) + console.print() + console.print(table) + console.print() + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("w", encoding="utf-8") as f: + json.dump(combined, f, indent=2) + + def generate_output_path(model_id: str, *, module_class: str | None = None) -> Path: r"""Generate default output path under the user's cache directory. @@ -1503,6 +1451,21 @@ def perf( benchmark = PerfBenchmark(config) result = benchmark.run() + # Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT + # session; each sub-model is benchmarked individually and reported as + # its own row (like --module), not as one aggregate forward() timing. + if isinstance(result, dict): + report_composite_results( + result, + console=console, + json_mode=json_mode, + output_path=output, + model_id=hf_model, + task=task, + ) + console.print(f"[green]Results saved to:[/green] {output}") + return + # Display results if json_mode: click.echo(json.dumps(result.to_dict(), indent=2)) diff --git a/tests/unit/commands/test_perf_composite.py b/tests/unit/commands/test_perf_composite.py index 42c9defef..1e99da930 100644 --- a/tests/unit/commands/test_perf_composite.py +++ b/tests/unit/commands/test_perf_composite.py @@ -5,9 +5,9 @@ """Tests for winml perf support of composite (multi-session) models. Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ONNX -session; they orchestrate several sub-models. The perf benchmark must -aggregate their io_configs and time the full ``forward()`` pass rather -than reaching for a single ``_session``. +session; they orchestrate several sub-models. ``winml perf`` benchmarks +each sub-model individually (like ``--module``) and reports one row per +sub-model rather than timing the aggregate ``forward()`` pass. Regression guard: previously ``PerfBenchmark`` assumed every model exposed ``io_config`` / ``_session`` and raised ``AttributeError`` on composites. @@ -15,32 +15,99 @@ from __future__ import annotations -from types import SimpleNamespace -from typing import Any +import json +from contextlib import contextmanager +from typing import TYPE_CHECKING, Any -import numpy as np +from rich.console import Console from winml.modelkit.commands.perf import ( BenchmarkConfig, + BenchmarkResult, PerfBenchmark, - _aggregate_io_config, - _describe_outputs, + report_composite_results, ) +from winml.modelkit.session.stats import PerfStats -def _make_sub_model( +if TYPE_CHECKING: + from collections.abc import Generator + from pathlib import Path + + +class _FakeSession: + """Stand-in for a WinMLSession that times runs via a real PerfStats.""" + + def __init__(self, io_config: dict[str, Any], device: str, ep_name: str) -> None: + self.io_config = io_config + self.device = device + self.ep_name = ep_name + self.compiled = False + self.run_log: list[dict[str, Any]] = [] + self._perf_stats: PerfStats | None = None + + def compile(self) -> None: + self.compiled = True + + @contextmanager + def perf(self, warmup: int = 0) -> Generator[PerfStats, None, None]: + self._perf_stats = PerfStats(warmup=warmup) + try: + yield self._perf_stats + finally: + self._perf_stats = None + + def run(self, inputs: dict[str, Any]) -> dict[str, Any]: + self.run_log.append(inputs) + if self._perf_stats is not None: + self._perf_stats.record(lambda: None) + return {} + + +class _FakeSubModel: + """Stand-in for a single-session WinMLAutoModel sub-component.""" + + def __init__( + self, + io_config: dict[str, Any], + task: str, + *, + device: str = "GPU", + ep_name: str = "OpenVINOExecutionProvider", + ) -> None: + self._session = _FakeSession(io_config, device, ep_name) + self.task = task + + @property + def io_config(self) -> dict[str, Any]: + return self._session.io_config + + @property + def device(self) -> str: + return self._session.device + + @property + def ep_name(self) -> str: + return self._session.ep_name + + +class _FakeComposite: + """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``).""" + + def __init__(self, sub_models: dict[str, Any]) -> None: + self.sub_models = sub_models + + +def _io_config( input_names: list[str], - input_shapes: list[list[int | None]], + input_shapes: list[list[int]], input_types: list[str], output_names: list[str], - output_shapes: list[list[int | None]], + output_shapes: list[list[int]], *, - device: str = "GPU", - ep_name: str = "OpenVINOExecutionProvider", precision: str | None = "fp16", -) -> Any: - """Build a minimal stand-in for a WinMLAutoModel sub-component.""" - io_config = { +) -> dict[str, Any]: + return { "input_names": input_names, "input_shapes": input_shapes, "input_types": input_types, @@ -49,201 +116,144 @@ def _make_sub_model( "output_types": ["float32"] * len(output_names), "precision": precision, } - compiled: dict[str, bool] = {"compiled": False} - def _compile() -> None: - compiled["compiled"] = True - return SimpleNamespace( - io_config=io_config, - device=device, - ep_name=ep_name, - _session=SimpleNamespace(compile=_compile), - _compiled_flag=compiled, +def _siglip_like() -> _FakeComposite: + image_encoder = _FakeSubModel( + _io_config( + ["pixel_values"], + [[1, 3, 224, 224]], + ["float32"], + ["image_embeds"], + [[1, 768]], + ), + task="image-feature-extraction", + ) + text_encoder = _FakeSubModel( + _io_config( + ["input_ids", "attention_mask"], + [[1, 64], [1, 64]], + ["int64", "int64"], + ["text_embeds"], + [[1, 768]], + ), + task="feature-extraction", ) + return _FakeComposite({"image-encoder": image_encoder, "text-encoder": text_encoder}) -class _FakeComposite: - """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``).""" +def _composite_benchmark() -> tuple[PerfBenchmark, _FakeComposite]: + config = BenchmarkConfig( + model_id="google/siglip-base-patch16-224", + task="zero-shot-image-classification", + device="gpu", + iterations=3, + warmup=1, + ) + bench = PerfBenchmark(config) + model = _siglip_like() + bench._model = model # bypass _load_model (no HF download in unit tests) + return bench, model - def __init__(self, sub_models: dict[str, Any]) -> None: - self.sub_models = sub_models - self.call_log: list[dict[str, np.ndarray]] = [] - def __call__(self, **kwargs: np.ndarray) -> dict[str, np.ndarray]: - self.call_log.append(kwargs) - # Mimics a composite's task-level forward() output (e.g. SigLIP): - # tensors that exist on no single sub-model's ONNX graph. - return { - "logits_per_image": np.zeros((1, 1), dtype=np.float32), - "image_embeds": np.zeros((1, 768), dtype=np.float32), - "text_embeds": np.zeros((1, 768), dtype=np.float32), - } +class TestPerfBenchmarkComposite: + """PerfBenchmark benchmarks each sub-model of a composite individually.""" + def test_detects_composite(self) -> None: + bench, _ = _composite_benchmark() + assert bench._is_composite is True -def _siglip_like() -> _FakeComposite: - image_encoder = _make_sub_model( - input_names=["pixel_values"], - input_shapes=[[1, 3, 224, 224]], - input_types=["float32"], - output_names=["image_embeds"], - output_shapes=[[1, 768]], - ) - text_encoder = _make_sub_model( - input_names=["input_ids", "attention_mask"], - input_shapes=[[1, 64], [1, 64]], - input_types=["int64", "int64"], - output_names=["text_embeds"], - output_shapes=[[1, 768]], - ) - return _FakeComposite({"image-encoder": image_encoder, "text-encoder": text_encoder}) + def test_run_returns_result_per_sub_model(self) -> None: + bench, _ = _composite_benchmark() + results = bench._run_sub_models() + assert set(results) == {"image-encoder", "text-encoder"} + assert all(isinstance(r, BenchmarkResult) for r in results.values()) -class TestAggregateIoConfig: - """Unit tests for the io_config union helper.""" + def test_each_sub_model_reports_its_own_io(self) -> None: + # No aggregation: each result carries only its sub-model's inputs. + bench, _ = _composite_benchmark() + results = bench._run_sub_models() - def test_union_dedupes_by_name_preserving_order(self) -> None: - model = _siglip_like() - agg = _aggregate_io_config(model.sub_models.values()) + assert results["image-encoder"].input_names == ["pixel_values"] + assert results["text-encoder"].input_names == ["input_ids", "attention_mask"] + assert results["image-encoder"].output_names == ["image_embeds"] + assert results["text-encoder"].output_names == ["text_embeds"] - assert agg["input_names"] == ["pixel_values", "input_ids", "attention_mask"] - assert agg["input_shapes"] == [[1, 3, 224, 224], [1, 64], [1, 64]] - assert agg["input_types"] == ["float32", "int64", "int64"] - assert agg["output_names"] == ["image_embeds", "text_embeds"] + def test_each_sub_model_reports_its_own_task(self) -> None: + bench, _ = _composite_benchmark() + results = bench._run_sub_models() - def test_shared_input_name_is_not_duplicated(self) -> None: - # Both encoders consume "attention_mask" -> it must appear once. - a = _make_sub_model( - ["input_ids", "attention_mask"], - [[1, 8], [1, 8]], - ["int64", "int64"], - ["a"], - [[1, 4]], - ) - b = _make_sub_model( - ["attention_mask", "token_type_ids"], - [[1, 8], [1, 8]], - ["int64", "int64"], - ["b"], - [[1, 4]], - ) - agg = _aggregate_io_config([a, b]) - assert agg["input_names"] == ["input_ids", "attention_mask", "token_type_ids"] + assert results["image-encoder"].actual_task == "image-feature-extraction" + assert results["text-encoder"].actual_task == "feature-extraction" - def test_precision_taken_from_first_sub_model(self) -> None: - a = _make_sub_model(["x"], [[1]], ["float32"], ["y"], [[1]], precision="int8") - b = _make_sub_model(["z"], [[1]], ["float32"], ["w"], [[1]], precision="fp16") - assert _aggregate_io_config([a, b])["precision"] == "int8" + def test_resolved_device_and_ep_per_sub_model(self) -> None: + bench, _ = _composite_benchmark() + results = bench._run_sub_models() + for result in results.values(): + assert result.actual_device == "GPU" + assert result.actual_ep == "OpenVINOExecutionProvider" -class TestPerfBenchmarkComposite: - """PerfBenchmark must transparently handle composite models.""" + def test_compiles_and_runs_every_sub_session(self) -> None: + bench, model = _composite_benchmark() + bench._run_sub_models() + + for sub in model.sub_models.values(): + assert sub._session.compiled is True + # warmup(1) + iterations(3) == 4 run() calls per sub-session. + assert len(sub._session.run_log) == 4 + + def test_each_sub_model_stats_exclude_warmup(self) -> None: + bench, _ = _composite_benchmark() + results = bench._run_sub_models() + + for result in results.values(): + assert len(result.raw_samples_ms) == 3 + + +class TestReportCompositeResults: + """report_composite_results writes a combined per-component JSON report.""" - def _benchmark(self) -> tuple[PerfBenchmark, _FakeComposite]: - config = BenchmarkConfig( + def test_combined_json_nests_each_component(self, tmp_path: Path) -> None: + bench, _ = _composite_benchmark() + results = bench._run_sub_models() + output = tmp_path / "perf.json" + + report_composite_results( + results, + console=Console(), + json_mode=False, + output_path=output, model_id="google/siglip-base-patch16-224", task="zero-shot-image-classification", - device="gpu", - iterations=3, - warmup=1, ) - bench = PerfBenchmark(config) - model = _siglip_like() - bench._model = model # bypass _load_model (no HF download in unit tests) - return bench, model - def test_detects_composite(self) -> None: - bench, _ = self._benchmark() - assert bench._is_composite is True + data = json.loads(output.read_text()) + assert data["model_id"] == "google/siglip-base-patch16-224" + assert data["task"] == "zero-shot-image-classification" + assert data["component_count"] == 2 + assert set(data["components"]) == {"image-encoder", "text-encoder"} + # Each component holds a full BenchmarkResult.to_dict() payload. + img = data["components"]["image-encoder"] + assert img["model_info"]["input_names"] == ["pixel_values"] + assert "latency_ms" in img + + def test_json_mode_emits_combined_payload_to_stdout(self, tmp_path: Path, capsys: Any) -> None: + bench, _ = _composite_benchmark() + results = bench._run_sub_models() + output = tmp_path / "perf.json" + + report_composite_results( + results, + console=Console(stderr=True), + json_mode=True, + output_path=output, + model_id="google/siglip-base-patch16-224", + task="zero-shot-image-classification", + ) - def test_resolved_io_config_is_aggregated_and_cached(self) -> None: - bench, _ = self._benchmark() - io = bench._resolved_io_config() - assert io["input_names"] == ["pixel_values", "input_ids", "attention_mask"] - # Cached: second call returns the same object. - assert bench._resolved_io_config() is io - - def test_compile_compiles_every_sub_session(self) -> None: - bench, model = self._benchmark() - bench._compile_model() - assert all(s._compiled_flag["compiled"] for s in model.sub_models.values()) - - def test_generate_inputs_covers_all_sub_model_inputs(self) -> None: - bench, _ = self._benchmark() - bench._generate_inputs() - assert set(bench._inputs) == {"pixel_values", "input_ids", "attention_mask"} - assert bench._inputs["pixel_values"].shape == (1, 3, 224, 224) - assert bench._inputs["input_ids"].shape == (1, 64) - - def test_resolved_device_ep_task_from_sub_model(self) -> None: - bench, _ = self._benchmark() - assert bench._resolved_device() == "GPU" - assert bench._resolved_ep() == "OpenVINOExecutionProvider" - assert bench._resolved_task() == "zero-shot-image-classification" - - def test_simple_benchmark_times_full_forward(self) -> None: - bench, model = self._benchmark() - bench._generate_inputs() - stats = bench._run_benchmark_simple() - - # warmup(1) + iterations(3) == 4 forward() calls; stats excludes warmup. - assert len(model.call_log) == 4 - assert stats.count == 3 - # forward() received the generated inputs as kwargs. - assert set(model.call_log[0]) == {"pixel_values", "input_ids", "attention_mask"} - - def test_probe_replaces_outputs_with_real_forward_result(self) -> None: - # The aggregated view reports the image encoder's raw ONNX outputs; - # probing must replace them with the composite forward()'s outputs. - bench, _ = self._benchmark() - bench._generate_inputs() - assert bench._resolved_io_config()["output_names"] == ["image_embeds", "text_embeds"] - - bench._probe_composite_outputs() - io = bench._resolved_io_config() - assert io["output_names"] == ["logits_per_image", "image_embeds", "text_embeds"] - assert io["output_shapes"] == [[1, 1], [1, 768], [1, 768]] - - def test_collect_results_reports_probed_outputs(self) -> None: - bench, _ = self._benchmark() - bench._generate_inputs() - bench._probe_composite_outputs() - stats = bench._run_benchmark_simple() - result = bench._collect_results(stats) - - assert result.input_names == ["pixel_values", "input_ids", "attention_mask"] - # Real composite outputs, not the deduped sub-model ONNX outputs. - assert result.output_names == ["logits_per_image", "image_embeds", "text_embeds"] - assert result.actual_device == "GPU" - assert result.actual_ep == "OpenVINOExecutionProvider" - assert result.actual_task == "zero-shot-image-classification" - - -class TestDescribeOutputs: - """Unit tests for the architecture-agnostic forward()-output describer.""" - - def test_dict_output_named_fields(self) -> None: - out = { - "logits": np.zeros((2, 5), dtype=np.float32), - "embeds": np.zeros((2, 8), dtype=np.float32), - } - names, shapes, types = _describe_outputs(out) - assert names == ["logits", "embeds"] - assert shapes == [[2, 5], [2, 8]] - assert all("float32" in t for t in types) - - def test_skips_none_and_non_array_fields(self) -> None: - out = {"a": np.zeros((1, 3)), "b": None, "c": "not-an-array"} - names, shapes, _ = _describe_outputs(out) - assert names == ["a"] - assert shapes == [[1, 3]] - - def test_sequence_output_positional_names(self) -> None: - names, shapes, _ = _describe_outputs([np.zeros((1, 4)), np.zeros((1, 2))]) - assert names == ["output_0", "output_1"] - assert shapes == [[1, 4], [1, 2]] - - def test_single_tensor_output(self) -> None: - names, shapes, _ = _describe_outputs(np.zeros((3, 3))) - assert names == ["output_0"] - assert shapes == [[3, 3]] + payload = json.loads(capsys.readouterr().out) + assert set(payload["components"]) == {"image-encoder", "text-encoder"} + # File is written regardless of json_mode. + assert output.exists() From 84456b6567303c1c49ed4e3729acd2b256e5e211 Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 15:06:38 +0800 Subject: [PATCH 4/9] use type --- src/winml/modelkit/commands/perf.py | 53 ++++++++++++++++++----------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 614c59a49..2dfd1be2a 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -282,45 +282,62 @@ class PerfBenchmark: def __init__(self, config: BenchmarkConfig) -> None: """Initialize benchmark with configuration.""" self.config = config - self._model: WinMLPreTrainedModel | None = None + self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None self._inputs: dict[str, np.ndarray] | None = None self._io_config: dict[str, Any] | None = None @property def _is_composite(self) -> bool: - """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP).""" + """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP). + + Duck-typed on ``sub_models`` rather than ``isinstance(..., WinMLCompositeModel)`` + on purpose: an ``isinstance`` check needs a runtime import of + ``composite_model``, which imports torch and would blow the + ``winml perf --help`` import budget (see tests/cli/test_import_time.py) by + pulling torch in at module load. Keeping ``WinMLCompositeModel`` a + TYPE_CHECKING-only import also lets the unit tests use lightweight + duck-typed fakes instead of constructing real torch-backed composites. + ``sub_models`` is the defining member of the composite base, so it is a + reliable marker. + """ return hasattr(self._model, "sub_models") def _sub_models(self) -> dict[str, WinMLPreTrainedModel]: """Sub-models of a composite model (only valid when ``_is_composite``).""" return cast("WinMLCompositeModel", self._model).sub_models + @property + def _single(self) -> WinMLPreTrainedModel: + """The model under benchmark, narrowed to a single-session model. + + Only valid for non-composite models: composites dispatch to + ``_run_sub_models``, which benchmarks each sub-model through a child + ``PerfBenchmark`` whose ``_model`` is itself single-session. + """ + assert self._model is not None + return cast("WinMLPreTrainedModel", self._model) + def _resolved_io_config(self) -> dict[str, Any]: """I/O config of the (single-session) model being benchmarked.""" if self._io_config is None: - assert self._model is not None - self._io_config = self._model.io_config + self._io_config = self._single.io_config return self._io_config def _compile_model(self) -> None: """Compile the underlying ORT session so device/EP are resolved.""" - assert self._model is not None - self._model._session.compile() + self._single._session.compile() def _resolved_device(self) -> str: """Actual device bound after compile.""" - assert self._model is not None - return self._model.device + return self._single.device def _resolved_ep(self) -> EPName | None: """Primary EP bound after compile.""" - assert self._model is not None - return self._model.ep_name + return self._single.ep_name def _resolved_task(self) -> str | None: """Resolved task; falls back to the requested task.""" - assert self._model is not None - return self._model.task or self.config.task + return self._single.task or self.config.task def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]: """Execute full benchmark pipeline. @@ -465,11 +482,10 @@ def _run_benchmark(self) -> PerfStats: def _run_benchmark_simple(self) -> PerfStats: """Execute benchmark without live monitoring.""" - assert self._model is not None assert self._inputs is not None total_iterations = self.config.warmup + self.config.iterations - session = self._model._session + session = self._single._session with session.perf(warmup=self.config.warmup) as stats: _run_simple_loop(session, self._inputs, total_iterations) @@ -487,7 +503,6 @@ def _run_benchmark_monitored(self) -> PerfStats: from ..session.monitor.hw_monitor import HWMonitor from ..session.monitor.vitisai_monitor import VitisAIMonitor - assert self._model is not None assert self._inputs is not None total_iterations = self.config.warmup + self.config.iterations @@ -520,12 +535,12 @@ def _run_benchmark_monitored(self) -> PerfStats: else: ep_monitor = NullEPMonitor() + session = self._single._session with ( - self._model._session.perf(warmup=self.config.warmup) as stats, + session.perf(warmup=self.config.warmup) as stats, hw_monitor as hw, ep_monitor as ep_mon, ): - session = self._model._session inputs = self._inputs def run_iteration() -> None: @@ -1497,9 +1512,7 @@ def perf( # For HF models the ONNX is built internally by PerfBenchmark. try: onnx_for_trace = ( - model_path - if is_onnx - else (benchmark._model._onnx_path if benchmark._model else None) + model_path if is_onnx else getattr(benchmark._model, "_onnx_path", None) ) if onnx_for_trace is None: raise AttributeError("benchmark._model not initialized") From dad558af47252e8949b0394a9d7fd4916e2bcd2a Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 15:15:37 +0800 Subject: [PATCH 5/9] fix mypy --- src/winml/modelkit/commands/perf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 2dfd1be2a..4b5325960 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -562,7 +562,7 @@ def run_iteration() -> None: if ep_dict: # NullEPMonitor returns {}, real monitors return data self._hw_metrics["ep_proof"] = ep_dict - return cast("PerfStats", stats) + return stats def _collect_results(self, stats: PerfStats) -> BenchmarkResult: """Collect benchmark results from PerfStats.""" From 7d102bd0ea6db0357e52dd363a6d9651dae0cc4d Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 16:18:08 +0800 Subject: [PATCH 6/9] process no compile --- src/winml/modelkit/commands/perf.py | 9 ++- src/winml/modelkit/models/auto.py | 2 + tests/unit/commands/test_perf_module.py | 90 +++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index ba11eae47..55c456ea4 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -629,6 +629,7 @@ def _perf_modules( warmup: int, batch_size: int, no_quantize: bool, + no_compile: bool, output: Path | None, verbose: bool, console: Console, @@ -651,7 +652,8 @@ def _perf_modules( iterations: Number of benchmark iterations. warmup: Number of warmup iterations. batch_size: Batch size for input generation. - no_quantize: If True, skip quantization and compilation. + no_quantize: If True, skip quantization during the per-module build. + no_compile: If True, skip the build's compile stage for each module. output: Output JSON path, or None for auto-generated path. verbose: If True, log exceptions at DEBUG level. console: Rich console for output. @@ -745,9 +747,11 @@ def _perf_modules( submodule = parent_model.get_submodule(module_path) - # Skip quant/compile for faster iteration when requested + # Skip quant/compile for faster iteration when requested. Quantization + # and compilation are independent toggles (mirrors the single-model path). if no_quantize: cfg.quant = None + if no_compile: cfg.compile = None with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: @@ -1394,6 +1398,7 @@ def perf( warmup=warmup, batch_size=batch_size, no_quantize=not quantize, + no_compile=no_compile, output=output, verbose=bool(verbose), console=console, diff --git a/src/winml/modelkit/models/auto.py b/src/winml/modelkit/models/auto.py index a8e4036c3..4fe697938 100644 --- a/src/winml/modelkit/models/auto.py +++ b/src/winml/modelkit/models/auto.py @@ -160,6 +160,7 @@ def from_onnx( use_cache=use_cache, force_rebuild=force_rebuild, skip_build=skip_build, + no_compile=no_compile, session_options=session_options, **kwargs, ) @@ -365,6 +366,7 @@ def from_pretrained( config=config, cache_dir=cache_dir, allow_unsupported_nodes=allow_unsupported_nodes, + no_compile=no_compile, **kwargs, ) diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py index 051bae425..210401ec1 100644 --- a/tests/unit/commands/test_perf_module.py +++ b/tests/unit/commands/test_perf_module.py @@ -218,3 +218,93 @@ def test_device_and_ep_forwarded_through_module_path(self, tmp_path: Path) -> No session_kwargs = mock_session_cls.call_args.kwargs assert session_kwargs["device"] == "npu" assert session_kwargs["ep"] == "qnn" + + +class TestPerfModuleQuantCompileToggles: + """--no-quantize and --compile/--no-compile clear cfg.quant / cfg.compile + independently in the per-module build (mirrors the single-model path).""" + + @staticmethod + def _run(tmp_path: Path, extra_args: list[str]) -> MagicMock: + """Invoke ``perf --module`` with mocked build and return the module cfg. + + The cfg is mutated (quant/compile cleared) before ``build_hf_model``, + so short-circuiting the benchmark via a failing ``session.perf()`` + still lets us inspect the mutation. + """ + fake_cfg = MagicMock() + fake_cfg.loader.model_type = "bert" + fake_cfg.loader.module_path = "encoder.layer.0" + + fake_build_result = MagicMock() + fake_build_result.final_onnx_path = tmp_path / "model.onnx" + + fake_session = MagicMock() + fake_session.perf.side_effect = RuntimeError("test-skip-benchmark") + + fake_loader_cfg = MagicMock() + fake_loader_cfg.task = "fill-mask" + + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + return_value=("cpu", ["cpu"]), + ), + patch( + "winml.modelkit.config.generate_hf_build_config", + return_value=[fake_cfg], + ), + patch( + "winml.modelkit.loader.resolve_loader_config", + return_value=(fake_loader_cfg, MagicMock(), MagicMock()), + ), + patch( + "winml.modelkit.commands.build._instantiate_parent_model", + return_value=MagicMock(), + ), + patch( + "winml.modelkit.build.build_hf_model", + return_value=fake_build_result, + ), + patch( + "winml.modelkit.session.WinMLSession", + return_value=fake_session, + ), + ): + runner = CliRunner() + result = runner.invoke( + main, + [ + "perf", + "-m", + "fake/model", + "--module", + "BertLayer", + "--iterations", + "1", + "--warmup", + "0", + "-o", + str(tmp_path / "out.json"), + *extra_args, + ], + ) + assert result.exit_code == 0, result.output + return fake_cfg + + def test_default_skips_compile_keeps_quant(self, tmp_path: Path) -> None: + # perf defaults to --no-compile and --quantize. + cfg = self._run(tmp_path, []) + assert cfg.compile is None + assert cfg.quant is not None + + def test_compile_flag_preserves_compile(self, tmp_path: Path) -> None: + cfg = self._run(tmp_path, ["--compile"]) + assert cfg.compile is not None + assert cfg.quant is not None + + def test_no_quantize_clears_only_quant(self, tmp_path: Path) -> None: + # --no-quantize must not also clear compile when --compile is set. + cfg = self._run(tmp_path, ["--no-quantize", "--compile"]) + assert cfg.quant is None + assert cfg.compile is not None From 9589466964a96984b4677c6a6d0fb0ea371dfb62 Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 17:06:54 +0800 Subject: [PATCH 7/9] add back --- tests/unit/commands/test_perf_module.py | 90 +++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py index f08f5d3d7..80298c02d 100644 --- a/tests/unit/commands/test_perf_module.py +++ b/tests/unit/commands/test_perf_module.py @@ -314,3 +314,93 @@ def test_running_model_path_in_module_result(self, tmp_path: Path) -> None: report = json.loads(out_path.read_text(encoding="utf-8")) instance = report["instances"][0] assert instance["running_model_path"] == str(running_model_path) + + +class TestPerfModuleQuantCompileToggles: + """--no-quantize and --compile/--no-compile clear cfg.quant / cfg.compile + independently in the per-module build (mirrors the single-model path).""" + + @staticmethod + def _run(tmp_path: Path, extra_args: list[str]) -> MagicMock: + """Invoke ``perf --module`` with mocked build and return the module cfg. + + The cfg is mutated (quant/compile cleared) before ``build_hf_model``, + so short-circuiting the benchmark via a failing ``session.perf()`` + still lets us inspect the mutation. + """ + fake_cfg = MagicMock() + fake_cfg.loader.model_type = "bert" + fake_cfg.loader.module_path = "encoder.layer.0" + + fake_build_result = MagicMock() + fake_build_result.final_onnx_path = tmp_path / "model.onnx" + + fake_session = MagicMock() + fake_session.perf.side_effect = RuntimeError("test-skip-benchmark") + + fake_loader_cfg = MagicMock() + fake_loader_cfg.task = "fill-mask" + + with ( + patch( + "winml.modelkit.sysinfo.resolve_device", + return_value=("cpu", ["cpu"]), + ), + patch( + "winml.modelkit.config.generate_hf_build_config", + return_value=[fake_cfg], + ), + patch( + "winml.modelkit.loader.resolve_loader_config", + return_value=(fake_loader_cfg, MagicMock(), MagicMock()), + ), + patch( + "winml.modelkit.commands.build._instantiate_parent_model", + return_value=MagicMock(), + ), + patch( + "winml.modelkit.build.build_hf_model", + return_value=fake_build_result, + ), + patch( + "winml.modelkit.session.WinMLSession", + return_value=fake_session, + ), + ): + runner = CliRunner() + result = runner.invoke( + main, + [ + "perf", + "-m", + "fake/model", + "--module", + "BertLayer", + "--iterations", + "1", + "--warmup", + "0", + "-o", + str(tmp_path / "out.json"), + *extra_args, + ], + ) + assert result.exit_code == 0, result.output + return fake_cfg + + def test_default_skips_compile_keeps_quant(self, tmp_path: Path) -> None: + # perf defaults to --no-compile and --quantize. + cfg = self._run(tmp_path, []) + assert cfg.compile is None + assert cfg.quant is not None + + def test_compile_flag_preserves_compile(self, tmp_path: Path) -> None: + cfg = self._run(tmp_path, ["--compile"]) + assert cfg.compile is not None + assert cfg.quant is not None + + def test_no_quantize_clears_only_quant(self, tmp_path: Path) -> None: + # --no-quantize must not also clear compile when --compile is set. + cfg = self._run(tmp_path, ["--no-quantize", "--compile"]) + assert cfg.quant is None + assert cfg.compile is not None From 3ad1490a8789c1f798c63659be2054aaff6f09de Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 17:14:51 +0800 Subject: [PATCH 8/9] clean up --- src/winml/modelkit/commands/perf.py | 51 +++++++--------------- tests/unit/commands/test_perf_composite.py | 5 +++ 2 files changed, 21 insertions(+), 35 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 46c16f15c..9fbcc7d1f 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -295,7 +295,6 @@ def __init__(self, config: BenchmarkConfig) -> None: self.config = config self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None self._inputs: dict[str, np.ndarray] | None = None - self._io_config: dict[str, Any] | None = None @property def _is_composite(self) -> bool: @@ -323,33 +322,18 @@ def _single(self) -> WinMLPreTrainedModel: Only valid for non-composite models: composites dispatch to ``_run_sub_models``, which benchmarks each sub-model through a child - ``PerfBenchmark`` whose ``_model`` is itself single-session. + ``PerfBenchmark`` whose ``_model`` is itself single-session. Exposes + ``io_config`` / ``device`` / ``ep_name`` / ``task`` directly (the + session caches ``io_config``), so callers read ``self._single.*`` + rather than going through per-attribute wrappers. """ assert self._model is not None return cast("WinMLPreTrainedModel", self._model) - def _resolved_io_config(self) -> dict[str, Any]: - """I/O config of the (single-session) model being benchmarked.""" - if self._io_config is None: - self._io_config = self._single.io_config - return self._io_config - def _compile_model(self) -> None: """Compile the underlying ORT session so device/EP are resolved.""" self._single._session.compile() - def _resolved_device(self) -> str: - """Actual device bound after compile.""" - return self._single.device - - def _resolved_ep(self) -> EPName | None: - """Primary EP bound after compile.""" - return self._single.ep_name - - def _resolved_task(self) -> str | None: - """Resolved task; falls back to the requested task.""" - return self._single.task or self.config.task - def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]: """Execute full benchmark pipeline. @@ -403,11 +387,11 @@ def _run_single(self) -> BenchmarkResult: # Print model info before benchmark starts _print_model_info( - self._resolved_io_config(), - task=self._resolved_task(), + self._single.io_config, + task=self._single.task or self.config.task, req_device=self.config.device, - act_device=self._resolved_device(), - ep_name=self._resolved_ep(), + act_device=self._single.device, + ep_name=self._single.ep_name, ) # [3] Run benchmark @@ -479,10 +463,8 @@ def _load_model(self) -> None: def _generate_inputs(self) -> None: """Generate random inputs based on model io_config.""" - assert self._model is not None - io_config = self._resolved_io_config() self._inputs = generate_random_inputs( - io_config=io_config, + io_config=self._single.io_config, batch_size=self.config.batch_size, ) @@ -529,8 +511,8 @@ def _run_benchmark_monitored(self) -> PerfStats: # GPU when --device gpu is specified, NPU when --device npu, etc. # ep_name lets the monitor resolve the exact LUID via ORT's autoEP # metadata so we follow the adapter the session actually binds to. - ep_name = self._resolved_ep() - monitor_device = self._resolved_device() or self.config.device or "auto" + ep_name = self._single.ep_name + monitor_device = self._single.device or self.config.device or "auto" hw_monitor = HWMonitor( poll_interval_ms=_HW_POLL_INTERVAL_MS, device=monitor_device, @@ -578,8 +560,7 @@ def run_iteration() -> None: def _collect_results(self, stats: PerfStats) -> BenchmarkResult: """Collect benchmark results from PerfStats.""" - assert self._model is not None - io_config = self._resolved_io_config() + io_config = self._single.io_config # Calculate throughput mean_latency_sec = stats.mean_ms / 1000.0 @@ -618,10 +599,10 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: samples_per_sec=samples_per_sec, batches_per_sec=batches_per_sec, # Actual values (resolved after build + compile) - actual_device=self._resolved_device(), - actual_task=self._resolved_task() or "auto-detected", - actual_ep=self._resolved_ep(), - running_model_path=str(self._single().running_model_path), + actual_device=self._single.device, + actual_task=self._single.task or self.config.task or "auto-detected", + actual_ep=self._single.ep_name, + running_model_path=str(self._single.running_model_path), # Hardware monitor metrics (only present when --monitor is used) hw_monitor=getattr(self, "_hw_metrics", None), ) diff --git a/tests/unit/commands/test_perf_composite.py b/tests/unit/commands/test_perf_composite.py index 1e99da930..f19e94840 100644 --- a/tests/unit/commands/test_perf_composite.py +++ b/tests/unit/commands/test_perf_composite.py @@ -42,6 +42,7 @@ def __init__(self, io_config: dict[str, Any], device: str, ep_name: str) -> None self.io_config = io_config self.device = device self.ep_name = ep_name + self.running_model_path = "model.onnx" self.compiled = False self.run_log: list[dict[str, Any]] = [] self._perf_stats: PerfStats | None = None @@ -90,6 +91,10 @@ def device(self) -> str: def ep_name(self) -> str: return self._session.ep_name + @property + def running_model_path(self) -> str: + return self._session.running_model_path + class _FakeComposite: """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``).""" From 9ace198d6d8db26bb40feb7dd2316f762dd88328 Mon Sep 17 00:00:00 2001 From: xieofxie Date: Fri, 12 Jun 2026 17:26:16 +0800 Subject: [PATCH 9/9] clean up --- src/winml/modelkit/commands/perf.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 9fbcc7d1f..622d30f1d 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -33,8 +33,6 @@ if TYPE_CHECKING: - from collections.abc import Callable - from ..models.winml.base import WinMLPreTrainedModel from ..models.winml.composite_model import WinMLCompositeModel from ..session.stats import PerfStats @@ -330,10 +328,6 @@ def _single(self) -> WinMLPreTrainedModel: assert self._model is not None return cast("WinMLPreTrainedModel", self._model) - def _compile_model(self) -> None: - """Compile the underlying ORT session so device/EP are resolved.""" - self._single._session.compile() - def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]: """Execute full benchmark pipeline. @@ -383,7 +377,7 @@ def _run_single(self) -> BenchmarkResult: self._generate_inputs() # Compile session early so model.device is resolved for display - self._compile_model() + self._single._session.compile() # Print model info before benchmark starts _print_model_info( @@ -535,13 +529,9 @@ def _run_benchmark_monitored(self) -> PerfStats: hw_monitor as hw, ep_monitor as ep_mon, ): - inputs = self._inputs - - def run_iteration() -> None: - session.run(inputs) - _run_monitored_loop( - run_iteration, + session, + self._inputs, stats, hw, total_iterations=total_iterations, @@ -1122,7 +1112,8 @@ def _print_model_info( def _run_monitored_loop( - run_iteration: Callable[[], Any], + session: Any, + inputs: dict[str, Any], stats: PerfStats, hw: Any, *, @@ -1131,12 +1122,7 @@ def _run_monitored_loop( model_id: str, device: str, ) -> None: - """Run the benchmark iteration loop with live hardware monitoring. - - ``run_iteration`` runs (and times into ``stats``) a single inference. For - single-session models it invokes ``session.run`` inside the session's - perf() context; for composite models it records a full ``forward()`` pass. - """ + """Run the benchmark iteration loop with live hardware monitoring.""" display = LiveMonitorDisplay( total_iterations=total_iterations, warmup=warmup, @@ -1146,7 +1132,7 @@ def _run_monitored_loop( ) with display: for i in range(total_iterations): - run_iteration() + session.run(inputs) latest_latency = stats.all_samples_ms[-1] if stats.all_samples_ms else 0 display.update(