From a0b72d324471e8271f67ed87af1d10f2578d41d9 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Wed, 10 Jun 2026 17:20:23 +0800
Subject: [PATCH 1/9] fix(perf): support composite (dual-encoder) models in
 winml perf

`winml perf` assumed every model exposes a single `io_config`/`_session`,
so composite models (CLIP/SigLIP zero-shot-image-classification) crashed
with `AttributeError: ... has no attribute io_config` during input
generation.

Make `PerfBenchmark` composite-aware:
- `_aggregate_io_config()` unions the sub-models inputs (their union is
  exactly the composite forward() kwargs) for input generation/display.
- Time the full `forward()` pass via an external PerfStats; single-session
  models keep recording pure-ORT time inside session.perf(). The monitored
  loop is refactored to take a run-iteration callable so both paths share it.
- Device/EP/task are resolved from a representative sub-model.
- `_probe_composite_outputs()` runs one forward() and introspects the result
  so reported outputs are the composite task-level tensors (e.g.
  logits_per_image) rather than a deduped union of sub-model ONNX outputs.

Add tests/unit/commands/test_perf_composite.py covering aggregation, output
describing/probing, input generation, device/EP/task resolution, and the
full-forward timing path.
---
 src/winml/modelkit/commands/perf.py        | 240 ++++++++++++++++++--
 tests/unit/commands/test_perf_composite.py | 249 +++++++++++++++++++++
 2 files changed, 466 insertions(+), 23 deletions(-)
 create mode 100644 tests/unit/commands/test_perf_composite.py

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index b53578d3d..88e2ac07f 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import contextlib
 import json
 import logging
 from dataclasses import dataclass, field
@@ -33,6 +34,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, Iterable
+
     from ..models.winml.base import WinMLPreTrainedModel
     from ..session.stats import PerfStats
 
@@ -258,6 +261,81 @@ def _resolve_shape(
     return tuple(resolved)
 
 
+def _aggregate_io_config(sub_models: Iterable[Any]) -> dict[str, Any]:
+    """Merge a composite model's sub-model io_configs into one unified view.
+
+    Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT
+    session; they orchestrate several sub-models. For benchmarking we present a
+    unified io_config whose inputs are the union of every sub-model's inputs
+    (deduplicated by name, order preserved) -- which is exactly the set of
+    keyword arguments the composite's ``forward()`` consumes. Outputs are
+    likewise unioned for display and result-reporting purposes.
+    """
+    agg: dict[str, Any] = {
+        "input_names": [],
+        "input_shapes": [],
+        "input_types": [],
+        "output_names": [],
+        "output_shapes": [],
+        "output_types": [],
+        "precision": None,
+    }
+    seen_in: set[str] = set()
+    seen_out: set[str] = set()
+    for sub in sub_models:
+        io = sub.io_config
+        for name, shape, dtype in zip(
+            io["input_names"], io["input_shapes"], io["input_types"], strict=True
+        ):
+            if name not in seen_in:
+                seen_in.add(name)
+                agg["input_names"].append(name)
+                agg["input_shapes"].append(shape)
+                agg["input_types"].append(dtype)
+        out_types = io.get("output_types") or [None] * len(io["output_names"])
+        for name, shape, dtype in zip(
+            io["output_names"], io["output_shapes"], out_types, strict=False
+        ):
+            if name not in seen_out:
+                seen_out.add(name)
+                agg["output_names"].append(name)
+                agg["output_shapes"].append(shape)
+                agg["output_types"].append(dtype)
+        if agg["precision"] is None:
+            agg["precision"] = io.get("precision")
+    return agg
+
+
+def _describe_outputs(output: Any) -> tuple[list[str], list[list[int]], list[str | None]]:
+    """Extract ``(names, shapes, dtypes)`` from a model ``forward()`` result.
+
+    Architecture-agnostic: handles HuggingFace ``ModelOutput`` / ``dict``
+    (named fields), plain sequences (positional ``output_N`` names), and a
+    single tensor. ``None`` fields and non-array values are skipped. Used to
+    report a composite model's real task-level outputs (e.g. ``logits``)
+    rather than its sub-models' raw ONNX outputs.
+    """
+    if hasattr(output, "items"):
+        pairs = list(output.items())
+    elif isinstance(output, (list, tuple)):
+        pairs = [(f"output_{i}", value) for i, value in enumerate(output)]
+    else:
+        pairs = [("output_0", output)]
+
+    names: list[str] = []
+    shapes: list[list[int]] = []
+    types: list[str | None] = []
+    for name, value in pairs:
+        shape = getattr(value, "shape", None)
+        if value is None or shape is None:
+            continue
+        names.append(name)
+        shapes.append([int(dim) for dim in shape])
+        dtype = getattr(value, "dtype", None)
+        types.append(str(dtype) if dtype is not None else None)
+    return names, shapes, types
+
+
 # =============================================================================
 # Benchmark Engine
 # =============================================================================
@@ -281,6 +359,77 @@ def __init__(self, config: BenchmarkConfig) -> None:
         self.config = config
         self._model: WinMLPreTrainedModel | None = None
         self._inputs: dict[str, np.ndarray] | None = None
+        self._io_config: dict[str, Any] | None = None
+
+    @property
+    def _is_composite(self) -> bool:
+        """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP)."""
+        return hasattr(self._model, "sub_models")
+
+    def _resolved_io_config(self) -> dict[str, Any]:
+        """Unified io_config (aggregated across sub-models for composites)."""
+        if self._io_config is None:
+            assert self._model is not None
+            if self._is_composite:
+                self._io_config = _aggregate_io_config(self._model.sub_models.values())
+            else:
+                self._io_config = self._model.io_config
+        return self._io_config
+
+    def _compile_model(self) -> None:
+        """Compile the underlying ORT session(s) so device/EP are resolved."""
+        assert self._model is not None
+        if self._is_composite:
+            for sub in self._model.sub_models.values():
+                sub._session.compile()
+        else:
+            self._model._session.compile()
+
+    def _resolved_device(self) -> str:
+        """Actual device bound after compile (representative sub-model for composites)."""
+        assert self._model is not None
+        if self._is_composite:
+            return next(iter(self._model.sub_models.values())).device
+        return self._model.device
+
+    def _resolved_ep(self) -> EPName | None:
+        """Primary EP bound after compile (representative sub-model for composites)."""
+        assert self._model is not None
+        if self._is_composite:
+            return next(iter(self._model.sub_models.values())).ep_name
+        return self._model.ep_name
+
+    def _resolved_task(self) -> str | None:
+        """Resolved task; composites fall back to the requested task."""
+        assert self._model is not None
+        if self._is_composite:
+            return self.config.task
+        return self._model.task or self.config.task
+
+    def _probe_composite_outputs(self) -> None:
+        """Overwrite a composite's reported outputs with its real forward() result.
+
+        Runs one ``forward()`` pass (an extra warmup) and introspects the
+        returned object so the displayed/reported outputs reflect the
+        composite's task-level tensors (e.g. ``logits_per_image``) instead of
+        the deduplicated union of its sub-models' raw ONNX outputs. Falls back
+        to the aggregated view if the probe fails or yields nothing.
+        """
+        assert self._model is not None
+        assert self._inputs is not None
+        try:
+            output = self._model(**self._inputs)
+        except Exception:  # best-effort display only; never fail the run
+            logger.debug("Composite output probe failed; keeping aggregated view", exc_info=True)
+            return
+
+        names, shapes, types = _describe_outputs(output)
+        if not names:
+            return
+        io = self._resolved_io_config()
+        io["output_names"] = names
+        io["output_shapes"] = shapes
+        io["output_types"] = types
 
     def run(self) -> BenchmarkResult:
         """Execute full benchmark pipeline.
@@ -297,16 +446,21 @@ def run(self) -> BenchmarkResult:
         logger.info("Generating benchmark inputs")
         self._generate_inputs()
 
-        # Compile session early so model.device is resolved for display
-        self._model._session.compile()
+        # Compile session(s) early so model.device is resolved for display
+        self._compile_model()
+
+        # Composite forward() returns task-level outputs (e.g. logits) that
+        # don't map to any single sub-model's ONNX outputs; probe the real ones.
+        if self._is_composite:
+            self._probe_composite_outputs()
 
         # Print model info before benchmark starts
         _print_model_info(
-            self._model.io_config,
-            task=self._model.task or self.config.task,
+            self._resolved_io_config(),
+            task=self._resolved_task(),
             req_device=self.config.device,
-            act_device=self._model.device,
-            ep_name=self._model.ep_name,
+            act_device=self._resolved_device(),
+            ep_name=self._resolved_ep(),
         )
 
         # [3] Run benchmark
@@ -378,12 +532,18 @@ def _load_model(self) -> None:
     def _generate_inputs(self) -> None:
         """Generate random inputs based on model io_config."""
         assert self._model is not None
-        io_config = self._model.io_config
+        io_config = self._resolved_io_config()
         self._inputs = generate_random_inputs(
             io_config=io_config,
             batch_size=self.config.batch_size,
         )
 
+    def _composite_run_iteration(self, stats: PerfStats) -> None:
+        """Time one full composite forward() pass (orchestrates all sub-sessions)."""
+        assert self._model is not None
+        assert self._inputs is not None
+        stats.record(lambda: self._model(**self._inputs))
+
     def _run_benchmark(self) -> PerfStats:
         """Execute benchmark iterations with timing."""
         if self.config.monitor:
@@ -394,9 +554,21 @@ def _run_benchmark_simple(self) -> PerfStats:
         """Execute benchmark without live monitoring."""
         assert self._model is not None
         assert self._inputs is not None
-        session = self._model._session
         total_iterations = self.config.warmup + self.config.iterations
 
+        # Composite models have no single ORT session; time the full forward()
+        # pass with an external PerfStats instead of the session's perf() hook.
+        if self._is_composite:
+            from ..session.stats import PerfStats
+
+            stats = PerfStats(warmup=self.config.warmup)
+            for i in range(total_iterations):
+                self._composite_run_iteration(stats)
+                if (i + 1) % max(1, total_iterations // 10) == 0:
+                    logger.debug("Progress: %d/%d", i + 1, total_iterations)
+            return stats
+
+        session = self._model._session
         with session.perf(warmup=self.config.warmup) as stats:
             _run_simple_loop(session, self._inputs, total_iterations)
 
@@ -413,10 +585,10 @@ def _run_benchmark_monitored(self) -> PerfStats:
         from ..session.monitor.ep_monitor import NullEPMonitor
         from ..session.monitor.hw_monitor import HWMonitor
         from ..session.monitor.vitisai_monitor import VitisAIMonitor
+        from ..session.stats import PerfStats
 
         assert self._model is not None
         assert self._inputs is not None
-        session = self._model._session
         total_iterations = self.config.warmup + self.config.iterations
 
         if not HWMonitor.is_available():
@@ -430,31 +602,49 @@ def _run_benchmark_monitored(self) -> PerfStats:
         # GPU when --device gpu is specified, NPU when --device npu, etc.
         # ep_name lets the monitor resolve the exact LUID via ORT's autoEP
         # metadata so we follow the adapter the session actually binds to.
-        monitor_device = self._model.device or self.config.device or "auto"
+        ep_name = self._resolved_ep()
+        monitor_device = self._resolved_device() or self.config.device or "auto"
         hw_monitor = HWMonitor(
             poll_interval_ms=_HW_POLL_INTERVAL_MS,
             device=monitor_device,
-            ep_name=session.ep_name,
+            ep_name=ep_name,
         )
 
         # EP-specific proof-of-execution monitor.
         # When QNN/OpenVINO monitors become real, add entries here.
         _ep_monitors: dict[EPName, Any] = {"VitisAIExecutionProvider": VitisAIMonitor}
-        monitor_cls = _ep_monitors.get(session.ep_name) if session.ep_name else None
+        monitor_cls = _ep_monitors.get(ep_name) if ep_name else None
         ep_monitor: Any
         if monitor_cls and monitor_cls.is_available():
             ep_monitor = monitor_cls()
         else:
             ep_monitor = NullEPMonitor()
 
+        # Composite models time the full forward() pass via an external
+        # PerfStats; single-session models record pure-ORT time inside the
+        # session's perf() context. The run callable abstracts that difference.
+        if self._is_composite:
+            stats_cm: Any = contextlib.nullcontext(PerfStats(warmup=self.config.warmup))
+        else:
+            stats_cm = self._model._session.perf(warmup=self.config.warmup)
+
         with (
-            session.perf(warmup=self.config.warmup) as stats,
+            stats_cm as stats,
             hw_monitor as hw,
             ep_monitor as ep_mon,
         ):
+            if self._is_composite:
+
+                def run_iteration() -> None:
+                    self._composite_run_iteration(stats)
+            else:
+                session = self._model._session
+
+                def run_iteration() -> None:
+                    session.run(self._inputs)
+
             _run_monitored_loop(
-                session,
-                self._inputs,
+                run_iteration,
                 stats,
                 hw,
                 total_iterations=total_iterations,
@@ -474,7 +664,7 @@ def _run_benchmark_monitored(self) -> PerfStats:
     def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
         """Collect benchmark results from PerfStats."""
         assert self._model is not None
-        io_config = self._model.io_config
+        io_config = self._resolved_io_config()
 
         # Calculate throughput
         mean_latency_sec = stats.mean_ms / 1000.0
@@ -512,9 +702,9 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
             samples_per_sec=samples_per_sec,
             batches_per_sec=batches_per_sec,
             # Actual values (resolved after build + compile)
-            actual_device=self._model.device,
-            actual_task=self._model.task or self.config.task or "auto-detected",
-            actual_ep=self._model.ep_name,
+            actual_device=self._resolved_device(),
+            actual_task=self._resolved_task() or "auto-detected",
+            actual_ep=self._resolved_ep(),
             # Hardware monitor metrics (only present when --monitor is used)
             hw_monitor=getattr(self, "_hw_metrics", None),
         )
@@ -961,8 +1151,7 @@ def _print_model_info(
 
 
 def _run_monitored_loop(
-    session: Any,
-    inputs: dict[str, Any],
+    run_iteration: Callable[[], Any],
     stats: PerfStats,
     hw: Any,
     *,
@@ -971,7 +1160,12 @@ def _run_monitored_loop(
     model_id: str,
     device: str,
 ) -> None:
-    """Run the benchmark iteration loop with live hardware monitoring."""
+    """Run the benchmark iteration loop with live hardware monitoring.
+
+    ``run_iteration`` runs (and times into ``stats``) a single inference. For
+    single-session models it invokes ``session.run`` inside the session's
+    perf() context; for composite models it records a full ``forward()`` pass.
+    """
     display = LiveMonitorDisplay(
         total_iterations=total_iterations,
         warmup=warmup,
@@ -981,7 +1175,7 @@ def _run_monitored_loop(
     )
     with display:
         for i in range(total_iterations):
-            session.run(inputs)
+            run_iteration()
 
             latest_latency = stats.all_samples_ms[-1] if stats.all_samples_ms else 0
             display.update(
diff --git a/tests/unit/commands/test_perf_composite.py b/tests/unit/commands/test_perf_composite.py
new file mode 100644
index 000000000..42c9defef
--- /dev/null
+++ b/tests/unit/commands/test_perf_composite.py
@@ -0,0 +1,249 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Tests for winml perf support of composite (multi-session) models.
+
+Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ONNX
+session; they orchestrate several sub-models. The perf benchmark must
+aggregate their io_configs and time the full ``forward()`` pass rather
+than reaching for a single ``_session``.
+
+Regression guard: previously ``PerfBenchmark`` assumed every model exposed
+``io_config`` / ``_session`` and raised ``AttributeError`` on composites.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any
+
+import numpy as np
+
+from winml.modelkit.commands.perf import (
+    BenchmarkConfig,
+    PerfBenchmark,
+    _aggregate_io_config,
+    _describe_outputs,
+)
+
+
+def _make_sub_model(
+    input_names: list[str],
+    input_shapes: list[list[int | None]],
+    input_types: list[str],
+    output_names: list[str],
+    output_shapes: list[list[int | None]],
+    *,
+    device: str = "GPU",
+    ep_name: str = "OpenVINOExecutionProvider",
+    precision: str | None = "fp16",
+) -> Any:
+    """Build a minimal stand-in for a WinMLAutoModel sub-component."""
+    io_config = {
+        "input_names": input_names,
+        "input_shapes": input_shapes,
+        "input_types": input_types,
+        "output_names": output_names,
+        "output_shapes": output_shapes,
+        "output_types": ["float32"] * len(output_names),
+        "precision": precision,
+    }
+    compiled: dict[str, bool] = {"compiled": False}
+
+    def _compile() -> None:
+        compiled["compiled"] = True
+
+    return SimpleNamespace(
+        io_config=io_config,
+        device=device,
+        ep_name=ep_name,
+        _session=SimpleNamespace(compile=_compile),
+        _compiled_flag=compiled,
+    )
+
+
+class _FakeComposite:
+    """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``)."""
+
+    def __init__(self, sub_models: dict[str, Any]) -> None:
+        self.sub_models = sub_models
+        self.call_log: list[dict[str, np.ndarray]] = []
+
+    def __call__(self, **kwargs: np.ndarray) -> dict[str, np.ndarray]:
+        self.call_log.append(kwargs)
+        # Mimics a composite's task-level forward() output (e.g. SigLIP):
+        # tensors that exist on no single sub-model's ONNX graph.
+        return {
+            "logits_per_image": np.zeros((1, 1), dtype=np.float32),
+            "image_embeds": np.zeros((1, 768), dtype=np.float32),
+            "text_embeds": np.zeros((1, 768), dtype=np.float32),
+        }
+
+
+def _siglip_like() -> _FakeComposite:
+    image_encoder = _make_sub_model(
+        input_names=["pixel_values"],
+        input_shapes=[[1, 3, 224, 224]],
+        input_types=["float32"],
+        output_names=["image_embeds"],
+        output_shapes=[[1, 768]],
+    )
+    text_encoder = _make_sub_model(
+        input_names=["input_ids", "attention_mask"],
+        input_shapes=[[1, 64], [1, 64]],
+        input_types=["int64", "int64"],
+        output_names=["text_embeds"],
+        output_shapes=[[1, 768]],
+    )
+    return _FakeComposite({"image-encoder": image_encoder, "text-encoder": text_encoder})
+
+
+class TestAggregateIoConfig:
+    """Unit tests for the io_config union helper."""
+
+    def test_union_dedupes_by_name_preserving_order(self) -> None:
+        model = _siglip_like()
+        agg = _aggregate_io_config(model.sub_models.values())
+
+        assert agg["input_names"] == ["pixel_values", "input_ids", "attention_mask"]
+        assert agg["input_shapes"] == [[1, 3, 224, 224], [1, 64], [1, 64]]
+        assert agg["input_types"] == ["float32", "int64", "int64"]
+        assert agg["output_names"] == ["image_embeds", "text_embeds"]
+
+    def test_shared_input_name_is_not_duplicated(self) -> None:
+        # Both encoders consume "attention_mask" -> it must appear once.
+        a = _make_sub_model(
+            ["input_ids", "attention_mask"],
+            [[1, 8], [1, 8]],
+            ["int64", "int64"],
+            ["a"],
+            [[1, 4]],
+        )
+        b = _make_sub_model(
+            ["attention_mask", "token_type_ids"],
+            [[1, 8], [1, 8]],
+            ["int64", "int64"],
+            ["b"],
+            [[1, 4]],
+        )
+        agg = _aggregate_io_config([a, b])
+        assert agg["input_names"] == ["input_ids", "attention_mask", "token_type_ids"]
+
+    def test_precision_taken_from_first_sub_model(self) -> None:
+        a = _make_sub_model(["x"], [[1]], ["float32"], ["y"], [[1]], precision="int8")
+        b = _make_sub_model(["z"], [[1]], ["float32"], ["w"], [[1]], precision="fp16")
+        assert _aggregate_io_config([a, b])["precision"] == "int8"
+
+
+class TestPerfBenchmarkComposite:
+    """PerfBenchmark must transparently handle composite models."""
+
+    def _benchmark(self) -> tuple[PerfBenchmark, _FakeComposite]:
+        config = BenchmarkConfig(
+            model_id="google/siglip-base-patch16-224",
+            task="zero-shot-image-classification",
+            device="gpu",
+            iterations=3,
+            warmup=1,
+        )
+        bench = PerfBenchmark(config)
+        model = _siglip_like()
+        bench._model = model  # bypass _load_model (no HF download in unit tests)
+        return bench, model
+
+    def test_detects_composite(self) -> None:
+        bench, _ = self._benchmark()
+        assert bench._is_composite is True
+
+    def test_resolved_io_config_is_aggregated_and_cached(self) -> None:
+        bench, _ = self._benchmark()
+        io = bench._resolved_io_config()
+        assert io["input_names"] == ["pixel_values", "input_ids", "attention_mask"]
+        # Cached: second call returns the same object.
+        assert bench._resolved_io_config() is io
+
+    def test_compile_compiles_every_sub_session(self) -> None:
+        bench, model = self._benchmark()
+        bench._compile_model()
+        assert all(s._compiled_flag["compiled"] for s in model.sub_models.values())
+
+    def test_generate_inputs_covers_all_sub_model_inputs(self) -> None:
+        bench, _ = self._benchmark()
+        bench._generate_inputs()
+        assert set(bench._inputs) == {"pixel_values", "input_ids", "attention_mask"}
+        assert bench._inputs["pixel_values"].shape == (1, 3, 224, 224)
+        assert bench._inputs["input_ids"].shape == (1, 64)
+
+    def test_resolved_device_ep_task_from_sub_model(self) -> None:
+        bench, _ = self._benchmark()
+        assert bench._resolved_device() == "GPU"
+        assert bench._resolved_ep() == "OpenVINOExecutionProvider"
+        assert bench._resolved_task() == "zero-shot-image-classification"
+
+    def test_simple_benchmark_times_full_forward(self) -> None:
+        bench, model = self._benchmark()
+        bench._generate_inputs()
+        stats = bench._run_benchmark_simple()
+
+        # warmup(1) + iterations(3) == 4 forward() calls; stats excludes warmup.
+        assert len(model.call_log) == 4
+        assert stats.count == 3
+        # forward() received the generated inputs as kwargs.
+        assert set(model.call_log[0]) == {"pixel_values", "input_ids", "attention_mask"}
+
+    def test_probe_replaces_outputs_with_real_forward_result(self) -> None:
+        # The aggregated view reports the image encoder's raw ONNX outputs;
+        # probing must replace them with the composite forward()'s outputs.
+        bench, _ = self._benchmark()
+        bench._generate_inputs()
+        assert bench._resolved_io_config()["output_names"] == ["image_embeds", "text_embeds"]
+
+        bench._probe_composite_outputs()
+        io = bench._resolved_io_config()
+        assert io["output_names"] == ["logits_per_image", "image_embeds", "text_embeds"]
+        assert io["output_shapes"] == [[1, 1], [1, 768], [1, 768]]
+
+    def test_collect_results_reports_probed_outputs(self) -> None:
+        bench, _ = self._benchmark()
+        bench._generate_inputs()
+        bench._probe_composite_outputs()
+        stats = bench._run_benchmark_simple()
+        result = bench._collect_results(stats)
+
+        assert result.input_names == ["pixel_values", "input_ids", "attention_mask"]
+        # Real composite outputs, not the deduped sub-model ONNX outputs.
+        assert result.output_names == ["logits_per_image", "image_embeds", "text_embeds"]
+        assert result.actual_device == "GPU"
+        assert result.actual_ep == "OpenVINOExecutionProvider"
+        assert result.actual_task == "zero-shot-image-classification"
+
+
+class TestDescribeOutputs:
+    """Unit tests for the architecture-agnostic forward()-output describer."""
+
+    def test_dict_output_named_fields(self) -> None:
+        out = {
+            "logits": np.zeros((2, 5), dtype=np.float32),
+            "embeds": np.zeros((2, 8), dtype=np.float32),
+        }
+        names, shapes, types = _describe_outputs(out)
+        assert names == ["logits", "embeds"]
+        assert shapes == [[2, 5], [2, 8]]
+        assert all("float32" in t for t in types)
+
+    def test_skips_none_and_non_array_fields(self) -> None:
+        out = {"a": np.zeros((1, 3)), "b": None, "c": "not-an-array"}
+        names, shapes, _ = _describe_outputs(out)
+        assert names == ["a"]
+        assert shapes == [[1, 3]]
+
+    def test_sequence_output_positional_names(self) -> None:
+        names, shapes, _ = _describe_outputs([np.zeros((1, 4)), np.zeros((1, 2))])
+        assert names == ["output_0", "output_1"]
+        assert shapes == [[1, 4], [1, 2]]
+
+    def test_single_tensor_output(self) -> None:
+        names, shapes, _ = _describe_outputs(np.zeros((3, 3)))
+        assert names == ["output_0"]
+        assert shapes == [[3, 3]]

From 0bd7bb4751c22e751d58a1ca54184411242ed84f Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Thu, 11 Jun 2026 09:48:01 +0800
Subject: [PATCH 2/9] mypy

---
 src/winml/modelkit/commands/perf.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 8e5c5542c..548e27d8b 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -20,7 +20,7 @@
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import click
 import numpy as np
@@ -37,6 +37,7 @@
     from collections.abc import Callable, Iterable
 
     from ..models.winml.base import WinMLPreTrainedModel
+    from ..models.winml.composite_model import WinMLCompositeModel
     from ..session.stats import PerfStats
 
 logger = logging.getLogger(__name__)
@@ -366,12 +367,16 @@ def _is_composite(self) -> bool:
         """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP)."""
         return hasattr(self._model, "sub_models")
 
+    def _sub_models(self) -> dict[str, WinMLPreTrainedModel]:
+        """Sub-models of a composite model (only valid when ``_is_composite``)."""
+        return cast("WinMLCompositeModel", self._model).sub_models
+
     def _resolved_io_config(self) -> dict[str, Any]:
         """Unified io_config (aggregated across sub-models for composites)."""
         if self._io_config is None:
             assert self._model is not None
             if self._is_composite:
-                self._io_config = _aggregate_io_config(self._model.sub_models.values())
+                self._io_config = _aggregate_io_config(self._sub_models().values())
             else:
                 self._io_config = self._model.io_config
         return self._io_config
@@ -380,7 +385,7 @@ def _compile_model(self) -> None:
         """Compile the underlying ORT session(s) so device/EP are resolved."""
         assert self._model is not None
         if self._is_composite:
-            for sub in self._model.sub_models.values():
+            for sub in self._sub_models().values():
                 sub._session.compile()
         else:
             self._model._session.compile()
@@ -389,14 +394,14 @@ def _resolved_device(self) -> str:
         """Actual device bound after compile (representative sub-model for composites)."""
         assert self._model is not None
         if self._is_composite:
-            return next(iter(self._model.sub_models.values())).device
+            return next(iter(self._sub_models().values())).device
         return self._model.device
 
     def _resolved_ep(self) -> EPName | None:
         """Primary EP bound after compile (representative sub-model for composites)."""
         assert self._model is not None
         if self._is_composite:
-            return next(iter(self._model.sub_models.values())).ep_name
+            return next(iter(self._sub_models().values())).ep_name
         return self._model.ep_name
 
     def _resolved_task(self) -> str | None:
@@ -542,7 +547,9 @@ def _composite_run_iteration(self, stats: PerfStats) -> None:
         """Time one full composite forward() pass (orchestrates all sub-sessions)."""
         assert self._model is not None
         assert self._inputs is not None
-        stats.record(lambda: self._model(**self._inputs))
+        model = self._model
+        inputs = self._inputs
+        stats.record(lambda: model(**inputs))
 
     def _run_benchmark(self) -> PerfStats:
         """Execute benchmark iterations with timing."""
@@ -639,9 +646,10 @@ def run_iteration() -> None:
                     self._composite_run_iteration(stats)
             else:
                 session = self._model._session
+                inputs = self._inputs
 
                 def run_iteration() -> None:
-                    session.run(self._inputs)
+                    session.run(inputs)
 
             _run_monitored_loop(
                 run_iteration,
@@ -659,7 +667,7 @@ def run_iteration() -> None:
             if ep_dict:  # NullEPMonitor returns {}, real monitors return data
                 self._hw_metrics["ep_proof"] = ep_dict
 
-        return stats
+        return cast("PerfStats", stats)
 
     def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
         """Collect benchmark results from PerfStats."""

From 3fc29435d0b56361706926064364b669e5860d6a Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 14:01:58 +0800
Subject: [PATCH 3/9] what's now

---
 src/winml/modelkit/commands/perf.py        | 297 +++++++---------
 tests/unit/commands/test_perf_composite.py | 392 +++++++++++----------
 2 files changed, 331 insertions(+), 358 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 548e27d8b..614c59a49 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -14,7 +14,6 @@
 
 from __future__ import annotations
 
-import contextlib
 import json
 import logging
 from dataclasses import dataclass, field
@@ -34,7 +33,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Iterable
+    from collections.abc import Callable
 
     from ..models.winml.base import WinMLPreTrainedModel
     from ..models.winml.composite_model import WinMLCompositeModel
@@ -262,81 +261,6 @@ def _resolve_shape(
     return tuple(resolved)
 
 
-def _aggregate_io_config(sub_models: Iterable[Any]) -> dict[str, Any]:
-    """Merge a composite model's sub-model io_configs into one unified view.
-
-    Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT
-    session; they orchestrate several sub-models. For benchmarking we present a
-    unified io_config whose inputs are the union of every sub-model's inputs
-    (deduplicated by name, order preserved) -- which is exactly the set of
-    keyword arguments the composite's ``forward()`` consumes. Outputs are
-    likewise unioned for display and result-reporting purposes.
-    """
-    agg: dict[str, Any] = {
-        "input_names": [],
-        "input_shapes": [],
-        "input_types": [],
-        "output_names": [],
-        "output_shapes": [],
-        "output_types": [],
-        "precision": None,
-    }
-    seen_in: set[str] = set()
-    seen_out: set[str] = set()
-    for sub in sub_models:
-        io = sub.io_config
-        for name, shape, dtype in zip(
-            io["input_names"], io["input_shapes"], io["input_types"], strict=True
-        ):
-            if name not in seen_in:
-                seen_in.add(name)
-                agg["input_names"].append(name)
-                agg["input_shapes"].append(shape)
-                agg["input_types"].append(dtype)
-        out_types = io.get("output_types") or [None] * len(io["output_names"])
-        for name, shape, dtype in zip(
-            io["output_names"], io["output_shapes"], out_types, strict=False
-        ):
-            if name not in seen_out:
-                seen_out.add(name)
-                agg["output_names"].append(name)
-                agg["output_shapes"].append(shape)
-                agg["output_types"].append(dtype)
-        if agg["precision"] is None:
-            agg["precision"] = io.get("precision")
-    return agg
-
-
-def _describe_outputs(output: Any) -> tuple[list[str], list[list[int]], list[str | None]]:
-    """Extract ``(names, shapes, dtypes)`` from a model ``forward()`` result.
-
-    Architecture-agnostic: handles HuggingFace ``ModelOutput`` / ``dict``
-    (named fields), plain sequences (positional ``output_N`` names), and a
-    single tensor. ``None`` fields and non-array values are skipped. Used to
-    report a composite model's real task-level outputs (e.g. ``logits``)
-    rather than its sub-models' raw ONNX outputs.
-    """
-    if hasattr(output, "items"):
-        pairs = list(output.items())
-    elif isinstance(output, (list, tuple)):
-        pairs = [(f"output_{i}", value) for i, value in enumerate(output)]
-    else:
-        pairs = [("output_0", output)]
-
-    names: list[str] = []
-    shapes: list[list[int]] = []
-    types: list[str | None] = []
-    for name, value in pairs:
-        shape = getattr(value, "shape", None)
-        if value is None or shape is None:
-            continue
-        names.append(name)
-        shapes.append([int(dim) for dim in shape])
-        dtype = getattr(value, "dtype", None)
-        types.append(str(dtype) if dtype is not None else None)
-    return names, shapes, types
-
-
 # =============================================================================
 # Benchmark Engine
 # =============================================================================
@@ -372,93 +296,83 @@ def _sub_models(self) -> dict[str, WinMLPreTrainedModel]:
         return cast("WinMLCompositeModel", self._model).sub_models
 
     def _resolved_io_config(self) -> dict[str, Any]:
-        """Unified io_config (aggregated across sub-models for composites)."""
+        """I/O config of the (single-session) model being benchmarked."""
         if self._io_config is None:
             assert self._model is not None
-            if self._is_composite:
-                self._io_config = _aggregate_io_config(self._sub_models().values())
-            else:
-                self._io_config = self._model.io_config
+            self._io_config = self._model.io_config
         return self._io_config
 
     def _compile_model(self) -> None:
-        """Compile the underlying ORT session(s) so device/EP are resolved."""
+        """Compile the underlying ORT session so device/EP are resolved."""
         assert self._model is not None
-        if self._is_composite:
-            for sub in self._sub_models().values():
-                sub._session.compile()
-        else:
-            self._model._session.compile()
+        self._model._session.compile()
 
     def _resolved_device(self) -> str:
-        """Actual device bound after compile (representative sub-model for composites)."""
+        """Actual device bound after compile."""
         assert self._model is not None
-        if self._is_composite:
-            return next(iter(self._sub_models().values())).device
         return self._model.device
 
     def _resolved_ep(self) -> EPName | None:
-        """Primary EP bound after compile (representative sub-model for composites)."""
+        """Primary EP bound after compile."""
         assert self._model is not None
-        if self._is_composite:
-            return next(iter(self._sub_models().values())).ep_name
         return self._model.ep_name
 
     def _resolved_task(self) -> str | None:
-        """Resolved task; composites fall back to the requested task."""
+        """Resolved task; falls back to the requested task."""
         assert self._model is not None
-        if self._is_composite:
-            return self.config.task
         return self._model.task or self.config.task
 
-    def _probe_composite_outputs(self) -> None:
-        """Overwrite a composite's reported outputs with its real forward() result.
+    def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]:
+        """Execute full benchmark pipeline.
 
-        Runs one ``forward()`` pass (an extra warmup) and introspects the
-        returned object so the displayed/reported outputs reflect the
-        composite's task-level tensors (e.g. ``logits_per_image``) instead of
-        the deduplicated union of its sub-models' raw ONNX outputs. Falls back
-        to the aggregated view if the probe fails or yields nothing.
+        Returns:
+            A single ``BenchmarkResult`` for single-session models, or a
+            ``{sub_model_name: BenchmarkResult}`` mapping for composite models
+            (e.g. CLIP/SigLIP dual-encoders). Composite models have no single
+            ORT session, so each sub-model is benchmarked individually rather
+            than timing the aggregate ``forward()`` pass.
         """
+        # [1] Load model
+        logger.info("Loading model: %s", self.config.model_id)
+        self._load_model()
         assert self._model is not None
-        assert self._inputs is not None
-        try:
-            output = self._model(**self._inputs)
-        except Exception:  # best-effort display only; never fail the run
-            logger.debug("Composite output probe failed; keeping aggregated view", exc_info=True)
-            return
 
-        names, shapes, types = _describe_outputs(output)
-        if not names:
-            return
-        io = self._resolved_io_config()
-        io["output_names"] = names
-        io["output_shapes"] = shapes
-        io["output_types"] = types
+        if self._is_composite:
+            return self._run_sub_models()
+        return self._run_single()
 
-    def run(self) -> BenchmarkResult:
-        """Execute full benchmark pipeline.
+    def _run_sub_models(self) -> dict[str, BenchmarkResult]:
+        """Benchmark each sub-model of a composite individually.
+
+        Each sub-model is itself a single-session ``WinMLAutoModel``, so it is
+        benchmarked through the standard single-model pipeline by spawning a
+        child ``PerfBenchmark`` with the already-loaded sub-model. Results are
+        keyed by sub-model name for per-component reporting.
+        """
+        results: dict[str, BenchmarkResult] = {}
+        for name, sub in self._sub_models().items():
+            logger.info("Benchmarking sub-model '%s'", name)
+            Console(stderr=True).print(f"\n[bold]Sub-model:[/bold] {name}")
+            child = PerfBenchmark(self.config)
+            child._model = sub
+            results[name] = child._run_single()
+        return results
+
+    def _run_single(self) -> BenchmarkResult:
+        """Benchmark the loaded single-session model.
 
         Returns:
             BenchmarkResult with timing statistics
         """
-        # [1] Load model
-        logger.info("Loading model: %s", self.config.model_id)
-        self._load_model()
         assert self._model is not None
 
         # [2] Generate inputs
         logger.info("Generating benchmark inputs")
         self._generate_inputs()
 
-        # Compile session(s) early so model.device is resolved for display
+        # Compile session early so model.device is resolved for display
         self._compile_model()
 
-        # Composite forward() returns task-level outputs (e.g. logits) that
-        # don't map to any single sub-model's ONNX outputs; probe the real ones.
-        if self._is_composite:
-            self._probe_composite_outputs()
-
         # Print model info before benchmark starts
         _print_model_info(
             self._resolved_io_config(),
@@ -543,14 +457,6 @@ def _generate_inputs(self) -> None:
             batch_size=self.config.batch_size,
         )
 
-    def _composite_run_iteration(self, stats: PerfStats) -> None:
-        """Time one full composite forward() pass (orchestrates all sub-sessions)."""
-        assert self._model is not None
-        assert self._inputs is not None
-        model = self._model
-        inputs = self._inputs
-        stats.record(lambda: model(**inputs))
-
     def _run_benchmark(self) -> PerfStats:
         """Execute benchmark iterations with timing."""
         if self.config.monitor:
@@ -563,18 +469,6 @@ def _run_benchmark_simple(self) -> PerfStats:
         assert self._inputs is not None
         total_iterations = self.config.warmup + self.config.iterations
 
-        # Composite models have no single ORT session; time the full forward()
-        # pass with an external PerfStats instead of the session's perf() hook.
-        if self._is_composite:
-            from ..session.stats import PerfStats
-
-            stats = PerfStats(warmup=self.config.warmup)
-            for i in range(total_iterations):
-                self._composite_run_iteration(stats)
-                if (i + 1) % max(1, total_iterations // 10) == 0:
-                    logger.debug("Progress: %d/%d", i + 1, total_iterations)
-            return stats
-
         session = self._model._session
         with session.perf(warmup=self.config.warmup) as stats:
             _run_simple_loop(session, self._inputs, total_iterations)
@@ -592,7 +486,6 @@ def _run_benchmark_monitored(self) -> PerfStats:
         from ..session.monitor.ep_monitor import NullEPMonitor
         from ..session.monitor.hw_monitor import HWMonitor
         from ..session.monitor.vitisai_monitor import VitisAIMonitor
-        from ..session.stats import PerfStats
 
         assert self._model is not None
         assert self._inputs is not None
@@ -627,29 +520,16 @@ def _run_benchmark_monitored(self) -> PerfStats:
         else:
             ep_monitor = NullEPMonitor()
 
-        # Composite models time the full forward() pass via an external
-        # PerfStats; single-session models record pure-ORT time inside the
-        # session's perf() context. The run callable abstracts that difference.
-        if self._is_composite:
-            stats_cm: Any = contextlib.nullcontext(PerfStats(warmup=self.config.warmup))
-        else:
-            stats_cm = self._model._session.perf(warmup=self.config.warmup)
-
         with (
-            stats_cm as stats,
+            self._model._session.perf(warmup=self.config.warmup) as stats,
             hw_monitor as hw,
             ep_monitor as ep_mon,
         ):
-            if self._is_composite:
-
-                def run_iteration() -> None:
-                    self._composite_run_iteration(stats)
-            else:
-                session = self._model._session
-                inputs = self._inputs
+            session = self._model._session
+            inputs = self._inputs
 
-                def run_iteration() -> None:
-                    session.run(inputs)
+            def run_iteration() -> None:
+                session.run(inputs)
 
             _run_monitored_loop(
                 run_iteration,
@@ -1084,6 +964,74 @@ def write_json_report(result: BenchmarkResult, output_path: Path) -> None:
         json.dump(result.to_dict(), f, indent=2)
 
 
+def _composite_report_dict(
+    results: dict[str, BenchmarkResult],
+    *,
+    model_id: str,
+    task: str | None,
+) -> dict[str, Any]:
+    """Build the combined JSON report for a composite model's sub-models."""
+    return {
+        "model_id": model_id,
+        "task": task,
+        "component_count": len(results),
+        "components": {name: result.to_dict() for name, result in results.items()},
+    }
+
+
+def report_composite_results(
+    results: dict[str, BenchmarkResult],
+    *,
+    console: Console,
+    json_mode: bool,
+    output_path: Path,
+    model_id: str,
+    task: str | None,
+) -> None:
+    """Display and persist per-sub-model results for a composite model.
+
+    Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT
+    session; each sub-model is benchmarked individually (like ``--module``)
+    and reported as its own summary row rather than timing the aggregate
+    ``forward()`` pass. The combined JSON nests each sub-model's full
+    ``BenchmarkResult.to_dict()`` under ``components``.
+    """
+    combined = _composite_report_dict(results, model_id=model_id, task=task)
+
+    if json_mode:
+        click.echo(json.dumps(combined, indent=2))
+    else:
+        table = Table(title="Per-Sub-Model Perf", show_header=True)
+        table.add_column("Sub-Model", style="cyan")
+        table.add_column("Task")
+        table.add_column("Device")
+        table.add_column("Mean (ms)", justify="right")
+        table.add_column("P90 (ms)", justify="right")
+        table.add_column("Min (ms)", justify="right")
+        table.add_column("Max (ms)", justify="right")
+        for name, result in results.items():
+            device_str = _device_string(
+                result.config.device, result.actual_device, result.actual_ep
+            )
+            table.add_row(
+                name,
+                result.actual_task,
+                device_str,
+                f"{result.mean_ms:.2f}",
+                f"{result.p90_ms:.2f}",
+                f"{result.min_ms:.2f}",
+                f"{result.max_ms:.2f}",
+            )
+        console.print()
+        console.print(table)
+        console.print()
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", encoding="utf-8") as f:
+        json.dump(combined, f, indent=2)
+
+
 def generate_output_path(model_id: str, *, module_class: str | None = None) -> Path:
     r"""Generate default output path under the user's cache directory.
 
@@ -1503,6 +1451,21 @@ def perf(
         benchmark = PerfBenchmark(config)
         result = benchmark.run()
 
+        # Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ORT
+        # session; each sub-model is benchmarked individually and reported as
+        # its own row (like --module), not as one aggregate forward() timing.
+        if isinstance(result, dict):
+            report_composite_results(
+                result,
+                console=console,
+                json_mode=json_mode,
+                output_path=output,
+                model_id=hf_model,
+                task=task,
+            )
+            console.print(f"[green]Results saved to:[/green] {output}")
+            return
+
         # Display results
         if json_mode:
             click.echo(json.dumps(result.to_dict(), indent=2))
diff --git a/tests/unit/commands/test_perf_composite.py b/tests/unit/commands/test_perf_composite.py
index 42c9defef..1e99da930 100644
--- a/tests/unit/commands/test_perf_composite.py
+++ b/tests/unit/commands/test_perf_composite.py
@@ -5,9 +5,9 @@
 """Tests for winml perf support of composite (multi-session) models.
 
 Composite models (e.g. CLIP/SigLIP dual-encoders) have no single ONNX
-session; they orchestrate several sub-models. The perf benchmark must
-aggregate their io_configs and time the full ``forward()`` pass rather
-than reaching for a single ``_session``.
+session; they orchestrate several sub-models. ``winml perf`` benchmarks
+each sub-model individually (like ``--module``) and reports one row per
+sub-model rather than timing the aggregate ``forward()`` pass.
 
 Regression guard: previously ``PerfBenchmark`` assumed every model exposed
 ``io_config`` / ``_session`` and raised ``AttributeError`` on composites.
@@ -15,32 +15,99 @@
 
 from __future__ import annotations
 
-from types import SimpleNamespace
-from typing import Any
+import json
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any
 
-import numpy as np
+from rich.console import Console
 
 from winml.modelkit.commands.perf import (
     BenchmarkConfig,
+    BenchmarkResult,
     PerfBenchmark,
-    _aggregate_io_config,
-    _describe_outputs,
+    report_composite_results,
 )
+from winml.modelkit.session.stats import PerfStats
 
 
-def _make_sub_model(
+if TYPE_CHECKING:
+    from collections.abc import Generator
+    from pathlib import Path
+
+
+class _FakeSession:
+    """Stand-in for a WinMLSession that times runs via a real PerfStats."""
+
+    def __init__(self, io_config: dict[str, Any], device: str, ep_name: str) -> None:
+        self.io_config = io_config
+        self.device = device
+        self.ep_name = ep_name
+        self.compiled = False
+        self.run_log: list[dict[str, Any]] = []
+        self._perf_stats: PerfStats | None = None
+
+    def compile(self) -> None:
+        self.compiled = True
+
+    @contextmanager
+    def perf(self, warmup: int = 0) -> Generator[PerfStats, None, None]:
+        self._perf_stats = PerfStats(warmup=warmup)
+        try:
+            yield self._perf_stats
+        finally:
+            self._perf_stats = None
+
+    def run(self, inputs: dict[str, Any]) -> dict[str, Any]:
+        self.run_log.append(inputs)
+        if self._perf_stats is not None:
+            self._perf_stats.record(lambda: None)
+        return {}
+
+
+class _FakeSubModel:
+    """Stand-in for a single-session WinMLAutoModel sub-component."""
+
+    def __init__(
+        self,
+        io_config: dict[str, Any],
+        task: str,
+        *,
+        device: str = "GPU",
+        ep_name: str = "OpenVINOExecutionProvider",
+    ) -> None:
+        self._session = _FakeSession(io_config, device, ep_name)
+        self.task = task
+
+    @property
+    def io_config(self) -> dict[str, Any]:
+        return self._session.io_config
+
+    @property
+    def device(self) -> str:
+        return self._session.device
+
+    @property
+    def ep_name(self) -> str:
+        return self._session.ep_name
+
+
+class _FakeComposite:
+    """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``)."""
+
+    def __init__(self, sub_models: dict[str, Any]) -> None:
+        self.sub_models = sub_models
+
+
+def _io_config(
     input_names: list[str],
-    input_shapes: list[list[int | None]],
+    input_shapes: list[list[int]],
     input_types: list[str],
     output_names: list[str],
-    output_shapes: list[list[int | None]],
+    output_shapes: list[list[int]],
     *,
-    device: str = "GPU",
-    ep_name: str = "OpenVINOExecutionProvider",
     precision: str | None = "fp16",
-) -> Any:
-    """Build a minimal stand-in for a WinMLAutoModel sub-component."""
-    io_config = {
+) -> dict[str, Any]:
+    return {
         "input_names": input_names,
         "input_shapes": input_shapes,
         "input_types": input_types,
@@ -49,201 +116,144 @@ def _make_sub_model(
         "output_types": ["float32"] * len(output_names),
         "precision": precision,
     }
-    compiled: dict[str, bool] = {"compiled": False}
 
-    def _compile() -> None:
-        compiled["compiled"] = True
 
-    return SimpleNamespace(
-        io_config=io_config,
-        device=device,
-        ep_name=ep_name,
-        _session=SimpleNamespace(compile=_compile),
-        _compiled_flag=compiled,
+def _siglip_like() -> _FakeComposite:
+    image_encoder = _FakeSubModel(
+        _io_config(
+            ["pixel_values"],
+            [[1, 3, 224, 224]],
+            ["float32"],
+            ["image_embeds"],
+            [[1, 768]],
+        ),
+        task="image-feature-extraction",
+    )
+    text_encoder = _FakeSubModel(
+        _io_config(
+            ["input_ids", "attention_mask"],
+            [[1, 64], [1, 64]],
+            ["int64", "int64"],
+            ["text_embeds"],
+            [[1, 768]],
+        ),
+        task="feature-extraction",
     )
+    return _FakeComposite({"image-encoder": image_encoder, "text-encoder": text_encoder})
 
 
-class _FakeComposite:
-    """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``)."""
+def _composite_benchmark() -> tuple[PerfBenchmark, _FakeComposite]:
+    config = BenchmarkConfig(
+        model_id="google/siglip-base-patch16-224",
+        task="zero-shot-image-classification",
+        device="gpu",
+        iterations=3,
+        warmup=1,
+    )
+    bench = PerfBenchmark(config)
+    model = _siglip_like()
+    bench._model = model  # bypass _load_model (no HF download in unit tests)
+    return bench, model
 
-    def __init__(self, sub_models: dict[str, Any]) -> None:
-        self.sub_models = sub_models
-        self.call_log: list[dict[str, np.ndarray]] = []
 
-    def __call__(self, **kwargs: np.ndarray) -> dict[str, np.ndarray]:
-        self.call_log.append(kwargs)
-        # Mimics a composite's task-level forward() output (e.g. SigLIP):
-        # tensors that exist on no single sub-model's ONNX graph.
-        return {
-            "logits_per_image": np.zeros((1, 1), dtype=np.float32),
-            "image_embeds": np.zeros((1, 768), dtype=np.float32),
-            "text_embeds": np.zeros((1, 768), dtype=np.float32),
-        }
+class TestPerfBenchmarkComposite:
+    """PerfBenchmark benchmarks each sub-model of a composite individually."""
 
+    def test_detects_composite(self) -> None:
+        bench, _ = _composite_benchmark()
+        assert bench._is_composite is True
 
-def _siglip_like() -> _FakeComposite:
-    image_encoder = _make_sub_model(
-        input_names=["pixel_values"],
-        input_shapes=[[1, 3, 224, 224]],
-        input_types=["float32"],
-        output_names=["image_embeds"],
-        output_shapes=[[1, 768]],
-    )
-    text_encoder = _make_sub_model(
-        input_names=["input_ids", "attention_mask"],
-        input_shapes=[[1, 64], [1, 64]],
-        input_types=["int64", "int64"],
-        output_names=["text_embeds"],
-        output_shapes=[[1, 768]],
-    )
-    return _FakeComposite({"image-encoder": image_encoder, "text-encoder": text_encoder})
+    def test_run_returns_result_per_sub_model(self) -> None:
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
 
+        assert set(results) == {"image-encoder", "text-encoder"}
+        assert all(isinstance(r, BenchmarkResult) for r in results.values())
 
-class TestAggregateIoConfig:
-    """Unit tests for the io_config union helper."""
+    def test_each_sub_model_reports_its_own_io(self) -> None:
+        # No aggregation: each result carries only its sub-model's inputs.
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
 
-    def test_union_dedupes_by_name_preserving_order(self) -> None:
-        model = _siglip_like()
-        agg = _aggregate_io_config(model.sub_models.values())
+        assert results["image-encoder"].input_names == ["pixel_values"]
+        assert results["text-encoder"].input_names == ["input_ids", "attention_mask"]
+        assert results["image-encoder"].output_names == ["image_embeds"]
+        assert results["text-encoder"].output_names == ["text_embeds"]
 
-        assert agg["input_names"] == ["pixel_values", "input_ids", "attention_mask"]
-        assert agg["input_shapes"] == [[1, 3, 224, 224], [1, 64], [1, 64]]
-        assert agg["input_types"] == ["float32", "int64", "int64"]
-        assert agg["output_names"] == ["image_embeds", "text_embeds"]
+    def test_each_sub_model_reports_its_own_task(self) -> None:
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
 
-    def test_shared_input_name_is_not_duplicated(self) -> None:
-        # Both encoders consume "attention_mask" -> it must appear once.
-        a = _make_sub_model(
-            ["input_ids", "attention_mask"],
-            [[1, 8], [1, 8]],
-            ["int64", "int64"],
-            ["a"],
-            [[1, 4]],
-        )
-        b = _make_sub_model(
-            ["attention_mask", "token_type_ids"],
-            [[1, 8], [1, 8]],
-            ["int64", "int64"],
-            ["b"],
-            [[1, 4]],
-        )
-        agg = _aggregate_io_config([a, b])
-        assert agg["input_names"] == ["input_ids", "attention_mask", "token_type_ids"]
+        assert results["image-encoder"].actual_task == "image-feature-extraction"
+        assert results["text-encoder"].actual_task == "feature-extraction"
 
-    def test_precision_taken_from_first_sub_model(self) -> None:
-        a = _make_sub_model(["x"], [[1]], ["float32"], ["y"], [[1]], precision="int8")
-        b = _make_sub_model(["z"], [[1]], ["float32"], ["w"], [[1]], precision="fp16")
-        assert _aggregate_io_config([a, b])["precision"] == "int8"
+    def test_resolved_device_and_ep_per_sub_model(self) -> None:
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
 
+        for result in results.values():
+            assert result.actual_device == "GPU"
+            assert result.actual_ep == "OpenVINOExecutionProvider"
 
-class TestPerfBenchmarkComposite:
-    """PerfBenchmark must transparently handle composite models."""
+    def test_compiles_and_runs_every_sub_session(self) -> None:
+        bench, model = _composite_benchmark()
+        bench._run_sub_models()
+
+        for sub in model.sub_models.values():
+            assert sub._session.compiled is True
+            # warmup(1) + iterations(3) == 4 run() calls per sub-session.
+            assert len(sub._session.run_log) == 4
+
+    def test_each_sub_model_stats_exclude_warmup(self) -> None:
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
+
+        for result in results.values():
+            assert len(result.raw_samples_ms) == 3
+
+
+class TestReportCompositeResults:
+    """report_composite_results writes a combined per-component JSON report."""
 
-    def _benchmark(self) -> tuple[PerfBenchmark, _FakeComposite]:
-        config = BenchmarkConfig(
+    def test_combined_json_nests_each_component(self, tmp_path: Path) -> None:
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
+        output = tmp_path / "perf.json"
+
+        report_composite_results(
+            results,
+            console=Console(),
+            json_mode=False,
+            output_path=output,
             model_id="google/siglip-base-patch16-224",
             task="zero-shot-image-classification",
-            device="gpu",
-            iterations=3,
-            warmup=1,
         )
-        bench = PerfBenchmark(config)
-        model = _siglip_like()
-        bench._model = model  # bypass _load_model (no HF download in unit tests)
-        return bench, model
 
-    def test_detects_composite(self) -> None:
-        bench, _ = self._benchmark()
-        assert bench._is_composite is True
+        data = json.loads(output.read_text())
+        assert data["model_id"] == "google/siglip-base-patch16-224"
+        assert data["task"] == "zero-shot-image-classification"
+        assert data["component_count"] == 2
+        assert set(data["components"]) == {"image-encoder", "text-encoder"}
+        # Each component holds a full BenchmarkResult.to_dict() payload.
+        img = data["components"]["image-encoder"]
+        assert img["model_info"]["input_names"] == ["pixel_values"]
+        assert "latency_ms" in img
+
+    def test_json_mode_emits_combined_payload_to_stdout(self, tmp_path: Path, capsys: Any) -> None:
+        bench, _ = _composite_benchmark()
+        results = bench._run_sub_models()
+        output = tmp_path / "perf.json"
+
+        report_composite_results(
+            results,
+            console=Console(stderr=True),
+            json_mode=True,
+            output_path=output,
+            model_id="google/siglip-base-patch16-224",
+            task="zero-shot-image-classification",
+        )
 
-    def test_resolved_io_config_is_aggregated_and_cached(self) -> None:
-        bench, _ = self._benchmark()
-        io = bench._resolved_io_config()
-        assert io["input_names"] == ["pixel_values", "input_ids", "attention_mask"]
-        # Cached: second call returns the same object.
-        assert bench._resolved_io_config() is io
-
-    def test_compile_compiles_every_sub_session(self) -> None:
-        bench, model = self._benchmark()
-        bench._compile_model()
-        assert all(s._compiled_flag["compiled"] for s in model.sub_models.values())
-
-    def test_generate_inputs_covers_all_sub_model_inputs(self) -> None:
-        bench, _ = self._benchmark()
-        bench._generate_inputs()
-        assert set(bench._inputs) == {"pixel_values", "input_ids", "attention_mask"}
-        assert bench._inputs["pixel_values"].shape == (1, 3, 224, 224)
-        assert bench._inputs["input_ids"].shape == (1, 64)
-
-    def test_resolved_device_ep_task_from_sub_model(self) -> None:
-        bench, _ = self._benchmark()
-        assert bench._resolved_device() == "GPU"
-        assert bench._resolved_ep() == "OpenVINOExecutionProvider"
-        assert bench._resolved_task() == "zero-shot-image-classification"
-
-    def test_simple_benchmark_times_full_forward(self) -> None:
-        bench, model = self._benchmark()
-        bench._generate_inputs()
-        stats = bench._run_benchmark_simple()
-
-        # warmup(1) + iterations(3) == 4 forward() calls; stats excludes warmup.
-        assert len(model.call_log) == 4
-        assert stats.count == 3
-        # forward() received the generated inputs as kwargs.
-        assert set(model.call_log[0]) == {"pixel_values", "input_ids", "attention_mask"}
-
-    def test_probe_replaces_outputs_with_real_forward_result(self) -> None:
-        # The aggregated view reports the image encoder's raw ONNX outputs;
-        # probing must replace them with the composite forward()'s outputs.
-        bench, _ = self._benchmark()
-        bench._generate_inputs()
-        assert bench._resolved_io_config()["output_names"] == ["image_embeds", "text_embeds"]
-
-        bench._probe_composite_outputs()
-        io = bench._resolved_io_config()
-        assert io["output_names"] == ["logits_per_image", "image_embeds", "text_embeds"]
-        assert io["output_shapes"] == [[1, 1], [1, 768], [1, 768]]
-
-    def test_collect_results_reports_probed_outputs(self) -> None:
-        bench, _ = self._benchmark()
-        bench._generate_inputs()
-        bench._probe_composite_outputs()
-        stats = bench._run_benchmark_simple()
-        result = bench._collect_results(stats)
-
-        assert result.input_names == ["pixel_values", "input_ids", "attention_mask"]
-        # Real composite outputs, not the deduped sub-model ONNX outputs.
-        assert result.output_names == ["logits_per_image", "image_embeds", "text_embeds"]
-        assert result.actual_device == "GPU"
-        assert result.actual_ep == "OpenVINOExecutionProvider"
-        assert result.actual_task == "zero-shot-image-classification"
-
-
-class TestDescribeOutputs:
-    """Unit tests for the architecture-agnostic forward()-output describer."""
-
-    def test_dict_output_named_fields(self) -> None:
-        out = {
-            "logits": np.zeros((2, 5), dtype=np.float32),
-            "embeds": np.zeros((2, 8), dtype=np.float32),
-        }
-        names, shapes, types = _describe_outputs(out)
-        assert names == ["logits", "embeds"]
-        assert shapes == [[2, 5], [2, 8]]
-        assert all("float32" in t for t in types)
-
-    def test_skips_none_and_non_array_fields(self) -> None:
-        out = {"a": np.zeros((1, 3)), "b": None, "c": "not-an-array"}
-        names, shapes, _ = _describe_outputs(out)
-        assert names == ["a"]
-        assert shapes == [[1, 3]]
-
-    def test_sequence_output_positional_names(self) -> None:
-        names, shapes, _ = _describe_outputs([np.zeros((1, 4)), np.zeros((1, 2))])
-        assert names == ["output_0", "output_1"]
-        assert shapes == [[1, 4], [1, 2]]
-
-    def test_single_tensor_output(self) -> None:
-        names, shapes, _ = _describe_outputs(np.zeros((3, 3)))
-        assert names == ["output_0"]
-        assert shapes == [[3, 3]]
+        payload = json.loads(capsys.readouterr().out)
+        assert set(payload["components"]) == {"image-encoder", "text-encoder"}
+        # File is written regardless of json_mode.
+        assert output.exists()

From 84456b6567303c1c49ed4e3729acd2b256e5e211 Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 15:06:38 +0800
Subject: [PATCH 4/9] use type

---
 src/winml/modelkit/commands/perf.py | 53 ++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 614c59a49..2dfd1be2a 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -282,45 +282,62 @@ class PerfBenchmark:
     def __init__(self, config: BenchmarkConfig) -> None:
         """Initialize benchmark with configuration."""
         self.config = config
-        self._model: WinMLPreTrainedModel | None = None
+        self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None
         self._inputs: dict[str, np.ndarray] | None = None
         self._io_config: dict[str, Any] | None = None
 
     @property
     def _is_composite(self) -> bool:
-        """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP)."""
+        """Composite models orchestrate multiple sub-sessions (e.g. CLIP/SigLIP).
+
+        Duck-typed on ``sub_models`` rather than ``isinstance(..., WinMLCompositeModel)``
+        on purpose: an ``isinstance`` check needs a runtime import of
+        ``composite_model``, which imports torch and would blow the
+        ``winml perf --help`` import budget (see tests/cli/test_import_time.py) by
+        pulling torch in at module load. Keeping ``WinMLCompositeModel`` a
+        TYPE_CHECKING-only import also lets the unit tests use lightweight
+        duck-typed fakes instead of constructing real torch-backed composites.
+        ``sub_models`` is the defining member of the composite base, so it is a
+        reliable marker.
+        """
         return hasattr(self._model, "sub_models")
 
     def _sub_models(self) -> dict[str, WinMLPreTrainedModel]:
         """Sub-models of a composite model (only valid when ``_is_composite``)."""
         return cast("WinMLCompositeModel", self._model).sub_models
 
+    @property
+    def _single(self) -> WinMLPreTrainedModel:
+        """The model under benchmark, narrowed to a single-session model.
+
+        Only valid for non-composite models: composites dispatch to
+        ``_run_sub_models``, which benchmarks each sub-model through a child
+        ``PerfBenchmark`` whose ``_model`` is itself single-session.
+        """
+        assert self._model is not None
+        return cast("WinMLPreTrainedModel", self._model)
+
     def _resolved_io_config(self) -> dict[str, Any]:
         """I/O config of the (single-session) model being benchmarked."""
         if self._io_config is None:
-            assert self._model is not None
-            self._io_config = self._model.io_config
+            self._io_config = self._single.io_config
         return self._io_config
 
     def _compile_model(self) -> None:
         """Compile the underlying ORT session so device/EP are resolved."""
-        assert self._model is not None
-        self._model._session.compile()
+        self._single._session.compile()
 
     def _resolved_device(self) -> str:
         """Actual device bound after compile."""
-        assert self._model is not None
-        return self._model.device
+        return self._single.device
 
     def _resolved_ep(self) -> EPName | None:
         """Primary EP bound after compile."""
-        assert self._model is not None
-        return self._model.ep_name
+        return self._single.ep_name
 
     def _resolved_task(self) -> str | None:
         """Resolved task; falls back to the requested task."""
-        assert self._model is not None
-        return self._model.task or self.config.task
+        return self._single.task or self.config.task
 
     def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]:
         """Execute full benchmark pipeline.
@@ -465,11 +482,10 @@ def _run_benchmark(self) -> PerfStats:
 
     def _run_benchmark_simple(self) -> PerfStats:
         """Execute benchmark without live monitoring."""
-        assert self._model is not None
         assert self._inputs is not None
         total_iterations = self.config.warmup + self.config.iterations
 
-        session = self._model._session
+        session = self._single._session
         with session.perf(warmup=self.config.warmup) as stats:
             _run_simple_loop(session, self._inputs, total_iterations)
 
@@ -487,7 +503,6 @@ def _run_benchmark_monitored(self) -> PerfStats:
         from ..session.monitor.hw_monitor import HWMonitor
         from ..session.monitor.vitisai_monitor import VitisAIMonitor
 
-        assert self._model is not None
         assert self._inputs is not None
         total_iterations = self.config.warmup + self.config.iterations
 
@@ -520,12 +535,12 @@ def _run_benchmark_monitored(self) -> PerfStats:
         else:
             ep_monitor = NullEPMonitor()
 
+        session = self._single._session
         with (
-            self._model._session.perf(warmup=self.config.warmup) as stats,
+            session.perf(warmup=self.config.warmup) as stats,
             hw_monitor as hw,
             ep_monitor as ep_mon,
         ):
-            session = self._model._session
             inputs = self._inputs
 
             def run_iteration() -> None:
@@ -1497,9 +1512,7 @@ def perf(
             # For HF models the ONNX is built internally by PerfBenchmark.
             try:
                 onnx_for_trace = (
-                    model_path
-                    if is_onnx
-                    else (benchmark._model._onnx_path if benchmark._model else None)
+                    model_path if is_onnx else getattr(benchmark._model, "_onnx_path", None)
                 )
                 if onnx_for_trace is None:
                     raise AttributeError("benchmark._model not initialized")

From dad558af47252e8949b0394a9d7fd4916e2bcd2a Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 15:15:37 +0800
Subject: [PATCH 5/9] fix mypy

---
 src/winml/modelkit/commands/perf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 2dfd1be2a..4b5325960 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -562,7 +562,7 @@ def run_iteration() -> None:
             if ep_dict:  # NullEPMonitor returns {}, real monitors return data
                 self._hw_metrics["ep_proof"] = ep_dict
 
-        return cast("PerfStats", stats)
+        return stats
 
     def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
         """Collect benchmark results from PerfStats."""

From 7d102bd0ea6db0357e52dd363a6d9651dae0cc4d Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 16:18:08 +0800
Subject: [PATCH 6/9] process no compile

---
 src/winml/modelkit/commands/perf.py     |  9 ++-
 src/winml/modelkit/models/auto.py       |  2 +
 tests/unit/commands/test_perf_module.py | 90 +++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index ba11eae47..55c456ea4 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -629,6 +629,7 @@ def _perf_modules(
     warmup: int,
     batch_size: int,
     no_quantize: bool,
+    no_compile: bool,
     output: Path | None,
     verbose: bool,
     console: Console,
@@ -651,7 +652,8 @@ def _perf_modules(
         iterations: Number of benchmark iterations.
         warmup: Number of warmup iterations.
         batch_size: Batch size for input generation.
-        no_quantize: If True, skip quantization and compilation.
+        no_quantize: If True, skip quantization during the per-module build.
+        no_compile: If True, skip the build's compile stage for each module.
         output: Output JSON path, or None for auto-generated path.
         verbose: If True, log exceptions at DEBUG level.
         console: Rich console for output.
@@ -745,9 +747,11 @@ def _perf_modules(
 
         submodule = parent_model.get_submodule(module_path)
 
-        # Skip quant/compile for faster iteration when requested
+        # Skip quant/compile for faster iteration when requested. Quantization
+        # and compilation are independent toggles (mirrors the single-model path).
         if no_quantize:
             cfg.quant = None
+        if no_compile:
             cfg.compile = None
 
         with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
@@ -1394,6 +1398,7 @@ def perf(
             warmup=warmup,
             batch_size=batch_size,
             no_quantize=not quantize,
+            no_compile=no_compile,
             output=output,
             verbose=bool(verbose),
             console=console,
diff --git a/src/winml/modelkit/models/auto.py b/src/winml/modelkit/models/auto.py
index a8e4036c3..4fe697938 100644
--- a/src/winml/modelkit/models/auto.py
+++ b/src/winml/modelkit/models/auto.py
@@ -160,6 +160,7 @@ def from_onnx(
                 use_cache=use_cache,
                 force_rebuild=force_rebuild,
                 skip_build=skip_build,
+                no_compile=no_compile,
                 session_options=session_options,
                 **kwargs,
             )
@@ -365,6 +366,7 @@ def from_pretrained(
                     config=config,
                     cache_dir=cache_dir,
                     allow_unsupported_nodes=allow_unsupported_nodes,
+                    no_compile=no_compile,
                     **kwargs,
                 )
 
diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py
index 051bae425..210401ec1 100644
--- a/tests/unit/commands/test_perf_module.py
+++ b/tests/unit/commands/test_perf_module.py
@@ -218,3 +218,93 @@ def test_device_and_ep_forwarded_through_module_path(self, tmp_path: Path) -> No
         session_kwargs = mock_session_cls.call_args.kwargs
         assert session_kwargs["device"] == "npu"
         assert session_kwargs["ep"] == "qnn"
+
+
+class TestPerfModuleQuantCompileToggles:
+    """--no-quantize and --compile/--no-compile clear cfg.quant / cfg.compile
+    independently in the per-module build (mirrors the single-model path)."""
+
+    @staticmethod
+    def _run(tmp_path: Path, extra_args: list[str]) -> MagicMock:
+        """Invoke ``perf --module`` with mocked build and return the module cfg.
+
+        The cfg is mutated (quant/compile cleared) before ``build_hf_model``,
+        so short-circuiting the benchmark via a failing ``session.perf()``
+        still lets us inspect the mutation.
+        """
+        fake_cfg = MagicMock()
+        fake_cfg.loader.model_type = "bert"
+        fake_cfg.loader.module_path = "encoder.layer.0"
+
+        fake_build_result = MagicMock()
+        fake_build_result.final_onnx_path = tmp_path / "model.onnx"
+
+        fake_session = MagicMock()
+        fake_session.perf.side_effect = RuntimeError("test-skip-benchmark")
+
+        fake_loader_cfg = MagicMock()
+        fake_loader_cfg.task = "fill-mask"
+
+        with (
+            patch(
+                "winml.modelkit.sysinfo.resolve_device",
+                return_value=("cpu", ["cpu"]),
+            ),
+            patch(
+                "winml.modelkit.config.generate_hf_build_config",
+                return_value=[fake_cfg],
+            ),
+            patch(
+                "winml.modelkit.loader.resolve_loader_config",
+                return_value=(fake_loader_cfg, MagicMock(), MagicMock()),
+            ),
+            patch(
+                "winml.modelkit.commands.build._instantiate_parent_model",
+                return_value=MagicMock(),
+            ),
+            patch(
+                "winml.modelkit.build.build_hf_model",
+                return_value=fake_build_result,
+            ),
+            patch(
+                "winml.modelkit.session.WinMLSession",
+                return_value=fake_session,
+            ),
+        ):
+            runner = CliRunner()
+            result = runner.invoke(
+                main,
+                [
+                    "perf",
+                    "-m",
+                    "fake/model",
+                    "--module",
+                    "BertLayer",
+                    "--iterations",
+                    "1",
+                    "--warmup",
+                    "0",
+                    "-o",
+                    str(tmp_path / "out.json"),
+                    *extra_args,
+                ],
+            )
+        assert result.exit_code == 0, result.output
+        return fake_cfg
+
+    def test_default_skips_compile_keeps_quant(self, tmp_path: Path) -> None:
+        # perf defaults to --no-compile and --quantize.
+        cfg = self._run(tmp_path, [])
+        assert cfg.compile is None
+        assert cfg.quant is not None
+
+    def test_compile_flag_preserves_compile(self, tmp_path: Path) -> None:
+        cfg = self._run(tmp_path, ["--compile"])
+        assert cfg.compile is not None
+        assert cfg.quant is not None
+
+    def test_no_quantize_clears_only_quant(self, tmp_path: Path) -> None:
+        # --no-quantize must not also clear compile when --compile is set.
+        cfg = self._run(tmp_path, ["--no-quantize", "--compile"])
+        assert cfg.quant is None
+        assert cfg.compile is not None

From 9589466964a96984b4677c6a6d0fb0ea371dfb62 Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 17:06:54 +0800
Subject: [PATCH 7/9] add back

---
 tests/unit/commands/test_perf_module.py | 90 +++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py
index f08f5d3d7..80298c02d 100644
--- a/tests/unit/commands/test_perf_module.py
+++ b/tests/unit/commands/test_perf_module.py
@@ -314,3 +314,93 @@ def test_running_model_path_in_module_result(self, tmp_path: Path) -> None:
         report = json.loads(out_path.read_text(encoding="utf-8"))
         instance = report["instances"][0]
         assert instance["running_model_path"] == str(running_model_path)
+
+
+class TestPerfModuleQuantCompileToggles:
+    """--no-quantize and --compile/--no-compile clear cfg.quant / cfg.compile
+    independently in the per-module build (mirrors the single-model path)."""
+
+    @staticmethod
+    def _run(tmp_path: Path, extra_args: list[str]) -> MagicMock:
+        """Invoke ``perf --module`` with mocked build and return the module cfg.
+
+        The cfg is mutated (quant/compile cleared) before ``build_hf_model``,
+        so short-circuiting the benchmark via a failing ``session.perf()``
+        still lets us inspect the mutation.
+        """
+        fake_cfg = MagicMock()
+        fake_cfg.loader.model_type = "bert"
+        fake_cfg.loader.module_path = "encoder.layer.0"
+
+        fake_build_result = MagicMock()
+        fake_build_result.final_onnx_path = tmp_path / "model.onnx"
+
+        fake_session = MagicMock()
+        fake_session.perf.side_effect = RuntimeError("test-skip-benchmark")
+
+        fake_loader_cfg = MagicMock()
+        fake_loader_cfg.task = "fill-mask"
+
+        with (
+            patch(
+                "winml.modelkit.sysinfo.resolve_device",
+                return_value=("cpu", ["cpu"]),
+            ),
+            patch(
+                "winml.modelkit.config.generate_hf_build_config",
+                return_value=[fake_cfg],
+            ),
+            patch(
+                "winml.modelkit.loader.resolve_loader_config",
+                return_value=(fake_loader_cfg, MagicMock(), MagicMock()),
+            ),
+            patch(
+                "winml.modelkit.commands.build._instantiate_parent_model",
+                return_value=MagicMock(),
+            ),
+            patch(
+                "winml.modelkit.build.build_hf_model",
+                return_value=fake_build_result,
+            ),
+            patch(
+                "winml.modelkit.session.WinMLSession",
+                return_value=fake_session,
+            ),
+        ):
+            runner = CliRunner()
+            result = runner.invoke(
+                main,
+                [
+                    "perf",
+                    "-m",
+                    "fake/model",
+                    "--module",
+                    "BertLayer",
+                    "--iterations",
+                    "1",
+                    "--warmup",
+                    "0",
+                    "-o",
+                    str(tmp_path / "out.json"),
+                    *extra_args,
+                ],
+            )
+        assert result.exit_code == 0, result.output
+        return fake_cfg
+
+    def test_default_skips_compile_keeps_quant(self, tmp_path: Path) -> None:
+        # perf defaults to --no-compile and --quantize.
+        cfg = self._run(tmp_path, [])
+        assert cfg.compile is None
+        assert cfg.quant is not None
+
+    def test_compile_flag_preserves_compile(self, tmp_path: Path) -> None:
+        cfg = self._run(tmp_path, ["--compile"])
+        assert cfg.compile is not None
+        assert cfg.quant is not None
+
+    def test_no_quantize_clears_only_quant(self, tmp_path: Path) -> None:
+        # --no-quantize must not also clear compile when --compile is set.
+        cfg = self._run(tmp_path, ["--no-quantize", "--compile"])
+        assert cfg.quant is None
+        assert cfg.compile is not None

From 3ad1490a8789c1f798c63659be2054aaff6f09de Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 17:14:51 +0800
Subject: [PATCH 8/9] clean up

---
 src/winml/modelkit/commands/perf.py        | 51 +++++++---------------
 tests/unit/commands/test_perf_composite.py |  5 +++
 2 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 46c16f15c..9fbcc7d1f 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -295,7 +295,6 @@ def __init__(self, config: BenchmarkConfig) -> None:
         self.config = config
         self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None
         self._inputs: dict[str, np.ndarray] | None = None
-        self._io_config: dict[str, Any] | None = None
 
     @property
     def _is_composite(self) -> bool:
@@ -323,33 +322,18 @@ def _single(self) -> WinMLPreTrainedModel:
 
         Only valid for non-composite models: composites dispatch to
         ``_run_sub_models``, which benchmarks each sub-model through a child
-        ``PerfBenchmark`` whose ``_model`` is itself single-session.
+        ``PerfBenchmark`` whose ``_model`` is itself single-session. Exposes
+        ``io_config`` / ``device`` / ``ep_name`` / ``task`` directly (the
+        session caches ``io_config``), so callers read ``self._single.*``
+        rather than going through per-attribute wrappers.
         """
         assert self._model is not None
         return cast("WinMLPreTrainedModel", self._model)
 
-    def _resolved_io_config(self) -> dict[str, Any]:
-        """I/O config of the (single-session) model being benchmarked."""
-        if self._io_config is None:
-            self._io_config = self._single.io_config
-        return self._io_config
-
     def _compile_model(self) -> None:
         """Compile the underlying ORT session so device/EP are resolved."""
         self._single._session.compile()
 
-    def _resolved_device(self) -> str:
-        """Actual device bound after compile."""
-        return self._single.device
-
-    def _resolved_ep(self) -> EPName | None:
-        """Primary EP bound after compile."""
-        return self._single.ep_name
-
-    def _resolved_task(self) -> str | None:
-        """Resolved task; falls back to the requested task."""
-        return self._single.task or self.config.task
-
     def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]:
         """Execute full benchmark pipeline.
 
@@ -403,11 +387,11 @@ def _run_single(self) -> BenchmarkResult:
 
         # Print model info before benchmark starts
         _print_model_info(
-            self._resolved_io_config(),
-            task=self._resolved_task(),
+            self._single.io_config,
+            task=self._single.task or self.config.task,
             req_device=self.config.device,
-            act_device=self._resolved_device(),
-            ep_name=self._resolved_ep(),
+            act_device=self._single.device,
+            ep_name=self._single.ep_name,
         )
 
         # [3] Run benchmark
@@ -479,10 +463,8 @@ def _load_model(self) -> None:
 
     def _generate_inputs(self) -> None:
         """Generate random inputs based on model io_config."""
-        assert self._model is not None
-        io_config = self._resolved_io_config()
         self._inputs = generate_random_inputs(
-            io_config=io_config,
+            io_config=self._single.io_config,
             batch_size=self.config.batch_size,
         )
 
@@ -529,8 +511,8 @@ def _run_benchmark_monitored(self) -> PerfStats:
         # GPU when --device gpu is specified, NPU when --device npu, etc.
         # ep_name lets the monitor resolve the exact LUID via ORT's autoEP
         # metadata so we follow the adapter the session actually binds to.
-        ep_name = self._resolved_ep()
-        monitor_device = self._resolved_device() or self.config.device or "auto"
+        ep_name = self._single.ep_name
+        monitor_device = self._single.device or self.config.device or "auto"
         hw_monitor = HWMonitor(
             poll_interval_ms=_HW_POLL_INTERVAL_MS,
             device=monitor_device,
@@ -578,8 +560,7 @@ def run_iteration() -> None:
 
     def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
         """Collect benchmark results from PerfStats."""
-        assert self._model is not None
-        io_config = self._resolved_io_config()
+        io_config = self._single.io_config
 
         # Calculate throughput
         mean_latency_sec = stats.mean_ms / 1000.0
@@ -618,10 +599,10 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
             samples_per_sec=samples_per_sec,
             batches_per_sec=batches_per_sec,
             # Actual values (resolved after build + compile)
-            actual_device=self._resolved_device(),
-            actual_task=self._resolved_task() or "auto-detected",
-            actual_ep=self._resolved_ep(),
-            running_model_path=str(self._single().running_model_path),
+            actual_device=self._single.device,
+            actual_task=self._single.task or self.config.task or "auto-detected",
+            actual_ep=self._single.ep_name,
+            running_model_path=str(self._single.running_model_path),
             # Hardware monitor metrics (only present when --monitor is used)
             hw_monitor=getattr(self, "_hw_metrics", None),
         )
diff --git a/tests/unit/commands/test_perf_composite.py b/tests/unit/commands/test_perf_composite.py
index 1e99da930..f19e94840 100644
--- a/tests/unit/commands/test_perf_composite.py
+++ b/tests/unit/commands/test_perf_composite.py
@@ -42,6 +42,7 @@ def __init__(self, io_config: dict[str, Any], device: str, ep_name: str) -> None
         self.io_config = io_config
         self.device = device
         self.ep_name = ep_name
+        self.running_model_path = "model.onnx"
         self.compiled = False
         self.run_log: list[dict[str, Any]] = []
         self._perf_stats: PerfStats | None = None
@@ -90,6 +91,10 @@ def device(self) -> str:
     def ep_name(self) -> str:
         return self._session.ep_name
 
+    @property
+    def running_model_path(self) -> str:
+        return self._session.running_model_path
+
 
 class _FakeComposite:
     """Stand-in for a WinMLCompositeModel (duck-typed via ``sub_models``)."""

From 9ace198d6d8db26bb40feb7dd2316f762dd88328 Mon Sep 17 00:00:00 2001
From: xieofxie <xieofxie@126.com>
Date: Fri, 12 Jun 2026 17:26:16 +0800
Subject: [PATCH 9/9] clean up

---
 src/winml/modelkit/commands/perf.py | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 9fbcc7d1f..622d30f1d 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -33,8 +33,6 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
-
     from ..models.winml.base import WinMLPreTrainedModel
     from ..models.winml.composite_model import WinMLCompositeModel
     from ..session.stats import PerfStats
@@ -330,10 +328,6 @@ def _single(self) -> WinMLPreTrainedModel:
         assert self._model is not None
         return cast("WinMLPreTrainedModel", self._model)
 
-    def _compile_model(self) -> None:
-        """Compile the underlying ORT session so device/EP are resolved."""
-        self._single._session.compile()
-
     def run(self) -> BenchmarkResult | dict[str, BenchmarkResult]:
         """Execute full benchmark pipeline.
 
@@ -383,7 +377,7 @@ def _run_single(self) -> BenchmarkResult:
         self._generate_inputs()
 
         # Compile session early so model.device is resolved for display
-        self._compile_model()
+        self._single._session.compile()
 
         # Print model info before benchmark starts
         _print_model_info(
@@ -535,13 +529,9 @@ def _run_benchmark_monitored(self) -> PerfStats:
             hw_monitor as hw,
             ep_monitor as ep_mon,
         ):
-            inputs = self._inputs
-
-            def run_iteration() -> None:
-                session.run(inputs)
-
             _run_monitored_loop(
-                run_iteration,
+                session,
+                self._inputs,
                 stats,
                 hw,
                 total_iterations=total_iterations,
@@ -1122,7 +1112,8 @@ def _print_model_info(
 
 
 def _run_monitored_loop(
-    run_iteration: Callable[[], Any],
+    session: Any,
+    inputs: dict[str, Any],
     stats: PerfStats,
     hw: Any,
     *,
@@ -1131,12 +1122,7 @@ def _run_monitored_loop(
     model_id: str,
     device: str,
 ) -> None:
-    """Run the benchmark iteration loop with live hardware monitoring.
-
-    ``run_iteration`` runs (and times into ``stats``) a single inference. For
-    single-session models it invokes ``session.run`` inside the session's
-    perf() context; for composite models it records a full ``forward()`` pass.
-    """
+    """Run the benchmark iteration loop with live hardware monitoring."""
     display = LiveMonitorDisplay(
         total_iterations=total_iterations,
         warmup=warmup,
@@ -1146,7 +1132,7 @@ def _run_monitored_loop(
     )
     with display:
         for i in range(total_iterations):
-            run_iteration()
+            session.run(inputs)
 
             latest_latency = stats.all_samples_ms[-1] if stats.all_samples_ms else 0
             display.update(