Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions src/winml/modelkit/commands/perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from rich.console import Console
from rich.table import Table

from ..session.monitor.memory_tracker import MemoryProfile
from ..utils import cli as cli_utils
from ..utils.constants import EPName, EPNameOrAlias
from ..utils.logging import configure_logging
Expand Down Expand Up @@ -82,6 +83,7 @@ class BenchmarkConfig:
skip_build: bool = True
allow_unsupported_nodes: bool = False
monitor: bool = False
memory: bool = True
ep: EPNameOrAlias | None = None
shape_config: dict | None = None

Expand Down Expand Up @@ -129,6 +131,9 @@ class BenchmarkResult:
# Hardware monitor metrics (from HWMonitor.to_dict())
hw_monitor: dict[str, Any] | None = None

# Memory profile (from MemoryTracker)
memory_profile: MemoryProfile | None = None

def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
result = {
Expand Down Expand Up @@ -169,6 +174,8 @@ def to_dict(self) -> dict[str, Any]:
}
if self.hw_monitor:
result["hw_monitor"] = self.hw_monitor
if self.memory_profile:
result["memory"] = self.memory_profile.to_dict()
return result


Expand Down Expand Up @@ -281,25 +288,40 @@ def __init__(self, config: BenchmarkConfig) -> None:
self.config = config
self._model: WinMLPreTrainedModel | None = None
self._inputs: dict[str, np.ndarray] | None = None
self._memory_tracker: Any = None

def run(self) -> BenchmarkResult:
"""Execute full benchmark pipeline.

Returns:
BenchmarkResult with timing statistics
"""
# Initialize memory tracker if enabled
if self.config.memory:
from ..session.monitor.memory_tracker import MemoryTracker

self._memory_tracker = MemoryTracker()
self._memory_tracker.snapshot_baseline()

# [1] Load model
logger.info("Loading model: %s", self.config.model_id)
self._load_model()
assert self._model is not None

if self._memory_tracker:
self._memory_tracker.snapshot_post_load()

# [2] Generate inputs
logger.info("Generating benchmark inputs")
self._generate_inputs()

# Compile session early so model.device is resolved for display
self._model._session.compile()

if self._memory_tracker:
adapter_luid = self._resolve_adapter_luid()
self._memory_tracker.snapshot_post_compile(adapter_luid=adapter_luid)

# Print model info before benchmark starts
_print_model_info(
self._model.io_config,
Expand All @@ -317,6 +339,10 @@ def run(self) -> BenchmarkResult:
)
stats = self._run_benchmark()

if self._memory_tracker:
adapter_luid = self._resolve_adapter_luid()
self._memory_tracker.snapshot_post_inference(adapter_luid=adapter_luid)

# [4] Collect results
logger.info("Collecting results")
return self._collect_results(stats)
Expand Down Expand Up @@ -384,6 +410,40 @@ def _generate_inputs(self) -> None:
batch_size=self.config.batch_size,
)

def _resolve_adapter_luid(self) -> str | None:
"""Resolve the adapter LUID for device memory queries.

Uses the same resolution logic as HWMonitor: device kind + EP name.
Returns None on non-Windows or when no adapter is available.
"""
import sys

if sys.platform != "win32":
return None

assert self._model is not None
device = self._model.device or self.config.device
ep_name = self._model.ep_name

if device == "cpu":
return None

try:
from ..sysinfo.pdh_adapters import resolve_adapter_luid

if device == "npu":
return resolve_adapter_luid("npu", ep_name=ep_name)
if device == "gpu":
return resolve_adapter_luid("gpu", ep_name=ep_name)
# "auto" — try NPU first, then GPU
luid = resolve_adapter_luid("npu", ep_name=ep_name)
if luid:
return luid
return resolve_adapter_luid("gpu", ep_name=ep_name)
except Exception:
logger.debug("Could not resolve adapter LUID for memory query", exc_info=True)
return None

def _run_benchmark(self) -> PerfStats:
"""Execute benchmark iterations with timing."""
if self.config.monitor:
Expand Down Expand Up @@ -517,6 +577,8 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
actual_ep=self._model.ep_name,
# Hardware monitor metrics (only present when --monitor is used)
hw_monitor=getattr(self, "_hw_metrics", None),
# Memory profile (only present when --memory is used)
memory_profile=(self._memory_tracker.profile() if self._memory_tracker else None),
)


Expand Down Expand Up @@ -874,6 +936,15 @@ def display_console_report(result: BenchmarkResult, console: Console) -> None:
f" CPU: {cpu.get('mean_pct', 0):.1f}% avg | Mem: {ram.get('used_mb', 0):.0f} MB"
)

# Memory section (only when --memory is enabled)
if result.memory_profile:
mem = result.memory_profile
inference_ws = mem.post_inference.working_set_mb
inference_dev = mem.post_inference.device_local_mb
dev_str = f" | {inference_dev:.1f} MB (device)" if inference_dev > 0 else ""
console.print()
console.print(f"[bold]Memory:[/bold] {inference_ws:.1f} MB (process){dev_str}")

console.print()


Expand Down Expand Up @@ -1103,6 +1174,12 @@ def _run_simple_loop(
show_default=True,
help="Show live hardware utilization chart for the benchmarked device (NPU, GPU, or CPU)",
)
@click.option(
"--memory/--no-memory",
default=True,
show_default=True,
help="Measure process and device memory at each benchmark phase",
)
@click.option(
"--op-tracing",
"op_tracing",
Expand Down Expand Up @@ -1134,6 +1211,7 @@ def perf(
allow_unsupported_nodes: bool,
module_class: str | None,
monitor: bool,
memory: bool,
op_tracing: str | None,
output_format: cli_utils.OutputFormat,
verbose: int,
Expand Down Expand Up @@ -1272,6 +1350,7 @@ def perf(
skip_build=skip_build,
allow_unsupported_nodes=allow_unsupported_nodes,
monitor=monitor,
memory=memory,
ep=ep,
shape_config=shape_config,
)
Expand Down
Loading
Loading