microsoft · DingmaomaoBJTU · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -26,6 +26,7 @@
 from rich.console import Console
 from rich.table import Table
 
+from ..session.monitor.memory_tracker import MemoryProfile
 from ..utils import cli as cli_utils
 from ..utils.constants import EPName, EPNameOrAlias
 from ..utils.logging import configure_logging
@@ -82,6 +83,7 @@ class BenchmarkConfig:
     skip_build: bool = True
     allow_unsupported_nodes: bool = False
     monitor: bool = False
+    memory: bool = True
     ep: EPNameOrAlias | None = None
     shape_config: dict | None = None
 
@@ -129,6 +131,9 @@ class BenchmarkResult:
     # Hardware monitor metrics (from HWMonitor.to_dict())
     hw_monitor: dict[str, Any] | None = None
 
+    # Memory profile (from MemoryTracker)
+    memory_profile: MemoryProfile | None = None
+
     def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for JSON serialization."""
         result = {
@@ -169,6 +174,8 @@ def to_dict(self) -> dict[str, Any]:
         }
         if self.hw_monitor:
             result["hw_monitor"] = self.hw_monitor
+        if self.memory_profile:
+            result["memory"] = self.memory_profile.to_dict()
         return result
 
 
@@ -281,25 +288,40 @@ def __init__(self, config: BenchmarkConfig) -> None:
         self.config = config
         self._model: WinMLPreTrainedModel | None = None
         self._inputs: dict[str, np.ndarray] | None = None
+        self._memory_tracker: Any = None
 
     def run(self) -> BenchmarkResult:
         """Execute full benchmark pipeline.
 
         Returns:
             BenchmarkResult with timing statistics
         """
+        # Initialize memory tracker if enabled
+        if self.config.memory:
+            from ..session.monitor.memory_tracker import MemoryTracker
+
+            self._memory_tracker = MemoryTracker()
+            self._memory_tracker.snapshot_baseline()
+
         # [1] Load model
         logger.info("Loading model: %s", self.config.model_id)
         self._load_model()
         assert self._model is not None
 
+        if self._memory_tracker:
+            self._memory_tracker.snapshot_post_load()
+
         # [2] Generate inputs
         logger.info("Generating benchmark inputs")
         self._generate_inputs()
 
         # Compile session early so model.device is resolved for display
         self._model._session.compile()
 
+        if self._memory_tracker:
+            adapter_luid = self._resolve_adapter_luid()
+            self._memory_tracker.snapshot_post_compile(adapter_luid=adapter_luid)
+
         # Print model info before benchmark starts
         _print_model_info(
             self._model.io_config,
@@ -317,6 +339,10 @@ def run(self) -> BenchmarkResult:
         )
         stats = self._run_benchmark()
 
+        if self._memory_tracker:
+            adapter_luid = self._resolve_adapter_luid()
+            self._memory_tracker.snapshot_post_inference(adapter_luid=adapter_luid)
+
         # [4] Collect results
         logger.info("Collecting results")
         return self._collect_results(stats)
@@ -384,6 +410,40 @@ def _generate_inputs(self) -> None:
             batch_size=self.config.batch_size,
         )
 
+    def _resolve_adapter_luid(self) -> str | None:
+        """Resolve the adapter LUID for device memory queries.
+
+        Uses the same resolution logic as HWMonitor: device kind + EP name.
+        Returns None on non-Windows or when no adapter is available.
+        """
+        import sys
+
+        if sys.platform != "win32":
+            return None
+
+        assert self._model is not None
+        device = self._model.device or self.config.device
+        ep_name = self._model.ep_name
+
+        if device == "cpu":
+            return None
+
+        try:
+            from ..sysinfo.pdh_adapters import resolve_adapter_luid
+
+            if device == "npu":
+                return resolve_adapter_luid("npu", ep_name=ep_name)
+            if device == "gpu":
+                return resolve_adapter_luid("gpu", ep_name=ep_name)
+            # "auto" — try NPU first, then GPU
+            luid = resolve_adapter_luid("npu", ep_name=ep_name)
+            if luid:
+                return luid
+            return resolve_adapter_luid("gpu", ep_name=ep_name)
+        except Exception:
+            logger.debug("Could not resolve adapter LUID for memory query", exc_info=True)
+            return None
+
     def _run_benchmark(self) -> PerfStats:
         """Execute benchmark iterations with timing."""
         if self.config.monitor:
@@ -517,6 +577,8 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
             actual_ep=self._model.ep_name,
             # Hardware monitor metrics (only present when --monitor is used)
             hw_monitor=getattr(self, "_hw_metrics", None),
+            # Memory profile (only present when --memory is used)
+            memory_profile=(self._memory_tracker.profile() if self._memory_tracker else None),
         )
 
 
@@ -874,6 +936,15 @@ def display_console_report(result: BenchmarkResult, console: Console) -> None:
                 f"  CPU: {cpu.get('mean_pct', 0):.1f}% avg  |  Mem: {ram.get('used_mb', 0):.0f} MB"
             )
 
+    # Memory section (only when --memory is enabled)
+    if result.memory_profile:
+        mem = result.memory_profile
+        inference_ws = mem.post_inference.working_set_mb
+        inference_dev = mem.post_inference.device_local_mb
+        dev_str = f" | {inference_dev:.1f} MB (device)" if inference_dev > 0 else ""
+        console.print()
+        console.print(f"[bold]Memory:[/bold]      {inference_ws:.1f} MB (process){dev_str}")
+
     console.print()
 
 
@@ -1103,6 +1174,12 @@ def _run_simple_loop(
     show_default=True,
     help="Show live hardware utilization chart for the benchmarked device (NPU, GPU, or CPU)",
 )
+@click.option(
+    "--memory/--no-memory",
+    default=True,
+    show_default=True,
+    help="Measure process and device memory at each benchmark phase",
+)
 @click.option(
     "--op-tracing",
     "op_tracing",
@@ -1134,6 +1211,7 @@ def perf(
     allow_unsupported_nodes: bool,
     module_class: str | None,
     monitor: bool,
+    memory: bool,
     op_tracing: str | None,
     output_format: cli_utils.OutputFormat,
     verbose: int,
@@ -1272,6 +1350,7 @@ def perf(
         skip_build=skip_build,
         allow_unsupported_nodes=allow_unsupported_nodes,
         monitor=monitor,
+        memory=memory,
         ep=ep,
         shape_config=shape_config,
     )