microsoft · xieofxie · Jun 12, 2026
@@ -23,6 +23,7 @@ $ winml perf [options]
 | `--device` | `-d` | `auto\|cpu\|gpu\|npu` | `auto` | Device to run the benchmark on. `auto` selects the highest-priority available device. |
 | `--precision` | | `TEXT` | `auto` | Precision mode applied during model build: `auto`, `fp32`, `fp16`, `int8`, `int16`, or compound forms such as `w8a16`. |
 | `--ep` | | `TEXT` | — | Force a specific execution provider (e.g., `qnn`, `dml`, `vitisai`, `openvino`, `cpu`). Overrides the device-to-provider mapping. |
+| `--ep-options` | | `KEY=VALUE` (multiple) | — | Runtime EP provider option forwarded to the inference session (e.g., `--ep-options htp_performance_mode=burst`). Repeatable. Applies to both HuggingFace model IDs and ONNX file inputs. Unlike build-time options set via `--config`, these tune the runtime session, not the compiled graph. |
 | `--output` | `-o` | `PATH` | `~/.cache/winml/perf/<slug>/<timestamp>.json` | Output JSON file path for the benchmark report. |
 | `--batch-size` | | `INTEGER` | `1` | Batch size used when generating synthetic input tensors. |
 | `--shape-config` | | `PATH` | — | Path to a JSON file containing shape overrides (e.g., `{"height": 480, "width": 480}`). Ignored for pre-exported ONNX files and in `--module` mode. |
@@ -78,6 +79,14 @@ Benchmark with live hardware monitoring enabled:
 $ winml perf -m microsoft/resnet-50 --device npu --monitor
 ```
 
+Pass runtime EP provider options to tune the session (repeatable):
+
+```bash
+$ winml perf -m model.onnx --device npu \
+    --ep-options htp_performance_mode=burst \
+    --ep-options htp_graph_finalization_optimization_mode=3
+```
+
 Per-module benchmarking to find latency hot-spots across all attention blocks:
 
 ```bash

@@ -84,6 +84,7 @@ class BenchmarkConfig:
     allow_unsupported_nodes: bool = False
     monitor: bool = False
     ep: EPNameOrAlias | None = None
+    ep_options: dict[str, str] | None = None
     shape_config: dict | None = None
 
 
@@ -368,6 +369,7 @@ def _load_model(self) -> None:
             "device": self.config.device,
             "precision": self.config.precision,
             "ep": self.config.ep,
+            "provider_options": self.config.ep_options,
             "use_cache": use_cache,
             "force_rebuild": force_rebuild,
             "shape_config": self.config.shape_config,
@@ -554,6 +556,7 @@ def _perf_modules(
     monitor: bool = False,
     device: str = "auto",
     ep: EPNameOrAlias | None = None,
+    ep_options: dict[str, str] | None = None,
     precision: str = "auto",
     allow_unsupported_nodes: bool = False,
 ) -> None:
@@ -578,6 +581,8 @@ def _perf_modules(
         device: Target device policy ("auto", "cpu", "gpu", "npu").
         ep: Explicit execution provider (e.g., "qnn", "dml"). Overrides
             device-to-provider mapping when set.
+        ep_options: Runtime EP provider options (e.g. QNN
+            ``htp_performance_mode``) forwarded to each per-module session.
         precision: Precision mode passed through to the build stage.
         allow_unsupported_nodes: If True, warn instead of failing the build when
             the analyzer reports unsupported nodes that persist.
@@ -687,6 +692,7 @@ def _perf_modules(
                     str(build_result.final_onnx_path),
                     device=resolved_device,
                     ep=ep,
+                    provider_options=ep_options,
                 )
                 io_cfg = session.io_config
                 inputs = generate_random_inputs(io_cfg, batch_size=batch_size)
@@ -1063,6 +1069,9 @@ def _run_simple_loop(
     required=False,
     optional_message="Overrides device-to-provider mapping.",
 )
+@cli_utils.ep_options_option(
+    optional_message="Applied to both HuggingFace model IDs and ONNX file inputs.",
+)
 @cli_utils.output_option(
     "Output JSON file path. Defaults to "
     "'~/.cache/winml/perf/<model_slug>[/<module_class>]/<timestamp>.json'."
@@ -1144,6 +1153,7 @@ def perf(
     device: str,
     precision: str,
     ep: EPNameOrAlias | None,
+    ep_options: tuple[str, ...],
     output: Path | None,
     batch_size: int,
     shape_config_path: Path | None,
@@ -1185,6 +1195,9 @@ def perf(
         # Text model with explicit task
         winml perf -m bert-base-uncased --task text-classification
 
+        # Pass runtime EP provider options (repeatable)
+        winml perf -m model.onnx --device npu --ep-options htp_performance_mode=burst
+
         # Per-module benchmarking
         winml perf -m bert-base-uncased --module BertAttention
 
@@ -1211,6 +1224,10 @@ def perf(
     verbose, quiet = cli_utils.resolve_verbosity(ctx, verbose, quiet)
     configure_logging(verbosity=verbose, quiet=quiet)
 
+    # Runtime EP provider options (e.g. QNN htp_performance_mode) forwarded to
+    # the inference session for both HF model IDs and ONNX file inputs.
+    ep_provider_options = cli_utils.parse_ep_options(ep_options)
+
     json_mode = output_format == "json"
     console = Console(stderr=True) if json_mode else Console()
 
@@ -1248,6 +1265,7 @@ def perf(
             monitor=monitor,
             device=device.lower(),
             ep=ep,
+            ep_options=ep_provider_options,
             precision=precision.lower(),
             allow_unsupported_nodes=allow_unsupported_nodes,
         )
@@ -1295,6 +1313,7 @@ def perf(
         allow_unsupported_nodes=allow_unsupported_nodes,
         monitor=monitor,
         ep=ep,
+        ep_options=ep_provider_options,
         shape_config=shape_config,
     )
 

@@ -109,6 +109,7 @@ def from_onnx(
         device: str = "auto",
         precision: str = "auto",
         ep: EPNameOrAlias | None = None,
+        provider_options: dict[str, str] | None = None,
         cache_dir: str | Path | None = None,
         use_cache: bool = True,
         force_rebuild: bool = False,
@@ -131,6 +132,8 @@ def from_onnx(
             device: Target device ("auto", "npu", "gpu", "cpu").
             precision: Target precision ("auto", "fp32", "fp16", "int8").
             ep: Explicit execution provider.
+            provider_options: Runtime EP provider options (e.g. QNN
+                ``htp_performance_mode``) forwarded to the inference session.
             cache_dir: Override cache directory.
             use_cache: Whether to use persistent cache.
             force_rebuild: Force rebuild even if cached.
@@ -156,6 +159,7 @@ def from_onnx(
                 device=device,
                 precision=precision,
                 ep=ep,
+                provider_options=provider_options,
                 cache_dir=cache_dir,
                 use_cache=use_cache,
                 force_rebuild=force_rebuild,
@@ -203,6 +207,7 @@ def from_onnx(
                 device=device,
                 session_options=session_options,
                 ep=ep,
+                provider_options=provider_options,
             )
 
         # Resolve output directory
@@ -241,6 +246,7 @@ def from_onnx(
             device=device,
             session_options=session_options,
             ep=ep,
+            provider_options=provider_options,
         )
 
     @classmethod
@@ -252,6 +258,7 @@ def from_pretrained(
         config: WinMLBuildConfig | None = None,
         device: str = "auto",
         precision: str = "auto",
+        provider_options: dict[str, str] | None = None,
         cache_dir: str | Path | None = None,
         use_cache: bool = True,
         force_rebuild: bool = False,
@@ -282,6 +289,8 @@ def from_pretrained(
                 "auto" detects available hardware (NPU > GPU > CPU).
             precision: Target precision ("auto", "fp32", "fp16", "int8", "int16").
                 "auto" selects based on device (npu->int8, gpu->fp16, cpu->fp16).
+            provider_options: Runtime EP provider options (e.g. QNN
+                ``htp_performance_mode``) forwarded to the inference session.
             cache_dir: Directory for caching. If None, uses default cache dir.
             use_cache: If True (default), use persistent cache directory.
                 If False, build in a temp directory and always rebuild.
@@ -323,6 +332,7 @@ def from_pretrained(
                 device=device,
                 precision=precision,
                 ep=kwargs.pop("ep", None),
+                provider_options=provider_options,
                 cache_dir=cache_dir,
                 use_cache=use_cache,
                 force_rebuild=force_rebuild,
@@ -362,6 +372,7 @@ def from_pretrained(
                     trust_remote_code=trust_remote_code,
                     shape_config=shape_config,
                     precision=precision,
+                    provider_options=provider_options,
                     config=config,
                     cache_dir=cache_dir,
                     allow_unsupported_nodes=allow_unsupported_nodes,
@@ -463,6 +474,7 @@ def from_pretrained(
             config=hf_config,  # HF PretrainedConfig for pipeline compatibility
             device=device,  # pass user's original device string; WinMLSession handles "auto"
             ep=resolved_ep,
+            provider_options=provider_options,
         )
         model._build_config = config  # resolved build config (task, quant, compile)
         return model

@@ -68,6 +68,7 @@ def __init__(
         device: str = "auto",
         session_options: Any | None = None,
         ep: EPNameOrAlias | None = None,
+        provider_options: dict[str, str] | None = None,
     ) -> None:
         """Initialize inference model.
 
@@ -78,6 +79,9 @@ def __init__(
             session_options: Factory returning an ORT SessionOptions (e.g., for
                 graph_optimization_level). Called fresh per ORT session.
             ep: Explicit EP short name (e.g., "dml", "qnn"). Forwarded to WinMLSession.
+            provider_options: Runtime EP provider options (e.g. QNN
+                ``htp_performance_mode``). Forwarded to WinMLSession and on to
+                ``add_provider_for_devices``.
         """
         self._onnx_path = Path(onnx_path)
         self.config = config
@@ -92,6 +96,7 @@ def __init__(
             device=device,
             session_options=session_options,
             ep=ep,
+            provider_options=provider_options,
         )
 
     @property

@@ -137,6 +137,7 @@ def __init__(
         ep_config: EPConfig | None = None,
         *,
         ep: EPNameOrAlias | None = None,
+        provider_options: dict[str, str] | None = None,
         session_options: Callable[[], ort.SessionOptions] | None = None,
     ) -> None:
         """Initialize WinMLSession.
@@ -153,6 +154,11 @@ def __init__(
             ep: Explicit EP short name (e.g., "migraphx", "nv_tensorrt_rtx").
                 When set, bypasses policy-based selection and uses
                 add_provider_for_devices to force the specific EP.
+            provider_options: Runtime EP provider options merged on top of any
+                ``ep_config.provider_options`` and forwarded to
+                ``add_provider_for_devices`` (e.g. QNN ``htp_performance_mode``).
+                Unlike ``ep_config``, this does not affect EPContext persistence —
+                it only tunes the runtime session.
             session_options: Factory returning an ``ort.SessionOptions``.
                 Called once per ``_build_session_options`` invocation so each
                 ORT session gets a fresh, un-poisoned options object
@@ -170,7 +176,11 @@ def __init__(
         self._ep = ep
         self._persist_jit = ep_config.enable_ep_context if ep_config else False
         self._embed_context = ep_config.embed_context if ep_config else False
-        self._provider_options = ep_config.provider_options if ep_config else {}
+        self._provider_options = dict(ep_config.provider_options) if ep_config else {}
+        # Runtime provider options (e.g. from --ep-options) merge on top of and
+        # override any build-time options carried by ep_config.
+        if provider_options:
+            self._provider_options.update(provider_options)
 
         self._session_options_factory: Callable[[], ort.SessionOptions] = (
             session_options or ort.SessionOptions

@@ -187,6 +187,72 @@ def ep_option(required: bool = True, optional_message: str | None = None) -> Cal
     )
 
 
+def ep_options_option(optional_message: str | None = None) -> Callable[[F], F]:
+    """Add a repeatable ``--ep-options KEY=VALUE`` option to a Click command.
+
+    Collects runtime EP provider options (e.g. QNN ``htp_performance_mode``)
+    that are forwarded to ``add_provider_for_devices`` when the inference
+    session is created. Distinct from build-time provider options set via
+    ``--config``: these affect the runtime session, not the compiled graph.
+
+    Use :func:`parse_ep_options` to turn the collected tuple into a dict.
+
+    Args:
+        optional_message: Extra command-specific guidance appended to help text.
+
+    Returns:
+        Decorator function.
+    """
+    help_text = (
+        "Runtime EP provider option as KEY=VALUE (repeatable). Forwarded to the "
+        "inference session's execution provider (e.g. "
+        "--ep-options htp_performance_mode=burst)."
+    )
+    if optional_message:
+        help_text = f"{help_text} {optional_message}"
+
+    return click.option(
+        "--ep-options",
+        "ep_options",
+        multiple=True,
+        help=help_text,
+    )
+
+
+def parse_ep_options(values: tuple[str, ...]) -> dict[str, str] | None:
+    """Parse ``--ep-options KEY=VALUE`` tuples into a provider-options dict.
+
+    Args:
+        values: Raw values collected by a ``multiple=True`` Click option.
+
+    Returns:
+        Mapping of option name to value, or ``None`` when nothing was provided
+        (so callers can leave the session default untouched).
+
+    Raises:
+        click.BadParameter: If any value is missing the ``=`` separator or has
+            an empty key.
+    """
+    if not values:
+        return None
+    options: dict[str, str] = {}
+    for item in values:
+        if "=" not in item:
+            raise click.BadParameter(
+                f"Invalid EP option format: '{item}'. Use KEY=VALUE.",
+                param_hint="--ep-options",
+            )
+        key, value = item.split("=", 1)
+        key = key.strip()
+        if not key:
+            raise click.BadParameter(
+                f"Invalid EP option format: '{item}'. Key cannot be empty.",
+                param_hint="--ep-options",
+            )
+        options[key] = value
+    return options
+
+
 def device_option(
     required: bool = True,
     optional_message: str | None = None,