Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/commands/perf.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ $ winml perf [options]
| `--device` | `-d` | `auto\|cpu\|gpu\|npu` | `auto` | Device to run the benchmark on. `auto` selects the highest-priority available device. |
| `--precision` | | `TEXT` | `auto` | Precision mode applied during model build: `auto`, `fp32`, `fp16`, `int8`, `int16`, or compound forms such as `w8a16`. |
| `--ep` | | `TEXT` | — | Force a specific execution provider (e.g., `qnn`, `dml`, `vitisai`, `openvino`, `cpu`). Overrides the device-to-provider mapping. |
| `--ep-options` | | `KEY=VALUE` (multiple) | — | Runtime EP provider option forwarded to the inference session (e.g., `--ep-options htp_performance_mode=burst`). Repeatable. Applies to both HuggingFace model IDs and ONNX file inputs. Unlike build-time options set via `--config`, these tune the runtime session, not the compiled graph. |
| `--output` | `-o` | `PATH` | `~/.cache/winml/perf/<slug>/<timestamp>.json` | Output JSON file path for the benchmark report. |
| `--batch-size` | | `INTEGER` | `1` | Batch size used when generating synthetic input tensors. |
| `--shape-config` | | `PATH` | — | Path to a JSON file containing shape overrides (e.g., `{"height": 480, "width": 480}`). Ignored for pre-exported ONNX files and in `--module` mode. |
Expand Down Expand Up @@ -78,6 +79,14 @@ Benchmark with live hardware monitoring enabled:
$ winml perf -m microsoft/resnet-50 --device npu --monitor
```

Pass runtime EP provider options to tune the session (repeatable):

```bash
$ winml perf -m model.onnx --device npu \
--ep-options htp_performance_mode=burst \
--ep-options htp_graph_finalization_optimization_mode=3
```

Per-module benchmarking to find latency hot-spots across all attention blocks:

```bash
Expand Down
19 changes: 19 additions & 0 deletions src/winml/modelkit/commands/perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class BenchmarkConfig:
allow_unsupported_nodes: bool = False
monitor: bool = False
ep: EPNameOrAlias | None = None
ep_options: dict[str, str] | None = None
shape_config: dict | None = None


Expand Down Expand Up @@ -368,6 +369,7 @@ def _load_model(self) -> None:
"device": self.config.device,
"precision": self.config.precision,
"ep": self.config.ep,
"provider_options": self.config.ep_options,
"use_cache": use_cache,
"force_rebuild": force_rebuild,
"shape_config": self.config.shape_config,
Expand Down Expand Up @@ -554,6 +556,7 @@ def _perf_modules(
monitor: bool = False,
device: str = "auto",
ep: EPNameOrAlias | None = None,
ep_options: dict[str, str] | None = None,
precision: str = "auto",
allow_unsupported_nodes: bool = False,
) -> None:
Expand All @@ -578,6 +581,8 @@ def _perf_modules(
device: Target device policy ("auto", "cpu", "gpu", "npu").
ep: Explicit execution provider (e.g., "qnn", "dml"). Overrides
device-to-provider mapping when set.
ep_options: Runtime EP provider options (e.g. QNN
``htp_performance_mode``) forwarded to each per-module session.
precision: Precision mode passed through to the build stage.
allow_unsupported_nodes: If True, warn instead of failing the build when
the analyzer reports unsupported nodes that persist.
Expand Down Expand Up @@ -687,6 +692,7 @@ def _perf_modules(
str(build_result.final_onnx_path),
device=resolved_device,
ep=ep,
provider_options=ep_options,
)
io_cfg = session.io_config
inputs = generate_random_inputs(io_cfg, batch_size=batch_size)
Expand Down Expand Up @@ -1063,6 +1069,9 @@ def _run_simple_loop(
required=False,
optional_message="Overrides device-to-provider mapping.",
)
@cli_utils.ep_options_option(
optional_message="Applied to both HuggingFace model IDs and ONNX file inputs.",
)
@cli_utils.output_option(
"Output JSON file path. Defaults to "
"'~/.cache/winml/perf/<model_slug>[/<module_class>]/<timestamp>.json'."
Expand Down Expand Up @@ -1144,6 +1153,7 @@ def perf(
device: str,
precision: str,
ep: EPNameOrAlias | None,
ep_options: tuple[str, ...],
output: Path | None,
batch_size: int,
shape_config_path: Path | None,
Expand Down Expand Up @@ -1185,6 +1195,9 @@ def perf(
# Text model with explicit task
winml perf -m bert-base-uncased --task text-classification

# Pass runtime EP provider options (repeatable)
winml perf -m model.onnx --device npu --ep-options htp_performance_mode=burst

# Per-module benchmarking
winml perf -m bert-base-uncased --module BertAttention

Expand All @@ -1211,6 +1224,10 @@ def perf(
verbose, quiet = cli_utils.resolve_verbosity(ctx, verbose, quiet)
configure_logging(verbosity=verbose, quiet=quiet)

# Runtime EP provider options (e.g. QNN htp_performance_mode) forwarded to
# the inference session for both HF model IDs and ONNX file inputs.
ep_provider_options = cli_utils.parse_ep_options(ep_options)

json_mode = output_format == "json"
console = Console(stderr=True) if json_mode else Console()

Expand Down Expand Up @@ -1248,6 +1265,7 @@ def perf(
monitor=monitor,
device=device.lower(),
ep=ep,
ep_options=ep_provider_options,
precision=precision.lower(),
allow_unsupported_nodes=allow_unsupported_nodes,
)
Expand Down Expand Up @@ -1295,6 +1313,7 @@ def perf(
allow_unsupported_nodes=allow_unsupported_nodes,
monitor=monitor,
ep=ep,
ep_options=ep_provider_options,
shape_config=shape_config,
)

Expand Down
12 changes: 12 additions & 0 deletions src/winml/modelkit/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def from_onnx(
device: str = "auto",
precision: str = "auto",
ep: EPNameOrAlias | None = None,
provider_options: dict[str, str] | None = None,
cache_dir: str | Path | None = None,
use_cache: bool = True,
force_rebuild: bool = False,
Expand All @@ -131,6 +132,8 @@ def from_onnx(
device: Target device ("auto", "npu", "gpu", "cpu").
precision: Target precision ("auto", "fp32", "fp16", "int8").
ep: Explicit execution provider.
provider_options: Runtime EP provider options (e.g. QNN
``htp_performance_mode``) forwarded to the inference session.
cache_dir: Override cache directory.
use_cache: Whether to use persistent cache.
force_rebuild: Force rebuild even if cached.
Expand All @@ -156,6 +159,7 @@ def from_onnx(
device=device,
precision=precision,
ep=ep,
provider_options=provider_options,
cache_dir=cache_dir,
use_cache=use_cache,
force_rebuild=force_rebuild,
Expand Down Expand Up @@ -203,6 +207,7 @@ def from_onnx(
device=device,
session_options=session_options,
ep=ep,
provider_options=provider_options,
)

# Resolve output directory
Expand Down Expand Up @@ -241,6 +246,7 @@ def from_onnx(
device=device,
session_options=session_options,
ep=ep,
provider_options=provider_options,
)

@classmethod
Expand All @@ -252,6 +258,7 @@ def from_pretrained(
config: WinMLBuildConfig | None = None,
device: str = "auto",
precision: str = "auto",
provider_options: dict[str, str] | None = None,
cache_dir: str | Path | None = None,
use_cache: bool = True,
force_rebuild: bool = False,
Expand Down Expand Up @@ -282,6 +289,8 @@ def from_pretrained(
"auto" detects available hardware (NPU > GPU > CPU).
precision: Target precision ("auto", "fp32", "fp16", "int8", "int16").
"auto" selects based on device (npu->int8, gpu->fp16, cpu->fp16).
provider_options: Runtime EP provider options (e.g. QNN
``htp_performance_mode``) forwarded to the inference session.
cache_dir: Directory for caching. If None, uses default cache dir.
use_cache: If True (default), use persistent cache directory.
If False, build in a temp directory and always rebuild.
Expand Down Expand Up @@ -323,6 +332,7 @@ def from_pretrained(
device=device,
precision=precision,
ep=kwargs.pop("ep", None),
provider_options=provider_options,
cache_dir=cache_dir,
use_cache=use_cache,
force_rebuild=force_rebuild,
Expand Down Expand Up @@ -362,6 +372,7 @@ def from_pretrained(
trust_remote_code=trust_remote_code,
shape_config=shape_config,
precision=precision,
provider_options=provider_options,
config=config,
cache_dir=cache_dir,
allow_unsupported_nodes=allow_unsupported_nodes,
Expand Down Expand Up @@ -463,6 +474,7 @@ def from_pretrained(
config=hf_config, # HF PretrainedConfig for pipeline compatibility
device=device, # pass user's original device string; WinMLSession handles "auto"
ep=resolved_ep,
provider_options=provider_options,
)
model._build_config = config # resolved build config (task, quant, compile)
return model
Expand Down
5 changes: 5 additions & 0 deletions src/winml/modelkit/models/winml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(
device: str = "auto",
session_options: Any | None = None,
ep: EPNameOrAlias | None = None,
provider_options: dict[str, str] | None = None,
) -> None:
"""Initialize inference model.

Expand All @@ -78,6 +79,9 @@ def __init__(
session_options: Factory returning an ORT SessionOptions (e.g., for
graph_optimization_level). Called fresh per ORT session.
ep: Explicit EP short name (e.g., "dml", "qnn"). Forwarded to WinMLSession.
provider_options: Runtime EP provider options (e.g. QNN
``htp_performance_mode``). Forwarded to WinMLSession and on to
``add_provider_for_devices``.
"""
self._onnx_path = Path(onnx_path)
self.config = config
Expand All @@ -92,6 +96,7 @@ def __init__(
device=device,
session_options=session_options,
ep=ep,
provider_options=provider_options,
)

@property
Expand Down
12 changes: 11 additions & 1 deletion src/winml/modelkit/session/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def __init__(
ep_config: EPConfig | None = None,
*,
ep: EPNameOrAlias | None = None,
provider_options: dict[str, str] | None = None,
session_options: Callable[[], ort.SessionOptions] | None = None,
) -> None:
"""Initialize WinMLSession.
Expand All @@ -153,6 +154,11 @@ def __init__(
ep: Explicit EP short name (e.g., "migraphx", "nv_tensorrt_rtx").
When set, bypasses policy-based selection and uses
add_provider_for_devices to force the specific EP.
provider_options: Runtime EP provider options merged on top of any
``ep_config.provider_options`` and forwarded to
``add_provider_for_devices`` (e.g. QNN ``htp_performance_mode``).
Unlike ``ep_config``, this does not affect EPContext persistence —
it only tunes the runtime session.
session_options: Factory returning an ``ort.SessionOptions``.
Called once per ``_build_session_options`` invocation so each
ORT session gets a fresh, un-poisoned options object
Expand All @@ -170,7 +176,11 @@ def __init__(
self._ep = ep
self._persist_jit = ep_config.enable_ep_context if ep_config else False
self._embed_context = ep_config.embed_context if ep_config else False
self._provider_options = ep_config.provider_options if ep_config else {}
self._provider_options = dict(ep_config.provider_options) if ep_config else {}
# Runtime provider options (e.g. from --ep-options) merge on top of and
# override any build-time options carried by ep_config.
if provider_options:
self._provider_options.update(provider_options)

self._session_options_factory: Callable[[], ort.SessionOptions] = (
session_options or ort.SessionOptions
Expand Down
66 changes: 66 additions & 0 deletions src/winml/modelkit/utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,72 @@ def ep_option(required: bool = True, optional_message: str | None = None) -> Cal
)


def ep_options_option(optional_message: str | None = None) -> Callable[[F], F]:
"""Add a repeatable ``--ep-options KEY=VALUE`` option to a Click command.

Collects runtime EP provider options (e.g. QNN ``htp_performance_mode``)
that are forwarded to ``add_provider_for_devices`` when the inference
session is created. Distinct from build-time provider options set via
``--config``: these affect the runtime session, not the compiled graph.

Use :func:`parse_ep_options` to turn the collected tuple into a dict.

Args:
optional_message: Extra command-specific guidance appended to help text.

Returns:
Decorator function.
"""
help_text = (
"Runtime EP provider option as KEY=VALUE (repeatable). Forwarded to the "
"inference session's execution provider (e.g. "
"--ep-options htp_performance_mode=burst)."
)
if optional_message:
help_text = f"{help_text} {optional_message}"

return click.option(
"--ep-options",
"ep_options",
multiple=True,
help=help_text,
)


def parse_ep_options(values: tuple[str, ...]) -> dict[str, str] | None:
"""Parse ``--ep-options KEY=VALUE`` tuples into a provider-options dict.

Args:
values: Raw values collected by a ``multiple=True`` Click option.

Returns:
Mapping of option name to value, or ``None`` when nothing was provided
(so callers can leave the session default untouched).

Raises:
click.BadParameter: If any value is missing the ``=`` separator or has
an empty key.
"""
if not values:
return None
options: dict[str, str] = {}
for item in values:
if "=" not in item:
raise click.BadParameter(
f"Invalid EP option format: '{item}'. Use KEY=VALUE.",
param_hint="--ep-options",
)
key, value = item.split("=", 1)
key = key.strip()
if not key:
raise click.BadParameter(
f"Invalid EP option format: '{item}'. Key cannot be empty.",
param_hint="--ep-options",
)
options[key] = value
return options


def device_option(
required: bool = True,
optional_message: str | None = None,
Expand Down
Loading
Loading