diff --git a/.github/extensions/doc-check/extension.mjs b/.github/extensions/doc-check/extension.mjs new file mode 100644 index 000000000..c82a27583 --- /dev/null +++ b/.github/extensions/doc-check/extension.mjs @@ -0,0 +1,274 @@ +// Extension: doc-check +// Verify docs are self-consistent, cross-referenced correctly, and match source code + +import { joinSession } from "@github/copilot-sdk/extension"; +import { readFile } from "fs/promises"; +import { resolve, join, dirname } from "path"; +import { glob } from "fs/promises"; + +const ROOT = resolve(import.meta.dirname, "../../.."); +const DOCS_DIR = join(ROOT, "docs"); +const SRC_DIR = join(ROOT, "src"); + +async function readTextFile(path) { + try { + return await readFile(path, "utf-8"); + } catch { + return null; + } +} + +async function findFiles(pattern, cwd) { + const results = []; + for await (const entry of glob(pattern, { cwd })) { + results.push(entry); + } + return results; +} + +const session = await joinSession({ + tools: [ + { + name: "doc_check_consistency", + description: + "Audit the docs/ folder for internal consistency issues: broken cross-references between markdown files, nav entries in mkdocs.yml that point to missing files, and markdown files in docs/ that are not listed in nav or exclude_docs. Returns a report of findings.", + parameters: { type: "object", properties: {} }, + skipPermission: true, + handler: async () => { + const findings = []; + + // 1. Read mkdocs.yml nav and check all referenced files exist + const mkdocsContent = await readTextFile(join(ROOT, "mkdocs.yml")); + if (!mkdocsContent) return "Error: mkdocs.yml not found"; + + const navFileRefs = [...mkdocsContent.matchAll(/:\s+(\S+\.md)/g)].map((m) => m[1]); + for (const ref of navFileRefs) { + const fullPath = join(DOCS_DIR, ref); + const content = await readTextFile(fullPath); + if (content === null) { + findings.push(`[NAV_MISSING_FILE] mkdocs.yml references '${ref}' but file does not exist`); + } + } + + // 2. Check internal markdown links in all docs files + const mdFiles = await findFiles("**/*.md", DOCS_DIR); + for (const relPath of mdFiles) { + const fullPath = join(DOCS_DIR, relPath); + const content = await readTextFile(fullPath); + if (!content) continue; + + // Find markdown links like [text](../path/to/file.md) or [text](./file.md#anchor) + const linkPattern = /\[([^\]]*)\]\(([^)]+)\)/g; + let match; + while ((match = linkPattern.exec(content)) !== null) { + const target = match[2]; + // Skip external URLs and anchors-only + if (target.startsWith("http") || target.startsWith("#") || target.startsWith("mailto:")) continue; + // Strip anchor + const filePart = target.split("#")[0]; + if (!filePart) continue; + // Resolve relative to current file's directory + const resolvedPath = resolve(dirname(fullPath), filePart); + const exists = await readTextFile(resolvedPath); + if (exists === null) { + findings.push(`[BROKEN_LINK] ${relPath}: link to '${filePart}' resolves to non-existent file`); + } + } + } + + // 3. Check for md files not in nav and not in exclude_docs + const excludeMatch = mkdocsContent.match(/exclude_docs:\s*\|([\s\S]*?)(?=\n\S|\n*$)/); + const excludePatterns = excludeMatch + ? excludeMatch[1] + .split("\n") + .map((l) => l.trim()) + .filter(Boolean) + : []; + const navSet = new Set(navFileRefs); + for (const relPath of mdFiles) { + const normalized = relPath.replace(/\\/g, "/"); + if (navSet.has(normalized)) continue; + if (normalized === "index.md") continue; + // Check exclude patterns (simple glob: just filename or path prefix) + const excluded = excludePatterns.some((pat) => { + const cleanPat = pat.replace(/^\//, ""); + return normalized === cleanPat || normalized.startsWith(cleanPat); + }); + if (excluded) continue; + findings.push(`[ORPHAN_FILE] ${normalized} is not in mkdocs.yml nav or exclude_docs`); + } + + if (findings.length === 0) return "✅ No consistency issues found."; + return `Found ${findings.length} issue(s):\n\n${findings.join("\n")}`; + }, + }, + { + name: "doc_check_code_alignment", + description: + "Cross-reference documentation claims against source code. Checks: CLI flag names documented in command pages match actual Click/Typer parameter definitions in src/; EP names and device mappings in docs match source; config schema fields match the WinMLBuildConfig dataclass. Returns mismatches.", + parameters: { + type: "object", + properties: { + scope: { + type: "string", + description: "Which aspect to check: 'flags' (CLI flags vs source), 'eps' (EP table vs source), 'config' (config schema fields vs dataclass), or 'all'", + enum: ["flags", "eps", "config", "all"], + }, + }, + }, + skipPermission: true, + handler: async (args) => { + const scope = args.scope || "all"; + const findings = []; + + // Helper: find Python files containing a pattern + async function searchSrc(pattern) { + const pyFiles = await findFiles("**/*.py", SRC_DIR); + const results = []; + for (const f of pyFiles) { + const content = await readTextFile(join(SRC_DIR, f)); + if (content && content.includes(pattern)) { + results.push({ file: f, content }); + } + } + return results; + } + + if (scope === "flags" || scope === "all") { + // Check command docs for flags and verify they exist in source + const cmdFiles = await findFiles("*.md", join(DOCS_DIR, "commands")); + for (const cmdFile of cmdFiles) { + const content = await readTextFile(join(DOCS_DIR, "commands", cmdFile)); + if (!content) continue; + // Extract flags from markdown tables: | `--flag-name` | + const flagPattern = /\|\s*`(--[\w-]+)`/g; + let match; + const docFlags = []; + while ((match = flagPattern.exec(content)) !== null) { + docFlags.push(match[1]); + } + if (docFlags.length === 0) continue; + + // Try to find the command source file + const cmdName = cmdFile.replace(".md", ""); + const srcFiles = await searchSrc(`def ${cmdName}`); + if (srcFiles.length === 0) continue; + + // Check each documented flag exists in source (as click option or argument) + const srcContent = srcFiles.map((s) => s.content).join("\n"); + for (const flag of docFlags) { + const paramName = flag.replace(/^--/, "").replace(/-/g, "_"); + const altName = flag; // --flag-name form + if (!srcContent.includes(paramName) && !srcContent.includes(altName)) { + findings.push(`[FLAG_NOT_IN_SRC] ${cmdFile}: '${flag}' not found in source for '${cmdName}' command`); + } + } + } + } + + if (scope === "eps" || scope === "all") { + // Check EP table in docs/concepts/eps-and-devices.md + const epDoc = await readTextFile(join(DOCS_DIR, "concepts", "eps-and-devices.md")); + if (epDoc) { + const epPattern = /\|\s*`(\w+ExecutionProvider)`\s*\|\s*`(\w+)`/g; + let match; + while ((match = epPattern.exec(epDoc)) !== null) { + const epName = match[1]; + const shortName = match[2]; + // Verify EP short name exists somewhere in source + const srcHits = await searchSrc(shortName); + if (srcHits.length === 0) { + findings.push(`[EP_SHORT_NAME_MISSING] EP '${epName}' short name '${shortName}' not found in source`); + } + } + } + } + + if (scope === "config" || scope === "all") { + // Check config schema fields in docs/reference/index.md against source dataclass + const refDoc = await readTextFile(join(DOCS_DIR, "reference", "index.md")); + if (refDoc) { + // Extract field names from table rows: | `field_name` | + const fieldPattern = /\|\s*`([\w.]+)`\s*\|/g; + let match; + const docFields = new Set(); + while ((match = fieldPattern.exec(refDoc)) !== null) { + docFields.add(match[1].split(".").pop()); // Get leaf field name + } + // Find WinMLBuildConfig in source + const configFiles = await searchSrc("WinMLBuildConfig"); + if (configFiles.length > 0) { + const configSrc = configFiles.map((f) => f.content).join("\n"); + // Check each doc field appears in source + for (const field of docFields) { + if (!configSrc.includes(field)) { + findings.push(`[CONFIG_FIELD_MISSING] Field '${field}' documented but not found in WinMLBuildConfig source`); + } + } + } + } + } + + if (findings.length === 0) return "✅ Documentation aligns with source code."; + return `Found ${findings.length} mismatch(es):\n\n${findings.join("\n")}`; + }, + }, + { + name: "doc_check_samples", + description: + "Verify that sample pages (docs/samples/) use correct model IDs, command flags, and pipeline steps that match the current CLI capabilities. Checks model IDs are valid HuggingFace references and command examples use documented flags.", + parameters: { type: "object", properties: {} }, + skipPermission: true, + handler: async () => { + const findings = []; + const sampleFiles = await findFiles("*.md", join(DOCS_DIR, "samples")); + + // Load all documented flags from command pages + const cmdFiles = await findFiles("*.md", join(DOCS_DIR, "commands")); + const allFlags = new Map(); // command -> Set of flags + for (const cmdFile of cmdFiles) { + const content = await readTextFile(join(DOCS_DIR, "commands", cmdFile)); + if (!content) continue; + const cmdName = cmdFile.replace(".md", ""); + const flags = new Set(); + const flagPattern = /\|\s*`(--[\w-]+)`/g; + let match; + while ((match = flagPattern.exec(content)) !== null) { + flags.add(match[1]); + } + allFlags.set(cmdName, flags); + } + + for (const sampleFile of sampleFiles) { + const content = await readTextFile(join(DOCS_DIR, "samples", sampleFile)); + if (!content) continue; + + // Check command examples use valid flags + const codeBlocks = content.match(/```bash\n([\s\S]*?)```/g) || []; + for (const block of codeBlocks) { + // Find winml commands + const cmdPattern = /winml\s+(\w+)(.*)/g; + let match; + while ((match = cmdPattern.exec(block)) !== null) { + const cmd = match[1]; + const argsStr = match[2]; + const docFlags = allFlags.get(cmd); + if (!docFlags || docFlags.size === 0) continue; + + // Extract flags used + const usedFlags = argsStr.match(/--[\w-]+/g) || []; + for (const flag of usedFlags) { + if (!docFlags.has(flag)) { + findings.push(`[UNDOCUMENTED_FLAG] ${sampleFile}: 'winml ${cmd} ${flag}' uses flag not in docs/commands/${cmd}.md`); + } + } + } + } + } + + if (findings.length === 0) return "✅ All sample commands use documented flags."; + return `Found ${findings.length} issue(s):\n\n${findings.join("\n")}`; + }, + }, + ], +}); diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..2b2d9d5e7 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,68 @@ +name: Build & Publish Docs + +on: + push: + branches: [main] + paths: ["docs/**", "mkdocs.yml"] + release: + types: [published] + workflow_dispatch: + inputs: + version: + description: "Version label to deploy (e.g., 0.2). Leave empty to use 'dev'." + required: false + +permissions: + contents: write + pages: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: astral-sh/setup-uv@v3 + with: + python-version: "3.11" + + - run: uv sync --extra dev + + - name: Configure git for mike + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Determine version + id: version + run: | + if [[ "${{ github.event_name }}" == "release" ]]; then + # Strip 'v' prefix from tag: v0.2.0 → 0.2 + TAG="${{ github.event.release.tag_name }}" + VERSION="${TAG#v}" + VERSION="${VERSION%.*}" + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "alias=latest" >> "$GITHUB_OUTPUT" + elif [[ -n "${{ github.event.inputs.version }}" ]]; then + echo "version=${{ github.event.inputs.version }}" >> "$GITHUB_OUTPUT" + echo "alias=latest" >> "$GITHUB_OUTPUT" + else + echo "version=dev" >> "$GITHUB_OUTPUT" + echo "alias=" >> "$GITHUB_OUTPUT" + fi + + - name: Deploy docs with mike + run: | + VERSION="${{ steps.version.outputs.version }}" + ALIAS="${{ steps.version.outputs.alias }}" + if [[ -n "$ALIAS" ]]; then + uv run mike deploy "$VERSION" "$ALIAS" --update-aliases --push + else + uv run mike deploy "$VERSION" --push + fi + + - name: Set default version + if: steps.version.outputs.alias == 'latest' + run: uv run mike set-default latest --push diff --git a/.gitignore b/.gitignore index 6d8e97985..2184e9f72 100644 --- a/.gitignore +++ b/.gitignore @@ -264,3 +264,6 @@ specs/ # Runtime check rule artifacts (hosted in external repo) src/winml/modelkit/analyze/rules/runtime_check_rules/**/*.parquet + +# Generated by mike (docs versioning) +docs/versions.json diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ff4ce5e10..0602c3003 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,9 @@ We're always looking for your help to improve the product (bug fixes, new featur See the [README](./README.md#getting-started) for prerequisites and installation instructions. Then set up your development environment: ```bash -uv sync +git clone https://github.com/microsoft/winml-cli.git +cd winml-cli +uv sync --extra dev uv run pre-commit install ``` @@ -24,6 +26,13 @@ This installs all dependencies and enables [pre-commit hooks](https://pre-commit When running WinML CLI from a source tree (`uv run winml ...`), you need to populate the runtime check rule zips locally. See [`src/winml/modelkit/analyze/rules/runtime_check_rules/README.md`](./src/winml/modelkit/analyze/rules/runtime_check_rules/README.md) for setup options (GitHub release for external contributors, `gim-home` script for Microsoft internal, `WINMLCLI_RULES_DIR` override). +For external contributors, download from a GitHub release: + +```bash +gh release download --repo microsoft/winml-cli --pattern 'rules-v*.zip' --dir . +Expand-Archive -Path .\rules-v*.zip -DestinationPath src\winml\modelkit\analyze\rules\runtime_check_rules -Force +``` + ## Coding conventions and standards ### Python code style diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..040fb3423 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,129 @@ +# Contributing to winml-cli docs + +This folder hosts the source for the [winml-cli](https://github.com/microsoft/winml-cli) documentation site, built with [MkDocs Material](https://squidfunk.github.io/mkdocs-material/). + +## Quick reference + +| Task | Command | +|---|---| +| Install dev deps | `uv sync --extra dev` | +| Live preview | `uv run mkdocs serve` | +| Build for CI | `uv run mkdocs build --strict` | +| Publish (one-shot from laptop) | `uv run mkdocs gh-deploy --force` | +| Publish (CI workflow) | GitHub Actions → "Build & Publish Docs" → Run workflow | + +## What's in here + +``` +docs/ +├── index.md ← landing page +├── getting-started/ ← 3 onboarding pages +├── concepts/ ← 12 conceptual pages in two sub-groups +│ ├── how-it-works.md, graphs-and-ir.md, weight-and-activation.md, +│ │ eps-and-devices.md, quantization.md (Fundamentals) +│ └── primitives-and-pipeline.md, config-and-build.md, load-and-export.md, analyze-and-optimize.md, +│ compile-and-epcontext.md, perf-and-monitoring.md, eval-and-datasets.md (WinML CLI workflows) +├── commands/ ← per-command reference (overview + 12 commands) +├── samples/ ← reference-style walkthroughs +├── tutorials/ ← classroom-style walkthroughs +├── reference/ ← P2 stubs +├── troubleshooting.md ← P2 stub +├── contributing.md ← P2 stub +│ +├── superpowers/ ← specs, plans, review notes (excluded from build) +├── design/ ← internal ADRs and design docs (excluded) +├── naming-convention.md ← internal style guide (excluded) +└── pytest-best-practices.md ← internal style guide (excluded) +``` + +The site config (`mkdocs.yml`) lives at the repo root, not inside `docs/`. The build outputs to `site/` (gitignored). + +## Local development + +### Prerequisites + +Python 3.10+ and [uv](https://github.com/astral-sh/uv). + +### Setup and preview + +```bash +# from the repo root +uv sync --extra dev +uv run mkdocs serve +``` + +Open http://127.0.0.1:8000/ in a browser. The server auto-reloads when you edit any `.md` file under `docs/`. Changes to `mkdocs.yml` (nav, theme, plugins) require a manual server restart. + +### Validate before pushing + +```bash +uv run mkdocs build --strict +``` + +`--strict` must exit 0 with no `WARNING` lines. Common causes of strict-mode failures: + +- A new page added without an entry in `nav:` (gives a "not included in nav" warning) +- A nav entry pointing at a file that doesn't exist +- A relative link like `[text](other-page.md)` whose target file is missing +- A markdown anchor like `[link](#section-heading)` that doesn't match any heading slug + +## Publishing + +The site publishes to **GitHub Pages** from the `gh-pages` branch. The repo's `Settings → Pages` source is set to "Deploy from a branch" → `gh-pages` → `/ (root)`. + +### One-shot publish from your laptop + +```bash +uv run mkdocs gh-deploy --force +``` + +This builds the site locally, commits the static HTML to a local `gh-pages` branch, and force-pushes it to `origin/gh-pages`. GitHub Pages picks up the new commit within ~30–60 seconds. + +### Publish via CI + +The workflow at `.github/workflows/docs.yml` does the same thing in CI: + +1. `Settings → Actions → Build & Publish Docs → Run workflow` +2. Select the branch you want to publish from (typically `main`) + +The workflow is `workflow_dispatch` only — there is no automatic publish on push. If you want auto-publish on every push to `main`, change the trigger: + +```yaml +on: + push: + branches: [main] + paths: + - 'docs/**' + - 'mkdocs.yml' + - 'pyproject.toml' + - '.github/workflows/docs.yml' + workflow_dispatch: +``` + +## Authoring conventions + +- **Product name**: `winml-cli` (lowercase, hyphenated) throughout user-facing prose. Use `WinML CLI` (or `Windows ML`) only where the broader Microsoft brand is meant. +- **Command name**: the CLI invocation is always `winml `. Never `wmk`. +- **Flag verification**: every flag mentioned in docs must exist in `src/winml/modelkit/commands/.py`. Run `uv run winml --help` to confirm. +- **Source citations**: when documenting source-grounded behavior (e.g., "the default opset is 17"), cite the file path and ideally the symbol name. Avoid line numbers — they drift fast. +- **Mermaid diagrams**: use `pymdownx.superfences` syntax (already configured in `mkdocs.yml`). +- **Tabbed code blocks**: use `pymdownx.tabbed` (`=== "Label"` followed by a blank line and 4-space-indented code block). +- **Admonitions**: `!!! note "Title"`, `!!! warning "Title"`, `!!! info "Title"`. +- **No emojis** in pages unless they're part of an external attribution (e.g., a GitHub badge). + +## Excluded paths + +The following are present in `docs/` but **excluded from the published site** via the `exclude_docs:` block in `mkdocs.yml`. They are kept in-repo for contributors: + +- `docs/design/` — internal architecture decision records and design notes +- `docs/superpowers/` — specs, plans, and review notes accumulated during doc development +- `docs/naming-convention.md` — internal naming conventions for code review +- `docs/pytest-best-practices.md` — internal testing style guide + +If you add new internal-only content, either place it under one of these excluded paths or add a new entry to `exclude_docs` in `mkdocs.yml`. + +## See also + +- [MkDocs Material reference](https://squidfunk.github.io/mkdocs-material/reference/) +- [MkDocs Material navigation setup](https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/) +- [MkDocs Material color palette](https://squidfunk.github.io/mkdocs-material/setup/changing-the-colors/) diff --git a/docs/assets/optimize-analyze-loop.svg b/docs/assets/optimize-analyze-loop.svg new file mode 100644 index 000000000..85298f599 --- /dev/null +++ b/docs/assets/optimize-analyze-loop.svg @@ -0,0 +1,95 @@ + + + + + + + + + + + + + + + + config.optim + + + + + + + Autoconf loop + + + + Optimize + + + + + + + Analyze + + + + + + + new + flags? + + + + yes + + + + no + + + + Final + Analyze + + + + + + + has + errors? + + + + yes + + + RuntimeError + + + + no + + + + + + + Transform + + + Analysis + + + Decision + + + Error + + + Autoconf loop + diff --git a/docs/commands/analyze.md b/docs/commands/analyze.md new file mode 100644 index 000000000..1f6298f67 --- /dev/null +++ b/docs/commands/analyze.md @@ -0,0 +1,119 @@ +# winml analyze + +> Verify an ONNX model is compatible with a target execution provider before deployment. + +## When to use this + +Use `winml analyze` before running the full build pipeline to confirm that your ONNX model's operators are supported by the intended execution provider and device. It surfaces operator gaps and actionable recommendations early, saving time that would otherwise be spent on a failed compile or quantize run. + +## Synopsis + +```bash +$ winml analyze [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--model` | `-m` | `PATH` | *(required)* | Path to the ONNX model file to analyze. | +| `--ep` | | choice | `auto` | Target execution provider. Accepts full names (e.g., `QNNExecutionProvider`) or short aliases (`qnn`, `openvino`, `vitisai`, `cpu`, `cuda`, `dml`, `nvtensorrtrtx`, `migraphx`). Use `all` for every rule-data-backed EP, or `auto` to infer from local availability. | +| `--device` | | `cpu\|gpu\|npu\|all\|auto` | `auto` | Target device type. `auto` infers from local availability; `all` evaluates all rule-data-backed devices. | +| `--verbose` | `-v` | flag | off | Enable verbose output. | +| `--quiet` | `-q` | flag | off | Suppress non-essential output. | +| `--config` | `-c` | `PATH` | *(none)* | Build configuration file (YAML/JSON). | +| `--output` | | `PATH` | *(none)* | Save the full JSON result to a file in addition to printing the console summary. | +| `--information` / `--no-information` | | flag | enabled | Include detailed per-operator recommendations and remediation hints in the output. Pass `--no-information` for a compact pass/fail summary. | +| `--htp-metadata` | | `PATH` | *(none)* | Path to an HTP metadata JSON file (produced by `winml export`). Enriches subgraph pattern extraction by mapping nodes back to their source module hierarchy. Benefits all target EPs. | +| `--run-unknown-op` / `--no-run-unknown-op` | | flag | disabled | For operators not in the rule database, build a minimal ONNX graph and run it on the target EP locally to determine support. Enable when local EP libraries are available. | +| `--save-node` | | `partial\|unsupported` | *(none)* | Save partial or unsupported node subgraphs to disk for further investigation. Can be specified multiple times: `--save-node partial --save-node unsupported`. | +| `--optim-config` | | `PATH` | *(none)* | Save the auto-discovered optimization config (merged across all analyzed EPs) to a JSON file. | + +## How it works + +`winml analyze` loads the ONNX model and runs a static analysis pass via `ONNXStaticAnalyzer`. For each operator (and recognized subgraph pattern), the analyzer consults the target EP's rule database. For operators not in the database, it can optionally probe them locally when `--run-unknown-op` is enabled. The combined answer classifies each node as supported, partial, unsupported, or unknown (see [Analyze and optimize](../concepts/analyze-and-optimize.md) for definitions). + +The analysis always produces a **lint** result — the pass/fail verdict. When `--information` is enabled (the default), it additionally produces an **autoconf** result: a set of fusion-flag suggestions that, if applied in the optimize stage, would resolve partial or unsupported patterns. Pass `--no-information` to skip autoconf and get just the lint verdict. + +### Exit codes + +| Code | Meaning | +|------|---------| +| `0` | All operators are fully supported on the target EP. | +| `1` | At least one operator is unsupported, partially supported, or unknown. | +| `2` | Input or configuration error (bad path, unknown EP, etc.). | + +Exit codes make `winml analyze` safe to use as a CI gate with `set -e` or `$?` checks. + +## Examples + +Analyze using auto-detected EP and device: + +```bash +$ winml analyze --model microsoft/resnet-50.onnx +``` + +The output shows a live progress table per EP followed by an `ANALYSIS SUMMARY` section. Each EP line displays support counts in `S/P/U/Unk` format (Supported / Partial / Unsupported / Unknown) with color-coded indicators. + +Check QNN NPU support using the short alias: + +```bash +$ winml analyze --model bert-base-uncased.onnx --ep qnn --device NPU +``` + +Check Intel OpenVINO GPU support and print operator-level recommendations: + +```bash +$ winml analyze --model bert-base-uncased.onnx --ep openvino --device GPU --information +``` + +Save the full JSON result for offline inspection while still printing the console summary: + +```bash +$ winml analyze --model facebook/convnext-tiny-224.onnx --output results.json +``` + +Use HTP metadata for enhanced subgraph pattern extraction: + +```bash +$ winml analyze --model bert-base-uncased.onnx \ + --ep qnn --device NPU \ + --htp-metadata bert-base-uncased_htp_metadata.json +``` + +Run a lint-only pass (no recommendations) for a CI gate: + +```bash +$ winml analyze --model model.onnx --ep qnn --device NPU --no-information +echo "Exit code: $?" # 0 = clean, 1 = issues, 2 = input error +``` + +Dump unsupported subgraphs to disk for debugging: + +```bash +$ winml analyze --model model.onnx --ep qnn \ + --save-node partial --save-node unsupported \ + --output result.json +``` + +Enable local execution for operators not in the rule database: + +```bash +$ winml analyze --model model.onnx --ep qnn --device NPU --run-unknown-op +``` + +## Common pitfalls + +- **Omitting `--ep` uses `auto` (inferred from local availability)** — to analyze every EP regardless of what is installed, pass `--ep all`. Specify `--ep ` when you know your target hardware. +- **Exit code 1 is not a hard failure** — it means at least one operator is unsupported, not that the model cannot run at all. Many EPs fall back unsupported nodes to the CPU EP automatically; review the recommendations before deciding to restructure the model. +- **`--htp-metadata` is EP-agnostic** — HTP metadata enriches pattern extraction before any EP-specific checks, so it benefits all target EPs equally. You do not need separate metadata files per EP. +- **`--run-unknown-op` is disabled by default** — operators not covered by the rule database are classified as `UNKNOWN` (not unsupported) unless you explicitly pass `--run-unknown-op` to probe them locally. Enable it only when the target EP's libraries are available on the local machine. +- **The model path must point to an existing `.onnx` file** — symbolic HuggingFace model IDs are not accepted; export the model first with `winml export`. + +## See also + +- [Analyze and optimize](../concepts/analyze-and-optimize.md) — conceptual deep dive on classifications, lint vs autoconf, and the analyzer/optimizer loop +- [eps-and-devices.md](../concepts/eps-and-devices.md) — background on ONNX operators and execution providers +- [export.md](export.md) — convert a HuggingFace model to ONNX before analyzing +- [compile.md](compile.md) — compile the model for the target EP after analysis passes +- [sys.md](sys.md) — list EPs available on the current machine diff --git a/docs/commands/build.md b/docs/commands/build.md new file mode 100644 index 000000000..39cd6a8c9 --- /dev/null +++ b/docs/commands/build.md @@ -0,0 +1,117 @@ +# winml build + +> Run the entire winml-cli pipeline (export → optimize → quantize → compile) in one command. + +## When to use this + +Use `winml build` when you want to go from a Hugging Face model ID (or an +existing `.onnx` file) to a deployment-ready artifact in a single invocation, +without manually chaining `winml export`, `winml optimize`, `winml quantize`, +and `winml compile`. A build config file — generated by `winml config` — controls every +stage of the pipeline. + +## Synopsis + +```bash +$ winml build [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|---|---|---|---|---| +| `--config` | `-c` | path | `None` | `WinMLBuildConfig` JSON file, generated by `winml config`. If omitted, config is auto-generated from `-m`. | +| `--model` | `-m` | string | `None` | Hugging Face model ID or path to an existing `.onnx` file. | +| `--output-dir` | `-o` | path | `None` | Directory for all build artifacts. Mutually exclusive with `--use-cache`. | +| `--use-cache/--no-use-cache` | | flag | `false` | Store artifacts in the winml-cli global cache (`~/.cache/winml/`). Mutually exclusive with `--output-dir`. | +| `--rebuild/--no-rebuild` | | flag | `false` | Overwrite existing artifacts and re-run the full pipeline. | +| `--quant/--no-quant` | | flag | `true` | Run the quantization stage (use `--no-quant` to skip), overriding the config. | +| `--no-compile` / `--compile` | | flag | `None` | Override compilation. `--compile` forces enable (config must have a compile section). `--no-compile` forces skip. Default: inherit from config. | +| `--optimize/--no-optimize` | | flag | `true` | Run the optimization stage (use `--no-optimize` to skip). | +| `--ep` | | string | `None` | Target execution provider for the analyzer (e.g., `qnn`). Falls back to the compile config EP if not set. | +| `--device` | `-d` | string | `auto` | Target device for the analyzer (e.g., `npu`, `gpu`). Default: `auto` (auto-detect). | +| `--analyze/--no-analyze` | | flag | `true` | Run the analyzer loop during build (use `--no-analyze` to skip). | +| `--max-optim-iterations` | | integer | `None` | Maximum autoconf re-optimization rounds (3 enforced internally when not set). `--no-analyze` implicitly sets this to 0. | +| `--trust-remote-code/--no-trust-remote-code` | | flag | `false` | Allow executing custom code from model repositories. Use only with trusted sources. | +| `--allow-unsupported-nodes/--no-allow-unsupported-nodes` | | flag | `false` | Allow unsupported nodes to remain in the graph instead of failing the build. | +| `--help` | `-h` | flag | | Show this message and exit. | + +## How it works + +`winml build` reads a `WinMLBuildConfig` JSON file (from `winml config`) that +encodes device, precision, export, quantization, and compilation settings. +When `-m` is a Hugging Face model ID, the full pipeline runs: export → optimize +→ quantize → compile. When `-m` points to an existing `.onnx` file, the export +stage is skipped and the pipeline starts at optimization. After compilation, an +optional analyzer loop (`--max-optim-iterations`) re-evaluates graph quality +and applies further passes; `--no-analyze` disables it for a deterministic +single-pass build. Individual stages can be suppressed with `--no-quant`, +`--no-compile`, and `--no-optimize` without touching the config file. + +!!! tip "Reproducible CI/CD builds" + The config file is a portable, self-contained pipeline specification. Check it into source control and invoke `winml build -c config.json` in CI to produce identical artifacts without manual flag management. Set `"auto": false` in the config to disable the autoconf discovery loop for fully deterministic output. + +## Examples + +```bash +# Full pipeline: HF model → export → optimize → quantize → compile +winml build -c config.json -m microsoft/resnet-50 -o output/ +``` + +```text +winml build + Config: config.json + Model: microsoft/resnet-50 + Output: output/ + + export done (28.3s) + optimize done (4.1s) + quantize done (6.8s) + compile done (14.2s) + + Build complete in 53.4s + Final artifact: output/resnet50_ctx.onnx +``` + +```bash +# Start from a pre-exported ONNX file (skips export stage) +winml build -c config.json -m resnet50.onnx -o output/ +``` + +```bash +# Export and optimize only — skip quantization and compilation for quick testing +winml build -c config.json -m bert-base-uncased -o output/ \ + --no-quant --no-compile +``` + +```bash +# Force a clean rebuild, overwriting any cached artifacts +winml build -c config.json -m facebook/convnext-tiny-224 -o output/ --rebuild +``` + +```bash +# Use the global cache and cap optimizer iterations for faster turnaround +winml build -c config.json -m microsoft/resnet-50 \ + --use-cache --max-optim-iterations 1 +``` + +## Common pitfalls + +- **Either `--output-dir` or `--use-cache` is required; they are mutually + exclusive.** Omitting both raises an error immediately. +- **`--use-cache` is not supported in module mode.** When the config is a JSON + array (module mode), only `--output-dir` is accepted. +- **The config file must come from `winml config`.** The schema is strict; + unknown keys are rejected. +- **Existing artifacts are reused by default.** Pass `--rebuild` to force a + fresh run after changing the config. + +## See also + +- [winml export](export.md) +- [winml compile](compile.md) +- [Config and build](../concepts/config-and-build.md) +- [How it works](../concepts/how-it-works.md) +- [Config Schema](../reference/index.md) — full field-by-field config reference +- [Output Layout](../reference/output-layout.md) — what each output file contains +- [Supported Models](../reference/supported-models.md) — validated model architectures diff --git a/docs/commands/catalog.md b/docs/commands/catalog.md new file mode 100644 index 000000000..87039b061 --- /dev/null +++ b/docs/commands/catalog.md @@ -0,0 +1,100 @@ +# winml catalog + +> Browse the curated winml-cli catalog of validated models and benchmarks. + +## When to use this + +Use `winml catalog` to discover which HuggingFace models have been validated end-to-end +by the winml-cli team — exported, quantized, compiled, and benchmarked on real Windows +ML devices. It is the starting point when you want a model that is known to work +before investing time in a custom build. + +## Synopsis + +```bash +$ winml catalog [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--model-type` | | string | `null` | Filter the catalog by model architecture (case-insensitive). Examples: `bert`, `roberta`, `vit`. | +| `--task` | `-t` | string | `null` | Filter by HuggingFace task (case-insensitive). Examples: `text-classification`, `image-segmentation`. | +| `--ep/--execution-provider` | | string | `null` | Filter by execution provider (e.g., `qnn`, `dml`). If not specified, shows all EPs. | +| `--device` | `-d` | string | `null` | Filter by target device (e.g., `npu`, `gpu`). If not specified, shows all devices. | +| `--output` | `-o` | path | `null` | Save the displayed results to a JSON file. | +| `--help` | `-h` | flag | — | Show help and exit. | + +> `winml catalog` reads a local catalog bundled with the package — no network access is +> required. + +## How it works + +The catalog is stored in `winml/modelkit/data/hub_models.json` and is loaded +directly from the installed package data without any network call. Each catalog +entry records the model ID, task, architecture type, and model size. Use +`--model-type`, `--task`, `--ep`, or `--device` to narrow the displayed list. +When `--output` is provided, the filtered results are written as indented JSON +to the specified path. + +## Examples + +```bash +# List all validated models in the catalog +$ winml catalog +``` + +```text ++--- winml-cli Catalog | 12 validated model(s) --------------------------+ +| Model Task Model Type | +| microsoft/resnet-50 image-classification resnet | +| bert-base-uncased fill-mask bert | +| ProsusAI/finbert text-classification bert | +| ... | ++---------------------------------------------------------------------------+ +Use --ep or --device to filter by execution provider or target device. +``` + +```bash +# Filter to BERT-family models only +$ winml catalog --model-type bert +``` + +```bash +# Filter by task — show only text-classification models +$ winml catalog --task text-classification +``` + +```bash +# Combine filters — BERT models for text classification +$ winml catalog --model-type bert --task text-classification +``` + +```bash +# Save filtered results to JSON for offline review +$ winml catalog --task image-classification --output results/image_catalog.json +``` + +## Common pitfalls + +- **The catalog reflects a point-in-time snapshot.** Models listed in the catalog + were validated against a specific version of winml-cli, ONNX Runtime, and the + relevant EP driver. Accuracy and latency may differ on your hardware or with + updated drivers. +- **`--output` only saves what was displayed.** Combining a filter with `--output` + saves the filtered list. There is no flag to dump the entire catalog in one call — + omit all filters and add `--output` to do so. +- **A model not in the catalog can still be used with winml-cli.** The catalog covers + tested models; `winml inspect` and `winml export` work with any HuggingFace model + that has a supported architecture, whether or not it appears in the catalog. + +## See also + +- [inspect.md](inspect.md) — check loader, exporter, and task detection for any + HuggingFace model ID +- [sys.md](sys.md) — verify your environment and EP availability before building +- [How winml-cli Works](../concepts/how-it-works.md) — pipeline overview from export + to benchmark +- [Quantization & QDQ](../concepts/quantization.md) — understand quantization concepts + and precision options diff --git a/docs/commands/compile.md b/docs/commands/compile.md new file mode 100644 index 000000000..a3d4e8f46 --- /dev/null +++ b/docs/commands/compile.md @@ -0,0 +1,97 @@ +# winml compile + +> Compile an ONNX model to an EP-specific format for fast runtime loading. + +## When to use this + +Use `winml compile` as the final pipeline stage after `winml quantize` to +produce an execution-provider-native artifact (for example, a QNN EPContext +model) that loads faster and avoids online graph compilation at inference time. + +## Synopsis + +```bash +$ winml compile [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|---|---|---|---|---| +| `--model` | `-m` | path | *(required unless `--list`)* | Input ONNX model file. | +| `--output` | `-o` | path | — | Output file path (e.g., `model_compiled.onnx`). Takes precedence over `--output-dir`. | +| `--output-dir` | | path | same dir as input | Directory to write compiled output artifacts. | +| `--device` | `-d` | choice | `auto` | Target device: `auto`, `npu`, `gpu`, or `cpu`. | +| `--ep` | | `TEXT` | — | Force a specific execution provider, overriding device-to-provider mapping. Accepts full names (e.g., `QNNExecutionProvider`) or aliases (`qnn`, `dml`, `openvino`, `vitisai`, `migraphx`, `cpu`, `nvtensorrtrtx`). | +| `--validate` / `--no-validate` | | flag | `--validate` | Run a post-compilation validation pass on the target hardware. Enabled by default; pass `--no-validate` to skip when the target hardware or driver is unavailable. | +| `--compiler` | | choice | `ort` | Compiler backend: `ort` (ONNX Runtime) or `qairt` (Qualcomm AI Runtime Tools). | +| `--qnn-sdk-root` | | path | `None` | Path to the QNN SDK root directory. | +| `--embed/--no-embed` | | flag | `false` | Embed the EP context blob inside the ONNX file instead of writing a separate `.bin` file. | +| `--list` | | flag | `false` | List available compiler backends for the selected device and exit without compiling. | +| `--help` | `-h` | flag | | Show this message and exit. | + +## How it works + +`winml compile` resolves the target execution provider from `--device` and +`--ep`, then calls the winml-cli compiler API to hand the ONNX graph to the +EP's offline compilation toolchain. When `--device auto` (the default), the +target EP is determined by auto-detecting available hardware. For NPU targets, +ONNX Runtime's QNN EP generates a binary `.bin` context file (or embeds it +inline with `--embed`) that encodes the hardware-optimized execution plan, +eliminating graph partitioning at load time. An optional post-compilation +validation pass runs a forward pass through the +target EP; skip it with `--no-validate` when the target hardware is absent. + +## Examples + +```bash +# Compile with auto device detection (default compiler) +winml compile -m resnet50_qdq.onnx +``` + +```text +Input: resnet50_qdq.onnx +Device: npu +Provider: qnn +Compiler: ort + +Compiling model... + +Success! Model compiled +Output: resnet50_qdq_ctx.onnx +Compile time: 12.40s +Total time: 13.05s +``` + +```bash +# List available compiler backends for NPU before committing to a run +winml compile --list --device npu +``` + +```bash +# Compile a pre-quantized BERT model for NPU with context embedded inline +winml compile -m bert-base-uncased_qdq.onnx --embed +``` + +```bash +# Compile for GPU using the OpenVINO execution provider +winml compile -m microsoft_resnet50.onnx --device gpu --ep openvino +``` + +## Common pitfalls + +- **`--embed` inflates the `.onnx` file significantly.** Embedding the EP + context produces a single portable file but can make it impractical to open or + inspect the ONNX graph with standard tooling. +- **Validation requires the target hardware.** The post-compilation validation + step runs an actual inference pass; on a machine without the NPU driver or the + relevant EP installed, always pass `--no-validate`. +- **`--device auto` auto-detects the best available hardware.** Pass `--device npu`, + `--device gpu`, or `--device cpu` explicitly when targeting specific hardware + regardless of what is auto-detected. + +## See also + +- [winml quantize](quantize.md) +- [winml build](build.md) +- [ONNX and execution providers](../concepts/eps-and-devices.md) diff --git a/docs/commands/config.md b/docs/commands/config.md new file mode 100644 index 000000000..63c5bdf0e --- /dev/null +++ b/docs/commands/config.md @@ -0,0 +1,99 @@ +# winml config + +> Generate a reusable build configuration for a Hugging Face model or ONNX file. + +## When to use this + +Use `winml config` at the start of a new model project to produce a `WinMLBuildConfig` JSON file. The config captures the model identity, task, precision, and per-stage settings in one shareable artifact that you can edit, version-control, and repeatedly pass to `winml build`. Running config first lets you review and adjust pipeline settings before committing to a full build. + +## Synopsis + +```bash +$ winml config [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--model` | `-m` | `TEXT` | *(none)* | HuggingFace model ID (e.g., `microsoft/resnet-50`) or path to an existing `.onnx` file. Optional when `--model-type` or `--model-class` is provided. | +| `--task` | `-t` | `TEXT` | *(auto)* | Override the auto-detected task (e.g., `image-classification`, `text-classification`). When omitted, the first supported task for the model is selected automatically. | +| `--model-class` | | `TEXT` | *(auto)* | Override the auto-detected model class (e.g., `CLIPTextModelWithProjection`). Useful for multi-component models. | +| `--model-type` | | `TEXT` | *(auto)* | Override the auto-detected model type (e.g., `bert`, `resnet`). Can be used without `-m` to generate a config from HuggingFace default settings. | +| `--module` | | `TEXT` | *(none)* | Generate configs for every submodule whose class name matches the given string (e.g., `ResNetConvLayer`). The output is a JSON array instead of a single object. | +| `--config` | `-c` | `PATH` | *(none)* | JSON override file in `WinMLBuildConfig` format. Fields present in this file take precedence over auto-detected values. | +| `--shape-config` | | `PATH` | *(none)* | JSON file with input shape overrides for dummy input generation. Valid keys by modality — text: `sequence_length`; vision: `height`, `width`, `num_channels`; audio: `feature_size`, `nb_max_frames`, `audio_sequence_length`. | +| `--device` | `-d` | `auto\|npu\|gpu\|cpu` | `auto` | Target device. Affects the generated quantization and compilation sub-configs. `auto` leaves those sections unchanged from the kit defaults. | +| `--ep` | | `TEXT` | *(none)* | Force a specific execution provider (`qnn`, `dml`, `migraphx`, `tensorrt`, `vitisai`, `openvino`, `cpu`). Overrides the device-to-provider mapping. When used without `--device`, the device is inferred from the EP. | +| `--precision` | `-p` | `TEXT` | `auto` | Target precision: `auto`, `fp32`, `fp16`, `int8`, `int16`, or a mixed format such as `w8a16`. `auto` selects the precision based on the chosen device. | +| `--output` | `-o` | `PATH` | *(stdout)* | Write the generated JSON to this file instead of printing to stdout. | +| `--library` | | `TEXT` | `transformers` | Source library for `TasksManager` task lookup. Defaults to `transformers`; set to `diffusers` or another Optimum-supported library when needed. | +| `--quant/--no-quant` | | flag | `true` | Include quantization in the generated config (use `--no-quant` to omit it and set `quant` to `null`). | +| `--no-compile` / `--compile` | | flag | `--no-compile` (compile excluded by default) | Controls whether compilation is included in the generated config. By default compilation is **excluded** (`compile: null`). Pass `--compile` to include a compile section. | +| `--trust-remote-code/--no-trust-remote-code` | | flag | `false` | Allow execution of custom model code from the HuggingFace repository. Required for some community models. Only enable for repositories you trust. | + +## How it works + +`winml config` queries the HuggingFace `TasksManager` to auto-detect the model's task, class, and ONNX export specification. For known model types it looks up a per-model kit in `MODEL_BUILD_CONFIGS` and uses that as a starting point, layering in your device, precision, and override file on top. When `-m` points to an existing `.onnx` file, the export stage is skipped by setting `export` to `null` in the output. The result is a complete `WinMLBuildConfig` JSON printed to stdout or written to a file, ready to be passed to `winml build`. + +## Examples + +Generate a config for ResNet-50 with all auto-detected settings: + +```bash +$ winml config -m microsoft/resnet-50 +``` + +```text +Generating config for microsoft/resnet-50... +Auto-selected task: image-classification (from 'microsoft/resnet-50') +Generated config for task 'image-classification' +{ + "loader": { "task": "image-classification", ... }, + "export": { "opset_version": 17, ... }, + "optim": { ... }, + "quant": null, + "compile": null +} +``` + +Target NPU with int8 quantization and save to a file: + +```bash +$ winml config -m microsoft/resnet-50 --device npu --precision int8 -o resnet_npu.json +``` + +Generate a config for BERT and override the task: + +```bash +$ winml config -m bert-base-uncased --task text-classification -o bert_cls.json +``` + +Generate from a model type alone (no HuggingFace download required at config time): + +```bash +$ winml config --model-type bert --task fill-mask +``` + +Generate a config from an already-exported ONNX file, skipping quantization (compilation is already excluded by default): + +```bash +$ winml config -m facebook/convnext-tiny-224.onnx --no-quant -o convnext_optim_only.json +``` + +## Common pitfalls + +- **At least one of `-m`, `--model-type`, or `--model-class` is required** — calling `winml config` with none of these three flags raises a usage error immediately. +- **`auto` precision does not always map to a lower-bit type** — when `--device` is also `auto`, precision stays at the kit default (usually `fp32`). Explicitly pass `--device npu` or `--device gpu` for `auto` precision to resolve to `int8` or `fp16`. +- **`--module` changes the output shape** — with `--module` the JSON output is an array of configs, not a single object. Scripts that expect a single object will fail to parse this output. +- **`--trust-remote-code` has security implications** — only use this flag with model repositories you own or explicitly trust; it allows arbitrary Python execution from the remote model card. +- **Shape overrides in `--shape-config` are modality-specific** — passing a `sequence_length` key for a vision model has no effect. Check the `--help` description for valid keys per modality. + +## See also + +- [Config and build](../concepts/config-and-build.md) — structure of `WinMLBuildConfig` and how stages interact +- [Config Schema](../reference/index.md) — full field-by-field config reference +- [Supported Models](../reference/supported-models.md) — validated model architectures +- [build.md](build.md) — run the full pipeline using a generated config +- [export.md](export.md) — export a HuggingFace model to ONNX as a standalone step +- [optimize.md](optimize.md) — apply graph optimizations to an existing ONNX file diff --git a/docs/commands/eval.md b/docs/commands/eval.md new file mode 100644 index 000000000..f31a5df28 --- /dev/null +++ b/docs/commands/eval.md @@ -0,0 +1,140 @@ +# winml eval + +> Evaluate ONNX model accuracy on a standard dataset. + +## When to use this + +Use `winml eval` to measure how accurately a model performs on real data — especially after quantization, where comparing the quantized model against the floating-point baseline reveals any accuracy regression introduced by precision reduction. + +## Synopsis + +```bash +$ winml eval [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|---|---|---|---|---| +| `--model` | `-m` | `TEXT` | — | HuggingFace model ID, or path to a local `.onnx` file. Required (unless `--model-id` is provided directly). | +| `--model-id` | | `TEXT` | — | HuggingFace model ID used for preprocessor and config resolution when `-m` points to an `.onnx` file. Required when `-m` is an ONNX file. | +| `--task` | | `TEXT` | auto-detected | Task name (e.g., `image-classification`). Auto-detected from `--model-id` when not provided. Required when `-m` is an ONNX file and the task cannot be inferred. | +| `--precision` | | `TEXT` | `auto` | Precision used when building the model from a HuggingFace ID. One of `auto`, `fp32`, `fp16`, `int8`, `int16`, or a mixed `w{x}a{y}` spec (e.g., `w8a16`). `fp16`/`fp32` skip quantization. **Ignored** when `-m` is a pre-built `.onnx` file — the precision is already baked in. | +| `--device` | | choice | `auto` | Target device. Choices: `auto`, `npu`, `gpu`, `cpu`. `auto` selects the best available device. Combined with `--precision`, this drives the build when `-m` is a HuggingFace ID. | +| `--ep` / `--execution-provider` | | `TEXT` | — | Target ONNX Runtime execution provider when finer control than `--device` is needed. Full names (e.g., `QNNExecutionProvider`, `OpenVINOExecutionProvider`, `VitisAIExecutionProvider`) and aliases (`qnn`, `ov`/`openvino`, `vitis`/`vitisai`) are accepted. | +| `--dataset` | | `TEXT` | task default | HuggingFace dataset path (e.g., `imagenet-1k`, `nyu-mll/glue`). If omitted, a default dataset is selected based on the task. | +| `--dataset-name` | | `TEXT` | — | Dataset configuration name for multi-config datasets. | +| `--dataset-revision` | | `TEXT` | — | Git revision (branch, tag, or commit) of the dataset to load. Use `refs/convert/parquet` for HF datasets that are only served via the parquet mirror. | +| `--dataset-script` | | `TEXT` | — | Path to a Python script that builds the evaluation dataset locally. Requires `--trust-remote-code`. | +| `--trust-remote-code / --no-trust-remote-code` | | flag | `false` | Allow executing custom code from model repositories or dataset scripts. Required with `--dataset-script`. Use only with trusted sources. | +| `--samples` | | `INTEGER` | `100` | Number of dataset samples to evaluate. | +| `--split` | | `TEXT` | `validation` | Dataset split to use (e.g., `validation`, `test`, `train`). | +| `--shuffle / --no-shuffle` | | flag | `shuffle` | Shuffle the dataset before sampling. Disable with `--no-shuffle` for reproducible sample ordering. | +| `--streaming / --no-streaming` | | flag | `false` | Stream the dataset from the Hub instead of downloading the full split. Useful for large datasets. | +| `--column` | | `TEXT` (multiple) | — | Column mapping as `key=value` pairs (e.g., `--column input_column=image`). Can be specified multiple times. | +| `--label-mapping` | | `PATH` | — | Path to a JSON file mapping dataset label names to the integer class IDs the model emits: `{"label_name": id}`. | +| `--output` | `-o` | `PATH` | — | Output JSON file path for the evaluation results. | +| `--schema` | | flag | `false` | Print the expected dataset schema for the given `--task` and exit. Does not run evaluation. | +| `--mode` | | `onnx\|compare` | `onnx` | Evaluation mode. `onnx` evaluates the ONNX candidate on a dataset. `compare` runs the ONNX candidate and the HuggingFace reference on identical random inputs and reports per-tensor similarity metrics — no dataset required. | + +## How it works + +`winml eval` loads the model and runs the evaluation pipeline via the internal `evaluate` function (supporting both HuggingFace IDs and local ONNX files), then pulls the requested number of samples from a HuggingFace dataset. Each sample is preprocessed using the tokenizer or image processor associated with the model ID, passed through the ONNX Runtime session, and the output is compared against the ground-truth label. Aggregated metrics (accuracy, F1, etc.) are printed to the console and optionally written to a JSON file. When `-m` is an ONNX file, `--model-id` must be provided so the command knows which preprocessor and label vocabulary to use. + +## Examples + +Evaluate a HuggingFace model using the task-default dataset: + +```bash +$ winml eval -m microsoft/resnet-50 +``` + +```text +Task: image-classification +Dataset: timm/mini-imagenet (test, 100 samples) +Device: auto + +Accuracy: 76.00% + +Results saved to: microsoft_resnet-50_eval.json +``` + +Evaluate a pre-exported ONNX file, providing the source model ID for preprocessing: + +```bash +$ winml eval -m model.onnx --model-id microsoft/resnet-50 --dataset timm/mini-imagenet +``` + +Evaluate a BERT model on the MRPC paraphrase task with column remapping: + +```bash +$ winml eval -m Intel/bert-base-uncased-mrpc --dataset nyu-mll/glue --dataset-name mrpc --column input_column=sentence1 --column second_input_column=sentence2 --samples 500 +``` + +Check what dataset columns are expected before running, then remap them to match your dataset: + +```bash +$ winml eval --schema --task text-classification +``` + +```text +Input schema for text-classification models +================================================== + +--column option schema + +Evaluating needs a dataset with the following columns: + input_column + input text (default: text) + label_column + class label (ClassLabel or integer) (default: label) + second_input_column + second text for sentence-pair tasks (optional) (default: None) + +Override any default with --column: + --column input_column= + --column label_column= + --column second_input_column= +``` + +The GLUE SST-2 dataset uses `sentence` instead of the default `text` column, so remap it with a single `--column` override: + +```bash +$ winml eval -m distilbert/distilbert-base-uncased-finetuned-sst-2-english --dataset nyu-mll/glue --dataset-name sst2 --column input_column=sentence --samples 500 +``` + +Evaluate against a custom dataset whose label names differ from the model's class IDs. The `--label-mapping` flag points to a JSON file whose **keys are the label name strings as they appear in the dataset** and whose **values are the integer class IDs the model emits**. For example, ResNet-50 outputs ImageNet-1k class IDs (`0`–`999`), so if your custom dataset uses readable strings like `"tabby cat"` or `"golden retriever"`, `labels.json` translates each dataset label to the corresponding ImageNet ID the model predicts: + +```json +{ + "tabby cat": 281, + "Egyptian cat": 285, + "golden retriever": 207 +} +``` + +```bash +$ winml eval -m microsoft/resnet-50 --dataset my-org/my-pets-dataset --label-mapping labels.json -o results/resnet_eval.json +``` + +Evaluate a composite model from pre-exported ONNX files. Some tasks (e.g., `image-to-text`, encoder-decoder, dual-encoder) split the model across multiple ONNX files, one per role. Pass `-m` once per role as `=.onnx` and supply `--model-id` so the preprocessor and tokenizer can be resolved. Run `winml eval --schema --task image-to-text` to see the expected roles for a task: + +```bash +$ winml eval -m encoder=encoder.onnx -m decoder=decoder.onnx --model-id microsoft/trocr-base-printed +``` + +## Common pitfalls + +- **ONNX file without `--model-id` fails.** When `-m` is a `.onnx` path, `--model-id` is mandatory. Without it the command cannot resolve the preprocessor or label vocabulary and will exit with a usage error. +- **The task-default dataset may not match every model.** A default dataset cannot fit every model. Classification and detection models in particular need a dataset whose label space and domain match what the model was trained on — using the default may produce misleadingly low scores, missing-label errors, or a dataset-schema error. Always pass `--dataset` (and `--label-mapping` if needed) when evaluating a model whose label space or domain differs from the task default. +- **Some dataset requires Hub credentials for gated datasets.** Some datasets (e.g., `imagenet-1k`) require a HuggingFace account with accepted terms of use. Log in with `huggingface-cli login` before running eval on gated data. +- **`--shuffle` is on by default.** The random 100-sample slice changes between runs unless you pass `--no-shuffle`. Use `--no-shuffle` when comparing two model variants to ensure they see identical samples. +- **`--streaming` skips the local cache.** Streaming mode avoids downloading the full split but prevents random shuffling on large datasets. For reproducible evaluation, download the split once and omit `--streaming`. +- **Column names vary across datasets.** If the evaluator raises a missing-column error, run `winml eval --schema --task ` to inspect the expected schema and use `--column` to remap dataset field names to the expected names. + +## See also + +- [winml perf](perf.md) — measure latency and throughput on the same model +- [winml build](build.md) — produce the quantized artifact to evaluate +- [Quantization & QDQ](../concepts/quantization.md) — why accuracy validation after quantization matters +- [ONNX & Execution Providers](../concepts/eps-and-devices.md) — understand the `--device` option diff --git a/docs/commands/export.md b/docs/commands/export.md new file mode 100644 index 000000000..a18bdebf1 --- /dev/null +++ b/docs/commands/export.md @@ -0,0 +1,113 @@ +# winml export + +> Convert a PyTorch / Hugging Face model to ONNX, preserving module hierarchy. + +## When to use this + +Use `winml export` when you have a Hugging Face model ID or a local PyTorch +checkpoint and need an ONNX file as the first step of the optimization +pipeline. This is the entry point before `winml quantize` or `winml compile`. + +## Synopsis + +```bash +$ winml export [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|---|---|---|---|---| +| `--model` | `-m` | string | *(required)* | Hugging Face model name or local path (e.g., `prajjwal1/bert-tiny`). | +| `--output` | `-o` | path | *(required)* | Output ONNX file path (e.g., `model.onnx`). | +| `--with-report/--no-with-report` | | flag | `false` | Generate full export reports: Markdown, JSON, and a console tree. | +| `--hierarchy/--no-hierarchy` | | flag | `true` | Preserve `hierarchy_tag` metadata in ONNX nodes (use `--no-hierarchy` for a clean ONNX file). | +| `--dynamo/--no-dynamo` | | flag | `false` | Enable PyTorch 2.9+ dynamo export for richer node metadata. (Experimental — currently logs a warning.) | +| `--torch-module` | | string | `None` | Comma-separated list of `torch.nn` module types to include in hierarchy (e.g., `LayerNorm,Embedding`). (Experimental — currently logs a warning.) | +| `--input-specs` | | path | `None` | JSON file with explicit input tensor specifications. Auto-generated when omitted. | +| `--task` | `-t` | string | `None` | Override auto-detected Hugging Face task (e.g., `image-feature-extraction`). | +| `--export-config` | | path | `None` | JSON file with ONNX export parameters such as `opset_version` and `do_constant_folding`. | +| `--shape-config` | | path | `None` | JSON object mapping symbolic dimension names to concrete sizes (e.g., `{"sequence_length": 2048}`). Ignored when `--input-specs` is provided. | +| `--trust-remote-code/--no-trust-remote-code` | | flag | `false` | Allow executing custom code from model repositories during export. Use only with trusted sources. | +| `--allow-unsupported-nodes/--no-allow-unsupported-nodes` | | flag | `false` | Allow unsupported nodes to remain in the exported graph instead of failing export. | +| `--help` | `-h` | flag | | Show this message and exit. | + +## How it works + +`winml export` loads the model via Hugging Face `transformers`, then runs the +eight-step Hierarchy-preserving Tags Protocol (HTP): model preparation, input +generation, module-hierarchy tracing, TorchScript ONNX export, node-tagger +creation, per-node tagging, tag injection into ONNX `metadata_props`, and +optional report generation. The hierarchy metadata allows downstream tools to +reason about operators grouped by their originating module rather than flat +graph position. When `--no-hierarchy` is specified, hierarchy steps are bypassed +and a bare ONNX file is written, useful for third-party tools that do not +understand custom metadata. + +## Examples + +```bash +# Minimal export: Hugging Face model ID to ONNX file +winml export -m microsoft/resnet-50 -o resnet50.onnx +``` + +```text +Model: microsoft/resnet-50 +Output: resnet50.onnx + +Starting HTP export... + Detected task: image-classification + +Success! Model exported to: resnet50.onnx +``` + +```bash +# Export with verbose output and full Markdown + JSON reports +winml export -m facebook/convnext-tiny-224 -o convnext.onnx -v --with-report +``` + +```bash +# Export a BERT model, overriding input shapes for longer sequences +winml export -m bert-base-uncased -o bert.onnx \ + --shape-config shape.json +# shape.json: {"sequence_length": 512} +``` + +```bash +# Export with a hand-crafted input-spec file (skips auto-detection) +winml export -m bert-base-uncased -o bert.onnx --input-specs inputs.json +``` + +```bash +# Produce clean ONNX without hierarchy metadata (for third-party optimizers) +winml export -m microsoft/resnet-50 -o resnet50_clean.onnx --no-hierarchy +``` + +## See also + +- [winml optimize](optimize.md) — the next pipeline stage after export +- [Supported Models](../reference/supported-models.md) — full list of validated architectures +- [Load and export concept](../concepts/load-and-export.md) — details on the export process + +## Common pitfalls + +- **Task detection fails on unusual model IDs.** If auto-detection picks the + wrong task (or fails entirely), pass `-t` with the correct task string, for + example `-t image-feature-extraction`. +- **`--shape-config` is silently ignored when `--input-specs` is set.** + `--input-specs` takes full priority; remove it if you only want to override + individual dimensions. +- **`--dynamo` and `--torch-module` are experimental.** Both flags emit a + warning and have no effect in the current release. Do not rely on them in + automated pipelines yet. +- **Output directory must be writable.** The command creates parent directories + automatically, but will fail with a permission error on read-only paths. +- **Model weights are downloaded to the Hugging Face cache.** Set `HF_HOME` or + `HF_HUB_CACHE` to control the download location. + +## See also + +- [winml quantize](quantize.md) +- [winml compile](compile.md) +- [winml build](build.md) +- [Load and export concept](../concepts/load-and-export.md) diff --git a/docs/commands/inspect.md b/docs/commands/inspect.md new file mode 100644 index 000000000..df0f18d85 --- /dev/null +++ b/docs/commands/inspect.md @@ -0,0 +1,109 @@ +# winml inspect + +> Inspect a model's tasks, classes, and hierarchy before committing to an export. + +## When to use this + +Use `winml inspect` to understand how winml-cli will treat a HuggingFace model before +running `winml export` or `winml build`. It answers questions like "which task will be +auto-detected?", "which HF model class will be loaded?", and "does this model have a +supported exporter?" without downloading weights or writing any files. + +## Synopsis + +```bash +$ winml inspect -m [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--model` | `-m` | string | **required** | HuggingFace model ID (e.g. `openai/clip-vit-base-patch32`). Required unless `--list-tasks` or `--help` is used. | +| `--format` | `-f` | `table` \| `json` | `table` | Output format. `table` renders rich panels; `json` emits a machine-readable object. | +| `--task` | `-t` | string | `null` | Override the auto-detected task (e.g. `image-classification`, `feature-extraction`). | +| `--hierarchy/--no-hierarchy` | `-H` | flag | `false` | Print the PyTorch module tree. Instantiates the model with random weights — no weight download required. | +| `--verbose` | `-v` | flag | `false` | Show full configuration details. | +| `--list-tasks` | | flag | `false` | List all known tasks and exit. Does not require `--model`. | +| `--model-type` | | string | `null` | Override model type (e.g. `bert`, `resnet`). Can be used without `--model`. | +| `--model-class` | | string | `null` | Override model class (e.g. `BertForMaskedLM`). Can be used without `--model`. | +| `--help` | `-h` | flag | — | Show help and exit. | + +> `winml inspect` does not accept `--device`, `--ep`, `--precision`, or `--output`. +> It is a read-only discovery command that does not produce any artifacts. + +## How it works + +`winml inspect` calls into the winml-cli registry to resolve the model ID against the +known loader and exporter configurations. It fetches only the model's `config.json` +from HuggingFace Hub (no weights), uses the architecture field to look up the matching +HF model class and WinML inference class, and then renders the result. When +`--hierarchy` is supplied, the model is instantiated locally with random weights using +`AutoModel.from_config()`, and a forward-pass trace records the full PyTorch module +tree. Because no real weights are downloaded, hierarchy inspection is fast even for +large models. + +## Examples + +```bash +# Basic inspection — check task detection and loader/exporter classes +$ winml inspect -m microsoft/resnet-50 +``` + +```text ++--------------------------- microsoft/resnet-50 ---------------------------+ +| Task image-classification | +| Model Class ResNetForImageClassification | +| Exporter OptimumExporter | +| WinML Class WinMLImageClassificationModel | +| Status Supported | ++---------------------------------------------------------------------------+ +``` + +```bash +# JSON output — useful for scripting or CI pre-flight checks +$ winml inspect -m bert-base-uncased --format json +``` + +```bash +# Override task when auto-detection picks the wrong one +$ winml inspect -m bert-base-uncased --task feature-extraction +``` + +```bash +# Print the full PyTorch module hierarchy (no weight download) +$ winml inspect -m openai/clip-vit-base-patch32 --hierarchy +``` + +```bash +# Combine verbose logging with hierarchy for deep diagnostics +$ winml inspect -m facebook/convnext-tiny-224 -v -H +``` + +## Common pitfalls + +- **`--model` is required for model inspection.** The flag is marked required for model-specific lookups; omitting it returns an error. The only exception is `--list-tasks`, which lists all known tasks and exits without needing a model. +- **Hierarchy requires a locally installable model config.** If the model config + references a custom architecture not in the local `transformers` installation, + `--hierarchy` will fail with an import error. Update `transformers` or omit the flag. +- **Task override affects all output.** Passing `--task` changes which exporter and + WinML class are reported, not just the task field. If the override is incompatible + with the model architecture, the status will show as unsupported. +- **`--format json` is silent on unsupported models.** When the model is not found in + the winml-cli registry, the command raises a `ClickException`. Wrap the call in + `winml inspect ... && ...` or check the exit code when scripting. +- **No weight download does not mean no network access.** The `config.json` is always + fetched from HuggingFace Hub. Set `HF_HUB_OFFLINE=1` if you need fully offline + inspection of a locally cached model. + +## See also + +- [catalog.md](catalog.md) — browse the curated catalog and check accuracy verdicts before + inspecting +- [Supported Models](../reference/supported-models.md) — full list of validated model architectures +- [Load and export concept](../concepts/load-and-export.md) — how `winml.hierarchy.tag` + metadata is written and what you can do with the module tree +- [How winml-cli Works](../concepts/how-it-works.md) — pipeline overview showing where + inspect fits before export +- [ONNX & Execution Providers](../concepts/eps-and-devices.md) — background on loaders, + exporters, and EP-specific configurations diff --git a/docs/commands/optimize.md b/docs/commands/optimize.md new file mode 100644 index 000000000..877f18731 --- /dev/null +++ b/docs/commands/optimize.md @@ -0,0 +1,101 @@ +# winml optimize + +> Apply graph optimizations and fusions to an ONNX model to reduce node count and improve inference speed. + +## When to use this + +Use `winml optimize` after exporting an ONNX model and before quantization or compilation. Graph fusions reduce operator count, improve memory locality, and can make downstream quantization more accurate by presenting cleaner subgraphs to the calibration pass. It is also useful as a standalone step when you want to optimize a pre-exported ONNX file without running the full build pipeline. + +## Synopsis + +```bash +$ winml optimize [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--model` | `-m` | `PATH` | *(required unless listing)* | Input ONNX model file. Not required when `--list-capabilities` or `--list-rewrites` is used. | +| `--output` | `-o` | `PATH` | `{input}_opt.onnx` | Output path for the optimized model. Defaults to the input filename with `_opt` inserted before the extension. | +| `--config` | `-c` | `PATH` | *(none)* | YAML or JSON configuration file. Fields in the file override capability defaults; CLI flags override the file. | +| `--verbose` | `-v` | flag | off | Enable verbose output. | +| `--list-capabilities` | `-l` | flag | off | Print all registered optimization capabilities grouped by category and exit. Add `--verbose` for descriptions and ORT names. | +| `--list-rewrites` | | flag | off | Print all available pattern-rewrite families with their source-to-target mappings and exit. | +| *(dynamic)* | | flag | *(per capability)* | Each registered capability generates a `--enable-` / `--disable-` pair. Run `--list-capabilities` to see the full current list. Examples: `--enable-gelu-fusion`, `--disable-constant-folding`. Pattern-rewrite flags follow the form `--enable--`; run `--list-rewrites` to discover all names. | + +### Configuration precedence + +When multiple sources are provided, settings are resolved in this order (highest wins): + +1. Explicit CLI flags (`--enable-X` / `--disable-X`) +2. Config file (`-c`) +3. Capability defaults + +## How it works + +`winml optimize` loads the ONNX model, builds a final capability configuration by merging capability defaults, an optional config file, and any explicit CLI flags, then runs all enabled passes through the `Optimizer`. Each capability maps to a named optimization or fusion pipe in the `winml.modelkit.optim` registry. The capability flags are auto-generated at startup from that registry — adding a new optimization to the registry automatically makes it available as a CLI flag without any change to this command's source. After optimization, the command prints the before-and-after node count and percentage reduction so you can quantify the effect. + +## Examples + +Optimize a model with all capability defaults: + +```bash +$ winml optimize -m microsoft/resnet-50.onnx +``` + +```text +Input: microsoft/resnet-50.onnx +Output: microsoft/resnet-50_opt.onnx + +Loading model... +Running optimizer... +Saving optimized model... + +Success! Model optimized: microsoft/resnet-50_opt.onnx +Nodes: 312 -> 289 (7.4% reduction) +``` + +Enable specific fusions for a BERT model: + +```bash +$ winml optimize -m bert-base-uncased.onnx \ + --enable-layer-norm-fusion \ + --enable-attention-fusion \ + -o bert_layernorm_attn.onnx +``` + +Use a config file to set capabilities and save the result for downstream compilation: + +```bash +$ winml optimize -m facebook/convnext-tiny-224.onnx \ + -c optimize_config.yaml \ + -o convnext_opt.onnx +``` + +List all available optimization capabilities: + +```bash +$ winml optimize --list-capabilities +``` + +Discover pattern-rewrite families and their flag names: + +```bash +$ winml optimize --list-rewrites +``` + +## Common pitfalls + +- **`--model` is required for actual optimization** — it can be omitted only when using `--list-capabilities` or `--list-rewrites`. Missing `--model` in any other case raises a usage error. +- **Config file and CLI flags interact via precedence** — a `--disable-X` CLI flag always wins over a config file value that enables the same capability, but omitting the flag leaves the config file value in effect. To turn off a capability set by a config file, pass the explicit `--disable-X` flag. +- **Config file validation errors abort the run** — if the config file contains keys that fail capability validation or dependency checks, the command prints all errors and exits with code 1 without touching the model. Fix the config before retrying. +- **The dynamic flag list changes between releases** — new capabilities are added as the optimizer registry grows. Always use `--list-capabilities` to confirm the current set of flags rather than relying on a cached list. +- **Output path default may overwrite a sibling file** — if you run optimize twice on the same input without specifying `-o`, the second run silently overwrites `{input}_opt.onnx`. Specify an explicit output path in scripts. + +## See also + +- [how-it-works.md](../concepts/how-it-works.md) — where optimization fits in the full winml-cli pipeline +- [export.md](export.md) — produce an ONNX file to optimize from a HuggingFace model +- [quantize.md](quantize.md) — quantize the optimized model for lower-precision inference +- [config.md](config.md) — generate a `WinMLBuildConfig` that includes optimization settings diff --git a/docs/commands/overview.md b/docs/commands/overview.md new file mode 100644 index 000000000..473a5bb73 --- /dev/null +++ b/docs/commands/overview.md @@ -0,0 +1,71 @@ +# Commands + +winml-cli exposes a CLI named `winml` with 12 subcommands covering the full +journey from model discovery to a deployment-ready artifact. Every subcommand +shares a consistent invocation style — `winml [flags]` — and the +same global flags are available on the root `winml` group. + +The commands group by user intent. **Discover** (`sys`, `inspect`, `catalog`, +`analyze`) helps you understand your hardware and model before writing any +artifacts. **Configure** (`config`, `optimize`) produces a reusable build +configuration and tunes the ONNX graph. **Build** (`export`, `quantize`, +`compile`, `build`) runs the pipeline stages that produce deployment artifacts. +**Measure** (`perf`, `eval`) benchmarks and validates the result. + +The typical workflow follows that order: run `winml sys` to confirm hardware +and EPs, then `winml inspect` or `winml catalog` to verify model support. Use +`winml config` to generate a build configuration, then `winml build` to execute +the full pipeline — or chain `export` → `analyze` → `optimize` → `quantize` → `compile` +individually for finer control. Close with `winml perf` and `winml eval` to +measure speed and accuracy. + +## Command map + +| Command | Group | Purpose | +|---|---|---| +| [`sys`](sys.md) | Discover | Inspect your machine — devices, EPs, and runtime versions at a glance. | +| [`inspect`](inspect.md) | Discover | Inspect a model's tasks, classes, and hierarchy before committing to an export. | +| [`catalog`](catalog.md) | Discover | Browse the curated winml-cli catalog of validated models and benchmarks. | +| [`config`](config.md) | Configure | Generate a reusable build configuration for a Hugging Face model or ONNX file. | +| [`export`](export.md) | Build | Convert a PyTorch / Hugging Face model to ONNX, preserving module hierarchy. | +| [`analyze`](analyze.md) | Build | Verify an ONNX model is compatible with a target execution provider before deployment. | +| [`optimize`](optimize.md) | Build | Apply graph optimizations and fusions to an ONNX model to reduce node count and improve inference speed. | +| [`quantize`](quantize.md) | Build | Quantize an ONNX model with QDQ insertion and calibration-based scaling. | +| [`compile`](compile.md) | Build | Compile an ONNX model to an EP-specific format for fast runtime loading. | +| [`build`](build.md) | Build | Run the entire winml-cli pipeline (export → optimize → quantize → compile) in one command. | +| [`perf`](perf.md) | Measure | Benchmark an ONNX model's latency and throughput on a target device. | +| [`eval`](eval.md) | Measure | Evaluate ONNX model accuracy on a standard dataset. | + +## Choosing a command + +- **I want to see what hardware and EPs I have** → `winml sys` +- **I want to know if my model is supported** → `winml inspect` +- **I want to browse validated models with known benchmarks** → `winml catalog` +- **I want to verify EP operator compatibility before compiling** → `winml analyze` +- **I want to convert a Hugging Face model to ONNX** → `winml export` +- **I want to run the whole pipeline in one go** → `winml build` +- **I want to benchmark latency and throughput** → `winml perf` +- **I want to measure model accuracy** → `winml eval` + +## Global flags + +`-v` / `--verbose`, `-q` / `--quiet`, `--version`, and `-h` / +`--help` live on the root `winml` group only. Subcommands access them through +`ctx.obj` and do not redefine them. See +`src/winml/modelkit/cli.py` for the canonical contract. + +## Shared flags + +Several flags share semantics across the commands that accept them: +`-m` / `--model`, `-d` / `--device`, `--ep`, `-o` / `--output`, +`-t` / `--task`, and `--precision`. Defaults and accepted values can +differ per command (e.g., `-p` is a short form for `--precision` only on +`config` and `quantize`); check the **Flags** section of each command page +rather than assuming they transfer. + +## See also + +- [How winml-cli Works](../concepts/how-it-works.md) — end-to-end pipeline overview +- [Config and build](../concepts/config-and-build.md) — structure of `WinMLBuildConfig` and how stages interact +- [ONNX & Execution Providers](../concepts/eps-and-devices.md) — background on EPs and how `--device` / `--ep` interact +- [winml build](build.md) — the single command that runs the entire pipeline diff --git a/docs/commands/perf.md b/docs/commands/perf.md new file mode 100644 index 000000000..962488996 --- /dev/null +++ b/docs/commands/perf.md @@ -0,0 +1,99 @@ +# winml perf + +> Benchmark an ONNX model's latency and throughput on a target device. + +## When to use this + +Use `winml perf` when you want a quantitative latency and throughput baseline for a model on a specific device, or when you need to compare the performance impact of different precision settings, execution providers, or batch sizes. + +## Synopsis + +```bash +$ winml perf [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|---|---|---|---|---| +| `--model` | `-m` | `TEXT` | — | HuggingFace model ID or path to a local `.onnx` file. Required. | +| `--task` | | `TEXT` | auto-detected | Explicit task override (e.g., `image-classification`). Inferred from the model if omitted. | +| `--iterations` | | `INTEGER` | `100` | Number of timed inference iterations used to compute statistics. | +| `--warmup` | | `INTEGER` | `10` | Number of warm-up iterations run before timing begins; excluded from statistics. | +| `--device` | `-d` | `auto\|cpu\|gpu\|npu` | `auto` | Device to run the benchmark on. `auto` selects the highest-priority available device. | +| `--precision` | | `TEXT` | `auto` | Precision mode applied during model build: `auto`, `fp32`, `fp16`, `int8`, `int16`, or compound forms such as `w8a16`. | +| `--ep` | | `TEXT` | — | Force a specific execution provider (e.g., `qnn`, `dml`, `vitisai`, `openvino`, `cpu`). Overrides the device-to-provider mapping. | +| `--output` | `-o` | `PATH` | `~/.cache/winml/perf//.json` | Output JSON file path for the benchmark report. | +| `--batch-size` | | `INTEGER` | `1` | Batch size used when generating synthetic input tensors. | +| `--shape-config` | | `PATH` | — | Path to a JSON file containing shape overrides (e.g., `{"height": 480, "width": 480}`). Ignored for pre-exported ONNX files and in `--module` mode. | +| `--quantize/--no-quantize` | | flag | `true` | Run quantization during model build (use `--no-quantize` to skip it). Useful for measuring the fp32 baseline. | +| `--rebuild/--no-rebuild` | | flag | `false` | Force model rebuild even if a cached artifact already exists. | +| `--ignore-cache/--no-ignore-cache` | | flag | `false` | Build from scratch in a temporary folder and discard the artifact after benchmarking. Implies `--rebuild`. | +| `--module` | | `TEXT` | — | PyTorch module class name for per-module benchmarking (e.g., `BertAttention`). Builds and times each matching instance separately. See [Load and export](../concepts/load-and-export.md). | +| `--monitor/--no-monitor` | | flag | `false` | Show a live NPU/CPU utilization chart while the benchmark runs and include hardware metrics in the JSON report. | + +## How it works + +`winml perf` loads the model through `WinMLAutoModel` — accepting both HuggingFace IDs and local ONNX files — then generates random input tensors from the model's I/O configuration. It runs the specified number of warm-up iterations (excluded from statistics) followed by the timed iterations, collecting per-sample latency. The final report includes mean, min, max, P50, P90, P95, P99, standard deviation, and throughput in samples per second. When `--monitor` is active, a hardware polling loop runs in parallel and records NPU / GPU utilization, CPU usage, and device memory alongside the timing data. + +## Examples + +Basic benchmark on the best available device: + +```bash +$ winml perf -m microsoft/resnet-50 +``` + +```text +Device: npu +Precision: auto +Task: image-classification +Iterations: 100 (+ 10 warmup) +Batch Size: 1 + +Latency (ms) + Avg P50 P90 P95 P99 Min Max Std + 2.14 2.11 2.38 2.51 2.79 1.97 3.04 0.12 + +Throughput: 467.29 samples/sec + +Results saved to: ~/.cache/winml/perf/microsoft_resnet-50/2026-05-27T120000.json +``` + +Benchmark a pre-exported ONNX file on CPU with more iterations: + +```bash +$ winml perf -m model.onnx --device cpu --iterations 500 +``` + +Benchmark a text model with an explicit task, targeting the NPU: + +```bash +$ winml perf -m bert-base-uncased --task text-classification --device npu --precision w8a16 +``` + +Benchmark with live hardware monitoring enabled: + +```bash +$ winml perf -m microsoft/resnet-50 --device npu --monitor +``` + +Per-module benchmarking to find latency hot-spots across all attention blocks: + +```bash +$ winml perf -m bert-base-uncased --module BertAttention --iterations 200 +``` + +## Common pitfalls + +- **Warm-up too low on NPU.** The first several inferences on an NPU EP can be significantly slower due to kernel compilation and caching. The default of 10 warm-up iterations is usually enough for vision models, but transformer models with many operators may need `--warmup 30` or higher to reach steady-state latency. +- **`--shape-config` is silently ignored in two cases.** It has no effect on pre-exported ONNX files (shapes are baked into the graph) and is ignored in `--module` mode. The command prints a warning in both situations. +- **Random inputs do not represent real data distributions.** Latency numbers are accurate, but memory access patterns may differ from production because the generated tensors are uniform random values. For memory-bandwidth-sensitive models this can understate real-world latency. +- **Cross-device comparison.** To compare performance across devices, run `winml perf` separately with different `--device` values and compare the resulting JSON reports. + +## See also + +- [winml eval](eval.md) — measure accuracy after benchmarking +- [winml build](build.md) — build the quantized artifact that `perf` benchmarks +- [Load and export concept](../concepts/load-and-export.md) — how `--module` per-instance benchmarking works +- [ONNX & Execution Providers](../concepts/eps-and-devices.md) — understand `--device` vs `--ep` diff --git a/docs/commands/quantize.md b/docs/commands/quantize.md new file mode 100644 index 000000000..51128a046 --- /dev/null +++ b/docs/commands/quantize.md @@ -0,0 +1,119 @@ +# winml quantize + +> Quantize an ONNX model with QDQ insertion and calibration-based scaling. + +## When to use this + +Use `winml quantize` after `winml export` to insert +QuantizeLinear/DequantizeLinear (QDQ) node pairs into an ONNX graph. The +resulting model is ready for `winml compile` targeting an NPU or other +quantization-aware execution provider. + +## Synopsis + +```bash +$ winml quantize [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|---|---|---|---|---| +| `--model` | `-m` | path | *(required)* | Input ONNX model file. | +| `--output` | `-o` | path | `{input}_qdq.onnx` | Output path for the quantized model. | +| `--task` | | string | — | Task name (e.g., `image-classification`, `text-classification`) used to select a task-appropriate calibration dataset. Pair with `--model-name` so the dataset is preprocessed exactly the way the model expects. Without `--task`, calibration falls back to synthetic random data. | +| `--model-name` | | string | — | HuggingFace model ID (e.g., `microsoft/resnet-50`) used to load the matching preprocessor/tokenizer for calibration. Only used when `--task` is provided. | +| `--precision` | `-p` | string | `None` | Precision shorthand: `int8`, `int16`, or mixed-precision like `w8a16`. Overridden by explicit `--weight-type` / `--activation-type`. | +| `--samples` | | integer | `10` | Number of calibration samples used to compute quantization ranges. | +| `--method` | | choice | `minmax` | Calibration algorithm: `minmax`, `entropy`, or `percentile`. | +| `--weight-type` | | choice | — | Per-tensor type for weights: `uint8`, `int8`, `uint16`, or `int16`. Overrides `--precision`. When unset, defaults to `uint8` (or the type implied by `--precision`). | +| `--activation-type` | | choice | — | Per-tensor type for activations: `uint8`, `int8`, `uint16`, or `int16`. Overrides `--precision`. When unset, defaults to `uint8` (or the type implied by `--precision`). | +| `--per-channel/--no-per-channel` | | flag | `false` | Apply per-channel (rather than per-tensor) quantization to weight tensors. | +| `--symmetric/--no-symmetric` | | flag | `false` | Use symmetric quantization (zero-point fixed at 0). | +| `--help` | `-h` | flag | | Show this message and exit. | + +## How it works + +`winml quantize` applies static post-training quantization (PTQ) using the +ONNX Runtime quantization API. Calibration passes collect activation range +statistics, which are used to compute scale and zero-point values baked into +`QuantizeLinear` / `DequantizeLinear` node pairs around each eligible operator. +The `--method` flag controls range estimation: `minmax` uses global observed +extremes, `entropy` minimizes KL-divergence, and `percentile` clips outliers. +Precision can be set at a coarse level with `--precision` or tuned per tensor +type with `--weight-type` and `--activation-type`; explicit type flags always +override `--precision`. + +Calibration data is selected from `--task` and `--model-name`. For a supported +task, a built-in default calibration dataset is loaded and preprocessed through +the model's own tokenizer or image processor, so the calibration tensors match +what the model will see at inference time. For an unsupported task — or when +`--task` is omitted entirely — calibration falls back to synthetic random data +synthesized from the ONNX input specification. Random-data calibration is fast +and always works, but the resulting scales are typically less accurate than +dataset-driven calibration, so always provide `--task` and `--model-name` when +the model task is supported. + +## Examples + +```bash +# Minimal quantization: defaults (10 samples, uint8 weights and activations) +winml quantize -m resnet50.onnx +``` + +```text +Input: resnet50.onnx +Output: resnet50_qdq.onnx +Weight type: uint8 +Activation type: uint8 +Samples: 10 +Method: minmax + +Running quantization... + +Success! Model quantized +Output: resnet50_qdq.onnx +QDQ nodes inserted: 53 +Total time: 4.31s +``` + +```bash +# Task-aware calibration: real samples preprocessed through the model's own image processor +winml quantize -m resnet50.onnx --task image-classification --model-name microsoft/resnet-50 --samples 128 +``` + +```bash +# int8 precision shorthand (equivalent to --weight-type int8 --activation-type int8) +winml quantize -m resnet50.onnx -p int8 +``` + +```bash +# Mixed-precision: int8 weights, uint16 activations with entropy calibration +winml quantize -m bert-base-uncased.onnx --weight-type int8 --activation-type uint16 --method entropy --samples 64 +``` + +```bash +# Per-channel symmetric quantization to a specific output path +winml quantize -m facebook_convnext.onnx -o facebook_convnext_qdq.onnx --per-channel --symmetric --samples 32 +``` + +```bash +# int16 precision (suitable for models sensitive to int8 accuracy loss) +winml quantize -m bert-base-uncased.onnx --precision int16 +``` + +## Common pitfalls + +- **Calibration uses synthetic random data by default.** Without `--task` and `--model-name`, scales and zero-points are computed from random tensors synthesized from the ONNX input specification — the model never sees realistic activations, so accuracy after quantization can degrade noticeably. Always pass `--task` and `--model-name` for supported tasks (e.g., `--task image-classification --model-name microsoft/resnet-50`) so calibration runs on real samples preprocessed through the model's own tokenizer or image processor. +- **`--weight-type` / `--activation-type` silently override `--precision`.** If you pass both, the explicit type flags win. Omit `--precision` when setting types explicitly to avoid confusion. +- **Low sample counts can hurt accuracy.** The default of 10 samples is sufficient for quick testing, but production models typically need 64–256 representative samples for good calibration. +- **`--per-channel` increases model size.** Per-channel quantization stores a separate scale and zero-point per output channel; this can noticeably inflate the model file size compared to per-tensor mode. +- **Output defaults to `{stem}_qdq.onnx` in the same directory as input.** Always pass `-o` when writing to a specific location to avoid accidentally overwriting or cluttering the source directory. +- **Quantizing an already-quantized model (one containing QDQ nodes) is unsupported and will produce incorrect results.** Use `winml compile --no-quant` instead if the model already contains QDQ nodes. + +## See also + +- [winml export](export.md) +- [winml compile](compile.md) +- [winml build](build.md) +- [Quantization concepts](../concepts/quantization.md) diff --git a/docs/commands/sys.md b/docs/commands/sys.md new file mode 100644 index 000000000..1ec4dacc7 --- /dev/null +++ b/docs/commands/sys.md @@ -0,0 +1,115 @@ +# winml sys + +> Inspect your machine — devices, EPs, and runtime versions at a glance. + +## When to use this + +Run `winml sys` before starting any export or build workflow to confirm that the +required ML libraries are installed and that the target hardware is visible. It is +also the first command to run when diagnosing an unexpected export failure. + +## Synopsis + +```bash +$ winml sys [options] +``` + +## Flags + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--format` | `-f` | `text` \| `json` \| `compact` | `text` | Output format. `text` renders rich tables, `json` emits machine-readable JSON, `compact` prints a single-line summary. | +| `--list-device` | — | flag | `false` | List available compute devices (NPU, GPU, CPU) in priority order instead of showing the full system report. | +| `--list-ep` | — | flag | `false` | List available ONNX Runtime execution providers instead of showing the full system report. Can be combined with `--list-device`. | +| `--verbose` | `-v` | flag | `false` | Surface additional diagnostic sections: backend availability and Export Readiness. | +| `--help` | `-h` | flag | — | Show help and exit. | + +> `winml sys` takes no `--model`, `--device`, `--ep`, `--task`, or `--precision` +> arguments. It describes the host environment, not a specific model. + +## How it works + +`winml sys` queries Python's `platform` and `importlib.metadata` modules to report +library versions, then probes PyTorch for CUDA availability and GPU device names. +Backend availability checks use the installed runtime environment, while device +enumeration queries hardware directly in NPU > GPU > CPU priority order, and EP +enumeration merges the WinML EP registry with ONNX Runtime's +`get_available_providers()`. When +`--format json` is used the full report — including devices and EPs — is emitted as +a single JSON object, making it easy to capture in CI pipelines. + +## Examples + +```bash +# Full human-readable system report +$ winml sys +``` + +```text ++------------------------------------+ +| winml-cli System Information | ++------------------------------------+ + +Environment + Python Version 3.11.9 + Python Executable C:\...\python.exe + OS Windows 11 + Machine AMD64 + +ML Libraries + Library Version Status + torch 2.4.0 OK + transformers 4.44.0 OK + onnx 1.16.1 OK + ... + +Available Devices (priority order) + #1 NPU Qualcomm(R) Hexagon NPU + #2 GPU Qualcomm(R) Adreno GPU + #3 CPU Snapdragon(R) X Elite + +Available Execution Providers + QNNExecutionProvider -> NPU/GPU + DmlExecutionProvider -> GPU + CPUExecutionProvider -> CPU +``` + +```bash +# Compact one-liner — useful for CI logs +$ winml sys --format compact +``` + +```bash +# Machine-readable JSON — pipe to jq or save for later comparison +$ winml sys --format json > env.json +``` + +```bash +# Only list devices — skip everything else +$ winml sys --list-device +``` + +```bash +# List EPs as JSON — useful for scripting EP selection +$ winml sys --list-ep --format json +``` + +## Common pitfalls + +- **`--list-device` and `--list-ep` suppress the full report.** When either flag is + present, only the requested section is printed. Omit both flags to see the + complete system report. +- **`--format compact` omits device and EP tables.** The compact format is designed + for single-line log entries and does not include device or EP details. Use `text` + or `json` when you need the full picture. +- **CUDA shown as unavailable on a machine with a GPU.** PyTorch must be installed + with CUDA support (`torch+cuXXX`). A CPU-only torch wheel will always report + `cuda_available: false`. + +## See also + +- [ONNX & Execution Providers](../concepts/eps-and-devices.md) — background on EPs and + how `--device` / `--ep` flags interact +- [inspect.md](inspect.md) — inspect a specific HuggingFace model's compatibility +- [catalog.md](catalog.md) — browse the curated catalog of validated models +- [How winml-cli Works](../concepts/how-it-works.md) — end-to-end pipeline overview diff --git a/docs/concepts/analyze-and-optimize.md b/docs/concepts/analyze-and-optimize.md new file mode 100644 index 000000000..b72b2b9d9 --- /dev/null +++ b/docs/concepts/analyze-and-optimize.md @@ -0,0 +1,174 @@ +# Analyze and optimize + +Not every ONNX graph runs efficiently on every execution provider. An operator that compiles cleanly on CPU may be unsupported on an NPU, and a correct graph may still leave performance on the table because adjacent operations were not fused. winml-cli separates the concern into two commands — `winml analyze` and `winml optimize` — that together form a graph-quality loop driven automatically by `winml build`. + +## What analyze does + +`winml analyze` performs static analysis on an ONNX graph to answer one question: **will this model run end-to-end on my target execution provider, and if not, what needs to change?** + +Unlike profiling, static analysis does not require executing the full model on the target device. It inspects each operator (and recognized subgraph pattern) against a rule database of known EP capabilities, classifies every node, and emits actionable recommendations. The same analyzer also drives the autoconf feedback loop inside `winml build`, so understanding how it works is useful even when you never invoke `winml analyze` directly. + +Specify a target EP with `--ep` (e.g., `--ep qnn` or `--ep openvino`) and a device with `--device` (CPU, GPU, or NPU). The default `--ep auto` infers from locally available EPs; pass `--ep all` to evaluate every rule-data-backed EP regardless of local availability. Results print to the console by default; add `--output results.json` to save the report as JSON for scripting or archiving. + +### How operators are classified + +For each operator (and matched subgraph pattern) the analyzer follows a two-step process: + +1. **Rule-database lookup** — does the target EP claim to support this pattern? +2. **Local probe (fallback)** — if the pattern is absent from the rule database and `--run-unknown-op` is enabled, the analyzer builds a minimal ONNX graph for the op and runs it on the target EP locally to determine support (see [Local op execution](#local-op-execution) below). + +The combined answer is recorded as a `SupportLevel`: + +| Level | Compile on target EP | Runs (possibly via CPU fallback) | CLI label | Exit code contribution | +| ------------- | -------------------- | -------------------------------- | ------------------ | ---------------------- | +| `SUPPORTED` | yes | yes | `Fully Supported` | 0 | +| `PARTIAL` | no | yes | `Partial Support` | 1 (warning) | +| `UNSUPPORTED` | no | no | `Not Supported` | 1 (error) | +| `UNKNOWN` | n/a | n/a | `Unknown Support` | 1 | + +A `PARTIAL` classification means the operator cannot be dispatched to the requested EP but the ONNX Runtime can still execute the model by falling back to CPU. This is technically a working model, but the latency and power-efficiency goals of NPU deployment are not met. `UNSUPPORTED` means even the CPU fallback path fails, so the model will not run at all. `UNKNOWN` appears only when the analyzer lacks both rule-database data and the ability to test locally. + +### Two key outputs: lint and autoconf + +Every analysis produces a **lint** result; the default (full) mode additionally produces an **autoconf** result. Understanding these two outputs separately is the easiest way to understand what `winml analyze` is for and how to consume it. + +**Lint** is the analyzer's verdict on the model as it stands today. It classifies every operator and recognized pattern against the target EP and rolls the classifications up into: + +- `errors` — count of `UNSUPPORTED` patterns. **The model will not run.** +- `warnings` — count of `PARTIAL` patterns. The model runs, but these nodes fall back to CPU. +- `passed` — `True` iff `errors == 0 and warnings == 0`. + +Lint always runs. It is deterministic and sufficient for a yes/no CI gate — the CLI's exit code is derived from it. + +**Autoconf** is the analyzer's _suggestion_ for how to fix the current model. It lists the fusion flags which, if enabled in the optimize stage, would convert one or more `PARTIAL`/`UNSUPPORTED` patterns into `SUPPORTED` ones. + +Autoconf is what powers the build pipeline's [re-optimization loop](#the-analyzeroptimizer-loop): when the analyzer says "`gelu_fusion` would resolve these warnings", the build re-runs optimize with that flag and re-analyzes — until no further suggestions remain or the iteration limit is hit. Autoconf is _advisory_; nothing else in the system flips fusion flags automatically. + +### Analysis modes + +`winml analyze` can run in two modes which differ only in whether autoconf is computed: + +| Mode | How to enable | Output | When to use | +| ------------------ | ---------------------------------------------------------- | ---------------------------------------------------------------------- | ----------------------------------------------- | +| **Lint-only** | `--no-information` (CLI) or `autoconf=False` (Python) | Lint only. `optimization_config` is `None`. | CI gate; pass/fail only | +| **Full** (default) | `--information` (CLI, default) or `autoconf=True` (Python) | Lint **plus** autoconf and recommendations | Local debugging; build pipeline's autoconf loop | + +The only difference between the two modes is whether autoconf and the human-readable recommendations are computed. Skipping them gives a faster, leaner run. The lint result is identical either way. + +### Three classes of finding + +Every analysis emits findings in three buckets. Each bucket maps to a different remediation pattern. + +**Errors (`UNSUPPORTED` patterns)** block deployment. Either the operator does not exist on the target EP at all, or it does not handle the specific input shape/dtype the model uses. Typical remediations: + +- Rewrite the model to use an equivalent pattern the EP does support. +- Lower the opset version of the offending op if the EP supports an older opset. +- Insert pre/post-processing to massage shapes into a supported configuration. + +Each error pattern includes a recommendation that identifies the current pattern and the target pattern the EP does support, so the optimizer (or a manual rewrite) can apply the fix. + +**Warnings (`PARTIAL` patterns)** mean the model will run, but the target EP cannot dispatch this pattern. Inference falls back to the CPU EP, breaking the deployment goal (e.g., NPU offload) without breaking correctness. Warnings are usually fusion opportunities — the analyzer recognized a sub-pattern that, if fused, would become a single EP-native op. The fix is to enable the relevant fusion flag in the optimize stage — this is exactly what the autoconf loop does automatically. + +**Info (`Information` items)** are lower-priority insights: a hint that an alternative pattern exists, a QDQ-equivalent that could be used after quantization, or a description of why a node was classified as it was. Info entries never affect exit code. + +### Local op execution + +The static rule database does not cover every operator and every shape/dtype combination. When `--run-unknown-op` is enabled and the analyzer encounters a pattern not present in the database, it builds a tiny ONNX graph containing just that op (with the model's actual input metadata) and runs it on the target EP locally. The compile/run result becomes the classification. Without `--run-unknown-op` (the default), such patterns are classified as `UNKNOWN`. + +Leave `--run-unknown-op` disabled when: + +- The local machine does not have the target EP available (e.g., analyzing a QNN model from a non-Snapdragon machine). +- You want bit-for-bit reproducible analysis across machines. Local execution can produce different results depending on driver versions. + +### Save-node: debugging unsupported subgraphs + +When a pattern is unsupported and the recommendation does not immediately tell you what is wrong, use `--save-node` to dump the offending subgraph to disk as a self-contained, runnable `.onnx` file. You can then open it in [Netron](https://netron.app/), re-analyze it in isolation, or attach it to a bug report as a minimal reproducer. See the [analyze command reference](../commands/analyze.md) for usage examples. + +### HTP metadata enhancement + +When a model is exported with hierarchy-preserving tags (HTP), the export produces a sidecar `_htp_metadata.json` that maps each ONNX node back to its source module (e.g., `encoder.layer.0.attention.self.GELUActivation`). Passing this file via `--htp-metadata` lets the `PatternExtractor` use the module hierarchy to match subgraph patterns more accurately than operator-level heuristics alone. + +HTP metadata is consumed at the pattern extraction stage — before any EP-specific runtime checking — so the enriched patterns benefit all target EPs equally (QNN, OpenVINO, VitisAI, etc.). Without HTP metadata, the analyzer falls back to attribute-based tag matching and then the general-purpose `PatternMatcher`; with it, the analyzer can correctly identify fused patterns (GELU, LayerNorm, Attention) that are difficult to detect from the raw operator graph. See the [analyze command reference](../commands/analyze.md) for usage examples. + +### What runs internally + +The analyzer is composed of five stages that run in order. You normally do not need to think about them, but they are worth knowing when reading recommendations or extending the analyzer: + +| Stage | Job | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ONNXLoader` | Load the ONNX file (or `ModelProto`), record metadata. | +| `PatternExtractor` | Walk the graph, match operator and subgraph patterns from the rule catalog. Optionally consume HTP metadata. | +| `RuntimeChecker` | For each pattern, consult the rule database; if no rule applies, run the op locally (when allowed). | +| `InformationEngine` | Turn classifications into human-readable `Information` items; also runs model validators (constant folding, dynamic input, pattern matching, QDQ validation, shape inference). | +| `OutputAggregator` | Assemble the final `AnalysisOutput` (the JSON you get from `--output`). | + +The model validators run regardless of whether there are runtime check results — they are model-level sanity checks (e.g., is shape inference complete? are QDQ pairs well-formed?) and can surface issues even when every operator looks fine in isolation. + +## What optimize does + +`winml optimize` rewrites the ONNX graph by applying fusions and structural simplifications. Internally the optimizer runs four pipes in sequence: + +| Pipe | What it does | +| ----------------- | ---------------------------------------------------------------------------- | +| **ORTGraphPipe** | ORT C++ graph optimizer (level 2): fusions, eliminations, layout transforms | +| **RewritePipe** | JSON-driven pattern matcher that replaces subgraph patterns with equivalent alternatives | +| **ORTFusionPipe** | ORT Python transformer optimizer: attention, LayerNorm, and RMSNorm fusions | +| **SurgeryPipe** | Post-optimization model surgery (constant clamping, NaN guard removal) | + +Every optimization is a named **capability** toggled via `--enable-` and `--disable-` flags. Run `--list-capabilities` to see all registered optimizations and their defaults. The optimizer currently ships 57 static capabilities across 13 categories: + +| Category | Capabilities | Examples | +| ------------ | :----------: | ----------------------------------------------- | +| GELU | 5 | gelu-fusion, fast-gelu-fusion, quick-gelu-fusion | +| LayerNorm | 6 | layer-norm-fusion, skip-layer-norm-fusion, fuse-rmsnorm | +| MatMul | 6 | matmul-add-fusion, matmul-activation-fusion | +| Conv | 4 | conv-bn-fusion, conv-activation-fusion | +| Layout | 4 | nhwc-transformer, transpose-optimizer | +| GEMM | 3 | gemm-activation-fusion, gemm-transpose-fusion | +| Elimination | 3 | slice-elimination, expand-elimination | +| Graph | 3 | constant-folding, double-qdq-pairs-remover | +| Activation | 2 | bias-softmax-fusion, bias-dropout-fusion | +| Attention | 1 | attention-fusion | +| Misc | 4 | pad-fusion, gather-to-slice-fusion | +| Rewrite | 14 | attention-expandedattention, matmuladd-conv2d4d, layernormalization-singlelayernorm | +| Surgery | 2 | clamp-constant-values, remove-isnan-in-attention-mask | + +This granularity matters when a specific fusion breaks a downstream step or when you need an exact optimization profile for a given EP. Some capabilities declare dependencies (e.g., `bias-gelu-fusion` requires `gelu-fusion`); the optimizer resolves these automatically when you enable a flag. + +**Pattern rewrites** are a complementary mechanism: instead of folding nodes, rewrites replace one subgraph pattern with a structurally equivalent alternative. Rules are defined in JSON files (`default.json` for general rewrites, `qnn.json` for QNN-specific rewrites). The optimizer currently ships 5 rewrite groups containing 12 individual rules — for example, four GELU source variants can each be rewritten to a single `Gelu` op, and a MatMul+Add pattern can be rewritten to a GEMM or to a Conv2D for Qualcomm NPU targets. Run `--list-rewrites` to discover available families and their flag names. Flags follow the form `--enable--`. + +Commit a specific combination of flags to a `--config` file for reproducible builds. + +## The analyzer/optimizer loop + +A single optimize pass may create fusion opportunities that were not present before, and a freshly fused graph may surface new operator compatibility issues. This is why `winml build` runs analyze and optimize in an alternating loop rather than once each. + +The flow inside `winml build` (implemented in `run_optimize_analyze_loop`) is: + +![Optimize-analyze loop](../assets/optimize-analyze-loop.svg) + +The initial optimize pass applies the flags from `config.optim`. The analyzer then inspects the result; if autoconf discovers fusion flags that were not yet enabled, the optimizer re-runs with those flags and the analyzer re-checks. This repeats up to `--max-optim-iterations` rounds (default: three). The loop exits early when autoconf suggests no further changes. After the loop, a final analysis validates the result — if unsupported patterns still exist, the build raises a `RuntimeError`. + +Use `--no-analyze` to skip the loop and run a single optimization pass — useful for deterministic rebuilds from a fixed ONNX checkpoint where the graph is already known good. + +## When to use which entry point + +| You want to... | Use | +| --------------------------------------------- | ------------------------------------------------- | +| Gate a CI pipeline on EP compatibility | `winml analyze` (CLI) — exit code is the contract | +| Embed analysis in a build script or notebook | `analyze_onnx(model, ep=...)` (flat Python API) | +| Post-process the full result programmatically | `ONNXStaticAnalyzer().analyze(...)` (class API) | +| Analyze an in-memory `ModelProto` | `ONNXStaticAnalyzer().analyze_from_proto(...)` | +| Optimize with full control over fusions | `winml optimize` (CLI) with `--enable-` / `--disable-` flags | +| Reproducible build from a config file | `winml build -c config.json` (pipeline wrapper) | + +The CLI and the flat Python API are sufficient for the vast majority of cases. The class-based API is only needed when you want to call `is_fully_supported(ep)`, `get_unsupported_operators(ep)`, or `get_optimization_opportunities(ep)` on the full result. + +## See also + +- [Compile and EPContext](compile-and-epcontext.md) +- [Primitives and pipeline](primitives-and-pipeline.md) +- [How winml-cli works](how-it-works.md) — where the analyzer sits in the build pipeline +- [EPs and devices](eps-and-devices.md) — background on EPs and operator support +- [analyze command](../commands/analyze.md) +- [optimize command](../commands/optimize.md) diff --git a/docs/concepts/compile-and-epcontext.md b/docs/concepts/compile-and-epcontext.md new file mode 100644 index 000000000..f5dda50b5 --- /dev/null +++ b/docs/concepts/compile-and-epcontext.md @@ -0,0 +1,42 @@ +# Compile and EPContext + +When you run `winml compile`, you are not simply copying an ONNX file to a new location. You are asking an execution provider (EP) to transform the model into a form it can load and run directly, without repeating that transformation at every startup. Understanding what the compiler produces — and why — helps you decide when to compile, what output format to choose, and how to balance file size against runtime performance. + +Compilation is an offline, one-time step. The artifact it creates is what you ship with your application and what `winml-cli` uses for benchmarking and evaluation. + +## What compilation produces + +For EPs that are fully integrated into ONNX Runtime — CPU, DirectML, and similar providers — the compile step writes a new `.onnx` file that the runtime loads directly. The ONNX graph has been prepared and, in some cases, partitioned so that the EP's session initializer has less work to do when the application starts. + +For EPs that support ahead-of-time compilation (e.g. `--ep qnn` for Qualcomm NPUs and `--ep vitisai` for AMD NPUs), the compiler goes further. It takes the ONNX graph and produces a binary artifact — the **EP context blob** — that encodes the fully compiled, hardware-ready version of the network. This blob is then associated with the ONNX model file. On subsequent loads, the EP reads the blob rather than re-compiling the graph, which makes session creation dramatically faster. + +The default compiler backend is `ort` (ONNX Runtime). + +## Embedded vs external EPContext + +For QNN compilation, `winml-cli` gives you a choice of where the EP context blob lives. By default the blob is written as a sidecar `.bin` file alongside the `.onnx`. Passing `--embed` instead inlines the blob directly into the ONNX file. + +**External (default):** The `.onnx` is small and human-inspectable; the heavy binary data lives in a separate file. You must keep the two files together — the ONNX stores a relative path back to the `.bin`. This layout is preferable for version control and for scenarios where you want to inspect or diff the model graph. + +**Embedded (`--embed`):** Everything ships in a single `.onnx` file. Deployment is simpler because there is only one artifact to track. The trade-off is file size: the `.onnx` grows by the full size of the compiled context, and the file is no longer human-readable in the usual sense. Choose embedded when your deployment tooling expects a single model file, or when you want to minimize the chance of the sidecar being misplaced. + +## Why pre-compile + +The first time an ONNX Runtime session is created for a model on a hardware EP, the runtime must partition the graph, allocate buffers, and JIT-compile the operators. On an NPU this process can take several seconds. For applications with tight startup budgets — on-device inference in a UI flow, for example — that cold-start cost is often unacceptable. + +A model produced by `winml compile` has already paid that cost. The EP context blob is the result of compilation, not its input. When the application loads the compiled model the EP reads the pre-built binary and the session is ready almost immediately. Shipping a compiled model is therefore the standard pattern for production deployments on QNN hardware. + +If you are iterating on quantization settings or ONNX graphs and want to check whether the model compiles at all, pass an already-quantized (QDQ) model directly — `winml compile` compiles whatever ONNX file you supply and does not have a separate quantization pass to skip. + +## Skipping validation + +By default `winml compile` runs a validation pass after compilation finishes — it loads the compiled model into an inference session, feeds it dummy inputs (all-ones tensors), and checks that the outputs do not contain NaN or Inf values. This catches basic compilation failures early (e.g., the EP rejecting the graph or producing garbage outputs). + +The `--no-validate` flag skips that pass. It is useful during rapid iteration when you only want to confirm that compilation succeeds without the overhead of a trial inference run. + +## See also + +- [EPs and devices](eps-and-devices.md) — execution provider selection and `--ep` / `--device` flags +- [Analyze and optimize](analyze-and-optimize.md) — graph-level analysis before compilation +- [compile command reference](../commands/compile.md) +- [build command reference](../commands/build.md) diff --git a/docs/concepts/config-and-build.md b/docs/concepts/config-and-build.md new file mode 100644 index 000000000..4e53eb3bb --- /dev/null +++ b/docs/concepts/config-and-build.md @@ -0,0 +1,162 @@ +# Config and build + +`winml config` and `winml build` are a producer/consumer pair. `winml config` +inspects a Hugging Face model (or an existing ONNX file), auto-detects the task, +model class, and I/O specifications, and writes a `WinMLBuildConfig` JSON file. +`winml build` reads that file and runs the full pipeline — export, optimize, +quantize, compile — producing a Windows ML-ready ONNX artifact. + +Keeping these two responsibilities separate is intentional. The config file is a +stable, human-readable description of exactly what the build will do. You can +generate it once, review or edit it, commit it to source control, and replay the +same build at any time without re-running model introspection. CI pipelines and +team workflows both benefit from treating the config file as a versioned artifact +rather than a transient intermediate. + +## Generating a config + +`winml config` produces a `WinMLBuildConfig` JSON with sensible defaults for the +detected model type. At minimum, provide a model identifier: + +```bash +winml config -m microsoft/resnet-50 -o resnet50.json +``` + +Several flags shape what ends up in the config: + +- `--task` overrides the auto-detected Hugging Face task when detection is + ambiguous or when you want a specific variant (for example, `text-classification` + vs `feature-extraction`). +- `--no-quant` sets the `quant` section to `null`, so the quantize stage is omitted + when `winml build` consumes the config. Use this for GPU workflows where float16 + is preferred over QDQ quantization. +- `--no-compile` sets the `compile` section to `null`, producing a portable ONNX + that the runtime compiles on first load instead of embedding a pre-compiled + binary. +- `--trust-remote-code` allows model repositories that ship custom modeling code — + required for some community models that define non-standard architectures outside + the standard `transformers` library. + +If `-o` is omitted, the config is printed to stdout, which is convenient for +piping or quick inspection. The generated JSON is plain text and can be edited +directly before being passed to `winml build`. + +## What's in a config + +A `WinMLBuildConfig` is a dataclass defined in +`src/winml/modelkit/config/build.py`. It holds five nested sub-configs for the +pipeline stages, plus an evaluation config and an auto flag: + +| Field | Type | Purpose | +|---|---|---| +| `loader` | `WinMLLoaderConfig` | Task, model type, and model class used to load the Hugging Face model. | +| `export` | `WinMLExportConfig` | Input/output tensor specs, opset version, dynamic axes (`null` for pre-exported ONNX). | +| `optim` | `WinMLOptimizationConfig` | Graph fusion flags (GeLU, LayerNorm, MatMul+Add). | +| `quant` | `WinMLQuantizationConfig` | Precision types (`weight_type`, `activation_type`), calibration samples and method (`null` to skip). | +| `compile` | `WinMLCompileConfig` | Target EP provider, EPContext options, compiler backend (`null` to skip). | +| `eval` | `WinMLEvaluationConfig \| null` | Evaluation settings run after the build (`null` to skip). | +| `auto` | `bool` | When `true` (default), auto-fills missing fields from model introspection. | + +Setting `quant` or `compile` to `null` tells the pipeline to skip that stage +entirely, equivalent to passing `--no-quant` or `--no-compile` on the command +line. + +A generated config looks similar to: + +```json +{ + "loader": { + "task": "image-classification" + }, + "export": { + "opset_version": 17, + "batch_size": 1 + }, + "optim": { + "gelu_fusion": false, + "layer_norm_fusion": false, + "matmul_add_fusion": false + }, + "quant": { + "mode": "qdq", + "weight_type": "uint8", + "activation_type": "uint8", + "samples": 10 + }, + "compile": { + "execution_provider": "qnn", + "enable_ep_context": true + } +} +``` + +The file is plain JSON. You can hand-edit any field before passing it to +`winml build` — adjust the calibration sample count, change the compile +provider, or remove a fusion flag. + +## Consuming a config + +Pass the config file to `winml build` with either an output directory or the +global cache flag: + +```bash +# Write artifacts to a local directory +winml build -c resnet50.json -m microsoft/resnet-50 --output-dir output/ + +# Write to the global cache (~/.cache/winml/) +winml build -c resnet50.json -m microsoft/resnet-50 --use-cache +``` + +`--output-dir` and `--use-cache` are mutually exclusive; you must supply one of +the two when running `winml build` (enforced at runtime, not parse time). Within the output directory, `winml build` writes one ONNX file per +completed stage so that intermediate artifacts are available for inspection, and +it writes a copy of the resolved config so the full build parameters are recorded +alongside the outputs. + +## Overrides at run time + +CLI flags passed directly to `winml build` override the corresponding config +sections for that run only, without modifying the JSON file on disk. This makes +it straightforward to experiment with a variation without creating a new config: + +```bash +# Skip quantization and compilation for this run only +winml build -c resnet50.json -m microsoft/resnet-50 --output-dir output/ --no-quant --no-compile + +# Skip optimization (for a pre-quantized input ONNX) +winml build -c resnet50.json -m model_qdq.onnx --output-dir output/ --no-optimize +``` + +`--no-quant`, `--no-compile`, and `--no-optimize` each suppress the corresponding +stage regardless of what the config file specifies. Because the config file is +unchanged, re-running without the override flag reverts to the full pipeline +described in the config. + +## Why version a config + +Storing the `WinMLBuildConfig` JSON in source control brings three concrete +benefits: + +1. **Reproducibility.** A config file pins every build decision — task, precision, + quantization method, calibration sample count, target EP, fusion flags — in a + single file. Running `winml build -c config.json` six months later produces the + same artifact as it does today, regardless of how the tool's defaults evolve. + +2. **CI integration.** A CI job can run `winml build -c config.json -m + --output-dir artifacts/` with no human intervention. Because all settings live + in the config file, the CI script requires no per-model flag knowledge, and + updating build parameters is a pull request to the config file, not a change to + the pipeline script. + +3. **Team sharing.** Handing a colleague a config file is enough for them to + reproduce the exact build on their machine. There is no need to document the + sequence of primitive commands, precision arguments, or calibration settings + separately — the file is the documentation. + +## See also + +- [Primitives and pipeline](primitives-and-pipeline.md) — when to use `winml build` + vs individual primitive commands +- [Config Schema](../reference/index.md) — full field-by-field config reference +- [winml config command reference](../commands/config.md) +- [winml build command reference](../commands/build.md) diff --git a/docs/concepts/eps-and-devices.md b/docs/concepts/eps-and-devices.md new file mode 100644 index 000000000..216b4cbba --- /dev/null +++ b/docs/concepts/eps-and-devices.md @@ -0,0 +1,72 @@ +# EP and Device + +An **Execution Provider (EP)** is a pluggable backend in ONNX Runtime that claims and runs a subset of graph nodes on a specific hardware target. When ONNX Runtime loads a model it partitions the graph among the registered EPs: operators that an EP claims are dispatched to it, and the remainder fall back to the CPU EP. This design lets a single [ONNX](graphs-and-ir.md) model exploit an NPU, GPU, or CPU without any change to the graph itself. + +A **device** is the hardware category that an EP targets — one of `npu`, `gpu`, or `cpu`. winml-cli exposes both levels of control: the high-level `--device` flag selects a hardware category, while the low-level `--ep` flag pins a specific ONNX Runtime provider name. In most workflows you set `--device` and let winml-cli resolve the best available EP; you reach for `--ep` when you need to compare or force a specific provider. + +## EPs winml-cli supports + +The table below lists every Execution Provider that winml-cli has explicit support for. EP names are the canonical ONNX Runtime strings accepted by `--ep`. You can also use the short **alias** (case-insensitive) anywhere the full name is accepted. + +| EP | Alias | Device | Hardware | When to use | +|----|-------|--------|----------|-------------| +| `QNNExecutionProvider` | `qnn` | npu / gpu | Qualcomm NPU (Hexagon DSP) / Qualcomm GPU (Adreno) | Snapdragon-based Copilot+ PCs; best latency and power efficiency on Qualcomm silicon | +| `VitisAIExecutionProvider` | `vitisai` | npu | AMD NPU (XDNA) | AMD Ryzen AI platforms; targets the AMD AI Engine via the Vitis AI stack | +| `OpenVINOExecutionProvider` | `openvino` | npu / gpu / cpu | Intel CPU / GPU / NPU | Intel Core Ultra platforms; flexible device targeting across all three Intel compute types | +| `DmlExecutionProvider` | `dml` | gpu | GPU (DirectML) | Any DirectX 12 GPU on Windows; broad compatibility across AMD, Intel, and NVIDIA discrete/integrated graphics | +| `NvTensorRTRTXExecutionProvider` | `nv_tensorrt_rtx` | gpu | NVIDIA GPU (TensorRT RTX) | NVIDIA RTX GPUs; maximum throughput via TensorRT graph optimization | +| `MIGraphXExecutionProvider` | `migraphx` | gpu | AMD GPU (MIGraphX) | AMD discrete GPUs; hardware-accelerated inference via the MIGraphX graph engine | +| `CPUExecutionProvider` | `cpu` | cpu | CPU | Universal fallback; always available regardless of hardware | + +To see which EPs are available on the current machine, run: + +```bash +winml sys --list-ep +``` + +## Device vs. EP on the CLI + +winml-cli exposes two overlapping flags for targeting hardware. Understanding their relationship prevents confusion when using `winml analyze`, `winml compile`, or `winml build`. + +**`--device` (high-level)** + +Accepts one of four values: `auto`, `cpu`, `gpu`, or `npu`. When set to `auto` (the default), winml-cli inspects the machine and selects the highest-priority device class that has a compatible EP available, in the order NPU > GPU > CPU. Setting an explicit value such as `--device npu` requests a device category without naming the EP. + +For `winml analyze`, `--device` also accepts `all` — this evaluates the model against every device that has rule data, producing a side-by-side compatibility report. + +```bash +# Let winml-cli pick the best available device +winml analyze --model model.onnx --device auto + +# Target the NPU device class +winml analyze --model model.onnx --device npu + +# Analyze against all devices at once (analyze only) +winml analyze --model model.onnx --device all +``` + +**`--ep` (low-level override)** + +Accepts a valid EP name or alias (for example `qnn`, `vitisai`, `dml`, `openvino`), or `auto` to let winml-cli resolve the EP from the device. When `--ep` is provided with a specific value it takes precedence over `--device` and bypasses device-class resolution entirely. Use `--ep` when you need to pin a specific provider — for instance to compare `QNNExecutionProvider` against `DmlExecutionProvider` on the same machine. + +For `winml analyze`, `--ep` also accepts `all` — this evaluates the model against every registered EP simultaneously. + +```bash +# Force Qualcomm QNN regardless of device selection +winml analyze --model model.onnx --ep QNNExecutionProvider --device npu + +# Use the short alias; winml-cli normalizes it to the full name +winml analyze --model model.onnx --ep qnn + +# Analyze against all EPs at once (analyze only) +winml analyze --model model.onnx --ep all +``` + +The `--ep` flag accepts a free-form string and is not restricted to the choices listed above. This allows forward compatibility with EP names that winml-cli does not yet enumerate. + +## See also + +- [Graphs and IR](graphs-and-ir.md) — ONNX graph format, operator sets, and the IR that EPs consume +- [Weight and Activation](weight-and-activation.md) — tensor roles relevant to EP compatibility +- [winml sys](../commands/sys.md) — list available devices and EPs on the current machine +- [winml analyze](../commands/analyze.md) — check ONNX operator compatibility against a specific EP diff --git a/docs/concepts/eval-and-datasets.md b/docs/concepts/eval-and-datasets.md new file mode 100644 index 000000000..bc7f662ec --- /dev/null +++ b/docs/concepts/eval-and-datasets.md @@ -0,0 +1,70 @@ +# Eval and datasets + +`winml eval` answers one question: does this model produce correct results? It measures +accuracy — how well outputs match ground truth — rather than latency or throughput. You +give it a model, point it at a labeled dataset, and get back a JSON report of metric +scores. Everything else in the pipeline (compilation, quantization, device selection) is +about making the model *fast*; eval is about knowing whether it is still *right*. + +The dataset is the source of truth. Eval iterates over dataset rows, runs each sample +through the model, and compares the prediction to the label recorded in the dataset. This +means the dataset must have both input features and ground-truth labels, and the columns +carrying those values must be wired to the model's inputs and outputs. winml-cli handles +standard tasks automatically, but the column-mapping flags let you override the defaults +for non-standard datasets. + +## What eval reports + +The metric reported depends on the task. Classification tasks produce accuracy (top-1 and +optionally top-5). Object detection tasks produce mean average precision (mAP). The exact +set of metrics is printed to stdout and saved to the file specified by `--output`. The +`--output` flag accepts any `.json` path; if omitted, results are printed but not persisted. +Use `--schema` to print the expected dataset schema for a given task without running eval, +which is useful when you are preparing a custom dataset. + +## Picking a dataset + +`--dataset` takes a Hugging Face dataset path — for example `imagenet-1k` or `glue`. If +you omit it, winml-cli selects a default dataset based on the detected task. For datasets +that have multiple configurations, `--dataset-name` picks the specific config (e.g. +`--dataset-name mrpc` when using the `glue` dataset). + +By default eval runs on the `validation` split; `--split` overrides this. Full validation +sets can be large. During development, `--samples 200` caps the run to 200 rows so you get +quick feedback. For very large datasets that you prefer not to download fully, `--streaming` +fetches rows on demand instead of materialising the whole dataset locally. `--shuffle` +(on by default) randomises sampling order so a capped run is representative rather than +biased toward the first rows. + +## Column mapping + +winml-cli must know which dataset column feeds which model input and which column holds +the ground-truth label. For well-known task/dataset combinations this mapping is built in. +When it is not, use `--column key=value` to declare it. The `key` is the name the task +pipeline expects (e.g. `input_column`) and `value` is the actual column name in the +dataset (e.g. `image`). You can repeat `--column` as many times as needed. + +When the integer label IDs in the dataset do not match the class indices the model was +trained against, `--label-mapping` accepts a JSON file of the form `{"class_name": id}` +that translates between the two spaces. This is common with models fine-tuned on a +relabelled subset of a public dataset. + +## Why eval after quantization + +Quantization is a lossy transformation. Converting weights from float32 to int8, or +activations to a narrow range, introduces rounding error that accumulates differently +across architectures and calibration data. The impact on accuracy cannot be predicted +analytically; it must be measured. Running `winml eval` before and after quantization +gives you a concrete accuracy delta. A drop within your acceptable threshold confirms the +quantized model is ready; a larger drop means you should revisit calibration settings or +switch to a less aggressive quantization scheme. + +Make this a habit: quantize, then eval. Comparing two `--output` JSON files is a reliable, +reproducible record that the trade-off between performance and accuracy was explicitly +checked. See [Quantization](quantization.md) for the full quantization workflow. + +## See also + +- [Quantization](quantization.md) — calibrate and quantize a model, then verify with eval +- [Perf and monitoring](perf-and-monitoring.md) — measure latency and throughput after accuracy is confirmed +- [`winml eval` command reference](../commands/eval.md) — all flags with examples diff --git a/docs/concepts/graphs-and-ir.md b/docs/concepts/graphs-and-ir.md new file mode 100644 index 000000000..af5d787e9 --- /dev/null +++ b/docs/concepts/graphs-and-ir.md @@ -0,0 +1,59 @@ +# Graph and IR + +A `.onnx` file is, at rest, a binary-serialized Protocol Buffer. Open it in any hex editor and you will find the familiar `ONNX` magic bytes followed by a dense encoding of every number the model has ever learned, plus the structural description of how those numbers are combined to produce a prediction. The file is self-contained: weights and computation recipe live together, making the artifact portable without any accompanying framework installation. + +That computation recipe is a **graph** — a directed acyclic structure of operators wired together by named data edges. The graph is what the ONNX Intermediate Representation (IR) actually defines. When winml-cli loads or transforms a model, every operation works against this graph structure, not against framework-specific objects. + +## What is in a .onnx file + +An ONNX `ModelProto` wraps a single `GraphProto`. Inside the graph you will find: + +- **Inputs** — typed, named entry points that accept runtime tensors (e.g., `pixel_values: float32[1, 3, 224, 224]`). +- **Outputs** — typed, named exit points that carry the model's predictions back to the caller. +- **Nodes** — individual operators (Conv, MatMul, Softmax, …) that transform tensors. Each node names its inputs and outputs using the same string identifiers used throughout the graph. +- **Initializers** — constant tensors embedded in the file. Learned weights, biases, and lookup tables are stored here; they are treated as graph inputs that are always pre-supplied. +- **Metadata** — key–value string properties attached at the model level. winml-cli uses this area to store information such as `winml.io.inputs` (serialized tensor specs) and `winml.hierarchy.tag` attributes on individual nodes. + +## Graphs as IR + +ONNX functions as an Intermediate Representation: a portable, framework-neutral description of a computation that can be loaded by any conforming runtime. Unlike a Python object graph or a compiled binary, the ONNX IR makes data flow completely explicit. Every node declares the exact names of its input and output edges; those names form a namespace shared across the whole graph, so any consumer can trace a tensor from the model inputs through every transformation to the final output. + +This explicit wiring unlocks two capabilities that winml-cli relies on heavily. First, **shape inference** can propagate concrete or symbolic dimensions through the graph without running it — a prerequisite for correct quantization and for generating input specs automatically. Second, **EP-targeted compilation** can partition the graph by examining which nodes an Execution Provider supports, fuse eligible sub-graphs into accelerated kernels, and serialize the result back into a valid ONNX file using the `EPContext` convention. Neither of these would be tractable on an opaque binary or a dynamic execution trace. + +Because the IR is static — describing the full computation at load time rather than at call time — winml-cli can inspect, validate, and transform a model without a GPU, a framework, or sample data. + +## Opsets and versioning + +Every operator in ONNX belongs to a **domain**, and every domain advances through numbered **opset versions**. An opset is a snapshot of the operator catalog: it defines which operators exist, what their inputs and outputs mean, and how edge cases are handled. When a model declares `opset_import { domain: "" version: 17 }`, it is saying "all unnamed-domain operators in this file must be interpreted according to the rules published in opset 17." + +winml-cli defaults to **opset 17** when exporting a PyTorch model to ONNX. This is the value of `opset_version: int = 17` in `WinMLExportConfig` (`src/winml/modelkit/export/config.py`, line 75). Opset 17 introduced layer-normalisation and group-normalisation operators in native form, eliminating the multi-node decompositions required by earlier opsets, which is why it is the recommended baseline for modern transformer and vision architectures. + +Higher opsets unlock additional operators and fix known edge-case behavior, but not every Execution Provider supports the latest opset. QNN, for instance, may lag behind the ONNX standard by one or two versions. If you need to target an older EP, pass a custom export configuration: + +```bash +# Write a config override +echo '{"opset_version": 16}' > export_cfg.json + +# Export with the override +winml export -m prajjwal1/bert-tiny -o bert.onnx --export-config export_cfg.json +``` + +You can also check the opset a saved model declares: + +```bash +winml inspect -m bert.onnx +``` + +```text +Opset: ai.onnx == 17 +``` + +When winml-cli's optimization and quantization pipelines transform a model, they preserve the declared opset unless explicitly instructed otherwise, so the model you receive after `winml quantize` will carry the same opset version as the model you supplied. + +## See also + +- [EP and Device](eps-and-devices.md) +- [Weight and Activation](weight-and-activation.md) +- [Datatype and Quantization](quantization.md) +- [winml inspect command](../commands/inspect.md) +- [winml export command](../commands/export.md) diff --git a/docs/concepts/how-it-works.md b/docs/concepts/how-it-works.md new file mode 100644 index 000000000..7a1d10b75 --- /dev/null +++ b/docs/concepts/how-it-works.md @@ -0,0 +1,143 @@ +# How winml-cli Works + +winml-cli is a toolkit for converting PyTorch and Hugging Face models into ONNX artifacts +that are optimized and compiled for Windows ML execution providers (EPs). Starting from a +model identifier or a pre-exported ONNX file, winml-cli runs a staged pipeline — export, +optimize, quantize, compile — and produces a final `model.onnx` ready for inference via +a Windows ML session. + +Each stage is independently controllable. Quantization and compilation are optional and +can be bypassed with a flag or by leaving the corresponding section of the build +configuration empty. The same pipeline API that powers `winml build` is also the +programmatic entry point for `WinMLAutoModel.from_pretrained()`. + +## The Pipeline at a Glance + +![winml-cli workflow](../assets/workflow-only.svg) + +The stages run in order, and each one writes an intermediate ONNX file to the output +directory. All intermediate artifacts are preserved so you can inspect any stage's output +or feed a pre-processed file into a later stage directly. + +## Pipeline Stages + +### Export — `winml export` + +`winml export` loads a Hugging Face model (pretrained or random-weight), traces it with +torch.export or an Optimum-based exporter, and writes a portable, device-agnostic ONNX +file. The output at this stage is a plain ONNX graph with float32 weights and no +EP-specific nodes. + +### Analyze — `winml analyze` + +`winml analyze` performs static compatibility analysis on an ONNX graph against a target +execution provider. It classifies every node as Supported, Partial, Unsupported, or +Unknown — without running the model on the device. Use it before building to check if +your model (or an intermediate artifact from any pipeline stage) will run cleanly on the +target EP: + +```bash +winml analyze -m model.onnx --ep qnn --device npu +``` + +Add `--optim-config optim.json` to output auto-discovered optimization recommendations +that can be fed directly into `winml optimize`. The same analyzer also drives the +autoconf feedback loop inside `winml build`. + +### Optimize — `winml optimize` + +`winml optimize` runs graph-level transformations on the exported ONNX: operator fusion +(attention, layer norm, GeLU), constant folding, and graph pruning. The optimize stage +also contains an autoconf loop: a static analyzer inspects the graph for nodes that the +target EP cannot dispatch natively, and re-runs optimization with adjusted fusion flags +until no further improvements are found (up to a configurable iteration limit). + +### Quantize — `winml quantize` + +`winml quantize` inserts Quantize-Dequantize (QDQ) nodes into the optimized graph to +reduce weights and activations to lower-precision types (for example, int8 weights with +uint8 activations). Calibration data is used to compute quantization parameters per +tensor. If the input model already contains QDQ nodes, this stage is skipped +automatically. + +### Compile — `winml compile` + +`winml compile` invokes an EP-specific compiler (for example, the QNN compiler for NPU +targets) to embed a pre-compiled binary cache inside the ONNX graph as an EPContext node. +At inference time, the EP loads the cached binary directly, bypassing per-session +compilation. Compilation is optional; omitting it produces a portable ONNX that is +compiled on first load by the runtime. + +### Perf and Eval — `winml perf` / `winml eval` + +After the model is built, `winml perf` benchmarks inference latency and throughput using +a Windows ML session, and `winml eval` runs task-specific accuracy evaluation. Neither +command modifies the model; they consume the final `model.onnx` produced by the pipeline. + +## `winml build` as the One-Shot Wrapper + +Running each stage individually is useful when iterating on a specific step, but the +normal workflow is `winml build`, which orchestrates the full pipeline in a single +command: + +```bash +winml build -m microsoft/resnet-50 -o output/ +``` + +The `-c config.json` flag is optional. If omitted, `winml build` auto-generates a +default config internally. To customize pipeline settings, generate a config first +with `winml config` and then pass it: + +```bash +winml config -m microsoft/resnet-50 -o config.json +winml build -c config.json -m microsoft/resnet-50 -o output/ +``` + +`winml build` auto-detects whether the input is a Hugging Face model ID or an existing +ONNX file and calls the appropriate internal API (`build_hf_model` or `build_onnx_model`). +When given an ONNX file directly, the export stage is skipped and the pipeline starts at +optimize. + +Individual stages can be bypassed from the command line without editing the config file: + +```bash +# Skip quantization and compilation +winml build -m bert-base-uncased -o output/ --no-quant --no-compile + +# Skip optimization (for pre-quantized input) +winml build -m model_qdq.onnx -o output/ --no-optimize +``` + +## Configuration: `WinMLBuildConfig` vs CLI Flags + +Pipeline behavior is primarily governed by a `WinMLBuildConfig` JSON file generated by +`winml config`. The config is a hierarchical structure with one section per stage: + +```text +WinMLBuildConfig +├── loader — model type, task, input constraints +├── export — input tensor specs, opset, backend +├── optim — fusion flags, optimization level +├── quant — precision, calibration settings (null = skip stage) +├── compile — target EP, device (null = skip stage) +└── eval — evaluation settings +``` + +Setting `quant` or `compile` to `null` in the JSON file is equivalent to passing +`--no-quant` or `--no-compile` on the command line; both result in the corresponding +stage being skipped. CLI flags override the config at runtime without modifying the file, +which is convenient for one-off experiments. + +The config file is written (or updated) to the output directory after the optimize stage +completes, capturing any autoconf-adjusted fusion flags so the build is reproducible. +This persisted `winml_build_config.json` is a self-contained pipeline specification that +you can check into version control and run in CI/CD (`winml build -c winml_build_config.json -m -o output/`) for repeatable, unattended builds across environments. + +For the full field-by-field schema, see [Reference — Config Schema](../reference/index.md). + +## See Also + +- [winml build](../commands/build.md) — full reference for the build command +- [winml export](../commands/export.md) — export command reference +- [ONNX and Execution Providers](eps-and-devices.md) — background on EPs and the ONNX runtime +- [Config and build](config-and-build.md) — detailed field-by-field config documentation diff --git a/docs/concepts/load-and-export.md b/docs/concepts/load-and-export.md new file mode 100644 index 000000000..b195adcd6 --- /dev/null +++ b/docs/concepts/load-and-export.md @@ -0,0 +1,132 @@ +# Load and export + +The first stage of the winml-cli pipeline is the most deterministic: bring a model into memory and convert it to ONNX. Everything that follows — optimization, quantization, compilation — operates on that ONNX artifact. A well-exported graph with accurate metadata travels cleanly through the rest of the pipeline without requiring patching or re-export. + +Loading is an internal operation: the loader module resolves model provenance, selects the right HuggingFace model class, and prepares the weights for tracing. The `winml export` command is the surface users interact with directly. + +## Loading a model + +When you point winml-cli at a model identifier, the internal loader resolves it in one of two ways. If the identifier looks like a HuggingFace Hub path (e.g., `prajjwal1/bert-tiny`), the loader downloads the model weights and configuration to the standard HuggingFace cache at `~/.cache/huggingface`. Subsequent runs are served from that cache without re-downloading. If the identifier is a path to a local PyTorch checkpoint directory, the loader reads it directly without network access. + +In both cases the loader auto-detects the task — image classification, text feature extraction, and so on — and selects a corresponding HuggingFace model class. The result is a PyTorch model object ready for tracing. + +Before committing to a full export you can verify that the loader resolved everything correctly with `winml inspect`. It prints the detected task, the HuggingFace model class, the export configuration, and the WinML inference class — all without downloading weights. Add `--hierarchy` to reconstruct the PyTorch module tree from random-weight tracing. + +Some community models host custom Python code in their repositories. The loader refuses to execute it by default. Pass `--trust-remote-code` to `winml config` when generating a build configuration for such a model. + +## Exporting to ONNX + +`winml export` converts the loaded model to ONNX. The conversion uses TorchScript tracing by default, which follows actual execution paths and tends to produce compact, inference-oriented graphs. A `--dynamo` flag exists for the PyTorch 2.x dynamo exporter; however, **Note:** the `--dynamo` flag is reserved for the PyTorch 2.x dynamo exporter but is **not yet functional** in the current release — passing it logs a warning and the flag is ignored. + +By default the exporter runs an eight-step process that includes hierarchy tracing and tag injection. The result is an ONNX file enriched with structural metadata that powers downstream features such as per-module benchmarking, inspector views, and optimizer scoping. + +### Hierarchy tagging in detail + +During export the HTP (Hierarchy-preserving Tags Protocol) exporter attaches two pieces of information to every ONNX graph node via `node.metadata_props`: + +| Key | Value | Example | +|-----|-------|---------| +| `winml.hierarchy.tag` | Full module path the node originated from | `/BertModel/BertEncoder/BertLayer.0/BertAttention` | +| `winml.hierarchy.depth` | Number of path segments (integer as string) | `4` | + +#### How tags are built + +The exporter registers PyTorch forward hooks on each module. When a module executes, a pre-hook pushes its class name onto a tag stack; the post-hook pops it. This produces hierarchical paths that mirror the PyTorch module tree: + +```mermaid +flowchart LR + A[Register hooks] --> B[Run forward pass] + B --> C[Pre-hook pushes tag] + C --> D[Child modules execute] + D --> E[Post-hook pops tag] + E --> F[Tag stack → path] +``` + +Only modules that are actually executed during tracing receive tags — unused modules are excluded. For example, `prajjwal1/bert-tiny` has 48 registered modules but only 18 are reached during a forward pass. + +#### Concrete example: BERT-tiny + +Running `winml export -m prajjwal1/bert-tiny -o model.onnx -v` produces the following hierarchy tree (18 traced modules, 132 ONNX nodes, 100 % coverage): + +``` +BertModel (132 nodes) +├── BertEmbeddings: embeddings (7 nodes) +├── BertEncoder: encoder (106 nodes) +│ ├── BertLayer: encoder.layer.0 (53 nodes) +│ │ ├── BertAttention: encoder.layer.0.attention (39 nodes) +│ │ │ ├── BertSelfOutput: encoder.layer.0.attention.output (4 nodes) +│ │ │ └── BertSdpaSelfAttention: encoder.layer.0.attention.self (35 nodes) +│ │ ├── BertIntermediate: encoder.layer.0.intermediate (10 nodes) +│ │ │ └── GELUActivation: encoder.layer.0.intermediate.intermediate_act_fn (8 nodes) +│ │ └── BertOutput: encoder.layer.0.output (4 nodes) +│ └── BertLayer: encoder.layer.1 (53 nodes) +│ └── ... (same structure) +└── BertPooler: pooler (0 nodes) +``` + +Each ONNX node gets its tag from the module it belongs to. Here are a few examples from the actual exported model: + +| ONNX node name | Assigned tag | +|---------------|--------------| +| `/embeddings/word_embeddings/Gather` | `/BertModel/BertEmbeddings` | +| `/encoder/layer.0/attention/self/query/MatMul` | `/BertModel/BertEncoder/BertLayer.0/BertAttention/BertSdpaSelfAttention` | +| `/encoder/layer.0/intermediate/intermediate_act_fn/Mul` | `/BertModel/BertEncoder/BertLayer.0/BertIntermediate/GELUActivation` | +| `/Unsqueeze` (no scope) | `/BertModel` (root fallback) | + +#### Node-to-module mapping + +After the ONNX graph is produced by `torch.onnx.export`, a 4-priority system assigns each ONNX node to the closest matching module: + +1. **Direct match** (61 %) — the node's scope name maps exactly to a traced module. +2. **Parent match** (24 %) — walk up the scope hierarchy until a traced module is found. +3. **Operation fallback** (optional, off by default) — find the most similar scope by common prefix. +4. **Root fallback** (14 %) — unmatched nodes receive the model root tag (e.g. `/BertModel`). + +This guarantees 100 % tag coverage: every node in the graph carries a non-empty tag. + +### Graph-level metadata + +Beyond per-node tags, the exporter also writes model-level metadata properties: + +| Key | Content | +|-----|---------| +| `winml.io.inputs` | JSON array of `InputTensorSpec` — name, shape, dtype, and optional `value_range` | +| `winml.io.outputs` | JSON array of `OutputTensorSpec` — name, shape, dtype | + +These I/O specs enable tools like `winml perf` to generate correct dummy inputs for benchmarking and `winml inspect` to display tensor shapes without loading the model into a runtime. + +### Sidecar metadata file + +Alongside the `.onnx` file, the exporter writes a `*_htp_metadata.json` sidecar containing: + +- **`nodes`** — complete mapping of every ONNX node name → hierarchy tag +- **`modules`** — traced module information (class name, tag, execution order) +- **`statistics`** — export time, node counts, coverage percentage +- **`outputs`** — I/O tensor specifications + +Use `--with-report` to additionally generate a human-readable markdown report (`*_htp_export_report.md`). + +### Features that depend on tags + +- **`winml inspect --hierarchy`** — traces the model with random weights and displays the resulting module tree in the terminal. This is a lightweight preview of what tags will look like after a full export. +- **`winml perf --module `** — isolates a submodule (e.g. `BertAttention`) and benchmarks it independently. + +### Disabling tags + +If you need a clean, standard-compliant ONNX without custom metadata — to hand off to a third-party tool, for example — pass `--no-hierarchy`. (The old `--clean-onnx` spelling remains as a deprecated hidden alias.) The graph behaviour is unchanged, but hierarchy-dependent features will not work against that file. + +## Where it goes wrong + +Most export failures fall into three categories. + +**Task mismatch.** The loader auto-detects task from the model card and configuration, but some models are registered under multiple tasks or have ambiguous metadata. If the wrong task is selected the exporter generates incorrect dummy inputs and the trace fails or produces wrong output shapes. Override it explicitly with `--task`, for example `--task image-feature-extraction`. + +**Shape issues.** Transformer models often have symbolic sequence-length dimensions; vision models may expect a fixed spatial resolution. If the default dummy inputs do not match what the model accepts, shape inference will fail or produce dynamic shapes that downstream tools cannot handle. Provide a `--shape-config` JSON file with explicit overrides, or use `--input-specs` to supply a fully specified input manifest. + +**Custom modules.** Some models contain `torch.nn.Module` subclasses the tracer cannot automatically decompose. A `--torch-module` option (comma-separated class names) is intended to include them as distinct hierarchy nodes rather than inlining them — most often needed for custom normalization or attention implementations defined in the model repository. **Note:** the `--torch-module` flag is reserved for module-targeted export but is **not yet functional** in the current release — passing it logs a warning and the flag is ignored. + +## See also + +- [Graph and IR](graphs-and-ir.md) +- [inspect command](../commands/inspect.md) +- [export command](../commands/export.md) diff --git a/docs/concepts/perf-and-monitoring.md b/docs/concepts/perf-and-monitoring.md new file mode 100644 index 000000000..235018e0f --- /dev/null +++ b/docs/concepts/perf-and-monitoring.md @@ -0,0 +1,180 @@ +# Perf and monitoring + +Knowing that a model produces correct outputs is necessary but not sufficient for a production deployment. You also need to know how fast it runs, how consistently it runs, and where the time goes when it does not run fast enough. `winml perf` is the primary tool in `winml-cli` for answering those questions. It synthesises end-to-end latency numbers and live hardware utilisation into a single benchmarking workflow. + +Because `winml perf` accepts both HuggingFace model IDs and local `.onnx` files, you can benchmark at any stage of the development cycle — from a freshly exported float model through to a compiled, quantized production artifact. + +## What perf measures + +At its core, `winml perf` runs a configurable number of inference iterations and reports latency statistics. Here is a real example benchmarking `bert-tiny` on CPU: + +``` +$ winml perf -m bert-tiny.onnx --device cpu --iterations 50 --warmup 5 + +Device: cpu / CPUExecutionProvider +Task: auto (auto-detected) +Model Precision: fp32 +Inputs: input_ids [1, 512] int32 + attention_mask [1, 512] int32 + token_type_ids [1, 512] int32 +Outputs: last_hidden_state [1, 512, 128] +``` + +Output latency table: + +| Avg | P50 | P90 | P95 | P99 | Min | Max | Std | +|-----|-----|-----|-----|-----|-----|-----|-----| +| 5.53 | 5.40 | 6.55 | 6.87 | 7.65 | 4.89 | 7.65 | 0.58 | + +``` +Warmup: 14.14 ms avg (first 5 iterations) +Throughput: 180.72 samples/sec +``` + +Key parameters: + +| Flag | Purpose | Default | +|------|---------|---------| +| `--iterations` | Number of benchmark iterations | 100 | +| `--warmup` | Warmup iterations excluded from statistics | 10 | +| `--batch-size` | Batch size for input generation | 1 | +| `-d, --device` | Target device: `auto`, `cpu`, `gpu`, `npu` | `auto` | +| `--ep` | Specific execution provider (e.g. `qnn`, `dml`, `openvino`) | auto-resolved from device | +| `--precision` | Precision mode: `auto`, `fp32`, `fp16`, `int8`, `int16`, or `w{x}a{y}` | `auto` | +| `--quantize/--no-quantize` | Include quantization during model build | `--quantize` | +| `--skip-build/--no-skip-build` | Skip the build pipeline for ONNX inputs | `--skip-build` | + +### Output format + +Add `-f json` to emit structured JSON to stdout, suitable for CI pipelines or automated comparisons: + +```json +{ + "benchmark_info": { + "model_id": "bert-tiny.onnx", + "task": "auto-detected", + "device": "cpu", + "ep": "CPUExecutionProvider", + "precision": "auto", + "iterations": 50, + "warmup": 5, + "batch_size": 1, + "timestamp": "2026-06-11T03:27:24+00:00" + }, + "model_info": { + "input_names": ["input_ids", "attention_mask", "token_type_ids"], + "input_shapes": [[1, 512], [1, 512], [1, 512]], + "input_types": ["int32", "int32", "int32"], + "output_names": ["last_hidden_state"], + "output_shapes": [[1, 512, 128]] + }, + "latency_ms": { + "mean": 5.53, "p50": 5.40, "p90": 6.55, + "p95": 6.87, "p99": 7.65, "min": 4.89, "max": 7.65, + "std": 0.58, "warmup_mean": 14.14 + }, + "throughput": { "samples_per_sec": 180.72, "batches_per_sec": 180.72 }, + "raw_samples_ms": [5.12, 5.40, ...] +} +``` + +Results are also saved automatically to `~/.cache/winml/perf//.json` for later comparison. Override the path with `--output`. + +## Live monitoring + +Latency numbers alone do not tell you whether the hardware is actually being used. A slow NPU inference could mean the model is running on the NPU and hitting a memory bottleneck, or it could mean the EP silently fell back to CPU and is not using the NPU at all. + +The `--monitor` flag adds a live terminal chart (powered by plotext + Rich Live) that streams hardware utilisation for whichever device is being benchmarked. The chart updates once per iteration so you can see whether utilisation is sustained, bursty, or absent. This is particularly useful when commissioning a new model on QNN or DirectML hardware, where EP fallback can be hard to detect from latency numbers alone. If the chart stays near zero while the benchmark runs, it is a strong signal that the model may not be executing on the expected device — investigate further with EP-specific tools. + +``` +winml perf -m model.onnx --device npu --monitor +``` + +Display updates are not included in the timed inference call, but monitoring may introduce small system overhead from background PDH polling. + +## Memory and resource metrics + +When `--monitor` is active, hardware metrics are sampled throughout the benchmark and reported at the end. These metrics help answer questions like "how much device memory does this model need?" and "is the model memory-bound?". + +The metrics collected depend on the target device: + +| Metric | CPU | GPU | NPU | +|--------|:---:|:---:|:---:| +| CPU utilisation (mean/peak %) | ✓ | ✓ | ✓ | +| RAM (used MB, peak MB) | ✓ | ✓ | ✓ | +| Device utilisation (mean/peak %) | — | ✓ | ✓ | +| Device memory local (peak MB) | — | ✓ | ✓ | +| Device memory shared (peak MB) | — | ✓ | ✓ | +| Engine running time (ns) | — | ✓ | ✓ | + +- **CPU**: Only system-level metrics (CPU %, RAM) are shown in terminal output. In JSON, `device_memory` and `running_time_ns` are still present but will be zero. +- **GPU**: Reports GPU engine utilisation plus dedicated VRAM (`local_peak_mb`) and shared system memory (`shared_peak_mb`) allocated by the GPU driver. +- **NPU**: Same structure as GPU. NPU adapters register as Windows GPU Engine devices, so utilisation and memory are read via the same PDH counters. `local_peak_mb` represents dedicated adapter memory; `shared_peak_mb` is system memory shared with the NPU. + +### Terminal output + +CPU device: + +``` +Hardware (during benchmark) + CPU: 8.3% avg | Mem: 644 MB +``` + +NPU or GPU device: + +``` +Hardware (during benchmark) + NPU: 87.3% avg, 100.0% peak | CPU: 12.1% avg | Mem: 1842 MB + Device Mem: 245/0 MB (local/shared) +``` + +### JSON structure + +In JSON output (`-f json`), these metrics appear under the `hw_monitor` key: + +```json +"hw_monitor": { + "monitor": "HWMonitor", + "device_kind": null, + "adapter_luid": null, + "cpu": { "mean_pct": 15.8, "peak_pct": 16.71, "sample_count": 2 }, + "ram": { "used_mb": 640.21, "peak_mb": 640.21 }, + "device_memory": { "local_peak_mb": 0.0, "shared_peak_mb": 0.0 }, + "running_time_ns": 0 +} +``` + +When a hardware accelerator is active, `device_kind` will be `"npu"` or `"gpu"`, and an additional key (e.g. `"npu"`) appears with device utilisation: + +```json +"hw_monitor": { + "monitor": "HWMonitor", + "device_kind": "npu", + "adapter_luid": "0x0000abcd12340000", + "cpu": { "mean_pct": 12.1, "peak_pct": 34.5, "sample_count": 50 }, + "ram": { "used_mb": 1842.0, "peak_mb": 1910.0 }, + "device_memory": { "local_peak_mb": 245.0, "shared_peak_mb": 0.0 }, + "npu": { "mean_pct": 87.3, "peak_pct": 100.0, "sample_count": 50 }, + "running_time_ns": 4820000000 +} +``` + +This makes it straightforward to track memory consumption across model revisions or compare devices programmatically. + +## Per-module benchmarking + +Large Transformer-family models contain many repeated module instances — attention blocks, feed-forward layers, encoder stages. When you want to understand the cost of one type of block rather than the full network, `--module ` isolates and benchmarks matching modules from the HuggingFace model hierarchy. + +``` +winml perf -m bert-base-uncased --module BertAttention +``` + +This builds and benchmarks each `BertAttention` instance separately and reports per-instance statistics. The `--module` argument must be a **class name** (e.g. `BertAttention`), not a dotted module path (e.g. not `encoder.layer.0.attention`). + +Internally, `--module` uses `torchinfo` to discover all submodule instances matching the given class name in the HuggingFace model. For each match it generates a separate build config, exports an isolated ONNX file, and benchmarks it independently. This requires a HuggingFace model ID (not a local `.onnx` file) because it needs access to the PyTorch module tree. + +## See also + +- [Load and export](load-and-export.md) — how the module-tree metadata that `--module` targets gets written +- [Eval and datasets](eval-and-datasets.md) — accuracy measurement to pair with performance numbers +- [perf command reference](../commands/perf.md) diff --git a/docs/concepts/primitives-and-pipeline.md b/docs/concepts/primitives-and-pipeline.md new file mode 100644 index 000000000..f9d820d3a --- /dev/null +++ b/docs/concepts/primitives-and-pipeline.md @@ -0,0 +1,109 @@ +# Primitives and pipeline + +winml-cli exposes two ways to turn a Hugging Face model or ONNX file into a +Windows ML-ready artifact. You can invoke each stage of the pipeline as an +individual primitive command — `winml export`, `winml analyze`, `winml optimize`, +`winml quantize`, `winml compile`, `winml perf`, `winml eval` — running one step +at a time with full control over inputs and outputs. Alternatively, `winml build` +wraps all of those stages into a single command driven by a `WinMLBuildConfig` +JSON file. + +Understanding when to reach for a primitive versus the pipeline wrapper is the +central workflow decision in winml-cli. Both paths produce the same artifacts; +the difference is in repeatability, convenience, and how much you need to inspect +or vary individual stages. + +## The primitive commands + +Each primitive command corresponds to one stage of the pipeline described in +[How winml-cli works](how-it-works.md). They run in order, each producing an ONNX +file that the next stage consumes: + +- **`winml export`** — loads a Hugging Face model, traces it with PyTorch and the + Optimum exporter, and writes a portable float32 ONNX file with no EP-specific + nodes. +- **`winml analyze`** — runs compatibility and runtime checks on the exported ONNX + graph, detecting unsupported operators, QDQ issues, and device-specific + constraints before further pipeline stages. +- **`winml optimize`** — applies graph transformations (operator fusion, constant + folding, graph pruning) and runs an autoconf loop to maximize EP-compatible + coverage. +- **`winml quantize`** — inserts QDQ nodes using calibration data, reducing weight + and activation types to lower precision (for example, int8) for efficient + inference. +- **`winml compile`** — invokes an EP-specific compiler (for example, QNN for NPU + targets) to embed a pre-compiled binary cache in the ONNX graph as an EPContext + node. +- **`winml perf`** — benchmarks latency and throughput against a Windows ML + session; does not modify the model. +- **`winml eval`** — evaluates task-specific accuracy on a dataset; does not + modify the model. + +You can enter the pipeline at any stage. If you already have an optimized ONNX +file, pass it directly to `winml quantize` without re-exporting. Each command +writes its output to a path you specify, so all intermediate artifacts are +preserved for inspection. + +## The pipeline wrapper + +`winml build` orchestrates all of the above stages in order from a single +`WinMLBuildConfig` JSON file: + +```bash +winml build -c config.json -m microsoft/resnet-50 -o output/ +``` + +The config file tells `winml build` which stages to run and how to configure them. +Setting the `quant` or `compile` section to `null` in the JSON skips that stage; +passing `--no-quant`, `--no-compile`, or `--no-optimize` on the command line +achieves the same effect at runtime without editing the file. + +When the model argument points to an existing ONNX file instead of a Hugging Face +ID, `winml build` detects this and skips the export stage, running +analyze → optimize → quantize → compile directly. This mirrors how each primitive +command handles the same case. + +`winml build` also accepts `--use-cache` in place of `-o`/`--output-dir`, routing +artifacts to the winml-cli global cache at `~/.cache/winml/` instead of a local +directory. Use `--rebuild` to force a clean re-run even when cached artifacts +already exist. + +## When to choose which + +**Use primitive commands when:** + +- You are learning the pipeline and want to observe each stage's output in + isolation. +- You are debugging a specific stage — for example, inspecting the optimized graph + before quantization, or testing a quantized model before compiling it. +- You need a one-off variation that does not warrant a versioned config, such as + trying a different opset or a different calibration sample count. +- You are integrating winml-cli output into a larger script that already manages + intermediate files. + +**Use `winml build` when:** + +- You are targeting production or CI: a single config file captures the full + pipeline reproducibly and can be committed alongside the code that uses the + model. +- You want to share the exact build recipe with a teammate or reproduce it later + without reconstructing the sequence of primitive flags. +- You need the autoconf loop to propagate optimization decisions across stages, + which only `winml build` coordinates end-to-end. +- You want stage-skipping to be declarative (`quant: null` in the config) rather + than remembered flag-by-flag across invocations. + +The two approaches are not exclusive. A common pattern is to prototype with +primitives — iterating on `winml optimize` and `winml quantize` individually to +tune fusion flags and calibration — and then encode the final settings into a +`WinMLBuildConfig` for repeatable production builds via `winml build`. + +## See also + +- [How winml-cli works](how-it-works.md) — pipeline stage order and internal + architecture +- [Config and build](config-and-build.md) — generating and versioning a + `WinMLBuildConfig` +- [winml build command reference](../commands/build.md) +- [Hugging Face Model to NPU tutorial](../tutorials/npu-convnext.md) — worked example + using primitive commands end-to-end diff --git a/docs/concepts/quantization.md b/docs/concepts/quantization.md new file mode 100644 index 000000000..4b5dc88de --- /dev/null +++ b/docs/concepts/quantization.md @@ -0,0 +1,65 @@ +# Datatype and Quantization + +Every ONNX tensor carries data in a specific numeric type — `float32`, `float16`, `int8`, `int16` — and every winml-cli pipeline makes deliberate choices about which type to use where. This page covers both halves of that decision: the **datatype family** winml-cli understands, and the **quantization** workflow that converts a model from one datatype to another to shrink it and run it faster on integer-native hardware. + +Quantization is the headline use of datatypes in winml-cli. By replacing `float32` weights and activations with `int8` or mixed precisions, you typically get a 2–4× smaller model artifact and a 2–8× latency speedup on NPU hardware. The trade-off is a potential reduction in model accuracy, the degree of which depends on the precision chosen and the sensitivity of the model. + +## Datatypes + +winml-cli exposes a precision shorthand on the `--precision` flag that encodes the weight/activation dtype pair as a single string. The table below lists every precision from `_NAMED_PRECISIONS` in `config/precision.py`, together with the resolved quantization types. Float precisions (`fp32`, `fp16`) carry no quantization types because weights and activations remain in floating point throughout. + +| Precision | Weight dtype | Activation dtype | Notes | +|-----------|-------------|-----------------|-------| +| `auto` | device-dependent | device-dependent | Resolves to `w8a16` (NPU), `fp16` (GPU/CPU) at runtime | +| `fp32` | float32 | float32 | No quantization; baseline accuracy | +| `fp16` | float16 | float16 | Half-precision float; no QDQ nodes inserted | +| `int8` | uint8 | uint8 | Static quantization; valid for QNN EP | +| `int16` | int16 | uint16 | Higher-accuracy quantization; larger model than int8 | +| `w8a8` | uint8 | uint8 | Equivalent to `int8`; explicit mixed-precision notation | +| `w8a16` | uint8 | uint16 | Mixed: compact weights, wider activations for accuracy | +| `w4a16` | n/a | n/a | **Not supported.** Rejected at validation — `is_quantized_precision("w4a16")` returns `False` because 4-bit weight types are absent from `_BITS_TO_WEIGHT_TYPE` in `precision.py`. The string is not a recognized precision. | + +The `--weight-type` and `--activation-type` flags on `winml quantize` accept `uint8`, `int8`, `uint16`, or `int16` and override whatever the `--precision` shorthand would have resolved. This is useful when you need an unsigned weight type for QNN compatibility but a signed activation type for a specific operator constraint. See [Weight and Activation](weight-and-activation.md) for why the two need separate flags in the first place. + +## How quantization works in winml-cli + +winml-cli applies quantization by inserting **QDQ** (Quantize/Dequantize) nodes into the ONNX graph. The resulting file is a standard ONNX model that any ONNX Runtime execution provider can consume and optimize for its target hardware — the EP reads the QDQ pattern and fuses adjacent operations into true integer kernels. + +### Calibration + +Static quantization — the kind winml-cli applies — requires a calibration pass before inserting QDQ nodes. During calibration, a small set of representative inputs runs through the original floating-point model so that winml-cli can observe the actual range of values each tensor takes at runtime. Those observed ranges are then used to choose the scale and zero-point constants baked into the QDQ nodes. + +The `--samples` flag controls how many calibration inputs are used (default: `10`). More samples generally produce better range estimates but take longer. The `--method` flag selects the algorithm used to summarize the observed ranges: + +- `minmax` (default) — uses the absolute minimum and maximum observed values. Fast and predictable; can be sensitive to outliers. +- `entropy` — minimizes the KL-divergence between the original and quantized distribution. Often yields better accuracy on models with heavy-tailed activation distributions. +- `percentile` — clips a small fraction of extreme values before computing the range. A practical middle ground when outliers are present but entropy calibration is slow. + +Example using entropy calibration with more samples: + +```bash +winml quantize -m model.onnx --precision int8 --samples 128 --method entropy +``` + +### The QDQ pattern + +The QDQ pattern is the standard ONNX representation for static quantization. winml-cli wraps the inputs and outputs of quantizable operators with pairs of `QuantizeLinear` and `DequantizeLinear` nodes. At the graph level the model still operates in floating-point; the QDQ nodes encode the scale and zero-point metadata that a runtime needs to fuse adjacent operations into true integer kernels. + +When the model runs under ONNX Runtime, the execution provider — whether CPU, DirectML, or a dedicated NPU EP — reads those QDQ patterns and performs its own graph fusion. This means the EP is free to apply hardware-specific optimizations without winml-cli needing to know anything about the target device's internal ISA or operator library. The QDQ model produced by `winml quantize` is a single portable artifact that can be deployed to any EP that supports integer execution. + +## When quantization is lossy + +Not all precision choices carry equal accuracy risk: + +- `fp16` is usually lossless in practice. Rounding errors relative to `fp32` are small enough that most models show no measurable accuracy difference. +- `int8` and `int16` are inherently lossy. Compressing a 32-bit float into 8 or 16 bits discards information, and the magnitude of accuracy degradation depends on how well the calibration data represents the deployment distribution. +- Compound precisions like `w8a16` reduce the risk compared to full `int8` by preserving more precision in activations, but they are still lossy relative to `fp32`. + +Always validate accuracy after quantizing an integer-precision model. Run `winml eval` on a representative dataset and compare the metrics against the original floating-point baseline before shipping the quantized artifact. + +## See also + +- [Weight and Activation](weight-and-activation.md) +- [EP and Device](eps-and-devices.md) +- [quantize command reference](../commands/quantize.md) +- [eval command reference](../commands/eval.md) diff --git a/docs/concepts/weight-and-activation.md b/docs/concepts/weight-and-activation.md new file mode 100644 index 000000000..5a3139912 --- /dev/null +++ b/docs/concepts/weight-and-activation.md @@ -0,0 +1,32 @@ +# Weight and Activation + +Every neural network model stores two kinds of numeric tensors that matter for deployment: **weights**, the static parameters baked in at training time, and **activations**, the intermediate values that flow through the graph at every inference call. Understanding the distinction is the key to reading winml-cli's precision flags, deciding when quantization is safe, and knowing why a model that runs fine on one execution provider may stall or degrade on another. + +## Weights are static + +Weights are the trained parameters of the model: convolution kernels, linear projection matrices, attention weights, embedding tables, bias vectors. They are fixed at the moment the model is exported and stay constant for every inference call. Because they are static, their quantization parameters — the scale and zero-point used to compress them from fp32 to int8 — can be computed once, offline, using calibration data. `winml quantize` does exactly that: it observes the weight distributions in your exported ONNX and bakes the per-tensor scale/zero-point into the QDQ nodes that wrap the weights. + +In ONNX terms, weights are stored as **initializers** inside the graph. The runtime treats them as graph inputs that are always pre-supplied; you do not pass weights to a session at inference time, the way you pass an image tensor or a text prompt. + +## Activations are dynamic + +Activations are the intermediate results that flow through the graph during inference: the output of every matrix multiply, every layer norm, every attention softmax. Unlike weights, activations are regenerated on every forward pass and depend entirely on the input data. winml-cli cannot pre-compute their quantization parameters offline — instead, calibration runs a small set of representative inputs through the model and observes the actual ranges each activation tensor takes. Those observed ranges become the scale/zero-point baked into QDQ nodes around each activation. + +This is why calibration data matters. If the calibration set fails to represent the inputs you will see in production, the per-activation ranges will be wrong and the quantized model will lose more accuracy than necessary on real traffic. + +## Why they need separate flags + +The `--weight-type` and `--activation-type` flags on `winml quantize` exist because the optimal bit-width for weights is not necessarily the optimal bit-width for activations: + +- **Wider activation types** (int16 vs int8) reduce accuracy loss at the cost of more memory bandwidth. Useful when activations have heavy-tailed distributions that quantize poorly at 8 bits. +- **Narrower weight types** compress the static footprint more aggressively. Useful when the model is memory-bound and accuracy headroom exists. +- **Execution providers diverge** along this boundary too. QNN on NPU pairs uint8 weights with uint8 or uint16 activations. DirectML on GPU can run float16 throughout. The CPU EP accepts almost any combination. + +The compound precision shorthand `w8a16` (8-bit weights, 16-bit activations) reflects this asymmetry directly: weights and activations get different bit-widths in one config string. For the full precision family and how each maps to weight/activation dtypes, see [Datatype and Quantization](quantization.md). + +## See also + +- [Datatype and Quantization](quantization.md) +- [EP and Device](eps-and-devices.md) +- [quantize command](../commands/quantize.md) +- [Graph and IR](graphs-and-ir.md) diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 000000000..21e159916 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,32 @@ +# Contributing + +For the full contributing guide — development setup, coding conventions, testing, PR checklist, and CLA — see [`CONTRIBUTING.md`](https://github.com/microsoft/winml-cli/blob/main/CONTRIBUTING.md) in the repository root. + +## Quick Reference + +```bash +# Clone and set up +git clone https://github.com/microsoft/winml-cli.git +cd winml-cli +uv sync --extra dev +uv run pre-commit install + +# Download runtime check rules (required for `winml analyze`) +gh release download --repo microsoft/winml-cli --pattern 'rules-v*.zip' --dir . +Expand-Archive -Path .\rules-v*.zip -DestinationPath src\winml\modelkit\analyze\rules\runtime_check_rules -Force + +# Run tests +uv run pytest tests/ -m "not e2e and not npu and not gpu" + +# Lint and format +uv run ruff check src/ tests/ --fix +uv run ruff format src/ tests/ + +# Docs preview +uv run mkdocs serve +``` + +## See also + +- [Installation](getting-started/installation.md) — user-facing setup +- [Commands](commands/overview.md) — CLI reference diff --git a/docs/getting-started/agent-skill.md b/docs/getting-started/agent-skill.md new file mode 100644 index 000000000..6d09c222d --- /dev/null +++ b/docs/getting-started/agent-skill.md @@ -0,0 +1,74 @@ +# Agent Skill + +winml-cli ships a **Copilot Skill** (`use-winml-cli`) that lets AI coding agents +drive the entire model-building pipeline on your behalf. When a coding agent has +this skill attached, it can inspect models, generate configs, run builds, and +interpret results — without you having to remember exact flags or stage ordering. + +--- + +## What the skill provides + +The skill teaches the agent: + +| Capability | What the agent learns | +|---|---| +| **Pipeline shape** | The stage order (`inspect → export → analyze → optimize → quantize → compile → perf`) and when to enter mid-pipeline | +| **Flag discovery** | Always run `winml --help` before quoting a command — never fabricate flags | +| **Output mapping** | Which command's `-o` produces the artifact the user actually needs | +| **Scope awareness** | Which model architectures are supported (classic DL) vs. out-of-scope (LLMs, diffusion) | +| **Hardware detection** | Use `winml sys --list-ep` to confirm what's available before targeting an EP | +| **Two paths** | When to use primitives (debugging, exploring) vs. config + build (production, CI) | + +--- + +## How to use it + +### With GitHub Copilot Coding Agent + +To make the [Copilot Coding Agent](https://docs.github.com/en/copilot/how-tos/copilot-on-github/use-copilot-agents/overview) +(the cloud agent that creates PRs) follow the skill's guidance, reference it in +`.github/copilot-instructions.md`. The Coding Agent reads that file automatically +when working on this repository. + +### With other AI agents + +For agents that support custom instructions (e.g., Copilot Extensions, Claude, +ChatGPT with file uploads, or custom MCP tool servers), attach the skill file +as context: + +``` +skills/use-winml-cli/SKILL.md +``` + +You can copy the file contents into your agent's system prompt, upload it as a +reference document, or include it in a `.github/copilot-instructions.md` for +VS Code Copilot Chat. The skill uses standard markdown with YAML front-matter — +any agent that accepts text context can benefit from it. + +--- + +## Skill location + +``` +winml-cli/ +└── skills/ + └── use-winml-cli/ + └── SKILL.md ← the skill definition +``` + +--- + +## Example agent interaction + +``` +User: Can I run ConvNeXt on my Snapdragon X Elite NPU? + +Agent (with skill): +1. Runs `winml sys --list-ep` → confirms QNNExecutionProvider is registered +2. Runs `winml inspect -m microsoft/convnext-tiny-224` → confirms supported +3. Runs `winml config --onnx ... -d npu -o config.json` +4. Runs `winml build -c config.json -m microsoft/convnext-tiny-224 -o output/` +5. Runs `winml perf -m output/model.onnx -d npu --monitor` +6. Reports latency + NPU utilization to user +``` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 000000000..68c35f494 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,74 @@ +# Installation + +## Prerequisites + +| Component | Details | +|---|---| +| Windows | Windows 11 24H2 or later (required for NPU support) | +| Hardware | Device with CPU, GPU, or NPU | +| Python | 3.11 | +| Package manager | [`uv`](https://github.com/astral-sh/uv) | +| Version control | `git` | + +!!! note "No NPU?" + You can follow most of these docs without NPU hardware. All winml-cli commands accept `--device auto` and fall back to CPU or DirectML automatically. The tutorials document explicit CPU fallback paths. + +## Install + +```bash +uv python install 3.11 +uv pip install winml-cli +``` + +`uv python install 3.11` downloads and pins the exact Python version the project requires. `uv pip install winml-cli` installs the latest release from PyPI into a managed environment. No separate venv activation is needed. + +!!! tip "Install from source (for development)" + If you want to contribute or run the latest unreleased code: + + ```bash + git clone https://github.com/microsoft/winml-cli.git + cd winml-cli + uv sync + ``` + +## Verify + +```bash +winml sys +``` + +Expected output (abbreviated): + +```text ++------------------------------------+ +| winml-cli System Information | ++------------------------------------+ + +Environment + Python Version 3.11.x + OS Windows 11 + Machine AMD64 + +ML Libraries + Library Version Status + torch 2.x.x OK + onnx 1.x.x OK + +Available Devices (priority order) + #1 NPU ... + #2 GPU ... + #3 CPU ... + +Available Execution Providers + QNNExecutionProvider -> NPU + DmlExecutionProvider -> GPU + CPUExecutionProvider -> CPU +``` + +This command enumerates available compute devices and execution providers on your machine. If an expected device or execution provider is missing, `winml sys` is the right place to diagnose it. See [winml sys](../commands/sys.md) for the full flag reference and troubleshooting tips. + +## Next steps + +- **[Quickstart](quickstart.md)** — export your first model in 5 minutes. +- **[End-to-End Tour](quickstart.md)** — full pipeline targeting whatever hardware you have (NPU / GPU / CPU). +- **[How winml-cli Works](../concepts/how-it-works.md)** — the mental model. diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 000000000..4a8e5ec8f --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,85 @@ +# Quickstart + +This guide walks you through verifying your install, inspecting a model from Hugging Face, running a full build pipeline to produce an optimized ONNX, and benchmarking the model on your device. Estimated time: 5 minutes. + +## Verify the install + +Run the following command to enumerate available devices and execution providers +on your machine: + +```bash +uv run winml sys --list-device --list-ep +``` + +`--list-device` and `--list-ep` print only the hardware and EP inventory. If the command exits without error, your winml-cli install is +ready. See [`winml sys`](../commands/sys.md) for the full flag reference. + +## Inspect the model + +Before downloading any models, confirm that winml-cli recognises the model: + +```bash +uv run winml inspect -m microsoft/resnet-50 +``` + +```text ++--------------------------- microsoft/resnet-50 ---------------------------+ +| Task image-classification | +| Model Class ResNetForImageClassification | +| Exporter OptimumExporter | +| WinML Class WinMLImageClassificationModel | +| Status Supported | ++---------------------------------------------------------------------------+ +``` + +!!! note "What just happened" + `winml inspect` read only the model's `config.json` from Hugging Face Hub — + no weights downloaded — and confirmed that `microsoft/resnet-50` maps to a + supported task, a known model class, and a compatible ONNX exporter. + +!!! tip + Always inspect before build to catch unsupported architectures early. + +## Build the model + +```bash +uv run winml build -m microsoft/resnet-50 -o resnet_out/ --no-quant +``` + +`winml build` runs all pipeline steps in sequence — export, optimize, quantize (when an NPU is detected on your device), and compile (disabled by default). You can start a model build without a config file, or provide one to configure each step in the sequence (see [`winml config`](../commands/config.md) to customize). +All intermediate artifacts land in `resnet_out/`, so you can reuse any stage independently. + +After a successful build, you will find the following outputs in `resnet_out/`: + +- **A standard ONNX file for each completed stage** — load, inspect, or pass any of these to a downstream tool independently. +- **`analyze_result.json`** — detailed model compatibility insights for each Windows ML EP, including supported, partially supported, and unsupported operators, detected optimization patterns, and recommended optimization workflows. +- **A declarative `winml_build_config` file** — automatically generated after the build step to capture the full workflow end-to-end. + +!!! tip "CI/CD integration" + The declarative `winml_build_config` makes it easy to integrate the model build workflow into CI/CD pipelines — the same file drives reproducible, portable build workflows across environments. + +!!! note "--no-quant" + `--no-quant` tells the pipeline to skip the quantize stage. Quantization is a valuable step for NPU targets, but skipping it here for the output model run on any device. + +!!! note "Why compile is disabled by default" + Compilation embeds a pre-compiled binary optimized for your specific device. Skip this step to keep the ONNX output portable — it will run on any device using just-in-time (JIT) compilation. + +## Benchmark the model + +```bash +uv run winml perf -m resnet_out/model.onnx --device auto --iterations 50 --monitor +``` + +`--device auto` lets the CLI resolve the best available device on your machine — NPU first, then GPU, then CPU. + +## What's next + +- **[How winml-cli Works](../concepts/how-it-works.md)** — understand what each command does under the hood. +- **[BERT sample](../samples/bert-config-build.md)** — see the config + build + perf workflow in detail with a representative model. + +## See also + +- [`winml build`](../commands/build.md) +- [`winml inspect`](../commands/inspect.md) +- [`winml perf`](../commands/perf.md) +- [`winml sys`](../commands/sys.md) diff --git a/docs/getting-started/ui-quickstart.md b/docs/getting-started/ui-quickstart.md new file mode 100644 index 000000000..e8ac68d6c --- /dev/null +++ b/docs/getting-started/ui-quickstart.md @@ -0,0 +1,13 @@ +# Try Windows ML CLI with a UI + +If you prefer a graphical interface, you can use the **Foundry Toolkit** extension for VS Code to run Windows ML CLI model conversion without typing commands. + +## Quick reference + +1. **Install [Visual Studio Code](https://code.visualstudio.com/)** +2. **Install the Foundry Toolkit extension** — search for `Foundry Toolkit` in the VS Code Extensions view +3. **Open the Model Conversion tool** — in the Foundry Toolkit panel, select **Model Conversion** +4. **Choose your model** — pick a model from Hugging Face, provide a local path, or select from the built-in model catalog filtered by Windows ML CLI +5. **Run the build** — the extension invokes Windows ML CLI and streams the output to the VS Code terminal + +For a full walkthrough, see [Build with Windows ML CLI (Preview)](https://code.visualstudio.com/docs/intelligentapps/modelconversion#_build-with-windows-ml-cli-preview) in the VS Code documentation. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..6f639b662 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,36 @@ +# winml-cli + +Windows ML CLI is a command line tool for building portable, performant, and high-quality AI models for Windows ML. It takes you from a source model — whether from Hugging Face or your own pipeline — to a hardware-optimized artifact in a reproducible workflow. + +Purpose-built for Windows hardware diversity, the CLI handles conversion, graph optimization, and compilation across AMD, Intel, NVIDIA, and Qualcomm targets. The CLI fits naturally into CI/CD pipelines so teams can validate and ship models easily. + +## What you can do + +- **Build once, run across hardwares.** Compose your own workflow from primitive commands (`export`, `analyze`, `optimize`, `quantize`, `compile`), or use an auto-generated config with `winml build` — both produce portable models that run across hardware. +- **Drill into the details.** Deep insights into operator compatibility, shape mismatches, graph optimizations, and EP-aware tuning at any stage of the pipeline. +- **AI-ready.** CLI-driven tools with built-in skills, friendly to work with mainstream agents. + +## What you get out of the box + +- **All Windows ML EPs supported.** Every [supported execution provider](concepts/eps-and-devices.md#eps-winml-cli-supports) is available behind the same commands. +- **Curated model catalog.** A [verified set of models](reference/supported-models.md) that run across all Windows ML EPs — a reliable starting point. +- **Bring your own ONNX.** Not only for converting from PyTorch — bring an [existing ONNX model](tutorials/build-from-onnx.md) to get operator-compatibility insights and optimize it based on the analysis. + +## Where to start + +- **[Installation](getting-started/installation.md)** — get the `winml` CLI running locally. +- **[Quickstart](getting-started/quickstart.md)** — export a Hugging Face model in five minutes. + +## Learn the model + +- **[How winml-cli Works](concepts/how-it-works.md)** — the pipeline from a PyTorch model to an EP-compiled artifact. +- **[Commands](commands/overview.md)** — reference for all 12 `winml` subcommands. +- **[Samples](samples/bert-config-build.md)** — walkthroughs for BERT and CLIP. + +## Repository access + +To request access to the Windows ML CLI repository, visit [aka.ms/winml-cli](https://aka.ms/winml-cli). + +## License + +MIT. See [LICENSE](https://github.com/microsoft/winml-cli/blob/main/LICENSE.txt). diff --git a/docs/reference/index.md b/docs/reference/index.md new file mode 100644 index 000000000..3c57085b3 --- /dev/null +++ b/docs/reference/index.md @@ -0,0 +1,204 @@ +# Reference — Config Schema + +This page documents the full schema for `WinMLBuildConfig`, the JSON configuration +file that drives the winml-cli pipeline. Generate a config with +`winml config`, then pass it to any command with `-c config.json`. + +The config is accepted by **all pipeline commands** — not just `winml build`. For +example, `winml export -c config.json`, `winml quantize -c config.json`, and +`winml compile -c config.json` each read the relevant section of the same config +file. This lets you use a single config as the source of truth across all stages. + +## Top-Level Structure + +```json +{ + "loader": { ... }, + "export": { ... }, + "optim": { ... }, + "quant": { ... }, + "compile": { ... }, + "eval": { ... }, + "auto": true +} +``` + +Setting `quant` or `compile` to `null` skips that pipeline stage entirely. +Setting `auto` to `true` (default) lets winml-cli auto-configure downstream +stages based on the target device and precision. + +--- + +## `loader` — Model Loading + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `task` | `str \| null` | `null` | HuggingFace task (e.g., `image-classification`). Auto-detected if omitted. | +| `model_class` | `str \| null` | `null` | Override model class (e.g., `AutoModelForCTC`). | +| `model_type` | `str \| null` | `null` | HuggingFace model type (e.g., `bert`, `resnet`). | +| `module_path` | `str \| null` | `null` | Dotted path to a submodule for targeted export. | +| `user_script` | `str \| null` | `null` | Path to custom model class script. | +| `trust_remote_code` | `bool` | `false` | Trust remote code from HuggingFace. | + +--- + +## `export` — ONNX Export + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `opset_version` | `int` | `17` | ONNX opset version. | +| `batch_size` | `int` | `1` | Static batch size. Use 1 for QNN compatibility. | +| `input_tensors` | `list[InputTensorSpec] \| null` | `null` | Input tensor specifications. Auto-inferred if omitted. | +| `output_tensors` | `list[OutputTensorSpec] \| null` | `null` | Output tensor specifications. | +| `dynamic_axes` | `dict \| null` | `null` | Dynamic axes mapping. ⚠️ Breaks MatMulAddFusion on QNN. | +| `export_params` | `bool` | `true` | Include model parameters in ONNX. | +| `do_constant_folding` | `bool` | `true` | Fold constants during export. | +| `verbose` | `bool` | `false` | Verbose export logging. | +| `dynamo` | `bool` | `false` | Use PyTorch 2.x Dynamo exporter. | +| `enable_hierarchy_tags` | `bool` | `true` | Add module hierarchy tags to ONNX nodes. | +| `clean_onnx` | `bool` | `false` | Strip hierarchy tags after export. | +| `hierarchy_tag_format` | `"full" \| "module_only"` | `"full"` | Tag detail level. | + +**InputTensorSpec:** + +| Field | Type | Description | +|-------|------|-------------| +| `name` | `str \| null` | Tensor name (e.g., `pixel_values`). | +| `dtype` | `str \| null` | Data type (e.g., `float32`, `int64`). | +| `shape` | `list[int] \| null` | Tensor shape (e.g., `[1, 3, 224, 224]`). | +| `value_range` | `[float, float] \| null` | Min/max for dummy tensor generation. | + +--- + +## `optim` — Graph Optimization + +A dictionary of boolean fusion flags. All default to `false` unless auto-configured. + +| Field | Type | Description | +|-------|------|-------------| +| `gelu_fusion` | `bool` | Fuse GeLU activation patterns. | +| `layer_norm_fusion` | `bool` | Fuse LayerNorm patterns. | +| `matmul_add_fusion` | `bool` | Fuse MatMul + Add (enables BiasGelu). | + +Additional fusion flags can be added as key-value pairs. + +--- + +## `quant` — Quantization + +Set to `null` to skip quantization. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `mode` | `"qdq" \| "static" \| "dynamic"` | `"qdq"` | Quantization mode. | +| `weight_type` | `"uint8" \| "int8" \| "uint16" \| "int16"` | `"uint8"` | Weight data type. | +| `activation_type` | `"uint8" \| "int8" \| "uint16" \| "int16"` | `"uint8"` | Activation data type. | +| `calibration_method` | `"minmax" \| "entropy" \| "percentile"` | `"minmax"` | Scale computation method. | +| `samples` | `int` | `10` | Number of calibration samples. | +| `per_channel` | `bool` | `false` | Per-channel quantization. | +| `symmetric` | `bool` | `false` | Symmetric quantization. | +| `task` | `str \| null` | `null` | Task for dataset-aware calibration. | +| `model_name` | `str \| null` | `null` | Model ID for calibration dataset resolution. | +| `dataset_name` | `str \| null` | `null` | Override calibration dataset. | +| `distribution` | `str` | `"uniform"` | Random distribution for dummy data. | +| `seed` | `int \| null` | `null` | Random seed for reproducibility. | +| `calibration_load_path` | `str \| null` | `null` | Load pre-computed calibration scales. | +| `calibration_save_path` | `str \| null` | `null` | Save calibration scales. | +| `op_types_to_quantize` | `list[str] \| null` | `null` | Operator types to quantize (all if null). | +| `nodes_to_exclude` | `list[str] \| null` | `null` | Node names to skip. | + +--- + +## `compile` — EP Compilation + +Set to `null` to skip compilation. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `ep_config.provider` | `str` | `"qnn"` | EP alias: `qnn`, `cpu`, `dml`, `openvino`, `tensorrt`, `vitisai`, `migraphx`. | +| `ep_config.device` | `str` | `"auto"` | Target device: `npu`, `gpu`, `cpu`, `auto`. | +| `ep_config.enable_ep_context` | `bool` | `true` | Generate EPContext model. | +| `ep_config.embed_context` | `bool` | `false` | Embed binary in ONNX (true) or external .bin (false). | +| `ep_config.compiler` | `str` | `"ort"` | Compiler backend: `ort` or `qairt`. | +| `ep_config.provider_options` | `dict` | `{}` | EP-specific options. | +| `ep_config.qnn_sdk_root` | `str \| null` | `null` | QNN SDK path for QAIRT compiler backend. | +| `validate` | `bool` | `true` | Validate compiled model. | +| `verbose` | `bool` | `false` | Verbose compilation logging. | + +--- + +## `eval` — Evaluation + +Set to `null` (default) to skip evaluation. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `model_id` | `str \| null` | `null` | HuggingFace model ID for config resolution. | +| `model_path` | `str \| dict[str, str] \| null` | `null` | Path to .onnx file, or a `{role: path}` dict for composite models. | +| `task` | `str \| null` | `null` | Task type. | +| `device` | `str` | `"auto"` | Inference device. | +| `precision` | `str` | `"auto"` | Precision (`fp32`, `fp16`, `w8a16`, etc.). | +| `ep` | `str \| null` | `null` | EP override. | +| `dataset.path` | `str \| null` | `null` | HuggingFace dataset path. | +| `dataset.name` | `str \| null` | `null` | Dataset config name. | +| `dataset.split` | `str` | `"validation"` | Dataset split. | +| `dataset.samples` | `int` | `100` | Evaluation sample count. | +| `dataset.shuffle` | `bool` | `true` | Shuffle before sampling. | +| `dataset.seed` | `int` | `42` | Random seed. | +| `output_path` | `str \| null` | `null` | Path for JSON results output. | + +--- + +## Example: Full Config + +```json +{ + "loader": { + "task": "image-classification", + "model_type": "resnet" + }, + "export": { + "opset_version": 17, + "batch_size": 1 + }, + "optim": { + "gelu_fusion": true, + "layer_norm_fusion": true, + "matmul_add_fusion": true + }, + "quant": { + "mode": "qdq", + "weight_type": "uint8", + "activation_type": "uint8", + "samples": 10, + "calibration_method": "minmax" + }, + "compile": { + "ep_config": { + "provider": "qnn", + "device": "npu", + "enable_ep_context": true, + "embed_context": false + }, + "validate": true + }, + "auto": true +} +``` + +### The `auto` field + +The top-level `"auto"` field (default: `true`) controls whether the build pipeline runs the **autoconf loop** — an iterative analyze → discover → re-optimize cycle that automatically detects which additional graph optimizations the model needs for the target EP. + +| Value | Behavior | +|-------|----------| +| `true` (default) | After initial optimization, the analyzer inspects the graph for unsupported or sub-optimal nodes and proposes additional optimization flags. The pipeline re-optimizes using the discovered flags and repeats (up to `--max-optim-iterations`, default 3). The final optimization result depends on what the analyzer discovers at runtime, so **outputs may vary** if the model or EP support changes between runs. | +| `false` | The pipeline applies only the explicit `optim` flags from the config — no autoconf discovery, no re-optimization loop. Builds are **fully deterministic** given the same config and input model. Use this for reproducible CI builds or when you have already tuned the optimization flags manually. | + +When `auto` is `true` and the autoconf loop discovers additional flags, the final persisted config (written to the output directory) includes the merged result so you can inspect what was discovered. + +## See also + +- [winml config](../commands/config.md) — generate a config interactively +- [winml build](../commands/build.md) — run the pipeline with a config +- [Config and build](../concepts/config-and-build.md) — conceptual overview diff --git a/docs/reference/output-layout.md b/docs/reference/output-layout.md new file mode 100644 index 000000000..d2aee17cd --- /dev/null +++ b/docs/reference/output-layout.md @@ -0,0 +1,237 @@ +# Output Layout + +When you run `winml build`, the tool writes all artifacts to the output +directory. This page documents what each file is and which ones you need +for deployment. + +--- + +## Directory Structure + +After a full pipeline run (export → optimize → quantize → compile): + +```text +output/ +├── model.onnx ← FINAL artifact (deploy this) +├── model.onnx.data ← External weights (if model ≥ 100 MiB) +├── winml_build_config.json ← Persisted build config +├── analyze_result.json ← Static analysis (EP compatibility) +├── build_manifest.json ← Build provenance (Python API only) +├── export_htp_metadata.json ← HTP export metadata (hierarchy info) +├── export.onnx ← Intermediate: raw ONNX export +├── export.onnx.data +├── optimized.onnx ← Intermediate: after graph optimization +├── optimized.onnx.data +├── quantized.onnx ← Intermediate: after QDQ insertion +├── quantized.onnx.data +├── compiled.onnx ← Intermediate: after EP compilation +└── compiled.onnx.data +``` + +--- + +## File Categories + +### Final Artifacts (Keep for Deployment) + +| File | Purpose | +|------|---------| +| `model.onnx` | The deployment-ready model. Always present. | +| `model.onnx.data` | External weight data (only if model ≥ 100 MiB). Must stay alongside `model.onnx`. | +| `winml_build_config.json` | The complete pipeline config used for this build (includes auto-discovered optimization flags). This file is a **reproducible pipeline specification** — check it into version control or feed it directly to `winml build -c` in a CI/CD pipeline to guarantee identical model processing across machines and runs (set `"auto": false` for fully deterministic builds). | +| `analyze_result.json` | Static analysis output: EP compatibility, operator classification, detected patterns. | +| `build_manifest.json` | Build provenance with stage timings. Only generated via the Python API (`build_hf_model`/`build_onnx_model`). | +| `export_htp_metadata.json` | HTP export metadata: module hierarchy, tracing info, tagging coverage. | + +### Intermediate Files (Can Delete After Build) + +| File | Stage | Contents | +|------|-------|----------| +| `export.onnx` | Export | Raw PyTorch → ONNX conversion (float32) | +| `optimized.onnx` | Optimize | Graph with fused operators, shape inference applied | +| `quantized.onnx` | Quantize | QDQ nodes inserted, calibrated scales | +| `compiled.onnx` | Compile | EPContext binary embedded or sidecar | + +Each intermediate has a corresponding `.onnx.data` file if the model exceeds +100 MiB. + +--- + +## What Gets Written at Each Stage + +### Export only (`winml export`) + +```text +output/ +├── export.onnx +└── export.onnx.data (if ≥ 100 MiB) +``` + +### Optimize only (`winml optimize`) + +```text +output/ +├── optimized.onnx +└── optimized.onnx.data +``` + +### Full build (`winml build`) + +All stages write their intermediate, and `model.onnx` is a copy of the last +successful stage output. If you skip quantization (`--no-quant`), the final +model is a copy of `optimized.onnx`. If you skip compilation too, it's still +a copy of `optimized.onnx`. + +--- + +## External Data + +Models larger than **100 MiB** store weights in a separate `.onnx.data` file. +Both files must be kept together — the `.onnx` file contains a reference to the +data file by name. + +| Model Size | Files | +|-----------|-------| +| < 100 MiB | `model.onnx` only (weights embedded) | +| ≥ 100 MiB | `model.onnx` + `model.onnx.data` | + +!!! warning + If you move `model.onnx`, always move `model.onnx.data` alongside it. + The ONNX file references the data file by relative path. + +--- + +## Analyzer Result + +`analyze_result.json` contains the static analysis output from the build pipeline's +analyze stage. It reports EP compatibility and operator classification: + +```json +{ + "analysis_timestamp": "2026-06-04T19:45:17.496169", + "metadata": { + "model_path": "iter.onnx", + "opset_version": 17, + "producer_name": "pytorch", + "producer_version": "2.12.0", + "total_operators": 122, + "operator_counts": { + "Conv": 53, + "Relu": 49, + "MaxPool": 1, + "Add": 16, + "GlobalAveragePool": 1, + "Flatten": 1, + "Gemm": 1 + }, + "unique_operator_types": 7, + "detected_pattern_count": {} + }, + "results": [ + { + "ihv_type": "Microsoft", + "ep_type": "CPUExecutionProvider", + "device_type": "cpu", + "runtime_support": false, + "has_errors": false, + "has_warnings": false, + "classification": { + "supported": [], + "partial": [], + "unsupported": [], + "unknown": [ + "OP/ai.onnx/Conv", + "OP/ai.onnx/Relu", + "OP/ai.onnx/MaxPool", + "OP/ai.onnx/Add", + "OP/ai.onnx/GlobalAveragePool", + "OP/ai.onnx/Flatten", + "OP/ai.onnx/Gemm" + ] + }, + "information": [] + } + ] +} +``` + +Key fields: + +| Field | Description | +|-------|-------------| +| `metadata.total_operators` | Total ONNX operator nodes in the model graph | +| `metadata.operator_counts` | Frequency of each operator type | +| `metadata.detected_pattern_count` | Fused subgraph patterns (GeLU, LayerNorm, etc.) | +| `results[].ihv_type` | Hardware vendor (`"Microsoft"`, `"QC"`, `"Intel"`, etc.) | +| `results[].runtime_support` | `true` if the EP can run all operators | +| `results[].classification` | Operators grouped by support level: `supported`, `partial`, `unsupported`, `unknown` | +| `results[].has_errors` | `true` if unsupported ops exist (model won't run on that EP) | + +--- + +## Build Manifest + +`build_manifest.json` records provenance for every build: + +```json +{ + "schema_version": 1, + "model_id": "microsoft/resnet-50", + "task": "image-classification", + "cache_key": "a1b2c3d4e5f6", + "config_hash": "f7e8d9c0b1a2", + "timestamp": "2026-01-15T10:30:00.000000+00:00", + "elapsed_seconds": 45.1, + "final_artifact": "model.onnx", + "analyze_iterations": 2, + "analyze_unsupported_node_count": 0, + "analyze_details": { "lint": {}, "autoconf": {} }, + "stages": [ + { + "name": "export", + "status": "completed", + "filename": "export.onnx", + "elapsed_seconds": 12.5 + }, + { + "name": "optimize", + "status": "completed", + "filename": "optimized.onnx", + "elapsed_seconds": 8.2 + }, + { + "name": "quantize", + "status": "completed", + "filename": "quantized.onnx", + "elapsed_seconds": 15.3, + "nodes_quantized": 150, + "nodes_skipped": 12 + }, + { + "name": "compile", + "status": "completed", + "filename": "compiled.onnx", + "elapsed_seconds": 9.1 + } + ] +} +``` + +--- + +## Rebuild Behavior + +- If `model.onnx` already exists and `rebuild=False` (default), the build is + skipped entirely. +- Pass `--rebuild` (CLI) or `force_rebuild=True` (Python API) to force a fresh + build. +- On rebuild, all old `.onnx` and `.onnx.data` files are deleted before the + pipeline runs. + +--- + +## See also + +- [winml build](../commands/build.md) — build command reference +- [Reference — Config Schema](index.md) — config file format +- [How winml-cli Works](../concepts/how-it-works.md) — pipeline stages explained diff --git a/docs/reference/python-api.md b/docs/reference/python-api.md new file mode 100644 index 000000000..8b76acb95 --- /dev/null +++ b/docs/reference/python-api.md @@ -0,0 +1,258 @@ +# Python API + +winml-cli can be used as a Python library for programmatic model building and +inference. This page documents the public API surface. + +--- + +## Quick Example + +```python +from winml.modelkit import WinMLAutoModel + +# Build and load in one call +model = WinMLAutoModel.from_pretrained("microsoft/resnet-50", device="npu") +output = model(pixel_values=images) + +# From a local ONNX file +model = WinMLAutoModel.from_onnx("model.onnx", task="image-classification") +``` + +--- + +## `WinMLAutoModel` + +Factory class for automatic model building and loading. Not instantiable directly — +use the class methods. + +### `from_pretrained()` + +Build and load a model from a HuggingFace ID or local path. Runs the full +pipeline: config → export → optimize → quantize → compile → load. + +```python +WinMLAutoModel.from_pretrained( + model_id_or_path: str | Path, + *, + task: str | None = None, + config: WinMLBuildConfig | None = None, + device: str = "auto", + precision: str = "auto", + cache_dir: str | Path | None = None, + use_cache: bool = True, + force_rebuild: bool = False, + trust_remote_code: bool = False, + shape_config: dict | None = None, + no_compile: bool = False, +) -> WinMLPreTrainedModel +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model_id_or_path` | `str \| Path` | required | HuggingFace model ID or path to local model. | +| `task` | `str \| None` | `None` | Task name. Auto-detected if omitted. | +| `config` | `WinMLBuildConfig \| None` | `None` | Custom build config. Auto-generated if omitted. | +| `device` | `str` | `"auto"` | Target device: `"auto"`, `"npu"`, `"gpu"`, `"cpu"`. | +| `precision` | `str` | `"auto"` | Precision: `"auto"`, `"fp32"`, `"fp16"`, `"w8a8"`, etc. | +| `cache_dir` | `str \| Path \| None` | `None` | Cache directory for built artifacts. | +| `use_cache` | `bool` | `True` | Reuse cached build if available. | +| `force_rebuild` | `bool` | `False` | Force rebuild even if cache exists. | +| `trust_remote_code` | `bool` | `False` | Trust remote code from HuggingFace. | +| `no_compile` | `bool` | `False` | Skip the compilation stage. | + +**Returns:** A task-specific `WinMLPreTrainedModel` subclass. + +--- + +### `from_onnx()` + +Build from a pre-exported ONNX file. Runs: optimize → quantize → compile → load. + +```python +WinMLAutoModel.from_onnx( + onnx_path: str | Path | dict[str, str | Path], + *, + task: str | None = None, + config: WinMLBuildConfig | None = None, + device: str = "auto", + precision: str = "auto", + ep: str | None = None, + cache_dir: str | Path | None = None, + use_cache: bool = True, + force_rebuild: bool = False, + skip_build: bool = False, + session_options: Any | None = None, + hf_config: PretrainedConfig | None = None, + **kwargs: Any, +) -> WinMLPreTrainedModel | WinMLCompositeModel +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `onnx_path` | `str \| Path \| dict` | required | ONNX file path, or dict of submodel paths for composite models. | +| `skip_build` | `bool` | `False` | Load ONNX directly without running optimize/quantize/compile. | +| `hf_config` | `PretrainedConfig \| None` | `None` | Required for composite models (dict inputs). | + +--- + +### `supported_tasks()` + +```python +WinMLAutoModel.supported_tasks() -> list[str] +``` + +Returns all task strings with dedicated inference classes (16 tasks). + +--- + +## Build Pipeline Functions + +Lower-level functions for fine-grained control over the pipeline. + +### `build_hf_model()` + +```python +from winml.modelkit.build import build_hf_model + +result = build_hf_model( + config: WinMLBuildConfig, + output_dir: Path, + *, + model_id: str | None = None, + pytorch_model: nn.Module | None = None, + rebuild: bool = False, + trust_remote_code: bool = False, + random_init: bool = False, + cache_key: str | None = None, + ep: str | None = None, + device: str | None = None, + **kwargs: Any, +) -> BuildResult +``` + +Runs the full pipeline (export → optimize → analyze → quantize → compile) and +writes all artifacts to `output_dir`. + +### `build_onnx_model()` + +```python +from winml.modelkit.build import build_onnx_model + +result = build_onnx_model( + onnx_path: Path | str, + *, + config: WinMLBuildConfig, + output_dir: Path | str, + rebuild: bool = False, + ep: str | None = None, + device: str | None = None, + **kwargs: Any, +) -> BuildResult +``` + +Builds from an existing ONNX file (skips export). + +### `BuildResult` + +```python +@dataclass +class BuildResult: + output_dir: Path # Directory containing all artifacts + final_onnx_path: Path # Path to final model.onnx + config_path: Path # Path to winml_build_config.json + stages_completed: list[str] # e.g., ["export", "optimize", "quantize"] + stages_skipped: list[str] + stage_timings: dict[str, float] # Per-stage seconds + elapsed: float # Total build time (seconds) + reused: bool # True if cache hit, no build ran + manifest_path: Path | None # Path to build_manifest.json +``` + +--- + +## Config Generation + +### `generate_build_config()` + +```python +from winml.modelkit.config import generate_build_config + +config = generate_build_config( + model_id: str | None = None, + *, + task: str | None = None, + model_class: str | None = None, + model_type: str | None = None, + module: str | None = None, + override: WinMLBuildConfig | None = None, + shape_config: dict | None = None, + library_name: str = "transformers", + device: str = "auto", + precision: str = "auto", + trust_remote_code: bool = False, + ep: str | None = None, + onnx_path: str | Path | None = None, +) -> WinMLBuildConfig | list[WinMLBuildConfig] +``` + +Auto-generates a complete build config by probing the model's `config.json` +(does not download weights). Equivalent to what `winml config` produces. +Returns a list when `module` is specified (one config per submodule). + +--- + +## Inference Model Classes + +All inference models inherit from `WinMLPreTrainedModel` and are HuggingFace +pipeline-compatible. + +### `WinMLPreTrainedModel` (Base) + +```python +class WinMLPreTrainedModel: + def __call__(self, **kwargs) -> Any: ... + def perf(self, warmup: int = 0) -> ContextManager: ... + + @property + def device(self) -> str: ... + @property + def ep_name(self) -> str | None: ... + @property + def io_config(self) -> dict: ... + @property + def task(self) -> str | None: ... +``` + +### Task-Specific Classes + +| Class | Task | +|-------|------| +| `WinMLModelForImageClassification` | `image-classification` | +| `WinMLModelForSequenceClassification` | `text-classification` | +| `WinMLModelForImageSegmentation` | `image-segmentation` | +| `WinMLModelForSemanticSegmentation` | `semantic-segmentation` | +| `WinMLModelForObjectDetection` | `object-detection` | +| `WinMLModelForFeatureExtraction` | `feature-extraction` | +| `WinMLModelForQuestionAnswering` | `question-answering` | +| `WinMLModelForZeroShotImageClassification` | `zero-shot-image-classification` | +| `WinMLModelForGenericTask` | fallback (raw outputs) | + +### Performance Tracking + +```python +model = WinMLAutoModel.from_pretrained("microsoft/resnet-50", device="npu") + +with model.perf(warmup=5) as stats: + for img in test_images: + model(pixel_values=img) + +print(f"P99 latency: {stats.p99_ms:.2f} ms") +``` + +--- + +## See also + +- [Reference — Config Schema](index.md) — full config field reference +- [winml build](../commands/build.md) — CLI equivalent +- [How winml-cli Works](../concepts/how-it-works.md) — pipeline overview diff --git a/docs/reference/supported-models.md b/docs/reference/supported-models.md new file mode 100644 index 000000000..28d0ca9ae --- /dev/null +++ b/docs/reference/supported-models.md @@ -0,0 +1,233 @@ +# Supported Models + +winml-cli supports a wide range of model architectures and tasks. This page +lists what's validated and how to discover model support. + +--- + +## Discovery Commands + +```bash +# Browse the curated catalog (57 validated models) +uv run winml catalog + +# Filter by task +uv run winml catalog -t image-classification + +# Check if a specific model is supported +uv run winml inspect -m microsoft/resnet-50 + +# List all known tasks +uv run winml inspect --list-tasks +``` + +--- + +## Supported Tasks + +winml-cli recognizes **35 task types** across vision, NLP, audio, and multimodal domains. Of these, 16 have dedicated inference classes; the remainder are supported via the generic task fallback. + +### Vision + +| Task | Example Models | +|------|----------------| +| `image-classification` | ResNet, ConvNeXt, ViT, Swin | +| `image-segmentation` | Segformer, Mask2Former | +| `semantic-segmentation` | Segformer | +| `object-detection` | DETR, YOLOS, Table-Transformer | +| `depth-estimation` | Depth Anything, ZoeDepth | +| `image-feature-extraction` | DINOv2, ViT | +| `zero-shot-image-classification` | CLIP, SigLIP | + +### NLP + +| Task | Example Models | +|------|----------------| +| `text-classification` | BERT, RoBERTa, XLM-RoBERTa | +| `token-classification` | BERT, RoBERTa (NER) | +| `question-answering` | BERT, RoBERTa | +| `fill-mask` | BERT, RoBERTa | +| `feature-extraction` | BGE, BERT, all-MiniLM | +| `text-generation` | Qwen3 (composite) | +| `text2text-generation` | T5, BART, Marian | + +### Audio + +| Task | Example Models | +|------|----------------| +| `automatic-speech-recognition` | Whisper | +| `audio-classification` | Wav2Vec2 | + +### Multimodal + +| Task | Example Models | +|------|----------------| +| `zero-shot-image-classification` | CLIP (text + vision) | +| `image-to-text` | VisionEncoderDecoder | +| `visual-question-answering` | BLIP | + +--- + +## Validated Model Catalog + +The following models have been validated end-to-end with EP compatibility +testing. Use `winml catalog` to browse the full list interactively. + +### Image Classification + +| Model | Architecture | +|-------|-------------| +| `AdamCodd/vit-base-nsfw-detector` | ViT | +| `Falconsai/nsfw_image_detection` | ViT | +| `amunchet/rorshark-vit-base` | ViT | +| `apple/mobilevit-small` | MobileViT | +| `dima806/fairface_age_image_detection` | ViT | +| `google/vit-base-patch16-224` | ViT | +| `microsoft/resnet-18` | ResNet | +| `rizvandwiki/gender-classification` | ViT | + +### Image Feature Extraction + +| Model | Architecture | +|-------|-------------| +| `facebook/dino-vitb16` | ViT | +| `facebook/dino-vits16` | ViT | +| `facebook/dinov2-base` | DINOv2 | +| `facebook/dinov2-large` | DINOv2 | +| `facebook/dinov2-small` | DINOv2 | +| `google/vit-base-patch16-224-in21k` | ViT | +| `microsoft/rad-dino` | DINOv2 | + +### Feature Extraction (Text) + +| Model | Architecture | +|-------|-------------| +| `laion/CLIP-ViT-B-32-laion2B-s34B-b79K` | CLIP | +| `openai/clip-vit-base-patch16` | CLIP | +| `openai/clip-vit-base-patch32` | CLIP | +| `sentence-transformers/all-MiniLM-L6-v2` | BERT | +| `sentence-transformers/all-mpnet-base-v2` | MPNet | +| `sentence-transformers/multi-qa-mpnet-base-dot-v1` | MPNet | + +### Sentence Similarity + +| Model | Architecture | +|-------|-------------| +| `BAAI/bge-large-en-v1.5` | BERT | +| `BAAI/bge-small-en-v1.5` | BERT | +| `sentence-transformers/all-MiniLM-L6-v2` | BERT | +| `sentence-transformers/all-mpnet-base-v2` | MPNet | +| `sentence-transformers/multi-qa-mpnet-base-dot-v1` | MPNet | +| `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | BERT | +| `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` | XLM-RoBERTa | + +### Fill-Mask + +| Model | Architecture | +|-------|-------------| +| `FacebookAI/roberta-base` | RoBERTa | +| `FacebookAI/xlm-roberta-base` | XLM-RoBERTa | +| `distilbert/distilbert-base-uncased` | DistilBERT | +| `google-bert/bert-base-multilingual-cased` | BERT | +| `google-bert/bert-base-multilingual-uncased` | BERT | +| `google-bert/bert-base-uncased` | BERT | +| `sentence-transformers/all-mpnet-base-v2` | MPNet | +| `sentence-transformers/multi-qa-mpnet-base-dot-v1` | MPNet | + +### Text Classification + +| Model | Architecture | +|-------|-------------| +| `cardiffnlp/twitter-roberta-base-sentiment-latest` | RoBERTa | +| `cross-encoder/ms-marco-MiniLM-L4-v2` | BERT | +| `cross-encoder/ms-marco-MiniLM-L6-v2` | BERT | +| `distilbert/distilbert-base-uncased-finetuned-sst-2-english` | DistilBERT | + +### Token Classification + +| Model | Architecture | +|-------|-------------| +| `Isotonic/distilbert_finetuned_ai4privacy_v2` | DistilBERT | +| `Jean-Baptiste/camembert-ner-with-dates` | CamemBERT | +| `kredor/punctuate-all` | XLM-RoBERTa | +| `w11wo/indonesian-roberta-base-posp-tagger` | RoBERTa | + +### Question Answering + +| Model | Architecture | +|-------|-------------| +| `ahotrod/electra_large_discriminator_squad2_512` | Electra | +| `deepset/bert-large-uncased-whole-word-masking-squad2` | BERT | +| `deepset/roberta-base-squad2` | RoBERTa | +| `deepset/tinyroberta-squad2` | RoBERTa | +| `distilbert/distilbert-base-cased-distilled-squad` | DistilBERT | +| `distilbert/distilbert-base-uncased-distilled-squad` | DistilBERT | +| `monologg/koelectra-small-v2-distilled-korquad-384` | Electra | + +### Zero-Shot Classification + +| Model | Architecture | +|-------|-------------| +| `lxyuan/distilbert-base-multilingual-cased-sentiments-student` | DistilBERT | + +### Zero-Shot Image Classification + +| Model | Architecture | +|-------|-------------| +| `laion/CLIP-ViT-B-32-laion2B-s34B-b79K` | CLIP | + +### Object Detection + +| Model | Architecture | +|-------|-------------| +| `hustvl/yolos-small` | YOLOS | +| `valentinafeve/yolos-fashionpedia` | YOLOS | + +### Depth Estimation + +| Model | Architecture | +|-------|-------------| +| `Intel/dpt-hybrid-midas` | DPT | + +--- + +## Execution Provider Compatibility + +Each validated model is tested against available EPs: + +| EP | Alias | Devices | Notes | +|----|-------|---------|-------| +| NvTensorRTRTXExecutionProvider | `nvtensorrtrtx`, `nv_tensorrt_rtx` | GPU | NVIDIA TensorRT-RTX; NVIDIA GPU with TensorRT runtime | +| CUDAExecutionProvider | `cuda` | GPU | NVIDIA CUDA; any CUDA-capable GPU | +| MIGraphXExecutionProvider | `migraphx` | GPU | AMD ROCm MIGraphX | +| QNNExecutionProvider | `qnn` | NPU, GPU | Qualcomm Snapdragon; bundled in ORT | +| OpenVINOExecutionProvider | `openvino` | NPU, GPU, CPU | Intel hardware | +| DmlExecutionProvider | `dml` | GPU | DirectML; any DirectX 12 GPU | +| CPUExecutionProvider | `cpu` | CPU | Always available | +| VitisAIExecutionProvider | `vitisai` | NPU | AMD/Xilinx | + +--- + +## Adding Unsupported Models + +If your model architecture isn't in the catalog, winml-cli may still support it +through auto-detection: + +```bash +# Try inspecting first +uv run winml inspect -m your-org/your-model + +# If "Status: Supported", proceed normally +uv run winml build -m your-org/your-model -d auto -o output/ +``` + +For truly custom architectures, use `--trust-remote-code` to allow execution of +model code from the Hugging Face Hub. + +--- + +## See also + +- [winml catalog](../commands/catalog.md) — browse validated models interactively +- [winml inspect](../commands/inspect.md) — check model compatibility +- [EP and Device](../concepts/eps-and-devices.md) — execution provider details diff --git a/docs/samples/bert-config-build.md b/docs/samples/bert-config-build.md new file mode 100644 index 000000000..e3b25c6e3 --- /dev/null +++ b/docs/samples/bert-config-build.md @@ -0,0 +1,134 @@ +# BERT — Config + Build + Perf + +BERT (`bert-base-uncased`) is a canonical text model that exercises every stage of the winml-cli pipeline: it has multiple input tensors, benefits from graph fusion (GeLU, LayerNorm, MatMul+Add), and produces quantizable activations that run well on NPU. That combination makes it a useful reference point for teams deploying transformer encoders on Windows. + +This sample walks through the production-style workflow: generate a reusable `WinMLBuildConfig` JSON file with `winml config`, run the full export → optimize → quantize → compile pipeline in one shot with `winml build`, and measure the result with `winml perf`. If you want to understand each pipeline stage individually before running the all-in-one command, read the [Hugging Face Model to NPU tutorial](../tutorials/npu-convnext.md) first. + +## Prerequisites + +- winml-cli installed and `winml` on your PATH. +- A target device (NPU or GPU recommended; CPU also works). + +## Step 1: Generate a build config + +```bash +winml config -m bert-base-uncased -t text-classification -o bert_config.json +``` + +This writes a `WinMLBuildConfig` JSON file to `bert_config.json`. The file captures every pipeline setting in a single artifact that you can version-control and share. A representative excerpt looks like this: + +```json +{ + "loader": { + "task": "text-classification", + "model_class": "AutoModelForSequenceClassification", + "model_type": "bert" + }, + "export": { + "opset_version": 17, + "batch_size": 1 + .. // truncated: input_tensors, output_tensors + }, + "optim": { + "clamp_constant_values": true + }, + "quant": { + "mode": "qdq", + "weight_type": "uint8", + "activation_type": "uint16", + "samples": 10, + "calibration_method": "minmax", + "task": "text-classification", + "model_name": "bert-base-uncased" + ... // truncated: per_channel, symmetric, distribution, ... + }, + "compile": null +} +``` + +!!! note + The five top-level keys — `loader`, `export`, `optim`, `quant`, and `compile` — map directly to the five pipeline stages. Setting `quant` or `compile` to `null` skips that stage entirely. See [Config and build](../concepts/config-and-build.md) for a field-by-field description of every option. + +## Step 2: Run the build + +```bash +winml build -c bert_config.json -m bert-base-uncased --output-dir bert_out/ +``` + +winml-cli reads the config, downloads the model weights once, and runs the pipeline in sequence. Terminal output shows each stage as it completes: + +```text +winml build + Config: bert_config.json + Model: bert-base-uncased + Output: bert_out/ + + export done (42.1s) + optimize done (6.3s) + quantize done (18.7s) + compile done (21.4s) + + Build complete in 88.5s + Final artifact: bert_out/model.onnx +``` + +!!! note + After the optimize stage, winml-cli runs an analyzer loop that inspects the graph for nodes the target EP cannot dispatch natively and re-runs optimization with adjusted fusion flags. The loop repeats up to `--max-optim-iterations` times (default: 3). Pass `--no-optimize` to skip this stage entirely when starting from a pre-optimized ONNX file. See [How winml-cli Works](../concepts/how-it-works.md) for a full description of the autoconf loop. + +## Step 3: Benchmark + +```bash +winml perf -m bert_out/model.onnx --iterations 50 +``` + +After a short warm-up, `winml perf` reports latency percentiles and throughput: + +```text +Device: npu +Task: text-classification +Iterations: 50 (+ 10 warmup) +Batch Size: 1 + +Latency (ms) + Avg P50 P90 P95 P99 Min Max Std + 4.83 4.79 5.12 5.31 5.68 4.51 6.04 0.21 + +Throughput: 206.99 samples/sec + +Results saved to: model_perf.json +``` + +## Customizing the config + +The JSON file is plain text and can be edited before running `winml build`. Two common adjustments: + +**Change precision.** To target fp16 instead of the default uint8 QDQ quantization, regenerate the config with an explicit precision flag: + +```bash +winml config -m bert-base-uncased -t text-classification --precision fp16 -o bert_config.json +``` + +Alternatively, edit `bert_config.json` directly: set `quant.weight_type` and `quant.activation_type` to `"int8"` or `"uint16"`, or set `quant` to `null` to skip quantization entirely. + +**Disable a stage at build time.** You can suppress a stage for a single run without touching the config file using the `--no-quant` flags: + +```bash +winml build -c bert_config.json -m bert-base-uncased --output-dir bert_out/ --no-quant +``` + +This is useful for measuring the fp32 baseline before committing to a quantized build. The `quant` section in `bert_config.json` is unchanged; the flag only affects this invocation. See [Config and build](../concepts/config-and-build.md) for the full list of configurable fields. + +## What you learned + +- `winml config` generates a complete, version-controllable `WinMLBuildConfig` JSON from a HuggingFace model ID in one command. +- `winml build` orchestrates the full export → optimize → quantize → compile pipeline from a single config file and model ID. +- The autoconf loop inside the optimize stage adjusts graph fusion flags automatically to maximize EP compatibility. +- `winml perf` gives a latency and throughput baseline on the built artifact in seconds. + + +## See also + +- [winml config](../commands/config.md) +- [winml build](../commands/build.md) +- [winml perf](../commands/perf.md) +- [Config and build](../concepts/config-and-build.md) diff --git a/docs/samples/clip-composite.md b/docs/samples/clip-composite.md new file mode 100644 index 000000000..4f09d833a --- /dev/null +++ b/docs/samples/clip-composite.md @@ -0,0 +1,161 @@ +# CLIP — Composite Models + +CLIP (`openai/clip-vit-base-patch32`) is a dual-encoder vision-language model: one tower encodes images, the other encodes text, and both project into a shared embedding space. winml-cli treats it as a **composite model** — a model that is split into multiple ONNX sub-models that run together at inference time. For CLIP, the two sub-models are: + +| Sub-model | Role | Input shape | Output (projected) | +|-----------|------|-------------|--------------------| +| `image-encoder` | Encodes images into embeddings | `pixel_values` `[1, 3, 224, 224]` | `image_embeds` `[1, 512]` | +| `text-encoder` | Encodes text labels into embeddings | `input_ids` `[1, 77]` | `text_embeds` `[1, 512]` | + +Zero-shot classification is achieved by embedding the image and the candidate text labels, then ranking the labels by the cosine similarity between their embeddings. Splitting the towers into two ONNX graphs lets each encoder have fully static shapes (required for efficient NPU compilation) and lets you build, cache, and benchmark them independently. + +## Prerequisites + +- winml-cli installed and `winml` on your PATH. +- A network connection to download CLIP weights from HuggingFace on first run. + +## Overall workflow + +The composite model architecture for CLIP: + +```mermaid +graph LR + A[winml config] -->|"(clip, zero-shot-image-classification)"| B[Composite Registry] + B --> C[image-encoder config] + B --> D[text-encoder config] + C --> E[winml build → image-encoder.onnx] + D --> F[winml build → text-encoder.onnx] + E --> G[WinMLAutoModel] + F --> G + G -->|logits_per_image| H[Classification scores] +``` + +## Step 1: Generate build configs + +```bash +winml config -m openai/clip-vit-base-patch32 --task zero-shot-image-classification -o clip.json +``` + +Because `(clip, zero-shot-image-classification)` is registered as a composite model, this command produces **two** config files — one per sub-model: + +- `clip_image-encoder.json` — export config using `image-feature-extraction` task +- `clip_text-encoder.json` — export config using `feature-extraction` task + +Each config includes CLIP-specific optimizations (GELU fusion, LayerNorm fusion, MatMul+Add fusion, and clamp constant values). + +## Step 2: Build each sub-model + +Build both sub-models individually using their config files: + +```bash +# Build the image encoder +winml build -c clip_image-encoder.json -m openai/clip-vit-base-patch32 -o output/image-encoder + +# Build the text encoder +winml build -c clip_text-encoder.json -m openai/clip-vit-base-patch32 -o output/text-encoder +``` + +Each `winml build` runs the full pipeline: export → optimize → quantize → compile. The output directories contain the final ONNX files ready for inference. + +To target a specific execution provider (e.g., QNN for NPU): + +```bash +winml build -c clip_image-encoder.json -m openai/clip-vit-base-patch32 -o output/image-encoder --ep qnn +winml build -c clip_text-encoder.json -m openai/clip-vit-base-patch32 -o output/text-encoder --ep qnn +``` + +## Step 3: Benchmark each sub-model + +```bash +winml perf output/image-encoder -d npu +winml perf output/text-encoder -d npu +``` + +This lets you identify whether the image or text encoder is the bottleneck on your target hardware. + +## Step 4: Run inference (Python API) + +There are two ways to get a ready-to-run model. Both return the same `WinMLModelForZeroShotImageClassification` — a single object that orchestrates the two encoders and combines their projected embeddings into similarity scores — so the inference code afterward is identical. + +**Option 1 — Load the ONNX files built in Step 2** (skips re-export/optimization). Pass a dict mapping each component name to its built `model.onnx`, plus the HF config so the composite registry can resolve `(clip, zero-shot-image-classification)`: + +```python +from transformers import AutoConfig + +from winml.modelkit.models import WinMLAutoModel + +model = WinMLAutoModel.from_onnx( + { + "image-encoder": "output/image-encoder/model.onnx", + "text-encoder": "output/text-encoder/model.onnx", + }, + task="zero-shot-image-classification", + hf_config=AutoConfig.from_pretrained("openai/clip-vit-base-patch32"), + skip_build=True, +) +``` + +**Option 2 — Build both encoders from the HuggingFace model in one call.** `WinMLAutoModel.from_pretrained` detects the composite task and runs the full pipeline for each sub-model: + +```python +from winml.modelkit.models import WinMLAutoModel + +model = WinMLAutoModel.from_pretrained( + "openai/clip-vit-base-patch32", + task="zero-shot-image-classification", +) +``` + +Either way, run inference the same way — prepare an image plus candidate labels with the HF processor, then call the model: + +```python +from PIL import Image +from transformers import CLIPProcessor + +processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") +image = Image.open("cat.jpg") +labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"] +inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) + +# Run both encoders and combine into per-label similarity scores +outputs = model(**inputs) +probs = outputs.logits_per_image.softmax(dim=-1) +for label, p in zip(labels, probs[0].tolist()): + print(f"{label}: {p:.4f}") +``` + +The text encoder's fixed sequence length (77) is handled for you — the processor's tokens are padded or truncated to match the ONNX graph before each run. + +### Customizing shape config per sub-model + +Each encoder takes its own `shape_config`, passed through `sub_model_kwargs`. The image encoder accepts vision keys (`height`, `width`); the text encoder accepts text keys (`sequence_length`): + +```python +model = WinMLAutoModel.from_pretrained( + "openai/clip-vit-base-patch32", + task="zero-shot-image-classification", + sub_model_kwargs={ + "image-encoder": {"shape_config": {"height": 224, "width": 224}}, + "text-encoder": {"shape_config": {"sequence_length": 77}}, + }, +) +``` + +## Other composite models + +The same composite model pattern is used for: + +- **SigLIP** (`google/siglip-base-patch16-224`) — dual-encoder zero-shot image classification; shares the same composite wrapper as CLIP +- **T5** (`google-t5/t5-small`) — encoder + decoder for translation/summarization +- **BART** (`facebook/bart-large-cnn`) — encoder + decoder for summarization and table-question-answering (TAPEX) +- **Marian** (`Helsinki-NLP/opus-mt-en-de`) — encoder + decoder for translation +- **Qwen3** (`Qwen/Qwen3-0.6B`) — prefill + generation decoders for text generation +- **BLIP** (`Salesforce/blip-image-captioning-base`) — vision encoder + text decoder for image-to-text captioning +- **Vision-encoder-decoder** (`microsoft/trocr-base-handwritten`) — vision encoder + text decoder for image-to-text (TrOCR, Donut) + +## See also + +- [BERT — Config + Build + Perf](bert-config-build.md) — single-model workflow +- [Hugging Face Model to NPU](../tutorials/npu-convnext.md) — step-by-step pipeline +- [Supported Models](../reference/supported-models.md) — full list of validated architectures +- [Config and build](../concepts/config-and-build.md) — concept overview diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 000000000..3f386a8da --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,3 @@ +.md-header__button.md-logo { + display: none; +} diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 000000000..fbe602375 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,149 @@ +# Troubleshooting + +Common issues and solutions when working with winml-cli. + +--- + +## Compile + +### Cannot enable compilation: no compile section + +```text +UsageError: Cannot enable compilation: no compile section found in the config file +``` + +**Cause:** Compilation is **off by default** in `winml build`. You passed `--compile` to explicitly enable it, but the config JSON has no `"compile"` section (it's `null`). This happens when the config was generated without a device target that supports EPContext (e.g., `--device cpu` or `--device auto` on a machine without NPU). + +**Solution:** Regenerate the config targeting a device that supports compilation (NPU or GPU with an EP that produces EPContext): + +```bash +uv run winml config -m -d npu --compile -o output/ +``` + +!!! note + By default `winml build` skips the compile stage unless `--compile` is passed or the config contains a non-null `"compile"` section. To include compilation in the generated config, specify a device that maps to an EPContext-capable EP (e.g., `-d npu`). + +--- + +### Already a compiled EPContext model + +```text +ClickException: model_ctx.onnx is already a compiled EPContext model and cannot be re-compiled +``` + +**Cause:** You're trying to compile a model that is already an EPContext artifact (the `_ctx.onnx` output). + +**Solution:** Run compilation on the original (pre-compiled) ONNX file instead: + +```bash +uv run winml compile -m model.onnx -d npu -o output/ +``` + +--- + +### Provider does not support EPContext compilation + +```text +ClickException: Provider 'DmlExecutionProvider' does not support EPContext compilation +``` + +**Cause:** Not all EPs produce EPContext format. DML and CPU do not support pre-compilation. + +**Solution:** EPContext is supported by QNN, OpenVINO, TensorRT, and Vitis AI. For DML/CPU, skip the compile step — the runtime compiles on first load automatically: + +```bash +uv run winml build -c config.json -m model -o output/ --no-compile +``` + +--- + +## Analyze + +### Unsupported nodes persist after analysis + +```text +RuntimeError: Unsupported nodes persist after analysis +``` + +**Cause:** The model contains operators that the selected EP cannot dispatch natively. + +**Solution:** Run `winml analyze` with `--optim-config` to identify problematic operators and get recommended graph optimizations: + +```bash +# Analyze and output optimization recommendations +uv run winml analyze -m model.onnx --ep qnn --optim-config optim_config.json +``` + +This produces `optim_config.json` with the auto-discovered optimization flags. Apply them with `winml optimize`, then re-analyze: + +```bash +# Apply recommended optimizations +uv run winml optimize -m model.onnx -o model_optimized.onnx -c optim_config.json + +# Re-analyze to check if unsupported nodes are resolved +uv run winml analyze -m model_optimized.onnx --ep qnn +``` + +If unsupported nodes still remain after optimization, consider: + +- **Manually modifying problematic nodes** — use tools like `onnx-graphsurgeon` to replace or remove operators the EP cannot handle +- **Using a different EP** (`--ep dml` or `--ep cpu`) that supports the operators in question +- **Checking if a newer opset version** resolves the compatibility gap (re-export with `--opset-version 18`) + +--- + +### Many "unknown" results from constant nodes + +When `winml analyze` reports a large number of nodes as "unknown", the model likely hasn't been normalized — it contains raw constant-folding subgraphs, missing shape annotations, or redundant initializer nodes that the analyzer cannot classify. + +**Solution:** Run `winml optimize` with no optimization flags to normalize the model (constant folding, shape inference, dead-node elimination), then re-analyze: + +```bash +# Normalize only (no fusion flags) +uv run winml optimize -m model.onnx -o model_normalized.onnx + +# Re-analyze — constant nodes are now folded, shapes are inferred +uv run winml analyze -m model_normalized.onnx --ep qnn +``` + +This baseline pass collapses constant subgraphs into initializers and propagates tensor shapes throughout the graph, giving the analyzer enough information to classify nodes correctly. + +--- + +## Build / Cache + +### Disk full / out of space + +Build artifacts (exported ONNX, optimized graphs, quantized models, compiled EPContext files) are cached under: + +``` +C:\Users\\.cache\winml +``` + +This directory can grow significantly after multiple builds with large models. If you encounter disk-full errors or want to reclaim space, it is safe to delete the entire folder: + +```powershell +Remove-Item -Recurse -Force "$env:USERPROFILE\.cache\winml" +``` + +The next `winml build` will re-create the cache as needed. Use `--rebuild` to force a full rebuild without relying on cached intermediates. + +--- + +## General Tips + +| Tip | Command | +|-----|---------| +| **Diagnose environment** | `uv run winml sys` | +| **Check EP compatibility** | `uv run winml analyze -m model.onnx --ep ` | +| **Verbose output** | Add `-v` or `--verbose` to any command | +| **Skip a pipeline stage** | `--no-quant`, `--no-compile`, `--no-optimize` | +| **Force rebuild (ignore cache)** | `uv run winml build -c config.json -m -o output/ --rebuild` | +| **Regenerate config** | `uv run winml config -m -d -o dir/` | +| **Free disk space** | Delete `C:\Users\\.cache\winml` | + +## See also + +- [winml sys](commands/sys.md) — system diagnostics +- [winml analyze](commands/analyze.md) — EP compatibility analysis +- [EP and Device](concepts/eps-and-devices.md) — execution provider reference diff --git a/docs/tutorials/build-from-onnx.md b/docs/tutorials/build-from-onnx.md new file mode 100644 index 000000000..7dbe6fc42 --- /dev/null +++ b/docs/tutorials/build-from-onnx.md @@ -0,0 +1,267 @@ +# Bring Your Own ONNX Model + +This tutorial walks you through the complete workflow for optimizing, analyzing, and deploying an ONNX model you already have — whether you exported it yourself (`torch.onnx.export`, ONNX Runtime tools), received it from a teammate, or downloaded it from the ONNX Model Zoo. + +Unlike the [Hugging Face Model to NPU](npu-convnext.md) tutorial which starts from a HuggingFace model ID, this tutorial assumes you already have a `.onnx` file on disk and want to make it run faster on your target hardware. + +The tutorial is split into two sections. Section A walks through the analyze → optimize → re-analyze loop using primitive commands, teaching you how the optimization feedback cycle works. Section B shows how `winml build` automates that same loop in a single command, optionally targeting NPU with quantization. + +--- + +## Prerequisites + +- **Windows 11 24H2** — required for NPU stack support +- **Python 3.11** and **uv** installed (`pip install uv` or follow [astral.sh/uv](https://astral.sh/uv)) +- **winml-cli** installed — see [Installation](../getting-started/installation.md) +- **An ONNX model file** — this tutorial uses `my_model.onnx` as a placeholder; substitute your own file + +> No NPU? Set `--device cpu` wherever you see `--device npu`. Every other flag stays the same. + +--- + +## Section A — Primitive commands + +Working through the primitive commands one at a time reveals how the analyze–optimize feedback cycle works. Each command accepts the output of the previous step as input, and every intermediate artifact is available for inspection. + +### Step 1: Analyze the original model + +Before any optimization, run the static analyzer to understand your model's EP compatibility and get optimization recommendations: + +```bash +uv run winml analyze --model my_model.onnx --optim-config optim_config.json +``` + +The analyzer classifies every operator in the graph as **supported**, **partial**, **unsupported**, or **unknown** for each available EP. It also detects fusible subgraph patterns and writes the recommended optimization flags to `optim_config.json`. + +To target a specific EP: + +```bash +uv run winml analyze --model my_model.onnx --ep qnn --device npu --optim-config optim_config.json +``` + +The output shows per-EP compatibility results: + +```text +══════════════════════════════════════════════════════════════════════════ + ANALYSIS SUMMARY +══════════════════════════════════════════════════════════════════════════ + QNNExecutionProvider (NPU): 122/0/0/0 + Ready to deploy +``` + +If the analyzer detects fusible patterns (GeLU, LayerNorm, etc.), they will appear in the output and the `optim_config.json` will contain the recommended fusion settings. If no patterns are detected (as with simple architectures like ResNet), the config will be empty `{}`. + +!!! note "What we just did" + The analyzer performs static analysis — no runtime or hardware required. It tells you two things: (1) can the model run on your target EP at all, and (2) are there graph patterns that the optimizer can fuse to improve performance. The `--optim-config` flag outputs a JSON file with the exact optimization settings the optimizer needs. S/P/U/Unk = Supported/Partial/Unsupported/Unknown. + +--- + +### Step 2: Optimize the graph + +Pass the analyzer's output config directly to the optimizer: + +```bash +uv run winml optimize -m my_model.onnx -c optim_config.json -o my_model_optimized.onnx +``` + +The optimizer applies the fusions specified in the config and reports how many nodes it reduced: + +```text +Input: my_model.onnx +Output: my_model_optimized.onnx + +Success! Model optimized: my_model_optimized.onnx +Nodes: 122 -> 122 (0.0% reduction) +``` + +!!! tip + The node reduction depends on your model's architecture. Simple models like ResNet (only Conv, Relu, Add) have no fusible patterns. Transformer-based models (BERT, ViT) typically see 10–30% node reduction from GeLU, LayerNorm, and Attention fusions. + +!!! note "What we just did" + Graph optimization fuses multi-node patterns (like the 5-node GeLU/Erf sequence) into single high-level operators that EPs can execute more efficiently. The optimizer is purely a graph transformation — it doesn't change the model's numerical behavior or require calibration data. Running it before quantization is important: calibration should be performed on the already-fused topology, not the verbose original graph. + +--- + +### Step 3: Re-analyze the optimized model + +Run the analyzer again on the optimized output to confirm that the fusions resolved and no new issues appeared: + +```bash +uv run winml analyze --model my_model_optimized.onnx --ep qnn --device npu +``` + +If the original analysis found fusible patterns that were optimized away, this run should show zero detected patterns and the same or better EP compatibility score. + +!!! note "What we just did" + The analyze → optimize → re-analyze cycle is the fundamental feedback loop in winml-cli. In Section B you'll see that `winml build` automates this loop — it calls the analyzer, applies recommendations, re-analyzes, and repeats until convergence (typically 1–3 iterations). Doing it manually here teaches you what the automation is actually doing under the hood. + +--- + +### Step 4 (optional): Quantize + +Insert QDQ (Quantize-Dequantize) nodes into the optimized graph using static calibration: + +```bash +uv run winml quantize -m my_model_optimized.onnx -o my_model_int8.onnx --precision int8 --samples 32 +``` + +The quantizer generates 32 random calibration samples, runs them through the model to collect activation statistics, and uses those statistics to set the quantization scale and zero-point for each tensor. + +!!! note "What we just did" + `--precision int8` sets both weights and activations to 8-bit integers, which is the precision most NPU compilers expect. The output model still contains standard `QuantizeLinear` and `DequantizeLinear` ONNX nodes, so it is portable and can run on any ONNX Runtime backend. See [Concepts → Quantization and QDQ](../concepts/quantization.md) for calibration methods and per-channel options. + +--- + +### Step 5 (optional): Compile for the target EP + +Compilation converts the portable quantized ONNX into an EP-specific binary format that the execution provider can load directly, skipping JIT compilation at inference time: + +=== "Qualcomm NPU" + + ```bash + uv run winml compile -m my_model_int8.onnx --device npu --ep qnn + ``` + +=== "Intel NPU" + + ```bash + uv run winml compile -m my_model_int8.onnx --device npu --ep openvino + ``` + +=== "AMD NPU" + + ```bash + uv run winml compile -m my_model_int8.onnx --device npu --ep vitisai + ``` + +=== "CPU" + + ```bash + uv run winml compile -m my_model_int8.onnx --device cpu + ``` + +!!! note "What we just did" + Compilation embeds EP context — the compiled binary — inside or alongside the ONNX file using the `EPContext` node convention. At inference time the runtime loads the pre-compiled binary directly rather than re-compiling from the ONNX graph. See [Concepts → Compile and EPContext](../concepts/compile-and-epcontext.md) for details. + +--- + +### Step 6: Benchmark + +Measure the performance of your model: + +=== "Optimized (CPU)" + + ```bash + uv run winml perf -m my_model_optimized.onnx --device cpu --warmup 5 --iterations 50 + ``` + +=== "Compiled (NPU)" + + ```bash + uv run winml perf -m my_model_int8_npu_ctx.onnx --device npu --iterations 50 --monitor + ``` + +!!! note "What we just did" + `winml perf` generates random inputs matching the model's I/O spec, runs warmup iterations (excluded from statistics), then the benchmark iterations, and reports full latency percentiles alongside throughput. The `--monitor` flag activates live hardware utilization polling. See [Concepts → Perf and monitoring](../concepts/perf-and-monitoring.md) for details. + +--- + +## Section B — One-shot with `winml build` + +Once you understand the analyze → optimize → re-analyze loop (which you now do), you can let `winml build` handle everything in one command. When you pass a `.onnx` file, winml-cli auto-detects it and skips the export stage — running the optimization loop, quantization, and compilation automatically. + +```bash +uv run winml build -m my_model.onnx -o output/ --device npu --precision int8 +``` + +!!! tip "Config file is optional" + The `-c config.json` flag is optional. Without it, `winml build` auto-generates an internal config from the flags you pass (like `--device` and `--precision`). If you need a reusable config, generate one with [`winml config`](../commands/config.md): + + ```bash + uv run winml config --onnx my_model.onnx -d npu --precision int8 -o config.json + uv run winml build -m my_model.onnx -c config.json -o output/ + ``` + +The pipeline runs: **analyze → optimize → (re-analyze → re-optimize if needed) → quantize → compile → model.onnx**. The output directory looks like: + +```text +output/ +├── model.onnx ← FINAL: deploy this +├── my_model.onnx ← Copy of your input +├── my_model_optimized.onnx ← After optimization loop converged +├── my_model_quantized.onnx ← After INT8 quantization +├── my_model_compiled.onnx ← After EP compilation +├── winml_build_config.json ← Config used (including auto-detected options) +└── analyze_result.json ← Analysis from optimize stage +``` + +You can selectively skip stages using the override flags: + +- `--no-optimize` — skip graph optimization (rarely needed; useful if you have a pre-optimized ONNX) +- `--no-quant` — skip quantization (produces a floating-point compiled model) +- `--no-compile` — skip compilation (produces a quantized but not device-locked ONNX) + +For example, to produce an optimized model without quantization or compilation: + +```bash +uv run winml build -m my_model.onnx -o output/ --device cpu +``` + +!!! note "What we just did" + `winml build` is the production workflow. It guarantees that stages run in the correct order, passes intermediate artifacts through the pipeline automatically, and records which stages completed or were skipped in the result summary. + +Once the build completes, benchmark the final artifact: + +```bash +uv run winml perf -m output/model.onnx --device npu --iterations 50 --monitor +``` + +--- + +## Using the Python API + +```python +from winml.modelkit import WinMLAutoModel + +# Load from a pre-built ONNX (skips the build pipeline) +model = WinMLAutoModel.from_onnx( + "output/model.onnx", + task="image-classification", # set your task + skip_build=True, +) + +output = model(pixel_values=your_input_tensor) +``` + +Or trigger the full build programmatically: + +```python +from winml.modelkit.build import build_onnx_model +from winml.modelkit.config import generate_build_config + +config = generate_build_config(onnx_path="my_model.onnx", device="npu", precision="int8") +result = build_onnx_model("my_model.onnx", config=config, output_dir="output/") +print(f"Final model: {result.final_onnx_path}") +``` + +--- + +## Troubleshooting + +| Problem | Solution | +|---------|----------| +| "ONNX file not found" | Use an absolute path or ensure the file is in the current directory | +| Analyzer reports unsupported ops | Check if an optimization fusion resolves them; if not, the model needs modification for that EP | +| Optimization loop doesn't converge | The default max is 3 iterations; if patterns persist, they may not be fusible — use `--no-quant --no-compile` and inspect | +| Quantization accuracy regression | Try `--precision int16`, `--per-channel`, or increase `--samples` for better calibration | +| EP compilation fails | Check the selected EP, model compatibility, and target device availability | +| Model too large for memory | Use `--no-compile` and compile on the target device | + +--- + +## Where to go next + +- [Hugging Face Model to NPU](npu-convnext.md) — the same pipeline starting from HuggingFace (includes export stage) +- [Output Layout](../reference/output-layout.md) — what each output file contains and the `analyze_result.json` schema +- [Concepts → Analyze and optimize](../concepts/analyze-and-optimize.md) — how the convergence loop works internally +- [Build Config Schema](../reference/index.md) — customize quantization, compilation, and optimization settings diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md new file mode 100644 index 000000000..cae936b5d --- /dev/null +++ b/docs/tutorials/index.md @@ -0,0 +1,12 @@ +# Tutorials + +Tutorials are linear, prescriptive, end-to-end walkthroughs that guide you through building something concrete with `winml-cli`. Each tutorial moves in one direction—start to finish—so you can follow along without making decisions. If you need to understand the reasoning behind a feature, see the Concepts section (the why and when). If you need a quick reference for a specific command, see Commands (the what). Tutorials sit alongside Samples, which are reference-style demos that compare multiple approaches side by side rather than walking through a single path. + +## Available tutorials + +| Tutorial | What you'll build | Hardware | +|---|---|---| +| [Hugging Face Model to NPU](npu-convnext.md) | A quantized ConvNeXt image classifier compiled for Snapdragon NPU (with CPU/DirectML fallback) | Copilot+PC NPU primary; CPU works as fallback | +| [Bring Your Own ONNX Model](build-from-onnx.md) | Optimize and deploy an ONNX file you already have, using the analyze → optimize → re-analyze feedback loop | Any (CPU, NPU, GPU) | + +More tutorials are coming, covering additional model families, execution providers, and deployment scenarios. Check back as the `winml-cli` documentation expands. diff --git a/docs/tutorials/npu-convnext.md b/docs/tutorials/npu-convnext.md new file mode 100644 index 000000000..4c037cf3a --- /dev/null +++ b/docs/tutorials/npu-convnext.md @@ -0,0 +1,287 @@ +# Hugging Face Model to NPU + +!!! info "Pick the right ConvNeXt page" + Two pages use ConvNeXt as their vehicle: + + - **This tutorial** — the canonical deep-dive: full pipeline with both QNN and OpenVINO NPU backends, plus the `winml build` one-shot. Start here if you want to ship to NPU. + - **[Quickstart](../getting-started/quickstart.md)** — the short Getting Started introduction. Start here for a 15-minute taste. + +This tutorial walks you through the complete journey from a pretrained Hugging Face model — `facebook/convnext-tiny-224` — to a quantized, compiled artifact running on an NPU. By the end you will have benchmarked the model on your device and measured real inference latency. Nothing is skipped, and every command produces a file you can inspect or reuse. + +The primary hardware target is a Copilot+PC with a Snapdragon X-class NPU (40+ TOPS). If you do not have an NPU, every step works on CPU or DirectML as a fallback — the only thing that changes is the `--device` and `--ep` flags on the compile and perf commands. Those variations are shown explicitly in the tabbed blocks below. + +The tutorial is split into two sections. Section A runs through eight primitive commands — one per pipeline stage — so you understand what each stage does, what artifact it produces, and why it matters. Section B shows you that `winml build` runs the same pipeline in a single command once you have a config file. Most production workflows live in Section B; Section A is how you learn to trust it. + +--- + +## Prerequisites + +- **Windows 11 24H2** — required for NPU stack support +- **Copilot+PC with NPU** — 40+ TOPS recommended; CPU and DirectML work as fallback throughout +- **Python 3.11** and **uv** installed (`pip install uv` or follow [astral.sh/uv](https://astral.sh/uv)) +- **winml-cli** installed — see [Installation](../getting-started/installation.md) + +> No NPU? Set `--device cpu` wherever you see `--device npu` and drop `--monitor` from perf commands. Every other flag stays the same. + +--- + +## Section A — Primitive commands + +Working through the primitive commands one at a time is the best way to understand what the `winml build` wrapper does under the hood. Each step accepts the output of the previous step as its input, so the chain is explicit and every intermediate artifact is available for inspection. + +### Step 1: Inspect the model + +Before downloading any weights, confirm that winml-cli knows how to handle `facebook/convnext-tiny-224`. + +```bash +uv run winml inspect -m facebook/convnext-tiny-224 +``` + +You should see output similar to the following: + +```text +Model facebook/convnext-tiny-224 +Task image-classification +Model class ConvNextForImageClassification +Exporter optimum/onnx +Input pixel_values: float32 [1, 3, 224, 224] +Output logits: float32 [1, 1000] +Support status supported +``` + +!!! note "What we just did" + `winml inspect` queries the Hugging Face model card and winml-cli's internal registry without downloading weights. It confirms three things: the auto-detected task (`image-classification`), the model class that will be used for loading, and the exporter that will handle the ONNX conversion. If this command fails, stop here — something about the model is unsupported and proceeding would waste time. A successful inspect is the green light for every stage that follows. + +--- + +### Step 2: Generate a build config + +Generate a `WinMLBuildConfig` JSON file for the model. For the primitive workflow this file is optional — you can drive each stage entirely through CLI flags — but generating it now gives you a versioned record of every auto-detected setting, and it is required for Section B. + +```bash +uv run winml config -m facebook/convnext-tiny-224 --device npu --precision int8 -o convnext_config.json +``` + +Open `convnext_config.json` to see what was auto-detected: the task, I/O tensor shapes, quantization parameters, and the compile target. The `--device npu --precision int8` flags tell the config generator to pre-populate the quantization and compile sections for NPU deployment rather than leaving them at defaults. + +!!! note "What we just did" + `winml config` auto-resolves every setting that would otherwise require you to look up flags manually. The resulting JSON is the single source of truth for a reproducible build. You can commit it to version control, share it with teammates, edit a single field to try a different precision, and replay the exact same build on any machine. See [Concepts → Config and build](../concepts/config-and-build.md) for a deeper look at the config schema and how the stages interact. + +--- + +### Step 3: Export to ONNX + +Download the pretrained weights and convert the PyTorch model to ONNX format. + +```bash +uv run winml export -m facebook/convnext-tiny-224 -o convnext.onnx +``` + +This runs an eight-stage export pipeline: model preparation, input generation, hierarchy building, ONNX conversion, node tagging, tag injection, and metadata generation. The result is a standards-compliant ONNX file with winml-cli's Hierarchy-preserving Tags Protocol (HTP) metadata embedded in node `metadata_props`. That metadata is what lets downstream tools make architecture-aware optimization decisions without hardcoded model knowledge. + +!!! note "What we just did" + The default export embeds hierarchy tags — a tree of source module names mapped onto ONNX nodes — so that the optimizer and analyzer can reason about the graph in terms of the original model structure rather than flat node lists. If you need a clean ONNX without that metadata (for compatibility with other tools), add `--no-hierarchy`. See [Concepts → Load and export](../concepts/load-and-export.md) for what hierarchy preservation adds and when it matters. + +--- + +### Step 4: Analyze for EP compatibility + +Before spending time on optimization and quantization, check that the model's operators are supported by your target execution provider. + +```bash +uv run winml analyze -m convnext.onnx --ep qnn --device npu +``` + +The analyzer performs static analysis — no runtime required — and classifies every operator in the graph as **supported**, **partial**, or **unsupported** for the target EP. It reports a coverage summary, flags any operators that may fall back to CPU, and exits with code 0 for full support or 1 for partial support. + +For CPU fallback, run: + +```bash +uv run winml analyze -m convnext.onnx --ep cpu --device cpu +``` + +!!! note "What we just did" + Knowing your operator coverage before you quantize or compile saves you from discovering EP incompatibilities at the very last step of a long pipeline. ConvNeXt's operators (Conv, GELU, LayerNorm, Add) have broad support across QNN and OpenVINO, so this command should exit 0. If it exits 1, the output tells you which operators are problematic and includes recommendations for resolving them — typically by enabling a graph rewrite in the optimizer that fuses the unsupported pattern into a supported one. See [Concepts → Analyze and optimize](../concepts/analyze-and-optimize.md) for details on the analyzer's recommendation engine. + +--- + +### Step 5: Optimize the graph + +Apply graph-level optimizations: operator fusion, constant folding, shape inference, and EP-specific graph rewrites. + +```bash +uv run winml optimize -m convnext.onnx -o convnext_optim.onnx +``` + +The optimizer reports how many nodes it reduced. A typical ConvNeXt-tiny optimization fuses several element-wise sequences and removes redundant reshape operations, cutting the node count noticeably without changing model semantics. If you want to apply a specific preset suited to the Snapdragon NPU, add `--preset qnn-compatible` to disable fusions that QNN does not benefit from. + +!!! note "What we just did" + Graph optimization is a separate stage from quantization so that you can inspect the intermediate graph, compare node counts, and selectively enable or disable individual fusion passes using the `--enable-*` / `--disable-*` flags. Run `uv run winml optimize --list-capabilities` to see every registered optimization flag and its default state. Optimization always happens on the floating-point graph; quantization is applied after so that calibration statistics are computed on the already-fused topology. + +--- + +### Step 6: Quantize + +Insert QDQ (Quantize-Dequantize) nodes into the optimized graph using static calibration. This reduces model size and speeds up inference on hardware with integer execution units, which includes Snapdragon NPUs and Intel NPUs. + +```bash +uv run winml quantize -m convnext_optim.onnx -o convnext_int8.onnx --precision int8 --samples 32 +``` + +The quantizer generates 32 random calibration samples, runs them through the model to collect activation statistics, and uses those statistics (with the default `minmax` method) to set the quantization scale and zero-point for each tensor. Thirty-two samples is sufficient for a vision model with fixed-size inputs like ConvNeXt. For models with variable-length inputs or complex activation distributions, increase `--samples` to 64 or 128. + +!!! note "What we just did" + `--precision int8` sets both weights and activations to 8-bit integers, which is the precision most NPU compilers expect. The output model still contains standard `QuantizeLinear` and `DequantizeLinear` ONNX nodes, so it is portable and can run on any ONNX Runtime backend — you do not need special tooling to inspect it. See [Concepts → Quantization and QDQ](../concepts/quantization.md) for a detailed explanation of the QDQ node pattern, calibration methods, and how to choose between per-tensor and per-channel quantization. + +--- + +### Step 7: Compile for the target EP + +Compilation converts the portable quantized ONNX into an EP-specific binary format that the execution provider can load directly, skipping JIT compilation at inference time. This is the step that produces a device-locked artifact tied to the selected EP. + +The examples below use the default compiler backend (`--compiler ort`), which uses ONNX Runtime's built-in EP context compiler: + +=== "Qualcomm NPU" + + ```bash + uv run winml compile -m convnext_int8.onnx --device npu --ep qnn + ``` + +=== "Intel NPU" + + ```bash + uv run winml compile -m convnext_int8.onnx --device npu --ep openvino + ``` + +=== "AMD NPU" + + ```bash + uv run winml compile -m convnext_int8.onnx --device npu --ep vitisai + ``` + +=== "CPU" + + ```bash + uv run winml compile -m convnext_int8.onnx --device cpu + ``` + +The compiled output file appears in the same directory as the input model. The file name follows the pattern `convnext_int8_npu_ctx.onnx` (using the resolved device string `npu`, not the EP name) and an accompanying `.bin` context binary is written alongside it (unless `--embed` is passed, which embeds the binary inside the ONNX file). CPU builds do not produce a new artifact — the compile step validates EP compatibility but writes no output file; use `convnext_int8.onnx` directly for CPU inference. + +!!! note "What we just did" + Compilation embeds EP context — the compiled binary — inside or alongside the ONNX file using the `EPContext` node convention. At inference time the runtime loads the pre-compiled binary directly rather than re-compiling from the ONNX graph, eliminating the 15–60 second JIT penalty on first load. The default `--compiler ort` backend bundles compilation within ONNX Runtime itself. See [Concepts → Compile and EPContext](../concepts/compile-and-epcontext.md) for the full picture of what gets embedded and how the context is consumed at runtime. + +--- + +### Step 8: Benchmark + +Measure inference latency and throughput with the `--monitor` flag to see live NPU utilization alongside the timing numbers. + +=== "QNN NPU" + + ```bash + uv run winml perf -m convnext_int8_npu_ctx.onnx --device npu --iterations 50 --monitor + ``` + +=== "OpenVINO NPU" + + ```bash + uv run winml perf -m convnext_int8_npu_ctx.onnx --device npu --ep openvino --iterations 50 --monitor + ``` + +=== "CPU" + + ```bash + uv run winml perf -m convnext_int8.onnx --device cpu --iterations 50 + ``` + +A representative run on a Snapdragon X Elite NPU produces output like the following: + +```text +Device: npu +Task: image-classification +Iterations: 50 (+ 10 warmup) +Batch Size: 1 + +Latency (ms) + Avg P50 P90 P95 P99 Min Max Std + 2.14 2.11 2.31 2.38 2.59 1.98 2.71 0.14 + +Throughput: 467.29 samples/sec + +Hardware (during benchmark) + NPU: 72.4% avg, 89.1% peak | CPU: 3.2% avg + Sys Mem: 1842 MB | Device Mem: 48/12 MB (local/shared) +``` + +The CPU fallback (same model, `--device cpu`) will typically show latencies 8–15x higher and near-zero NPU utilization. The contrast between those two runs is the best proof that your NPU path is actually being used. + +!!! note "What we just did" + `winml perf` generates random inputs matching the model's I/O spec, runs the configured number of warmup iterations (excluded from statistics), then the benchmark iterations, and reports full latency percentiles alongside throughput. The `--monitor` flag activates live hardware utilization polling at 200 ms intervals, displaying an in-terminal chart and attaching the hardware metrics to the JSON report saved alongside the console output. See [Concepts → Perf and monitoring](../concepts/perf-and-monitoring.md) for how to interpret the utilization numbers and what `hw_monitor` fields look like in the JSON report. + +--- + +### Step 9 (optional): Evaluate accuracy + +After quantization it is good practice to verify that INT8 accuracy is close to the FP32 baseline. The `winml eval` command runs the model against a held-out dataset slice and reports task-relevant metrics. + +```bash +uv run winml eval -m convnext_int8.onnx --model-id facebook/convnext-tiny-224 --dataset imagenet-1k --split validation --samples 100 --device npu +``` + +The `--model-id` flag is required when passing an ONNX file, because the evaluator needs it to locate the preprocessor and label mappings. The command downloads 100 shuffled validation samples, runs inference, and reports top-1 and top-5 accuracy. A well-quantized ConvNeXt-tiny should lose less than 0.5 percentage points of top-1 accuracy compared to the floating-point checkpoint. + +!!! note "What we just did" + Accuracy evaluation gives you a principled stopping criterion for quantization decisions. If the accuracy drop is larger than acceptable, return to Step 6 and try `--precision int16` or per-channel quantization (`--per-channel`) instead of the default per-tensor int8. See [Concepts → Eval and datasets](../concepts/eval-and-datasets.md) for the full list of supported datasets, tasks, and column mapping options. + +--- + +## Section B — One-shot with `winml build` + +Once you understand what each primitive stage does (which you now do), you can collapse the entire pipeline into a single command. `winml build` orchestrates export, optimize, quantize, and compile in sequence. + +```bash +uv run winml build -m facebook/convnext-tiny-224 -o convnext_out/ --device npu --precision int8 +``` + +!!! tip "Config file is optional" + The `-c config.json` flag is optional. Without it, `winml build` auto-generates an internal config from the flags you pass (like `--device` and `--precision`). If you need a reusable config, generate one with [`winml config`](../commands/config.md). + +The command downloads the pretrained weights, runs all four pipeline stages, and writes every intermediate and final artifact into `convnext_out/`. The stage timing is printed as each stage completes, and the final line tells you the path of the compiled model. + +You can selectively skip stages using the override flags: + +- `--no-optimize` — skip graph optimization (rarely needed; useful if you have a pre-optimized ONNX) +- `--no-quant` — skip quantization (produces a floating-point compiled model) +- `--no-compile` — skip compilation (produces a quantized but not device-locked ONNX) + +For example, to produce an optimized and quantized model without the compile step: + +```bash +uv run winml build -m facebook/convnext-tiny-224 -o convnext_out/ --device npu --precision int8 --no-compile +``` + +!!! note "What we just did" + `winml build` is the production workflow. It guarantees that stages run in the correct order, passes intermediate artifacts through the pipeline automatically, and records which stages completed or were skipped in the result summary. + +Once the build completes, benchmark the final artifact from `convnext_out/`: + +```bash +uv run winml perf -m convnext_out/model.onnx --device npu --iterations 50 --monitor +``` + +The result should match what you saw in Step 8, confirming that the `winml build` pipeline produces bit-identical output to the manual primitive chain. + +--- + +## Where to go next + +- [Concepts → How winml-cli works](../concepts/how-it-works.md) — the full mental model for the pipeline +- [Concepts → Compile and EPContext](../concepts/compile-and-epcontext.md) — understanding the compiled artifact format +- [Commands → Overview](../commands/overview.md) — quick reference for every flag on every command + +## See also + +- [Concepts → Quantization and QDQ](../concepts/quantization.md) +- [Concepts → Analyze and optimize](../concepts/analyze-and-optimize.md) +- [Concepts → Perf and monitoring](../concepts/perf-and-monitoring.md) +- [Concepts → Eval and datasets](../concepts/eval-and-datasets.md) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..c6c21debe --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,137 @@ +site_name: Windows ML CLI +site_description: A CLI toolkit to build portable, performant, and high-quality models for Windows ML. +site_url: https://microsoft.github.io/winml-cli/ +repo_url: https://github.com/microsoft/winml-cli +repo_name: microsoft/winml-cli +edit_uri: edit/main/docs/ + +docs_dir: docs + +exclude_docs: | + /design/ + /naming-convention.md + /pytest-best-practices.md + +extra: + version: + provider: mike + default: latest + +extra_css: + - stylesheets/extra.css + +theme: + name: material + features: + - navigation.instant + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.top + - content.code.copy + - content.action.edit + - toc.follow + - search.suggest + - search.highlight + palette: + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode + +plugins: + - search + - mike: + version_selector: true + css_dir: css + javascript_dir: js + +markdown_extensions: + - admonition + - attr_list + - md_in_html + - tables + - toc: + permalink: true + - pymdownx.details + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.tabbed: + alternate_style: true + - pymdownx.tasklist: + custom_checkbox: true + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Quickstart: getting-started/quickstart.md + - UI Quickstart: getting-started/ui-quickstart.md + - Use with AI Agent: getting-started/agent-skill.md + - Concepts: + - Fundamentals: + - How winml-cli works: concepts/how-it-works.md + - Graph and IR: concepts/graphs-and-ir.md + - Weight and Activation: concepts/weight-and-activation.md + - Datatype and Quantization: concepts/quantization.md + - EP and Device: concepts/eps-and-devices.md + - WinML CLI: + - Load and export: concepts/load-and-export.md + - Primitives and pipeline: concepts/primitives-and-pipeline.md + - Analyze and optimize: concepts/analyze-and-optimize.md + - Compile and EPContext: concepts/compile-and-epcontext.md + - Perf and monitoring: concepts/perf-and-monitoring.md + - Eval and datasets: concepts/eval-and-datasets.md + - Config and build: concepts/config-and-build.md + - Commands: + - Overview: commands/overview.md + - Discover: + - sys: commands/sys.md + - inspect: commands/inspect.md + - catalog: commands/catalog.md + - Configure: + - config: commands/config.md + - Build: + - export: commands/export.md + - analyze: commands/analyze.md + - optimize: commands/optimize.md + - quantize: commands/quantize.md + - compile: commands/compile.md + - build: commands/build.md + - Measure: + - perf: commands/perf.md + - eval: commands/eval.md + - Samples: + - BERT — Config + Build + Perf: samples/bert-config-build.md + - CLIP — Composite Models: samples/clip-composite.md + - Tutorials: + - Overview: tutorials/index.md + - Hugging Face Model to NPU: tutorials/npu-convnext.md + - Bring Your Own ONNX Model: tutorials/build-from-onnx.md + - Reference: + - Config Schema: reference/index.md + - Output Layout: reference/output-layout.md + - Supported Models: reference/supported-models.md + - Python API: reference/python-api.md + - Troubleshooting: troubleshooting.md + - Contributing: contributing.md + - Privacy: Privacy.md diff --git a/pyproject.toml b/pyproject.toml index 64a80427b..a5cfc74df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,8 +75,11 @@ optional-dependencies.dev = [ "jupyter>=1.1.1", "markdown-it-py>=3", "matplotlib>=3.10", + "mkdocs-jupyter>=0.25", + "mkdocs-material>=9.5", "mypy>=1.18", "nbconvert>=7.16", + "pymdown-extensions>=10.7", "pytest>=8.4", "pytest-cov>=7", "pytest-timeout>=2.3",