diff --git a/scripts/e2e_eval/README.md b/scripts/e2e_eval/README.md index 8d018d3b6..fa70813f2 100644 --- a/scripts/e2e_eval/README.md +++ b/scripts/e2e_eval/README.md @@ -91,6 +91,105 @@ uv run python scripts/e2e_eval/run_eval.py --retry-failed | `--verbose` | off | Print stderr for failed models | | `--continue` | off | Skip models with existing results | | `--retry-failed [TYPE ...]` | — | Re-run failed models (implies `--continue`) | +| `--build-only` | off | Build with `--no-compile`, writing each stage's ONNX (no EP needed). Loops the EP matrix when `--ep`/`--device` omitted | + +#### `--build-only` — Generate per-stage models (no EP required) + +`--build-only` runs config + build with `--no-compile`, writing each stage's ONNX — +`export.onnx`, `optimized.onnx`, `quantized.onnx`. Because compile is skipped, this +needs **no execution-provider hardware** and runs on any CPU machine. Perf and accuracy +phases are skipped. + +When `--ep`/`--device` are **omitted**, every model is built once per EP in the +build-only matrix, each into a `_/` subdir: + +| Label | EP | Device | +|---|---|---| +| `qnn_npu` | qnn | npu | +| `qnn_gpu` | qnn | gpu | +| `ov_cpu` | openvino | cpu | +| `ov_npu` | openvino | npu | +| `ov_gpu` | openvino | gpu | +| `mlas_cpu` | cpu (MLAS) | cpu | +| `dml_gpu` | dml | gpu | +| `vitisai_npu` | vitisai | npu | + +Precision per combo follows the eval policy: NPU defaults to `w8a16`, CPU/GPU omit the +flag (winml auto), and native-quant EPs (VitisAI) are built unquantized (`--no-quant`). +When `--ep` or `--device` is pinned, a single build is written directly into +`/models//`. + +```bash +# Build all EP-matrix variants for P0 models (8 builds per model) +uv run python scripts/e2e_eval/run_eval.py --build-only --priority P0 + +# Pin a single EP/device (no matrix; writes directly to model dir) +uv run python scripts/e2e_eval/run_eval.py --build-only --hf-model microsoft/resnet-50 --ep qnn --device npu +``` + +Composite models (multiple sub-components) are built into per-component subdirectories +under each EP subdir. + +**Export dedup**: the `export.onnx` stage is EP/device-independent, so it is identical +across all matrix combos. It is stored once under `/_shared/export.onnx` +and removed from each `_/` subdir, keeping only one copy on disk. + +#### Streaming upload to the Azure Artifacts feed (`--upload`) + +Running the full matrix over many models fills the local disk fast. `--upload` +publishes each model's artifacts to the **`Modelkit`** Azure Artifacts feed +(Universal Package) as soon as its combos are built, then deletes the local copy — +so peak disk stays at roughly one model's matrix. + +- **Auth**: uses `az login` (Entra ID) — no PAT. The script verifies the + `azure-devops` az extension is installed (auto-adds it) and that you're logged in; + if not, it aborts (so disk isn't silently filled). +- **Package**: one package `winml-cli-models`, **one version per model**, named + `0.0.0--` where the run-stamp is a date (default today, + `YYYYMMDD`). e.g. `0.0.0-20260609-microsoft-resnet-50-image-classification` + (the `0.0.0-` core keeps it valid SemVer 2.0; the stamp+slug are the + pre-release segment). The shared run-stamp prefix groups a batch together. +- A `build_only_uploads.json` manifest (version → run-stamp → combos → status) is + written in the output dir; it drives `--continue`. + +```bash +# Build the matrix and stream each model to the feed, deleting locals +uv run python scripts/e2e_eval/run_eval.py --build-only --upload --priority P0 + +# Resume an interrupted batch: same run-stamp + --continue skips models already +# uploaded (per the manifest) without rebuilding them. +uv run python scripts/e2e_eval/run_eval.py --build-only --upload --continue \ + --run-stamp 20260609 --priority P0 + +# --upload-skip-existing: if the feed already has a version (e.g. manifest lost), +# treat the publish conflict as done and delete the local copy. +uv run python scripts/e2e_eval/run_eval.py --build-only --upload --upload-skip-existing + +# Upload but keep local copies (debug) +uv run python scripts/e2e_eval/run_eval.py --build-only --upload --keep-local +``` + +Download a specific model's specific file later with `--file-filter`: + +```bash +az artifacts universal download \ + --organization https://dev.azure.com/microsoft --project windows.ai.toolkit \ + --scope project --feed Modelkit --name winml-cli-models \ + --version 0.0.0-20260609-microsoft-resnet-50-image-classification \ + --path ./out --file-filter 'qnn_npu/quantized.onnx' +``` + +| Upload flag | Default | Description | +|---|---|---| +| `--upload` | off | Publish each model dir to the feed, then delete it locally | +| `--run-stamp` | today (`YYYYMMDD`) | Version prefix; pass the same stamp + `--continue` to resume | +| `--continue` | off | Skip models already uploaded for this run-stamp (no rebuild) | +| `--feed` | `Modelkit` | Azure Artifacts feed name | +| `--feed-org` | `https://dev.azure.com/microsoft` | Azure DevOps org URL | +| `--feed-project` | `windows.ai.toolkit` | Project for the project-scoped feed | +| `--package-name` | `winml-cli-models` | Universal Package name | +| `--keep-local` | off | Upload but do not delete the local dir | +| `--upload-skip-existing` | off | Treat an existing feed version as done (feed-based resume) | ### `generate_report.py` — Regenerate Reports diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 6eefb924b..bb38dc5bd 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -30,6 +30,7 @@ import argparse import contextlib +import hashlib import json import logging import os @@ -484,6 +485,7 @@ def _run_build( timeout: int, model_dir: Path, ep: str | None = None, + build_only: bool = False, ) -> dict: """Run winml config + winml build for one model. Returns build result dict. @@ -492,6 +494,11 @@ def _run_build( Single models produce one config; composite models (e.g., T5 translation) produce one per sub-component (suffixed names). Both go through the same build loop — single model is just the list-of-1 case. + + When ``build_only`` is set, each build writes its artifacts to ``model_dir`` + via ``-o`` (preserving the intermediate export/optimized/quantized ONNX) and + skips compile (``--no-compile``) — no execution provider is required. + Otherwise the build populates the global cache (``--use-cache``). """ config_path = model_dir / "build_config.json" model_dir.mkdir(parents=True, exist_ok=True) @@ -564,10 +571,16 @@ def _run_build( str(sub_cfg), "-m", entry.hf_id, - "--use-cache", - "--device", - device, ] + if build_only: + # Write artifacts to disk and skip compile (no EP required). + # Composite components get a subdir to avoid name collisions. + build_out = model_dir / label if label else model_dir + build_out.mkdir(parents=True, exist_ok=True) + build_args += ["-o", str(build_out), "--no-compile"] + else: + build_args += ["--use-cache"] + build_args += ["--device", device] if ep: build_args += ["--ep", ep] # Mirror the --no-quant passed to winml config above so the build @@ -587,6 +600,15 @@ def _run_build( "proc": build_proc, } + if build_only: + # In build-only mode the artifacts go to ``-o `` (no + # cache, no compile). There is no "Final artifact:" marker to + # parse and no downstream consumer of the path -- exit-code 0 is + # the success signal. Record build_out so the per-component + # bookkeeping (len(onnx_paths) == len(sub_configs)) stays valid. + onnx_paths[label] = str(build_out) + continue + task_hint = _extract_task_from_config(sub_cfg) or entry.task path = _extract_onnx_path(build_proc, entry.hf_id, task_hint) if path: @@ -658,6 +680,498 @@ def _find_cached_model(hf_id: str, build_proc: dict, task: str | None = None) -> return str(model_files[0]) if model_files else None +# --------------------------------------------------------------------------- +# Build-only phase (export + optimize + quantize, no compile / no EP hardware) +# --------------------------------------------------------------------------- + +# EP matrix generated by --build-only when neither --ep nor --device is pinned. +# This is the eval test matrix (deliberately broader than the canonical +# get_ep_device_map): qnn on npu+gpu, OpenVINO on cpu+npu+gpu, MLAS (native CPU +# EP), DirectML on gpu, and VitisAI on npu. Each combo is built into its own +# /