From 356394cd4b161da7a9b73cbc423317d858e980eb Mon Sep 17 00:00:00 2001 From: Yue Sun Date: Fri, 5 Jun 2026 16:44:52 +0800 Subject: [PATCH 1/5] feat(e2e_eval): add --build-only mode with per-EP build matrix Add a --build-only mode to run_eval.py that runs config + build with --no-compile, writing each pipeline stage's ONNX (export/optimize/quantize) without requiring execution-provider hardware. Perf and accuracy are skipped. When --ep/--device are omitted, every model is built once per EP in the build-only matrix (qnn npu/gpu, openvino cpu/npu/gpu, mlas, dml, vitisai) into /_/ subdirs. When either is pinned, a single build is written directly into the model dir. Precision per combo reuses the existing _resolve_precision policy (NPU w8a16, CPU/GPU auto, native-quant EPs unquantized). Reuses the existing _run_build via a build_only flag (-o --no-compile instead of --use-cache). --- scripts/e2e_eval/README.md | 38 ++++++++++ scripts/e2e_eval/run_eval.py | 137 ++++++++++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 3 deletions(-) diff --git a/scripts/e2e_eval/README.md b/scripts/e2e_eval/README.md index 8d018d3b6..c75cfec24 100644 --- a/scripts/e2e_eval/README.md +++ b/scripts/e2e_eval/README.md @@ -91,6 +91,44 @@ uv run python scripts/e2e_eval/run_eval.py --retry-failed | `--verbose` | off | Print stderr for failed models | | `--continue` | off | Skip models with existing results | | `--retry-failed [TYPE ...]` | — | Re-run failed models (implies `--continue`) | +| `--build-only` | off | Build with `--no-compile`, writing each stage's ONNX (no EP needed). Loops the EP matrix when `--ep`/`--device` omitted | + +#### `--build-only` — Generate per-stage models (no EP required) + +`--build-only` runs config + build with `--no-compile`, writing each stage's ONNX — +`export.onnx`, `optimized.onnx`, `quantized.onnx`. Because compile is skipped, this +needs **no execution-provider hardware** and runs on any CPU machine. Perf and accuracy +phases are skipped. + +When `--ep`/`--device` are **omitted**, every model is built once per EP in the +build-only matrix, each into a `_/` subdir: + +| Label | EP | Device | +|---|---|---| +| `qnn_npu` | qnn | npu | +| `qnn_gpu` | qnn | gpu | +| `ov_cpu` | openvino | cpu | +| `ov_npu` | openvino | npu | +| `ov_gpu` | openvino | gpu | +| `mlas_cpu` | cpu (MLAS) | cpu | +| `dml_gpu` | dml | gpu | +| `vitisai_npu` | vitisai | npu | + +Precision per combo follows the eval policy: NPU defaults to `w8a16`, CPU/GPU omit the +flag (winml auto), and native-quant EPs (VitisAI) are built unquantized (`--no-quant`). +When `--ep` or `--device` is pinned, a single build is written directly into +`/models//`. + +```bash +# Build all EP-matrix variants for P0 models (8 builds per model) +uv run python scripts/e2e_eval/run_eval.py --build-only --priority P0 + +# Pin a single EP/device (no matrix; writes directly to model dir) +uv run python scripts/e2e_eval/run_eval.py --build-only --hf-model microsoft/resnet-50 --ep qnn --device npu +``` + +Composite models (multiple sub-components) are built into per-component subdirectories +under each EP subdir. ### `generate_report.py` — Regenerate Reports diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 6eefb924b..c3cf6d6fb 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -484,6 +484,7 @@ def _run_build( timeout: int, model_dir: Path, ep: str | None = None, + build_only: bool = False, ) -> dict: """Run winml config + winml build for one model. Returns build result dict. @@ -492,6 +493,11 @@ def _run_build( Single models produce one config; composite models (e.g., T5 translation) produce one per sub-component (suffixed names). Both go through the same build loop — single model is just the list-of-1 case. + + When ``build_only`` is set, each build writes its artifacts to ``model_dir`` + via ``-o`` (preserving the intermediate export/optimized/quantized ONNX) and + skips compile (``--no-compile``) — no execution provider is required. + Otherwise the build populates the global cache (``--use-cache``). """ config_path = model_dir / "build_config.json" model_dir.mkdir(parents=True, exist_ok=True) @@ -564,10 +570,16 @@ def _run_build( str(sub_cfg), "-m", entry.hf_id, - "--use-cache", - "--device", - device, ] + if build_only: + # Write artifacts to disk and skip compile (no EP required). + # Composite components get a subdir to avoid name collisions. + build_out = model_dir / label if label else model_dir + build_out.mkdir(parents=True, exist_ok=True) + build_args += ["-o", str(build_out), "--no-compile"] + else: + build_args += ["--use-cache"] + build_args += ["--device", device] if ep: build_args += ["--ep", ep] # Mirror the --no-quant passed to winml config above so the build @@ -658,6 +670,107 @@ def _find_cached_model(hf_id: str, build_proc: dict, task: str | None = None) -> return str(model_files[0]) if model_files else None +# --------------------------------------------------------------------------- +# Build-only phase (export + optimize + quantize, no compile / no EP hardware) +# --------------------------------------------------------------------------- + +# EP matrix generated by --build-only when neither --ep nor --device is pinned. +# This is the eval test matrix (deliberately broader than the canonical +# get_ep_device_map): qnn on npu+gpu, OpenVINO on cpu+npu+gpu, MLAS (native CPU +# EP), DirectML on gpu, and VitisAI on npu. Each combo is built into its own +# /