From 356394cd4b161da7a9b73cbc423317d858e980eb Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Fri, 5 Jun 2026 16:44:52 +0800
Subject: [PATCH 1/5] feat(e2e_eval): add --build-only mode with per-EP build
 matrix

Add a --build-only mode to run_eval.py that runs config + build with
--no-compile, writing each pipeline stage's ONNX (export/optimize/quantize)
without requiring execution-provider hardware. Perf and accuracy are skipped.

When --ep/--device are omitted, every model is built once per EP in the
build-only matrix (qnn npu/gpu, openvino cpu/npu/gpu, mlas, dml, vitisai)
into <model_dir>/<ep>_<device>/ subdirs. When either is pinned, a single
build is written directly into the model dir. Precision per combo reuses
the existing _resolve_precision policy (NPU w8a16, CPU/GPU auto, native-quant
EPs unquantized).

Reuses the existing _run_build via a build_only flag (-o <dir> --no-compile
instead of --use-cache).
---
 scripts/e2e_eval/README.md   |  38 ++++++++++
 scripts/e2e_eval/run_eval.py | 137 ++++++++++++++++++++++++++++++++++-
 2 files changed, 172 insertions(+), 3 deletions(-)
diff --git a/scripts/e2e_eval/README.md b/scripts/e2e_eval/README.md
index 8d018d3b6..c75cfec24 100644
--- a/scripts/e2e_eval/README.md
+++ b/scripts/e2e_eval/README.md
@@ -91,6 +91,44 @@ uv run python scripts/e2e_eval/run_eval.py --retry-failed
 | `--verbose` | off | Print stderr for failed models |
 | `--continue` | off | Skip models with existing results |
 | `--retry-failed [TYPE ...]` | — | Re-run failed models (implies `--continue`) |
+| `--build-only` | off | Build with `--no-compile`, writing each stage's ONNX (no EP needed). Loops the EP matrix when `--ep`/`--device` omitted |
+
+#### `--build-only` — Generate per-stage models (no EP required)
+
+`--build-only` runs config + build with `--no-compile`, writing each stage's ONNX —
+`export.onnx`, `optimized.onnx`, `quantized.onnx`. Because compile is skipped, this
+needs **no execution-provider hardware** and runs on any CPU machine. Perf and accuracy
+phases are skipped.
+
+When `--ep`/`--device` are **omitted**, every model is built once per EP in the
+build-only matrix, each into a `<ep>_<device>/` subdir:
+
+| Label | EP | Device |
+|---|---|---|
+| `qnn_npu` | qnn | npu |
+| `qnn_gpu` | qnn | gpu |
+| `ov_cpu` | openvino | cpu |
+| `ov_npu` | openvino | npu |
+| `ov_gpu` | openvino | gpu |
+| `mlas_cpu` | cpu (MLAS) | cpu |
+| `dml_gpu` | dml | gpu |
+| `vitisai_npu` | vitisai | npu |
+
+Precision per combo follows the eval policy: NPU defaults to `w8a16`, CPU/GPU omit the
+flag (winml auto), and native-quant EPs (VitisAI) are built unquantized (`--no-quant`).
+When `--ep` or `--device` is pinned, a single build is written directly into
+`<output-dir>/models/<slug>/`.
+
+```bash
+# Build all EP-matrix variants for P0 models (8 builds per model)
+uv run python scripts/e2e_eval/run_eval.py --build-only --priority P0
+
+# Pin a single EP/device (no matrix; writes directly to model dir)
+uv run python scripts/e2e_eval/run_eval.py --build-only --hf-model microsoft/resnet-50 --ep qnn --device npu
+```
+
+Composite models (multiple sub-components) are built into per-component subdirectories
+under each EP subdir.
 
 ### `generate_report.py` — Regenerate Reports
 
diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 6eefb924b..c3cf6d6fb 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -484,6 +484,7 @@ def _run_build(
     timeout: int,
     model_dir: Path,
     ep: str | None = None,
+    build_only: bool = False,
 ) -> dict:
     """Run winml config + winml build for one model. Returns build result dict.
 
@@ -492,6 +493,11 @@ def _run_build(
     Single models produce one config; composite models (e.g., T5 translation)
     produce one per sub-component (suffixed names). Both go through the same
     build loop — single model is just the list-of-1 case.
+
+    When ``build_only`` is set, each build writes its artifacts to ``model_dir``
+    via ``-o`` (preserving the intermediate export/optimized/quantized ONNX) and
+    skips compile (``--no-compile``) — no execution provider is required.
+    Otherwise the build populates the global cache (``--use-cache``).
     """
     config_path = model_dir / "build_config.json"
     model_dir.mkdir(parents=True, exist_ok=True)
@@ -564,10 +570,16 @@ def _run_build(
             str(sub_cfg),
             "-m",
             entry.hf_id,
-            "--use-cache",
-            "--device",
-            device,
         ]
+        if build_only:
+            # Write artifacts to disk and skip compile (no EP required).
+            # Composite components get a subdir to avoid name collisions.
+            build_out = model_dir / label if label else model_dir
+            build_out.mkdir(parents=True, exist_ok=True)
+            build_args += ["-o", str(build_out), "--no-compile"]
+        else:
+            build_args += ["--use-cache"]
+        build_args += ["--device", device]
         if ep:
             build_args += ["--ep", ep]
         # Mirror the --no-quant passed to winml config above so the build
@@ -658,6 +670,107 @@ def _find_cached_model(hf_id: str, build_proc: dict, task: str | None = None) ->
     return str(model_files[0]) if model_files else None
 
 
+# ---------------------------------------------------------------------------
+# Build-only phase (export + optimize + quantize, no compile / no EP hardware)
+# ---------------------------------------------------------------------------
+
+# EP matrix generated by --build-only when neither --ep nor --device is pinned.
+# This is the eval test matrix (deliberately broader than the canonical
+# get_ep_device_map): qnn on npu+gpu, OpenVINO on cpu+npu+gpu, MLAS (native CPU
+# EP), DirectML on gpu, and VitisAI on npu. Each combo is built into its own
+# <model_dir>/<label>/ subdir.
+_BUILD_ONLY_EP_MATRIX: tuple[tuple[str, str, str], ...] = (
+    ("qnn_npu", "qnn", "npu"),
+    ("qnn_gpu", "qnn", "gpu"),
+    ("ov_cpu", "openvino", "cpu"),
+    ("ov_npu", "openvino", "npu"),
+    ("ov_gpu", "openvino", "gpu"),
+    ("mlas_cpu", "cpu", "cpu"),
+    ("dml_gpu", "dml", "gpu"),
+    ("vitisai_npu", "vitisai", "npu"),
+)
+
+
+def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None:
+    """Build each model to disk with --no-compile (no execution provider needed).
+
+    ``winml build -o <dir> --no-compile`` writes the per-stage artifacts
+    (export.onnx, optimized.onnx, quantized.onnx). Perf and accuracy are skipped.
+
+    When neither --ep nor --device is pinned, every model is built once per EP in
+    :data:`_BUILD_ONLY_EP_MATRIX`, each into a ``<model_dir>/<ep>_<device>/``
+    subdir. When --ep or --device is pinned, a single build is written directly
+    into ``<model_dir>``. Precision per combo follows the same policy as the eval
+    path (NPU defaults to w8a16; CPU/GPU omit the flag; native-quant EPs skip).
+    """
+    output_dir = args.output_dir or Path(f"eval_results/{date.today().isoformat()}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    save_environment_info(output_dir / "environment.json")
+
+    use_matrix = args.ep is None and args.device == "auto"
+    # Single combo uses an empty label (no subdir); matrix uses (label, ep, device).
+    combos = list(_BUILD_ONLY_EP_MATRIX) if use_matrix else [("", args.ep, args.device)]
+
+    safe_print(f"Build-only: {len(entries)} models -> {output_dir}")
+    if use_matrix:
+        safe_print(
+            f"EP matrix ({len(combos)}): {', '.join(c[0] for c in combos)} | "
+            f"Timeout: {args.timeout}s | Compile skipped (no EP required)"
+        )
+    else:
+        safe_print(
+            f"Device: {args.device} | EP: {args.ep or 'auto'} | "
+            f"Timeout: {args.timeout}s | Compile skipped (no EP required)"
+        )
+
+    total_builds = len(entries) * len(combos)
+    succeeded = 0
+    interrupted = False
+
+    for i, entry in enumerate(entries, 1):
+        label = f"{entry.hf_id} / {entry.task}" if entry.task else entry.hf_id
+        model_dir = model_result_dir(output_dir, entry.hf_id, entry.task)
+        safe_print(f"\n[{i}/{len(entries)}] {label}  ({entry.priority}, {entry.group})")
+
+        for combo_label, ep, device in combos:
+            build_dir = model_dir / combo_label if combo_label else model_dir
+            tag = f"  [{combo_label}]" if combo_label else ""
+            precision = _resolve_precision(device, entry.precision, ep=ep)
+            try:
+                build = _run_build(
+                    entry,
+                    device,
+                    precision,
+                    args.timeout,
+                    build_dir,
+                    ep=ep,
+                    build_only=True,
+                )
+            except KeyboardInterrupt:
+                safe_print("\n\n[Ctrl+C] Interrupted.")
+                interrupted = True
+                break
+
+            if build["success"]:
+                succeeded += 1
+                safe_print(f"  [OK]{tag} artifacts -> {build_dir}")
+            else:
+                safe_print(f"  [FAIL @ {build['stage']}]{tag}")
+                if args.verbose:
+                    proc = build.get("proc") or {}
+                    combined = (proc.get("stdout", "") + proc.get("stderr", "")).strip()
+                    for line in combined.splitlines()[-12:]:
+                        safe_print(f"    {line}")
+
+            if args.clean_cache:
+                _clear_disk_caches()
+
+        if interrupted:
+            break
+
+    safe_print(f"\nBuild-only complete: {succeeded}/{total_builds} builds -> {output_dir}")
+
+
 # ---------------------------------------------------------------------------
 # Perf phase
 # ---------------------------------------------------------------------------
@@ -1213,6 +1326,18 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--group", help="Filter by group")
     parser.add_argument("--device", default="auto", help="Target device (default: auto)")
     parser.add_argument("--ep", default=None, help="Execution provider (e.g. qnn, dml, ov)")
+    parser.add_argument(
+        "--build-only",
+        dest="build_only",
+        action="store_true",
+        help=(
+            "Build-only mode: run config + build with --no-compile and write each "
+            "stage's ONNX (export/optimize/quantize) to the output dir. No execution "
+            "provider required; perf/accuracy are skipped. When --ep/--device are "
+            "omitted, builds once per EP in the build-only matrix "
+            "(qnn/openvino/mlas/dml/vitisai) into <model_dir>/<ep>_<device>/ subdirs."
+        ),
+    )
     parser.add_argument(
         "--timeout", type=int, default=600, help="Per-subprocess timeout in seconds (default: 600)"
     )
@@ -1376,6 +1501,12 @@ def main() -> None:
         safe_print(f"Wrote {len(model_list)} models to {args.list_json}")
         sys.exit(0)
 
+    # Build-only mode: generate export+optimize+quantize artifacts only (no EP).
+    # Loops the EP matrix unless --ep/--device pinned. Skips perf/accuracy.
+    if args.build_only:
+        _run_build_only(entries, args)
+        return
+
     # 2. Setup output directory
     output_dir = args.output_dir or Path(f"eval_results/{date.today().isoformat()}")
     output_dir.mkdir(parents=True, exist_ok=True)

From 57e27de604ef4042e5182239270387e62ca257a5 Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Mon, 8 Jun 2026 15:25:50 +0800
Subject: [PATCH 2/5] fix(e2e_eval): make --build-only work for cross-EP
 cross-host builds

Two bugs surfaced when running `run_eval.py --build-only` against the EP matrix on a CPU-only host:

1. Every combo for the 'no native EP' subset (mlas/dml/openvino) was reported as `[FAIL @ complete]` even though export/optimize/quantize/model.onnx all landed correctly. `_run_build` was funnelling build-only results through `_extract_onnx_path`, which scans stdout for a `Final artifact:` marker that `winml build --no-compile` never prints, and falls back to the global cache which build-only doesn't populate (`-o <dir>` writes elsewhere). In build-only mode there is no downstream consumer of the path, so trust exit-code 0 directly and record `build_out` to keep the per-component bookkeeping balanced.

2. QNN/VitisAI combos failed at the optimize stage with `Requested EP 'qnn' is not available on this system`. `_run_optimize_stage` calls `resolve_device(device, ep=ep)` purely to pick the right `has_rule_data_for_ep` key for the progress bar, but that helper raises when the EP isn't installed locally -- even when the rest of the pipeline (export + optimize + quantize) runs on CPU and the EP is only needed at compile time. Soft-fail the lookup *only when* `config.compile is None` (i.e. `--no-compile` or a config that explicitly opts out); otherwise re-raise so configs that will compile still fail fast here instead of deep inside the compile stage.

Also moves `--clean-cache` from per-combo to per-model in `_run_build_only`: combos for the same model share the same HF download, so clearing between combos forced N redundant re-downloads of the same weights.
---
 scripts/e2e_eval/run_eval.py         | 22 ++++++++++++++++------
 src/winml/modelkit/commands/build.py | 22 ++++++++++++++++++----
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index c3cf6d6fb..07783139d 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -599,6 +599,15 @@ def _run_build(
                 "proc": build_proc,
             }
 
+        if build_only:
+            # In build-only mode the artifacts go to ``-o <build_out>`` (no
+            # cache, no compile). There is no "Final artifact:" marker to
+            # parse and no downstream consumer of the path -- exit-code 0 is
+            # the success signal. Record build_out so the per-component
+            # bookkeeping (len(onnx_paths) == len(sub_configs)) stays valid.
+            onnx_paths[label] = str(build_out)
+            continue
+
         task_hint = _extract_task_from_config(sub_cfg) or entry.task
         path = _extract_onnx_path(build_proc, entry.hf_id, task_hint)
         if path:
@@ -762,12 +771,15 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
                     for line in combined.splitlines()[-12:]:
                         safe_print(f"    {line}")
 
-            if args.clean_cache:
-                _clear_disk_caches()
-
         if interrupted:
             break
 
+        # Clean caches once per model (after all EP combos finish), not per
+        # combo: combos share the same HF download, so clearing between
+        # combos forces redundant re-downloads of the same weights.
+        if args.clean_cache:
+            _clear_disk_caches()
+
     safe_print(f"\nBuild-only complete: {succeeded}/{total_builds} builds -> {output_dir}")
 
 
@@ -922,9 +934,7 @@ def _build_dataset(ds_config: dict, timeout: int) -> None:
         return
 
     script_path = Path(build_script)
-    cache_dir = Path(
-        ds_config.get("dataset", EVAL_DATASETS_CACHE / script_path.stem)
-    ).expanduser()
+    cache_dir = Path(ds_config.get("dataset", EVAL_DATASETS_CACHE / script_path.stem)).expanduser()
 
     if (cache_dir / "dataset_info.json").exists():
         safe_print(f"    dataset: cached ({cache_dir})")
diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py
index 7ba2850c1..89b874d9e 100644
--- a/src/winml/modelkit/commands/build.py
+++ b/src/winml/modelkit/commands/build.py
@@ -637,9 +637,7 @@ def _patch_device(cfg: WinMLBuildConfig) -> None:
         # scratch state when the user passes the wrong file or a
         # hand-edited config (#P1 UX).
         _configs_to_validate: list[WinMLBuildConfig] = (
-            config_or_configs
-            if isinstance(config_or_configs, list)
-            else [config_or_configs]
+            config_or_configs if isinstance(config_or_configs, list) else [config_or_configs]
         )
         try:
             for _cfg in _configs_to_validate:
@@ -1014,10 +1012,26 @@ def _on_iteration_start(iteration: int, max_iter: int) -> None:
 
         # Resolve "auto" to a concrete device once so that has_rule_data_for_ep
         # doesn't search for non-existent "*_AUTO_*.parquet" files.
+        #
+        # ``_resolved_device`` is only used as a cosmetic key for
+        # ``has_rule_data_for_ep`` (decides whether to show a per-EP progress
+        # bar). When the build will *not* compile (``config.compile is None`` —
+        # either from ``--no-compile`` or a config that explicitly opts out)
+        # the EP doesn't need to be installed on this host: we're only
+        # exporting + optimizing + quantizing, all of which run on CPU. In
+        # that case fall back to the requested device string so cross-EP
+        # builds (e.g. emitting a QNN/VitisAI artifact on a CPU box) work.
+        # When compile *will* run, re-raise so the missing EP fails fast
+        # here instead of deep inside the compile stage.
         from ..analyze.utils.ep_utils import has_rule_data_for_ep
         from ..sysinfo import resolve_device as _resolve_device
 
-        _resolved_device, _ = _resolve_device(device=device or "auto", ep=ep)
+        try:
+            _resolved_device, _ = _resolve_device(device=device or "auto", ep=ep)
+        except ValueError:
+            if config.compile is not None:
+                raise
+            _resolved_device = device or ""
 
         def _on_ep_start(ep_name: EPName, operator_counts: dict) -> None:
             nonlocal _current_ep

From f84b10e719aeea96626c85604c0f271db9068372 Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Tue, 9 Jun 2026 15:53:39 +0800
Subject: [PATCH 3/5] feat(e2e_eval): build-only export dedup + stream-upload
 to Azure Artifacts feed

Running --build-only over the 8-EP matrix for many models fills local disk.
Two additions keep disk bounded:

1. Export dedup: the export.onnx stage is EP/device-independent, so every
   combo produces an identical export. After each combo builds, its export is
   hash-compared against a per-model canonical: the first is moved to
   <model_dir>/_shared/, later identical ones are deleted. One export copy on
   disk instead of 8.

2. --upload: after a model's combos are built, publish the model dir to the
   Modelkit Azure Artifacts feed as a Universal Package version, then delete it
   locally. Auth via az login (no PAT); the azure-devops extension is ensured
   and login verified up front (aborts otherwise so disk isn't silently filled).
   Version is 0.0.0-<run-stamp>-<model-slug> (valid SemVer 2.0; date stamp
   groups a batch). --continue + --run-stamp resume an interrupted batch from
   the build_only_uploads.json manifest without rebuilding uploaded models;
   --keep-local, --upload-skip-existing, and feed/package args round it out.
---
 scripts/e2e_eval/README.md   |  61 +++++++
 scripts/e2e_eval/run_eval.py | 312 ++++++++++++++++++++++++++++++++++-
 2 files changed, 367 insertions(+), 6 deletions(-)

diff --git a/scripts/e2e_eval/README.md b/scripts/e2e_eval/README.md
index c75cfec24..fa70813f2 100644
--- a/scripts/e2e_eval/README.md
+++ b/scripts/e2e_eval/README.md
@@ -130,6 +130,67 @@ uv run python scripts/e2e_eval/run_eval.py --build-only --hf-model microsoft/res
 Composite models (multiple sub-components) are built into per-component subdirectories
 under each EP subdir.
 
+**Export dedup**: the `export.onnx` stage is EP/device-independent, so it is identical
+across all matrix combos. It is stored once under `<model_dir>/_shared/export.onnx`
+and removed from each `<ep>_<device>/` subdir, keeping only one copy on disk.
+
+#### Streaming upload to the Azure Artifacts feed (`--upload`)
+
+Running the full matrix over many models fills the local disk fast. `--upload`
+publishes each model's artifacts to the **`Modelkit`** Azure Artifacts feed
+(Universal Package) as soon as its combos are built, then deletes the local copy —
+so peak disk stays at roughly one model's matrix.
+
+- **Auth**: uses `az login` (Entra ID) — no PAT. The script verifies the
+  `azure-devops` az extension is installed (auto-adds it) and that you're logged in;
+  if not, it aborts (so disk isn't silently filled).
+- **Package**: one package `winml-cli-models`, **one version per model**, named
+  `0.0.0-<run-stamp>-<model-slug>` where the run-stamp is a date (default today,
+  `YYYYMMDD`). e.g. `0.0.0-20260609-microsoft-resnet-50-image-classification`
+  (the `0.0.0-` core keeps it valid SemVer 2.0; the stamp+slug are the
+  pre-release segment). The shared run-stamp prefix groups a batch together.
+- A `build_only_uploads.json` manifest (version → run-stamp → combos → status) is
+  written in the output dir; it drives `--continue`.
+
+```bash
+# Build the matrix and stream each model to the feed, deleting locals
+uv run python scripts/e2e_eval/run_eval.py --build-only --upload --priority P0
+
+# Resume an interrupted batch: same run-stamp + --continue skips models already
+# uploaded (per the manifest) without rebuilding them.
+uv run python scripts/e2e_eval/run_eval.py --build-only --upload --continue \
+  --run-stamp 20260609 --priority P0
+
+# --upload-skip-existing: if the feed already has a version (e.g. manifest lost),
+# treat the publish conflict as done and delete the local copy.
+uv run python scripts/e2e_eval/run_eval.py --build-only --upload --upload-skip-existing
+
+# Upload but keep local copies (debug)
+uv run python scripts/e2e_eval/run_eval.py --build-only --upload --keep-local
+```
+
+Download a specific model's specific file later with `--file-filter`:
+
+```bash
+az artifacts universal download \
+  --organization https://dev.azure.com/microsoft --project windows.ai.toolkit \
+  --scope project --feed Modelkit --name winml-cli-models \
+  --version 0.0.0-20260609-microsoft-resnet-50-image-classification \
+  --path ./out --file-filter 'qnn_npu/quantized.onnx'
+```
+
+| Upload flag | Default | Description |
+|---|---|---|
+| `--upload` | off | Publish each model dir to the feed, then delete it locally |
+| `--run-stamp` | today (`YYYYMMDD`) | Version prefix; pass the same stamp + `--continue` to resume |
+| `--continue` | off | Skip models already uploaded for this run-stamp (no rebuild) |
+| `--feed` | `Modelkit` | Azure Artifacts feed name |
+| `--feed-org` | `https://dev.azure.com/microsoft` | Azure DevOps org URL |
+| `--feed-project` | `windows.ai.toolkit` | Project for the project-scoped feed |
+| `--package-name` | `winml-cli-models` | Universal Package name |
+| `--keep-local` | off | Upload but do not delete the local dir |
+| `--upload-skip-existing` | off | Treat an existing feed version as done (feed-based resume) |
+
 ### `generate_report.py` — Regenerate Reports
 
 Re-reads cached `result.json` files and regenerates reports using the latest
diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 07783139d..e2d5a0397 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -30,6 +30,7 @@
 
 import argparse
 import contextlib
+import hashlib
 import json
 import logging
 import os
@@ -700,6 +701,162 @@ def _find_cached_model(hf_id: str, build_proc: dict, task: str | None = None) ->
 )
 
 
+# ---------------------------------------------------------------------------
+# Build-only: export dedup
+# ---------------------------------------------------------------------------
+
+
+def _hash_files(paths: list[Path]) -> str:
+    """SHA-256 over a set of files (name + streamed content), order-independent."""
+    h = hashlib.sha256()
+    for p in sorted(paths, key=lambda x: x.name):
+        h.update(p.name.encode("utf-8"))
+        try:
+            with p.open("rb") as fh:
+                for chunk in iter(lambda: fh.read(1 << 20), b""):
+                    h.update(chunk)
+        except OSError:
+            h.update(b"<unreadable>")
+    return h.hexdigest()
+
+
+def _dedup_export(build_dir: Path, shared_dir: Path, canonical_hash: str | None, label: str) -> str | None:
+    """Deduplicate this combo's export.onnx(+sidecar) against a per-model canonical.
+
+    The export stage is EP/device-independent, so every combo produces an
+    identical ``export.onnx``. The first one is moved into ``shared_dir``
+    (``_shared/``); later identical ones are deleted to keep one copy on disk.
+
+    Returns the (possibly newly-set) canonical hash.
+    """
+    export_files = sorted(build_dir.glob("export.onnx*"))
+    if not export_files:
+        return canonical_hash  # composite/no top-level export — leave untouched
+    h = _hash_files(export_files)
+    if canonical_hash is None:
+        shared_dir.mkdir(parents=True, exist_ok=True)
+        for f in export_files:
+            shutil.move(str(f), str(shared_dir / f.name))
+        safe_print(f"  [dedup] export -> {shared_dir.name}/ (canonical, {len(export_files)} file(s))")
+        return h
+    if h == canonical_hash:
+        for f in export_files:
+            f.unlink(missing_ok=True)
+        safe_print(f"  [dedup] {label}: export identical -> removed (using {shared_dir.name}/)")
+        return canonical_hash
+    safe_print(f"  [dedup] WARNING {label}: export differs from canonical — keeping in place")
+    return canonical_hash
+
+
+# ---------------------------------------------------------------------------
+# Build-only: Azure Artifacts feed upload (Universal Packages, az CLI, no PAT)
+# ---------------------------------------------------------------------------
+
+
+def _run_az(args: list[str], timeout: int = 600) -> dict:
+    """Run an `az ...` command, returning the same dict shape as _run_subprocess."""
+    az = shutil.which("az")
+    if az is None:
+        return {
+            "stdout": "",
+            "stderr": "az CLI not found on PATH",
+            "exit_code": 127,
+            "elapsed": 0.0,
+            "timeout": False,
+            "command": "az " + " ".join(args),
+        }
+    # Pass the az path (incl. az.cmd on Windows) directly in list form.
+    # subprocess handles .cmd resolution and arg quoting correctly; wrapping in
+    # `cmd /c` breaks when the az path has spaces (C:\Program Files\...) and an
+    # arg also contains spaces (e.g. --description), because cmd.exe then mangles
+    # the quotes and tries to run 'C:\Program'.
+    return _run_subprocess([az, *args], timeout)
+
+
+def _ensure_feed_ready(timeout: int = 180) -> str | None:
+    """Verify az + azure-devops extension + login. Returns an error string or None."""
+    if shutil.which("az") is None:
+        return "az CLI not found. Install Azure CLI (https://aka.ms/azcli)."
+    ext = _run_az(["extension", "show", "--name", "azure-devops"], timeout)
+    if ext["exit_code"] != 0:
+        safe_print("  [upload] Installing 'azure-devops' az extension...")
+        add = _run_az(["extension", "add", "--name", "azure-devops"], timeout)
+        if add["exit_code"] != 0:
+            return f"Failed to install 'azure-devops' az extension: {add['stderr'][:300]}"
+    acct = _run_az(["account", "show"], timeout)
+    if acct["exit_code"] != 0:
+        return "Not logged in to Azure. Run 'az login' (PAT not required), then retry."
+    return None
+
+
+def _slugify_version(text: str) -> str:
+    """Lowercase + collapse non-[0-9a-z] runs to single dashes (semver prerelease-safe)."""
+    s = re.sub(r"[^0-9a-z]+", "-", text.lower())
+    return re.sub(r"-{2,}", "-", s).strip("-")
+
+
+def _feed_version_for(entry: ModelEntry, run_stamp: str) -> str:
+    """Per-model Universal Package version: ``0.0.0-<run-stamp>-<model-slug>``.
+
+    Universal Packages require a valid lowercase SemVer 2.0 version, so the
+    ``major.minor.patch`` core is fixed at ``0.0.0`` and the batch stamp + model
+    identity live in the pre-release segment. The run stamp (a date like
+    ``20260609``) groups a batch under a common prefix and lets an interrupted
+    batch resume by re-using the same stamp (see ``--run-stamp`` / ``--continue``).
+    """
+    parts = [run_stamp, entry.hf_id]
+    if entry.task:
+        parts.append(entry.task)
+    return "0.0.0-" + _slugify_version("-".join(parts))
+
+
+def _is_publish_conflict(proc: dict) -> bool:
+    """True if a publish failed because the version already exists in the feed."""
+    blob = (proc.get("stdout", "") + proc.get("stderr", "")).lower()
+    return any(s in blob for s in ("already exist", "conflict", "409"))
+
+
+def _upload_model_dir(args: argparse.Namespace, model_dir: Path, version: str, timeout: int) -> dict:
+    """Publish a model dir to the feed as a Universal Package version."""
+    return _run_az(
+        [
+            "artifacts",
+            "universal",
+            "publish",
+            "--organization",
+            args.feed_org,
+            "--project",
+            args.feed_project,
+            "--scope",
+            "project",
+            "--feed",
+            args.feed,
+            "--name",
+            args.package_name,
+            "--version",
+            version,
+            "--path",
+            str(model_dir),
+            "--description",
+            f"build-only artifacts: {version}",
+        ],
+        timeout,
+    )
+
+
+def _load_upload_manifest(path: Path) -> dict:
+    """Load the build_only_uploads.json manifest, or {} on any error."""
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return {}
+
+
+def _write_upload_manifest(path: Path, manifest: dict) -> None:
+    """Persist the upload manifest (model version -> details)."""
+    path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+
+
 def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None:
     """Build each model to disk with --no-compile (no execution provider needed).
 
@@ -708,9 +865,14 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
 
     When neither --ep nor --device is pinned, every model is built once per EP in
     :data:`_BUILD_ONLY_EP_MATRIX`, each into a ``<model_dir>/<ep>_<device>/``
-    subdir. When --ep or --device is pinned, a single build is written directly
-    into ``<model_dir>``. Precision per combo follows the same policy as the eval
-    path (NPU defaults to w8a16; CPU/GPU omit the flag; native-quant EPs skip).
+    subdir. The identical ``export.onnx`` is deduped into ``<model_dir>/_shared/``.
+    When --ep or --device is pinned, a single build is written directly into
+    ``<model_dir>`` (no dedup). Precision per combo follows the same policy as the
+    eval path (NPU defaults to w8a16; CPU/GPU omit the flag; native-quant EPs skip).
+
+    With ``--upload``, each model's dir is published to the Azure Artifacts feed as
+    a Universal Package version (``<package-name>@0.0.0-<run-stamp>-<model-slug>``)
+    and then deleted locally to bound disk usage.
     """
     output_dir = args.output_dir or Path(f"eval_results/{date.today().isoformat()}")
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -720,6 +882,25 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
     # Single combo uses an empty label (no subdir); matrix uses (label, ep, device).
     combos = list(_BUILD_ONLY_EP_MATRIX) if use_matrix else [("", args.ep, args.device)]
 
+    # Verify feed prerequisites once, up front. --upload exists to keep disk
+    # bounded, so a broken az setup must abort (not silently fall back to
+    # keeping everything local, which would fill the disk it was meant to save).
+    manifest: dict = {}
+    manifest_path = output_dir / "build_only_uploads.json"
+    run_stamp = _slugify_version(args.run_stamp or date.today().strftime("%Y%m%d"))
+    if args.upload:
+        err = _ensure_feed_ready()
+        if err is not None:
+            safe_print(f"[upload] Cannot upload: {err}")
+            sys.exit(2)
+        manifest = _load_upload_manifest(manifest_path)
+        safe_print(
+            f"[upload] Feed ready: {args.feed_org} / {args.feed_project} / "
+            f"feed={args.feed} / package={args.package_name} | run-stamp={run_stamp}"
+        )
+        if args.continue_run:
+            safe_print("[upload] Continue: skipping models already uploaded for this run-stamp")
+
     safe_print(f"Build-only: {len(entries)} models -> {output_dir}")
     if use_matrix:
         safe_print(
@@ -734,11 +915,26 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
 
     total_builds = len(entries) * len(combos)
     succeeded = 0
+    uploaded = 0
     interrupted = False
 
     for i, entry in enumerate(entries, 1):
         label = f"{entry.hf_id} / {entry.task}" if entry.task else entry.hf_id
         model_dir = model_result_dir(output_dir, entry.hf_id, entry.task)
+        shared_dir = model_dir / "_shared"
+        canonical_hash: str | None = None
+        built_labels: list[str] = []
+        version = _feed_version_for(entry, run_stamp)
+
+        # Resume: skip models already uploaded for this run-stamp (manifest is
+        # the source of truth; failed/unrecorded models are (re)built). Skipping
+        # here avoids the expensive 8-EP rebuild, not just the upload.
+        if args.upload and args.continue_run:
+            prev = manifest.get(version)
+            if prev and prev.get("status") in ("uploaded", "exists-skipped"):
+                safe_print(f"\n[{i}/{len(entries)}] {label}  (SKIP - {prev['status']}: {version})")
+                continue
+
         safe_print(f"\n[{i}/{len(entries)}] {label}  ({entry.priority}, {entry.group})")
 
         for combo_label, ep, device in combos:
@@ -762,7 +958,13 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
 
             if build["success"]:
                 succeeded += 1
+                built_labels.append(combo_label or "(pinned)")
                 safe_print(f"  [OK]{tag} artifacts -> {build_dir}")
+                # Dedup the EP-independent export into _shared/ (matrix only).
+                if use_matrix:
+                    canonical_hash = _dedup_export(
+                        build_dir, shared_dir, canonical_hash, combo_label
+                    )
             else:
                 safe_print(f"  [FAIL @ {build['stage']}]{tag}")
                 if args.verbose:
@@ -774,13 +976,54 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
         if interrupted:
             break
 
+        # Upload the whole model dir, then delete locally to bound disk usage.
+        if args.upload and built_labels:
+            safe_print(f"  [upload] {args.package_name}@{version} ...")
+            up = _upload_model_dir(args, model_dir, version, args.timeout)
+            ok = up["exit_code"] == 0
+            conflict = (not ok) and _is_publish_conflict(up)
+            status: str
+            if ok:
+                status = "uploaded"
+                uploaded += 1
+                safe_print(f"  [upload OK] {args.package_name}@{version}")
+            elif conflict and args.upload_skip_existing:
+                status = "exists-skipped"
+                safe_print(f"  [upload SKIP] version exists: {version}")
+            else:
+                status = "failed"
+                safe_print(f"  [upload FAIL] {args.package_name}@{version} (kept local)")
+                if args.verbose:
+                    for line in (up.get("stderr", "")).strip().splitlines()[-12:]:
+                        safe_print(f"    {line}")
+
+            manifest[version] = {
+                "hf_id": entry.hf_id,
+                "task": entry.task,
+                "package": args.package_name,
+                "run_stamp": run_stamp,
+                "combos": built_labels,
+                "status": status,
+                "uploaded_at": _utc_now(),
+            }
+            _write_upload_manifest(manifest_path, manifest)
+
+            # Delete local copy only when the artifacts are safely in the feed.
+            if status in ("uploaded", "exists-skipped") and not args.keep_local:
+                try:
+                    shutil.rmtree(model_dir)
+                    safe_print(f"  [upload] Removed local: {model_dir}")
+                except OSError as exc:
+                    safe_print(f"  [upload] Warning: could not remove {model_dir}: {exc}")
+
         # Clean caches once per model (after all EP combos finish), not per
         # combo: combos share the same HF download, so clearing between
         # combos forces redundant re-downloads of the same weights.
         if args.clean_cache:
             _clear_disk_caches()
 
-    safe_print(f"\nBuild-only complete: {succeeded}/{total_builds} builds -> {output_dir}")
+    tail = f" | uploaded {uploaded} models" if args.upload else ""
+    safe_print(f"\nBuild-only complete: {succeeded}/{total_builds} builds -> {output_dir}{tail}")
 
 
 # ---------------------------------------------------------------------------
@@ -1236,7 +1479,7 @@ def save_environment_info(path: Path) -> None:
     # `winml sys --format json` captures hardware details (devices, EPs,
     # backends) that the lightweight package-version probes above miss.
     try:
-        result = subprocess.run(  # noqa: S603
+        result = subprocess.run(
             [sys.executable, "-m", "winml", "sys", "--format", "json"],
             capture_output=True,
             text=True,
@@ -1345,7 +1588,64 @@ def parse_args() -> argparse.Namespace:
             "stage's ONNX (export/optimize/quantize) to the output dir. No execution "
             "provider required; perf/accuracy are skipped. When --ep/--device are "
             "omitted, builds once per EP in the build-only matrix "
-            "(qnn/openvino/mlas/dml/vitisai) into <model_dir>/<ep>_<device>/ subdirs."
+            "(qnn/openvino/mlas/dml/vitisai) into <model_dir>/<ep>_<device>/ subdirs. "
+            "Identical export.onnx is deduped into <model_dir>/_shared/."
+        ),
+    )
+    # --- Build-only feed upload (Azure Artifacts Universal Packages) ---
+    parser.add_argument(
+        "--upload",
+        dest="upload",
+        action="store_true",
+        help=(
+            "Build-only only: after each model's combos are built, publish the "
+            "model dir to an Azure Artifacts feed (Universal Package) and delete "
+            "it locally to bound disk usage. Auth via 'az login' (no PAT)."
+        ),
+    )
+    parser.add_argument(
+        "--feed",
+        default="Modelkit",
+        help="Azure Artifacts feed name for --upload (default: Modelkit)",
+    )
+    parser.add_argument(
+        "--feed-org",
+        default="https://dev.azure.com/microsoft",
+        help="Azure DevOps org URL for --upload (default: https://dev.azure.com/microsoft)",
+    )
+    parser.add_argument(
+        "--feed-project",
+        default="windows.ai.toolkit",
+        help="Azure DevOps project for the project-scoped feed (default: windows.ai.toolkit)",
+    )
+    parser.add_argument(
+        "--package-name",
+        default="winml-cli-models",
+        help="Universal Package name for --upload (default: winml-cli-models)",
+    )
+    parser.add_argument(
+        "--run-stamp",
+        dest="run_stamp",
+        default=None,
+        help=(
+            "Batch stamp used as the feed version prefix (<stamp>-<model-slug>). "
+            "Defaults to today's date (YYYYMMDD). Pass the SAME stamp with "
+            "--continue to resume an interrupted batch."
+        ),
+    )
+    parser.add_argument(
+        "--keep-local",
+        dest="keep_local",
+        action="store_true",
+        help="With --upload, do NOT delete the local model dir after a successful upload.",
+    )
+    parser.add_argument(
+        "--upload-skip-existing",
+        dest="upload_skip_existing",
+        action="store_true",
+        help=(
+            "With --upload, skip building+uploading a model whose package version "
+            "already exists in the feed (resume support)."
         ),
     )
     parser.add_argument(

From d275e63fe28ab32be685b116dd225dbaf6170666 Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Wed, 10 Jun 2026 12:03:21 +0800
Subject: [PATCH 4/5] fix(e2e_eval): make build-only --continue query the feed
 for uploaded versions

The --continue skip logic only consulted the local build_only_uploads.json manifest, which is written after each successful upload. A fresh --output-dir (e.g. a gitignored temp dir) starts empty, so models already published to the Azure Artifacts feed under the same run-stamp were rebuilt and re-uploaded instead of being skipped.

Seed the in-memory manifest from the feed at startup: query the feed REST API for versions matching 0.0.0-<run-stamp>-* and mark them as uploaded so the existing skip check honors them. The feed is now authoritative for what's published, regardless of local state. Querying is best-effort -- a failure falls back to local-manifest-only behavior.

Use two ampersand-free REST GETs (list packages -> resolve UPack package GUID -> list versions) because az resolves to az.cmd and cmd.exe splits query strings on '&', dropping every parameter after the first.
---
 scripts/e2e_eval/run_eval.py | 109 ++++++++++++++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 1 deletion(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index e2d5a0397..941c5e410 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -752,6 +752,12 @@ def _dedup_export(build_dir: Path, shared_dir: Path, canonical_hash: str | None,
 # Build-only: Azure Artifacts feed upload (Universal Packages, az CLI, no PAT)
 # ---------------------------------------------------------------------------
 
+# Azure DevOps AAD application ID. Used as the token audience (``--resource``)
+# when querying the feed REST API with ``az rest``. Constant across all orgs and
+# not a secret (it is the public first-party app id for Azure DevOps).
+_AZURE_DEVOPS_RESOURCE = "499b84ac-1321-427f-aa17-267ca6975798"
+_FEED_API_VERSION = "7.1-preview.1"
+
 
 def _run_az(args: list[str], timeout: int = 600) -> dict:
     """Run an `az ...` command, returning the same dict shape as _run_subprocess."""
@@ -857,6 +863,85 @@ def _write_upload_manifest(path: Path, manifest: dict) -> None:
     path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
 
 
+def _feed_org_name(feed_org: str) -> str:
+    """Extract the Azure DevOps org name from a feed-org URL.
+
+    ``https://dev.azure.com/microsoft`` -> ``microsoft``
+    ``https://microsoft.visualstudio.com`` -> ``microsoft``
+    """
+    host_and_path = feed_org.rstrip("/").split("://", 1)[-1]
+    host, _, path = host_and_path.partition("/")
+    if path:
+        return path.split("/")[0]
+    if host.endswith(".visualstudio.com"):
+        return host.split(".", 1)[0]
+    return host
+
+
+def _fetch_feed_versions(
+    args: argparse.Namespace, run_stamp: str, timeout: int = 180
+) -> set[str] | None:
+    """Return package versions already published to the feed for ``run_stamp``.
+
+    The local manifest is only written after a successful upload, so a fresh
+    ``--output-dir`` starts empty even when the feed already holds versions from
+    a previous run. Querying the feed makes ``--continue`` authoritative: a model
+    is skipped if its version exists on the feed, regardless of local state.
+
+    Two ``&``-free REST GETs are used because ``az`` resolves to ``az.cmd``, which
+    runs through cmd.exe and would split a query string on ``&`` (dropping every
+    parameter after the first):
+        1. list packages, find the UPack package by name -> package GUID,
+        2. list that package's versions.
+
+    Returns the set of lowercased versions matching the ``0.0.0-<run_stamp>-``
+    prefix, an empty set if the feed is reachable but has no such versions yet,
+    or ``None`` if the feed could not be queried (caller falls back to the local
+    manifest only).
+    """
+    org = _feed_org_name(args.feed_org)
+    base = (
+        f"https://feeds.dev.azure.com/{org}/{args.feed_project}"
+        f"/_apis/packaging/feeds/{args.feed}"
+    )
+
+    def _get_json(url: str) -> dict | None:
+        res = _run_az(
+            ["rest", "--method", "get", "--resource", _AZURE_DEVOPS_RESOURCE, "--url", url],
+            timeout,
+        )
+        if res["exit_code"] != 0:
+            return None
+        try:
+            return json.loads(res["stdout"])
+        except (json.JSONDecodeError, TypeError):
+            return None
+
+    listing = _get_json(f"{base}/packages?api-version={_FEED_API_VERSION}")
+    if listing is None:
+        return None
+    pkg_id: str | None = None
+    for pkg in listing.get("value", []):
+        name_matches = (pkg.get("name") or "").lower() == args.package_name.lower()
+        is_upack = (pkg.get("protocolType") or "").lower() == "upack"
+        if name_matches and is_upack:
+            pkg_id = pkg.get("id")
+            break
+    if pkg_id is None:
+        return set()  # feed reachable, package not published yet
+
+    versions_doc = _get_json(f"{base}/packages/{pkg_id}/versions?api-version={_FEED_API_VERSION}")
+    if versions_doc is None:
+        return None
+    prefix = f"0.0.0-{run_stamp}-"
+    published: set[str] = set()
+    for entry in versions_doc.get("value", []):
+        version = (entry.get("version") or "").lower()
+        if version.startswith(prefix):
+            published.add(version)
+    return published
+
+
 def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None:
     """Build each model to disk with --no-compile (no execution provider needed).
 
@@ -900,6 +985,24 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
         )
         if args.continue_run:
             safe_print("[upload] Continue: skipping models already uploaded for this run-stamp")
+            # Seed the manifest from the feed: a fresh --output-dir has no local
+            # manifest, so the feed is the source of truth for what's published.
+            # Best-effort — a query failure falls back to local-manifest-only.
+            feed_versions = _fetch_feed_versions(args, run_stamp)
+            if feed_versions is None:
+                safe_print(
+                    "[upload] Continue: could not query feed; relying on local manifest only"
+                )
+            else:
+                seeded = 0
+                for feed_version in feed_versions:
+                    if feed_version not in manifest:
+                        manifest[feed_version] = {"status": "uploaded", "source": "feed"}
+                        seeded += 1
+                safe_print(
+                    f"[upload] Continue: feed has {len(feed_versions)} version(s) for "
+                    f"run-stamp {run_stamp}; {seeded} not in local manifest -> will skip"
+                )
 
     safe_print(f"Build-only: {len(entries)} models -> {output_dir}")
     if use_matrix:
@@ -932,7 +1035,11 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
         if args.upload and args.continue_run:
             prev = manifest.get(version)
             if prev and prev.get("status") in ("uploaded", "exists-skipped"):
-                safe_print(f"\n[{i}/{len(entries)}] {label}  (SKIP - {prev['status']}: {version})")
+                origin = " via feed" if prev.get("source") == "feed" else ""
+                safe_print(
+                    f"\n[{i}/{len(entries)}] {label}  "
+                    f"(SKIP - {prev['status']}{origin}: {version})"
+                )
                 continue
 
         safe_print(f"\n[{i}/{len(entries)}] {label}  ({entry.priority}, {entry.group})")

From dc087a5b036299ac406a283dde99057bc94f0817 Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Wed, 10 Jun 2026 12:20:42 +0800
Subject: [PATCH 5/5] fix(e2e_eval): address PR review feedback on build-only
 upload path

- _hash_files: stop hashing unreadable files to a fixed sentinel; propagate
  OSError and have _dedup_export keep the export in place instead of risking
  deletion of an artifact never verified identical.
- _is_publish_conflict: narrow detection to specific version-exists / HTTP 409
  markers (drop bare 'conflict'/'409') so an unrelated message can't trigger
  exists-skipped and rmtree the local model dir.
- build.py _run_optimize_stage: narrow the no-compile EP fallback to only
  swallow EP-not-available ValueErrors; re-raise malformed device/EP names.
- Warn when --continue is used with --build-only but without --upload (no
  local-disk resume exists, so everything is rebuilt).
- Document that the pinned-EP auto-device path delegates precision to winml
  config's auto-detection.
- Fix misleading --upload-skip-existing help: it does not skip the build.
---
 scripts/e2e_eval/run_eval.py         | 65 +++++++++++++++++++++++-----
 src/winml/modelkit/commands/build.py | 11 ++++-
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 941c5e410..bb38dc5bd 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -707,16 +707,20 @@ def _find_cached_model(hf_id: str, build_proc: dict, task: str | None = None) ->
 
 
 def _hash_files(paths: list[Path]) -> str:
-    """SHA-256 over a set of files (name + streamed content), order-independent."""
+    """SHA-256 over a set of files (name + streamed content), order-independent.
+
+    Raises:
+        OSError: if any file cannot be read. The caller must decide how to
+            handle this (e.g. skip dedup) instead of hashing two unreadable
+            files to the same value and deleting an artifact that was never
+            verified to be identical.
+    """
     h = hashlib.sha256()
     for p in sorted(paths, key=lambda x: x.name):
         h.update(p.name.encode("utf-8"))
-        try:
-            with p.open("rb") as fh:
-                for chunk in iter(lambda: fh.read(1 << 20), b""):
-                    h.update(chunk)
-        except OSError:
-            h.update(b"<unreadable>")
+        with p.open("rb") as fh:
+            for chunk in iter(lambda: fh.read(1 << 20), b""):
+                h.update(chunk)
     return h.hexdigest()
 
 
@@ -732,7 +736,13 @@ def _dedup_export(build_dir: Path, shared_dir: Path, canonical_hash: str | None,
     export_files = sorted(build_dir.glob("export.onnx*"))
     if not export_files:
         return canonical_hash  # composite/no top-level export — leave untouched
-    h = _hash_files(export_files)
+    try:
+        h = _hash_files(export_files)
+    except OSError as exc:
+        # Never dedup on an unverified hash: an export we cannot read is kept in
+        # place rather than risk deleting it as a false duplicate.
+        safe_print(f"  [dedup] WARNING {label}: cannot hash export ({exc}) — keeping in place")
+        return canonical_hash
     if canonical_hash is None:
         shared_dir.mkdir(parents=True, exist_ok=True)
         for f in export_files:
@@ -817,9 +827,25 @@ def _feed_version_for(entry: ModelEntry, run_stamp: str) -> str:
 
 
 def _is_publish_conflict(proc: dict) -> bool:
-    """True if a publish failed because the version already exists in the feed."""
+    """True if a publish failed because the version already exists in the feed.
+
+    Only specific version-exists / HTTP 409 markers are matched. A broad
+    substring like ``"conflict"`` or a bare ``"409"`` is avoided on purpose: a
+    false positive is treated as ``exists-skipped`` and deletes the local model
+    dir, so an unrelated message mentioning those words would be a data-loss
+    path.
+    """
     blob = (proc.get("stdout", "") + proc.get("stderr", "")).lower()
-    return any(s in blob for s in ("already exist", "conflict", "409"))
+    return any(
+        marker in blob
+        for marker in (
+            "already exist",
+            "packageversionexists",
+            "status code: 409",
+            "statuscode=409",
+            "httpstatuscode: 409",
+        )
+    )
 
 
 def _upload_model_dir(args: argparse.Namespace, model_dir: Path, version: str, timeout: int) -> dict:
@@ -963,8 +989,21 @@ def _run_build_only(entries: list[ModelEntry], args: argparse.Namespace) -> None
     output_dir.mkdir(parents=True, exist_ok=True)
     save_environment_info(output_dir / "environment.json")
 
+    # Resume in build-only mode is upload-driven (manifest + feed). Without
+    # --upload there is no local-disk resume, so --continue rebuilds everything;
+    # warn rather than silently no-op.
+    if args.continue_run and not args.upload:
+        safe_print(
+            "[warn] --continue has no effect in --build-only without --upload: "
+            "no local-disk resume exists, so all models will be rebuilt."
+        )
+
     use_matrix = args.ep is None and args.device == "auto"
     # Single combo uses an empty label (no subdir); matrix uses (label, ep, device).
+    # In the pinned single-combo path the device may be "auto": precision is then
+    # delegated to winml config's own auto-detection (the same omit-the-flag policy
+    # _resolve_precision applies to CPU/GPU), instead of forcing w8a16 the way the
+    # matrix does for its explicit *_npu combos.
     combos = list(_BUILD_ONLY_EP_MATRIX) if use_matrix else [("", args.ep, args.device)]
 
     # Verify feed prerequisites once, up front. --upload exists to keep disk
@@ -1751,8 +1790,10 @@ def parse_args() -> argparse.Namespace:
         dest="upload_skip_existing",
         action="store_true",
         help=(
-            "With --upload, skip building+uploading a model whose package version "
-            "already exists in the feed (resume support)."
+            "With --upload: treat a 'version already exists' publish conflict as "
+            "success (and delete the local copy) instead of a failure. This does "
+            "NOT skip the build. To skip rebuilding already-uploaded models "
+            "entirely, use --continue."
         ),
     )
     parser.add_argument(
diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py
index 85eef2f14..c3913102d 100644
--- a/src/winml/modelkit/commands/build.py
+++ b/src/winml/modelkit/commands/build.py
@@ -1040,8 +1040,15 @@ def _on_iteration_start(iteration: int, max_iter: int) -> None:
 
         try:
             _resolved_device, _ = _resolve_device(device=device or "auto", ep=ep)
-        except ValueError:
-            if config.compile is not None:
+        except ValueError as exc:
+            # Only an EP-not-available failure is expected here (target EP not
+            # installed on this host); resolve_device phrases those with
+            # "...not available..." / "...no compatible EP is available...". A
+            # malformed device/EP name raises "Unknown ..." and is a real bug,
+            # so re-raise it. Also re-raise whenever the build will compile, so
+            # a missing EP fails fast instead of deep in the compile stage.
+            ep_unavailable = "available" in str(exc).lower()
+            if config.compile is not None or not ep_unavailable:
                 raise
             _resolved_device = device or ""