From 3620e5cf5761cf5bf79942e7424c6c2c2203b7a5 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 10:23:22 +0800
Subject: [PATCH 01/10] compile: multi-model shared-EP-context compilation with
 two backends

Add `winml compile -m A -m B ...` to compile multiple ONNX models that share
a single EP context (weight sharing), selectable between two backends:
ort.ModelCompiler (default) and ort.InferenceSession (--use-inference-session).

- compiler: Compiler gains n_total_models / use_inference_session / n_compiled_models
  and a reused shared SessionOptions; add compile_multiple_onnx().
- CompileStage: plugged path picks the backend, reuses the shared options, and sets
  ep.share_ep_contexts / ep.stop_share_ep_contexts across models. Default single-model
  path unchanged.
- CLI: repeatable -m, --use-inference-session, --output-dir required for multi-model
  (reject -o), and report every model's result.
- Tests: e2e shared-weight test parametrized over both backends (inference_session
  output is run + np.allclose-checked against CPU); unit tests for the output-dir rules.
---
 src/winml/modelkit/commands/compile.py        | 129 ++++++++---
 src/winml/modelkit/compiler/__init__.py       |  17 +-
 src/winml/modelkit/compiler/compiler.py       |  69 +++++-
 src/winml/modelkit/compiler/context.py        |  12 +
 src/winml/modelkit/compiler/stages/compile.py | 210 +++++++++++++-----
 tests/e2e/test_compile_e2e.py                 | 144 ++++++++++++
 tests/unit/compiler/test_compile_command.py   |  82 +++++++
 7 files changed, 571 insertions(+), 92 deletions(-)

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index 1387d7907..176a1a621 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -45,8 +45,10 @@
     "--model",
     "-m",
     required=False,
+    multiple=True,
     type=click.Path(exists=True, path_type=Path),
-    help="Input ONNX model file (required unless --list)",
+    help="Input ONNX model file. Repeat -m to compile multiple models with a shared "
+    "EP context (weight sharing). Required unless --list.",
 )
 @cli_utils.output_option("Output file path (e.g., model_compiled.onnx)")
 @click.option(
@@ -90,6 +92,13 @@
     show_default=True,
     help="Embed EP context in ONNX file (default: external .bin file)",
 )
+@click.option(
+    "--use-inference-session",
+    is_flag=True,
+    default=False,
+    help="Compile via ort.InferenceSession (ep.context_enable) instead of the default "
+    "ort.ModelCompiler backend.",
+)
 @click.option(
     "--list",
     "list_compilers_flag",
@@ -102,7 +111,7 @@
 @click.pass_context
 def compile(
     ctx: click.Context,
-    model: Path | None,
+    model: tuple[Path, ...],
     output: Path | None,
     output_dir: Path | None,
     device: str,
@@ -113,6 +122,7 @@ def compile(
     compiler: str,
     qnn_sdk_root: Path | None,
     embed: bool,
+    use_inference_session: bool,
     list_compilers_flag: bool,
     config_file: Path | None,
 ) -> None:
@@ -140,15 +150,24 @@ def compile(
 
     # Apply build config defaults (CLI explicit options take precedence).
     # Read raw JSON so missing keys are distinguishable from dataclass defaults.
+    config_provider_options: dict[str, str] = {}
     if config_file is not None:
         _, raw_cfg = cli_utils.load_build_config(config_file)
         cc = raw_cfg.get("compile") or {}
+        # EP provider options (e.g. QNN htp_arch/soc_model/vtcm_mb) for the compile session.
+        if "provider_options" in cc:
+            config_provider_options = dict(cc["provider_options"])
         if not cli_utils.is_cli_provided(ctx, "ep") and "execution_provider" in cc:
             ep = cc["execution_provider"]
         if not cli_utils.is_cli_provided(ctx, "compiler") and "compiler" in cc:
             compiler = cc["compiler"]
         if not cli_utils.is_cli_provided(ctx, "embed") and "embed_context" in cc:
             embed = cc["embed_context"]
+        if (
+            not cli_utils.is_cli_provided(ctx, "use_inference_session")
+            and "use_inference_session" in cc
+        ):
+            use_inference_session = cc["use_inference_session"]
         if not cli_utils.is_cli_provided(ctx, "validate") and "validate" in cc:
             validate = cc["validate"]
         # Config-file verbosity fallback. CLI flags always win: only honor the
@@ -176,18 +195,29 @@ def compile(
         click.echo(list_compilers(provider))
         return
 
-    # Validate model is provided when not listing
-    if model is None:
+    # Validate model(s) provided when not listing
+    if not model:
         raise click.UsageError("Missing option '--model' / '-m'.")
-
-    if is_compiled_onnx(model):
-        raise click.ClickException(
-            f"{model} is already a compiled EPContext model and cannot be re-compiled. "
-            "Run 'winml compile' on the original ONNX model."
+    models = list(model)
+
+    for m in models:
+        if is_compiled_onnx(m):
+            raise click.ClickException(
+                f"{m} is already a compiled EPContext model and cannot be re-compiled. "
+                "Run 'winml compile' on the original ONNX model."
+            )
+
+    # Multiple models share one EP context and are written by filename into a
+    # directory, so a single -o/--output file path is ambiguous: require --output-dir
+    # (and forbid -o/--output).
+    if len(models) > 1 and (output is not None or output_dir is None):
+        raise click.UsageError(
+            "Multiple --model inputs are written by filename into a directory; "
+            "pass --output-dir (and not -o/--output)."
         )
 
     # Import compiler (late import to speed up CLI)
-    from ..compiler import WinMLCompileConfig, compile_onnx
+    from ..compiler import WinMLCompileConfig, compile_multiple_onnx, compile_onnx
 
     # Resolve EP from device + ep flags
     provider = _resolve_compile_provider(resolved_device, ep)
@@ -207,14 +237,23 @@ def compile(
     config.ep_config.compiler = compiler
     config.ep_config.qnn_sdk_root = qnn_sdk_root
     config.ep_config.embed_context = embed
+    # EP provider options supplied via --config (compile.provider_options).
+    if config_provider_options:
+        config.ep_config.provider_options.update(config_provider_options)
 
     # Show info
-    console.print(f"[bold blue]Input:[/bold blue] {model}")
+    console.print(f"[bold blue]Input:[/bold blue] {', '.join(str(m) for m in models)}")
     console.print(f"[bold blue]Device:[/bold blue] {resolved_device}")
     if ep:
         console.print(f"[bold blue]EP:[/bold blue] {ep}")
     console.print(f"[bold blue]Provider:[/bold blue] {provider}")
     console.print(f"[bold blue]Compiler:[/bold blue] {compiler}")
+    console.print(
+        f"[bold blue]Backend:[/bold blue] "
+        f"{'inference_session' if use_inference_session else 'model_compiler'}"
+    )
+    if len(models) > 1:
+        console.print(f"[bold blue]Shared EP context:[/bold blue] yes ({len(models)} models)")
     if qnn_sdk_root:
         console.print(f"[bold blue]SDK root:[/bold blue] {qnn_sdk_root}")
     # Resolve output path: -o (file) takes precedence over --output-dir
@@ -225,31 +264,51 @@ def compile(
         console.print(f"[bold blue]Output dir:[/bold blue] {output_dir}")
 
     try:
-        console.print("\n[bold]Compiling model...[/bold]")
-        result = compile_onnx(model, output_path=resolved_output, config=config)
-
-        if result.success:
-            if config.ep_config.enable_ep_context and not result.output_path:
-                console.print(
-                    "\n[bold yellow]Warning:[/bold yellow] Compilation finished "
-                    "but no output file was written to the output directory."
-                )
-                raise click.ClickException(
-                    "No output file produced. Check EP context support for "
-                    f"provider '{config.ep_config.provider}'."
-                )
-            console.print("\n[bold green]Success![/bold green] Model compiled")
-            if result.output_path:
-                console.print(f"[dim]Output: {result.output_path}[/dim]")
-            if result.compile_time:
-                console.print(f"[dim]Compile time: {result.compile_time:.2f}s[/dim]")
-            if result.total_time:
-                console.print(f"[dim]Total time: {result.total_time:.2f}s[/dim]")
+        console.print("\n[bold]Compiling model(s)...[/bold]")
+        if len(models) == 1 and not use_inference_session:
+            # Default path: single model via ort.ModelCompiler (staged pipeline).
+            results = [compile_onnx(models[0], output_path=resolved_output, config=config)]
         else:
-            console.print("\n[bold red]Compilation failed:[/bold red]")
-            for error in result.errors:
-                console.print(f"  {error}")
-            raise click.ClickException("Compilation failed")
+            # Multi-model (shared EP context) and/or inference-session backend.
+            # Multiple models require --output-dir (enforced above), so resolved_output
+            # is that directory; a single inference_session model may instead use -o,
+            # whose parent directory the compile stage resolves.
+            results = compile_multiple_onnx(
+                models, resolved_output, config, use_inference_session=use_inference_session
+            )
+
+        # Report every model's result (not just the first failure).
+        multi = len(results) > 1
+        failures = 0
+        for model_path, result in zip(models, results, strict=True):
+            label = f" — {model_path.name}" if multi else ""
+            if result.success:
+                if config.ep_config.enable_ep_context and not result.output_path:
+                    # Compiled but no artifact landed: a warning, not a failure.
+                    console.print(
+                        "\n[bold yellow]Warning:[/bold yellow] Compilation finished but "
+                        f"no output file was written to the output directory.{label}"
+                    )
+                    continue
+                console.print(f"\n[bold green]Success![/bold green] Model compiled{label}")
+                if result.output_path:
+                    console.print(f"[dim]Output: {result.output_path}[/dim]")
+                if result.compile_time:
+                    console.print(f"[dim]Compile time: {result.compile_time:.2f}s[/dim]")
+                if result.total_time:
+                    console.print(f"[dim]Total time: {result.total_time:.2f}s[/dim]")
+            else:
+                failures += 1
+                console.print(f"\n[bold red]Compilation failed:[/bold red]{label}")
+                for error in result.errors:
+                    console.print(f"  {error}")
+
+        if failures:
+            raise click.ClickException(
+                f"Compilation failed for {failures} of {len(results)} model(s)."
+                if multi
+                else "Compilation failed"
+            )
 
     except click.ClickException:
         raise
diff --git a/src/winml/modelkit/compiler/__init__.py b/src/winml/modelkit/compiler/__init__.py
index 99c0eb42d..bd49317e5 100644
--- a/src/winml/modelkit/compiler/__init__.py
+++ b/src/winml/modelkit/compiler/__init__.py
@@ -41,7 +41,7 @@
 # (mypy, CodeQL) visibility into what ``__all__`` actually exports without
 # triggering the heavy imports at runtime.
 if TYPE_CHECKING:
-    from .compiler import Compiler, compile_onnx, list_compilers
+    from .compiler import Compiler, compile_multiple_onnx, compile_onnx, list_compilers
     from .stages.compile import CompileStage
     from .stages.optimize import OptimizeStage
     from .stages.qformat import QFormatConvertStage
@@ -49,11 +49,19 @@
 
 def __getattr__(name: str) -> Any:
     """Lazy-load heavy symbols that pull in session/torch to speed up import."""
-    if name in {"Compiler", "compile_onnx", "list_compilers"}:
-        from .compiler import Compiler, compile_onnx, list_compilers
+    if name in {"Compiler", "compile_multiple_onnx", "compile_onnx", "list_compilers"}:
+        from .compiler import (
+            Compiler,
+            compile_multiple_onnx,
+            compile_onnx,
+            list_compilers,
+        )
 
         globals().update(
-            Compiler=Compiler, compile_onnx=compile_onnx, list_compilers=list_compilers
+            Compiler=Compiler,
+            compile_multiple_onnx=compile_multiple_onnx,
+            compile_onnx=compile_onnx,
+            list_compilers=list_compilers,
         )
         return globals()[name]
 
@@ -84,6 +92,7 @@ def __getattr__(name: str) -> Any:
     "QFormatConvertStage",
     "WinMLCompileConfig",
     "clear_transforms",
+    "compile_multiple_onnx",
     "compile_onnx",
     "get_transforms_for_ep",
     "list_compilers",
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index 6267daca6..bec5d7b6b 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -53,6 +53,28 @@ class Compiler:
     # Registered stages (in execution order)
     _stages: list[type[BaseStage]] | None = None
 
+    def __init__(
+        self,
+        n_total_models: int = 1,
+        use_inference_session: bool = False,
+    ) -> None:
+        """Create a compiler.
+
+        Args:
+            n_total_models: Total number of models compiled by this instance. When
+                >1, the models share a single EP context (weight sharing) and the
+                same shared ``SessionOptions`` is reused across every ``compile``.
+            use_inference_session: Select the ``ort.InferenceSession``
+                (``ep.context_enable``) backend instead of the default
+                ``ort.ModelCompiler``.
+        """
+        self.n_total_models = n_total_models
+        self.use_inference_session = use_inference_session
+        # The shared SessionOptions: created by CompileStage on the first model and
+        # reused for the rest (kept here so it survives between compile() calls).
+        self.inference_session: object | None = None
+        self.n_compiled_models = 0
+
     @classmethod
     def _get_stages(cls) -> list[type[BaseStage]]:
         """Lazy initialization of stages."""
@@ -103,12 +125,18 @@ def compile(
         work_dir = Path(temp_dir.name)
 
         try:
-            # Create context from config
+            # Create context from config. Multi-model / weight-sharing state is
+            # threaded through so CompileStage can pick the backend, reuse the shared
+            # SessionOptions, and detect the last (stop_share) model.
             context = CompileContext(
                 model_path=model_path,
                 config=config.to_dict(),
                 work_dir=work_dir,
                 verbose=config.verbose,
+                n_compiled_models=self.n_compiled_models,
+                n_total_models=self.n_total_models,
+                use_inference_session=self.use_inference_session,
+                inference_session=self.inference_session,
             )
 
             if output_path is not None:
@@ -129,6 +157,11 @@ def compile(
                 else:
                     context.log(f"Skipping stage: {stage_cls.name}")
 
+            # Carry the shared SessionOptions (created/reused by CompileStage) forward
+            # so the next model in a shared-context run reuses the same EP + group.
+            self.inference_session = context.inference_session
+            self.n_compiled_models += 1
+
             # Build result
             total_time = time.time() - start_time
             result = self._build_result(context, total_time)
@@ -199,3 +232,37 @@ def compile_onnx(
     """
     compiler = Compiler()
     return compiler.compile(model_path=model_path, output_path=output_path, config=config)
+
+
+def compile_multiple_onnx(
+    model_paths: list[str | Path],
+    output_path: str | Path | None = None,
+    config: WinMLCompileConfig | None = None,
+    use_inference_session: bool = False,
+) -> list[CompileResult]:
+    """Compile one or more ONNX models, sharing a single EP context when >1.
+
+    A single :class:`Compiler` (``n_total_models=len(model_paths)``) compiles every
+    model in sequence, reusing one shared ``SessionOptions`` so the weights are shared
+    across the compiled EPContext models. The backend is ``ort.ModelCompiler`` by
+    default, or ``ort.InferenceSession`` when ``use_inference_session`` is set.
+
+    Args:
+        model_paths: Input ONNX model paths.
+        output_path: Output directory (or file) for the compiled models.
+        config: Compilation configuration. ``None`` skips compilation (passthrough).
+        use_inference_session: Use the InferenceSession backend.
+
+    Returns:
+        One :class:`CompileResult` per input model, in order.
+    """
+    compiler = Compiler(
+        n_total_models=len(model_paths),
+        use_inference_session=use_inference_session,
+    )
+    # Compiled in order (the comprehension evaluates left-to-right) so the shared
+    # context accumulates across models and the last one flushes it.
+    return [
+        compiler.compile(model_path=mp, output_path=output_path, config=config)
+        for mp in model_paths
+    ]
diff --git a/src/winml/modelkit/compiler/context.py b/src/winml/modelkit/compiler/context.py
index 18fb22679..572c5bebe 100644
--- a/src/winml/modelkit/compiler/context.py
+++ b/src/winml/modelkit/compiler/context.py
@@ -44,6 +44,18 @@ class CompileContext:
     # Session (set during compile)
     session: ort.InferenceSession | None = None
 
+    # Multi-model / shared-EP-context compilation state (driven by Compiler).
+    # n_compiled_models: how many models the Compiler has already compiled (0-based
+    #   index of the current model).
+    # n_total_models: total models in this compile run (>1 enables weight sharing).
+    # use_inference_session: pick the InferenceSession backend over ort.ModelCompiler.
+    # inference_session: the shared ort.SessionOptions created on the first model and
+    #   reused for the rest (the EP is added once and the share group lives on it).
+    n_compiled_models: int = 0
+    n_total_models: int = 1
+    use_inference_session: bool = False
+    inference_session: ort.SessionOptions | None = None
+
     # Output paths
     output_path: Path | None = None
     context_binary_path: Path | None = None
diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index 94b476956..2374e0708 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -45,62 +45,25 @@ def should_run(cls, context: CompileContext) -> bool:
         return True
 
     def process(self, context: CompileContext) -> CompileContext:
-        """Execute compilation."""
+        """Execute compilation.
+
+        Two compile paths, selected from the multi-model state on the context:
+
+        * Default (single model, ``ort.ModelCompiler``): the existing
+          ``WinMLSession``-driven path — unchanged.
+        * Plugged (``use_inference_session`` and/or ``n_total_models > 1``): a
+          backend chosen between ``ort.ModelCompiler`` and ``ort.InferenceSession``,
+          reusing one shared ``SessionOptions`` so multiple models share a single
+          EP context (weight sharing).
+        """
         context.log("Starting compile stage")
         start_time = time.time()
 
         try:
-            # Resolve session class from compiler config
-            compiler = context.config.get("compiler", "ort")
-            session_cls = COMPILER_SESSION_MAPPING[compiler]
-
-            # Determine final output directory (default: same as input model)
-            output_dir = self._get_output_dir(context)
-            context.log(f"Output directory: {output_dir}")
-
-            # Ensure model is saved to disk (may be in work_dir if modified)
-            model_path = self._ensure_model_file(context)
-            context.log(f"Model path: {model_path}")
-
-            ep_config = WinMLCompileConfig.from_dict(context.config).ep_config
-            # Derive the target device from the runtime session so the compile
-            # stage stays aligned with the actual EPContext filename produced by
-            # WinMLSession instead of carrying device metadata in provider_options.
-            device = context.config.get("device", "auto")
-            explicit_ep = normalize_ep_name(ep_config.provider)
-            session_cls_name = getattr(session_cls, "__name__", session_cls.__class__.__name__)
-            context.log(f"Creating {session_cls_name} for device: {device}")
-            winml_session = session_cls(
-                onnx_path=model_path,
-                device=device,
-                ep_config=ep_config,
-                ep=explicit_ep,
-            )
-            winml_session.compile()
-
-            # Get the underlying session for validation and info collection
-            session = winml_session._session
-            context.session = session
-
-            resolved_device = getattr(winml_session, "_device", device)
-            if isinstance(resolved_device, str) and resolved_device:
-                device = resolved_device.lower()
-
-            # Log actual providers used
-            if session is not None:
-                actual_providers = session.get_providers()
-                context.log(f"Actual providers: {actual_providers}")
-
-                # Validate if requested
-                if context.validate:
-                    self._validate_model(session, context)
-
-                # Collect model info
-                self._collect_model_info(session, context)
-
-            # Find and relocate EPContext files to output directory
-            if ep_config.enable_ep_context:
-                self._finalize_output(context, model_path, output_dir, device=device)
+            if context.use_inference_session or context.n_total_models > 1:
+                self._compile_plugged(context)
+            else:
+                self._compile_default(context)
 
         except Exception as e:
             context.add_error(f"Compilation failed: {e}")
@@ -113,6 +76,149 @@ def process(self, context: CompileContext) -> CompileContext:
 
         return context
 
+    def _compile_default(self, context: CompileContext) -> None:
+        """Single-model compile via ``WinMLSession`` (``ort.ModelCompiler``)."""
+        # Resolve session class from compiler config
+        compiler = context.config.get("compiler", "ort")
+        session_cls = COMPILER_SESSION_MAPPING[compiler]
+
+        # Determine final output directory (default: same as input model)
+        output_dir = self._get_output_dir(context)
+        context.log(f"Output directory: {output_dir}")
+
+        # Ensure model is saved to disk (may be in work_dir if modified)
+        model_path = self._ensure_model_file(context)
+        context.log(f"Model path: {model_path}")
+
+        ep_config = WinMLCompileConfig.from_dict(context.config).ep_config
+        # Derive the target device from the runtime session so the compile
+        # stage stays aligned with the actual EPContext filename produced by
+        # WinMLSession instead of carrying device metadata in provider_options.
+        device = context.config.get("device", "auto")
+        explicit_ep = normalize_ep_name(ep_config.provider)
+        session_cls_name = getattr(session_cls, "__name__", session_cls.__class__.__name__)
+        context.log(f"Creating {session_cls_name} for device: {device}")
+        winml_session = session_cls(
+            onnx_path=model_path,
+            device=device,
+            ep_config=ep_config,
+            ep=explicit_ep,
+        )
+        winml_session.compile()
+
+        # Get the underlying session for validation and info collection
+        session = winml_session._session
+        context.session = session
+
+        resolved_device = getattr(winml_session, "_device", device)
+        if isinstance(resolved_device, str) and resolved_device:
+            device = resolved_device.lower()
+
+        # Log actual providers used
+        if session is not None:
+            actual_providers = session.get_providers()
+            context.log(f"Actual providers: {actual_providers}")
+
+            # Validate if requested
+            if context.validate:
+                self._validate_model(session, context)
+
+            # Collect model info
+            self._collect_model_info(session, context)
+
+        # Find and relocate EPContext files to output directory
+        if ep_config.enable_ep_context:
+            self._finalize_output(context, model_path, output_dir, device=device)
+
+    def _compile_plugged(self, context: CompileContext) -> None:
+        """Multi-model / inference-session compile with a shared EP context.
+
+        The shared ``SessionOptions`` (``context.inference_session``) is created on
+        the first model — the EP is added once and, for a multi-model run, the
+        ``ep.share_ep_contexts`` group is opened on it — then reused for every model.
+        ``ep.stop_share_ep_contexts`` is added before the final model so the shared
+        weights binary is flushed.
+        """
+        import onnxruntime as ort
+
+        from ...sysinfo.device import resolve_device, resolve_eps
+        from ...utils.constants import DEVICE_TO_DEVICE_TYPE
+        from ...winml import add_ep_for_device, register_execution_providers
+
+        ep_config = WinMLCompileConfig.from_dict(context.config).ep_config
+        multi = context.n_total_models > 1
+        is_last = context.n_compiled_models >= context.n_total_models - 1
+        use_is = context.use_inference_session
+
+        output_dir = self._get_output_dir(context)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        model_path = self._ensure_model_file(context)
+        ctx_path = output_dir / f"{context.model_path.stem}_ctx.onnx"
+        backend = "inference_session" if use_is else "model_compiler"
+        context.log(
+            f"[{backend}] compiling {model_path.name} "
+            f"({context.n_compiled_models + 1}/{context.n_total_models}) -> {ctx_path.name}"
+        )
+
+        # Build the shared SessionOptions once; reuse it for subsequent models.
+        sess_options = context.inference_session
+        if sess_options is None:
+            register_execution_providers(ort=True)
+            resolved_device, _ = resolve_device(context.config.get("device", "auto"))
+            ep = normalize_ep_name(ep_config.provider) or resolve_eps(resolved_device)[0]
+            device_type = DEVICE_TO_DEVICE_TYPE.get(resolved_device.upper())
+
+            sess_options = ort.SessionOptions()
+            if use_is:
+                sess_options.add_session_config_entry("ep.context_enable", "1")
+                sess_options.add_session_config_entry(
+                    "ep.context_embed_mode", "1" if ep_config.embed_context else "0"
+                )
+            if multi:
+                sess_options.add_session_config_entry("ep.share_ep_contexts", "1")
+            if not add_ep_for_device(
+                sess_options, ep, device_type, dict(ep_config.provider_options)
+            ):
+                raise RuntimeError(f"Could not add {ep} for device type {device_type}")
+            context.inference_session = sess_options  # captured by Compiler for reuse
+
+        # Last model in a shared run flushes the shared context.
+        if multi and is_last:
+            sess_options.add_session_config_entry("ep.stop_share_ep_contexts", "1")
+
+        if use_is:
+            # InferenceSession backend: ep.context_file_path writes the EPContext
+            # wrapper; constructing the session performs the compile.
+            sess_options.add_session_config_entry("ep.context_file_path", str(ctx_path))
+            session = ort.InferenceSession(str(model_path), sess_options=sess_options)
+            context.session = session
+            if session.get_providers():
+                context.log(f"Actual providers: {session.get_providers()}")
+            # Models compiled this way are loadable; validate (run) when requested.
+            if context.validate:
+                self._validate_model(session, context)
+                self._collect_model_info(session, context)
+        else:
+            # ModelCompiler backend: compile straight to the EPContext file. No
+            # session is created here (smoke path — outputs are checked, not loaded).
+            ort.ModelCompiler(
+                sess_options,
+                str(model_path),
+                embed_compiled_data_into_model=ep_config.embed_context,
+            ).compile_to_file(str(ctx_path))
+
+        if ctx_path.exists():
+            context.output_path = ctx_path
+            bins = [
+                f
+                for f in output_dir.glob(f"{ctx_path.stem}*.bin")
+                if not f.name.endswith("_schematic.bin")
+            ]
+            if bins:
+                context.context_binary_path = bins[0]
+        else:
+            context.add_warning(f"No EPContext produced for {model_path.name}")
+
     def _get_output_dir(self, context: CompileContext) -> Path:
         """Determine the output directory for compiled model.
 
diff --git a/tests/e2e/test_compile_e2e.py b/tests/e2e/test_compile_e2e.py
index 58157518d..c4192a73f 100644
--- a/tests/e2e/test_compile_e2e.py
+++ b/tests/e2e/test_compile_e2e.py
@@ -35,9 +35,11 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+import numpy as np
 import onnx
 import pytest
 from click.testing import CliRunner
+from onnx import TensorProto, helper
 
 from tests.e2e.require_ep import require_ep, require_not_ep
 from winml.modelkit.commands.compile import compile as compile_cmd
@@ -164,6 +166,9 @@ def assert_by_run_inference(
     device: str,
     ep: str,
     sample_input: dict,
+    reference_model: Path | None = None,
+    rtol: float = 1e-2,
+    atol: float = 1e-2,
 ) -> None:
     """Bind ``ep`` + ``device`` and run one inference call on the compiled artifact.
 
@@ -171,13 +176,34 @@ def assert_by_run_inference(
     EP), this asserts the artifact specifically loads and runs on the
     requested ``(device, ep)`` pair. Catches the case where the compile
     succeeded against a different EP/device than the user asked for.
+
+    When ``reference_model`` is given, the original (pre-compile) model is run on
+    the CPU EP with the same input and the compiled output is checked against it
+    with :func:`numpy.allclose` — a correctness check that the compiled graph still
+    computes the same result, not just that it runs.
     """
+    import onnxruntime as ort
+
     from winml.modelkit.session import WinMLSession
 
     session = WinMLSession(out_path, device=device, ep=ep)
     outputs = session.run(sample_input)
     assert outputs, "Inference produced no outputs"
 
+    if reference_model is not None:
+        ref_sess = ort.InferenceSession(str(reference_model), providers=["CPUExecutionProvider"])
+        ref_names = [o.name for o in ref_sess.get_outputs()]
+        ref_outputs = ref_sess.run(None, sample_input)
+        for name, ref in zip(ref_names, ref_outputs, strict=True):
+            assert name in outputs, f"Compiled model missing output {name!r}"
+            got = np.asarray(outputs[name], dtype=np.float32)
+            ref = np.asarray(ref, dtype=np.float32)
+            assert np.allclose(got, ref, rtol=rtol, atol=atol), (
+                f"Compiled output {name!r} differs from CPU reference "
+                f"(max abs diff {np.max(np.abs(got - ref)):.4g}, "
+                f"rtol={rtol}, atol={atol})"
+            )
+
 
 def _find_qairt_sdk_root() -> Path | None:
     """Locate an installed QAIRT SDK on this host, or None."""
@@ -227,6 +253,7 @@ def test_help_lists_every_option(self) -> None:
             "--compiler",
             "--qnn-sdk-root",
             "--embed",
+            "--use-inference-session",
             "--list",
         ):
             assert opt in result.output, f"--help missing {opt}"
@@ -860,3 +887,120 @@ def test_bad_input_no_ep_covers_device(simple_matmul_onnx: Path) -> None:
         src_hash,
         simple_matmul_onnx,
     )
+
+
+# ===========================================================================
+# Compile backend (ort.ModelCompiler vs ort.InferenceSession) + multi-model
+# shared EP context (qnn-only)
+# ===========================================================================
+
+
+@pytest.fixture
+def shared_weight_models(tmp_path: Path) -> tuple[Path, Path]:
+    """Two MatMul models sharing the SAME weight but with different input shapes.
+
+    Mirrors the prefill/decode (ctx/iter) pattern that QNN weight sharing targets:
+    one ``[K, K]`` weight ``B`` reused across both graphs while the leading sequence
+    dimension differs (4 vs 1). Returns ``(seq4_model, seq1_model)``.
+    """
+    np.random.seed(7)
+    k = 4
+    b_values = np.random.randn(k, k).astype(np.float32)
+
+    def _build(seq: int, name: str) -> Path:
+        a = helper.make_tensor_value_info("A", TensorProto.FLOAT, [1, seq, k])
+        c = helper.make_tensor_value_info("C", TensorProto.FLOAT, [1, seq, k])
+        b = helper.make_tensor("B", TensorProto.FLOAT, [k, k], b_values.flatten().tolist())
+        node = helper.make_node("MatMul", ["A", "B"], ["C"], name="matmul")
+        graph = helper.make_graph([node], "shared_matmul", [a], [c], [b])
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model.ir_version = 7
+        onnx.checker.check_model(model)
+        path = tmp_path / name
+        onnx.save(model, str(path))
+        return path
+
+    return _build(4, "shared_seq4.onnx"), _build(1, "shared_seq1.onnx")
+
+
+def _sample_for(model_path: Path) -> dict[str, np.ndarray]:
+    """Random input matching a ``shared_weight_models`` graph's declared shape."""
+    dims = onnx.load(str(model_path)).graph.input[0].type.tensor_type.shape.dim
+    shape = [d.dim_value for d in dims]
+    return {"A": np.random.randn(*shape).astype(np.float32)}
+
+
+@pytest.mark.e2e
+def test_default_backend_uses_model_compiler(
+    simple_matmul_onnx: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """By default (no ``--use-inference-session``), a single-model compile is driven
+    by ``ort.ModelCompiler``."""
+    require_ep("qnn")
+    import onnxruntime as ort
+
+    real = ort.ModelCompiler
+    calls: list[int] = []
+
+    def _spy(*args: object, **kwargs: object) -> object:
+        calls.append(1)
+        return real(*args, **kwargs)
+
+    monkeypatch.setattr(ort, "ModelCompiler", _spy)
+
+    out = tmp_path / "default.onnx"
+    result = _invoke("-m", str(simple_matmul_onnx), "--ep", "qnn", "-o", str(out))
+    assert result.exit_code == 0, result.output
+    assert calls, "Default single-model compile should use ort.ModelCompiler"
+    assert is_compiled_onnx(out)
+
+
+@pytest.mark.e2e
+@pytest.mark.parametrize(
+    "use_inference_session",
+    [False, True],
+    ids=["model_compiler", "inference_session"],
+)
+def test_multi_model_shared_weights(
+    use_inference_session: bool,
+    shared_weight_models: tuple[Path, Path],
+    tmp_path: Path,
+) -> None:
+    """Multiple models with shared weights compile to a single shared EP context in
+    BOTH backends (``ort.ModelCompiler`` default, ``ort.InferenceSession`` opt-in).
+
+    Both backends are smoke-checked (files + one shared weights bin). The
+    inference_session output is additionally loaded and run on QNN; the
+    model_compiler output is smoke-only (not loaded).
+    """
+    require_ep("qnn")
+    m_seq4, m_seq1 = shared_weight_models
+    out_dir = tmp_path / "out"
+    out_dir.mkdir()
+
+    cmd = ["-m", str(m_seq4), "-m", str(m_seq1), "--ep", "qnn", "--output-dir", str(out_dir)]
+    if use_inference_session:
+        cmd.append("--use-inference-session")
+    result = _invoke(*cmd)
+    assert result.exit_code == 0, result.output
+    assert "Success! Model compiled" in result.output, result.output
+    expected_backend = "inference_session" if use_inference_session else "model_compiler"
+    assert expected_backend in result.output.lower(), result.output
+
+    # Both compiled wrappers exist + exactly one shared weights bin (weight sharing).
+    ctx4 = out_dir / f"{m_seq4.stem}_ctx.onnx"
+    ctx1 = out_dir / f"{m_seq1.stem}_ctx.onnx"
+    assert ctx4.is_file() and is_compiled_onnx(ctx4), f"missing/invalid {ctx4}"
+    assert ctx1.is_file() and is_compiled_onnx(ctx1), f"missing/invalid {ctx1}"
+    bins = [p for p in out_dir.glob("*.bin") if not p.name.endswith("_schematic.bin")]
+    assert len(bins) == 1, f"Expected one shared weights bin, got {[b.name for b in bins]}"
+
+    # inference_session output is runnable; model_compiler output is smoke-only (no load).
+    # Run on QNN and np.allclose-check against the original model on CPU.
+    if use_inference_session:
+        assert_by_run_inference(
+            ctx4, device="npu", ep="qnn", sample_input=_sample_for(m_seq4), reference_model=m_seq4
+        )
+        assert_by_run_inference(
+            ctx1, device="npu", ep="qnn", sample_input=_sample_for(m_seq1), reference_model=m_seq1
+        )
diff --git a/tests/unit/compiler/test_compile_command.py b/tests/unit/compiler/test_compile_command.py
index f4d3a1de0..5b49da530 100644
--- a/tests/unit/compiler/test_compile_command.py
+++ b/tests/unit/compiler/test_compile_command.py
@@ -341,6 +341,88 @@ def test_compile_device_propagates_to_provider_options(
         assert config.ep_config.provider_options.get("device_type") == "NPU"
         assert config.ep_config.device == "npu"
 
+    def test_multiple_models_reject_output_file(self, runner: CliRunner, tmp_path: Path) -> None:
+        """Multiple -m inputs with -o/--output (a file) are rejected: use --output-dir.
+
+        Several models share one EP context and are written by filename into a
+        directory, so a single output file path is ambiguous.
+        """
+        m1 = tmp_path / "m1.onnx"
+        m2 = tmp_path / "m2.onnx"
+        self._create_simple_onnx(m1)
+        self._create_simple_onnx(m2)
+        out_file = tmp_path / "out.onnx"
+
+        result = runner.invoke(main, ["compile", "-m", str(m1), "-m", str(m2), "-o", str(out_file)])
+
+        assert result.exit_code != 0
+        assert "output-dir" in result.output.lower(), result.output
+
+    def test_multiple_models_require_output_dir(self, runner: CliRunner, tmp_path: Path) -> None:
+        """Multiple -m inputs with neither -o nor --output-dir are rejected.
+
+        --output-dir is mandatory for multi-model compiles (the compiled models are
+        written by filename into that directory).
+        """
+        m1 = tmp_path / "m1.onnx"
+        m2 = tmp_path / "m2.onnx"
+        self._create_simple_onnx(m1)
+        self._create_simple_onnx(m2)
+
+        result = runner.invoke(main, ["compile", "-m", str(m1), "-m", str(m2)])
+
+        assert result.exit_code != 0
+        assert "output-dir" in result.output.lower(), result.output
+
+    @patch("winml.modelkit.compiler.compile_multiple_onnx")
+    def test_multiple_models_with_output_dir_calls_compile_multiple(
+        self,
+        mock_compile_multiple: MagicMock,
+        runner: CliRunner,
+        tmp_path: Path,
+    ) -> None:
+        """Multiple -m inputs with --output-dir compile via compile_multiple_onnx."""
+        m1 = tmp_path / "m1.onnx"
+        m2 = tmp_path / "m2.onnx"
+        self._create_simple_onnx(m1)
+        self._create_simple_onnx(m2)
+        out_dir = tmp_path / "out"
+
+        mock_result = MagicMock()
+        mock_result.success = True
+        mock_result.output_path = out_dir / "m2_ctx.onnx"
+        mock_result.compile_time = 1.0
+        mock_result.total_time = 1.5
+        mock_compile_multiple.return_value = [mock_result, mock_result]
+
+        result = runner.invoke(
+            main,
+            [
+                "compile",
+                "-m",
+                str(m1),
+                "-m",
+                str(m2),
+                "--device",
+                "npu",
+                "--ep",
+                "qnn",
+                "--output-dir",
+                str(out_dir),
+            ],
+        )
+
+        assert result.exit_code == 0, result.output
+        assert mock_compile_multiple.called
+        call_args = mock_compile_multiple.call_args
+        # First positional arg is the ordered list of input models.
+        passed_models = call_args.args[0]
+        assert [str(m) for m in passed_models] == [str(m1), str(m2)]
+        # Second positional arg is the output target — the --output-dir directory.
+        assert call_args.args[1] == out_dir
+        # use_inference_session defaults to False (model_compiler backend).
+        assert call_args.kwargs["use_inference_session"] is False
+
     def _create_simple_onnx(self, path: Path) -> None:
         """Create a simple ONNX model for testing."""
         import onnx

From 4305e83eaeeb13512a3aacbe26d6df006b8c32f3 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 11:11:37 +0800
Subject: [PATCH 02/10] compile: dedup same-named multi-model outputs;
 output-folder API; mypy fix

- compile_multiple_onnx takes an output folder and disambiguates same-named
  inputs by suffixing the later one(s) (<stem>_ctx.onnx, <stem>_1_ctx.onnx) with a
  warning, instead of raising.
- CompileStage honors an explicit <name>_ctx.onnx output path (used for the
  de-duplicated names); rename _compile_default -> _compile_model_compiler and
  _compile_plugged -> _compile_inference_session.
- Fix mypy invariance error: compile_multiple_onnx takes Sequence[str | Path].
- Add unit tests for the duplicate-name suffixing.
---
 src/winml/modelkit/commands/compile.py        |  7 +-
 src/winml/modelkit/compiler/compiler.py       | 47 ++++++++---
 src/winml/modelkit/compiler/stages/compile.py | 28 ++++---
 tests/unit/compiler/test_compile_multiple.py  | 84 +++++++++++++++++++
 4 files changed, 140 insertions(+), 26 deletions(-)
 create mode 100644 tests/unit/compiler/test_compile_multiple.py

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index 176a1a621..6026d3c88 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -270,11 +270,10 @@ def compile(
             results = [compile_onnx(models[0], output_path=resolved_output, config=config)]
         else:
             # Multi-model (shared EP context) and/or inference-session backend.
-            # Multiple models require --output-dir (enforced above), so resolved_output
-            # is that directory; a single inference_session model may instead use -o,
-            # whose parent directory the compile stage resolves.
+            # compile_multiple_onnx writes each model as <stem>_ctx.onnx into a folder;
+            # multiple models require --output-dir (enforced above).
             results = compile_multiple_onnx(
-                models, resolved_output, config, use_inference_session=use_inference_session
+                models, output_dir, config, use_inference_session=use_inference_session
             )
 
         # Report every model's result (not just the first failure).
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index bec5d7b6b..a19a68fb1 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import logging
 import tempfile
 import time
 from pathlib import Path
@@ -15,7 +16,12 @@
 from .result import CompileResult
 
 
+logger = logging.getLogger(__name__)
+
+
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from ..utils.constants import EPName
     from .configs import WinMLCompileConfig
     from .stages.base import BaseStage
@@ -235,8 +241,8 @@ def compile_onnx(
 
 
 def compile_multiple_onnx(
-    model_paths: list[str | Path],
-    output_path: str | Path | None = None,
+    model_paths: Sequence[str | Path],
+    output_dir: str | Path | None = None,
     config: WinMLCompileConfig | None = None,
     use_inference_session: bool = False,
 ) -> list[CompileResult]:
@@ -248,21 +254,40 @@ def compile_multiple_onnx(
     default, or ``ort.InferenceSession`` when ``use_inference_session`` is set.
 
     Args:
-        model_paths: Input ONNX model paths.
-        output_path: Output directory (or file) for the compiled models.
+        model_paths: Input ONNX model paths. Each compiles to ``<stem>_ctx.onnx`` in
+            ``output_dir``; inputs that share a filename stem are disambiguated by
+            appending an integer suffix to the later one(s) (with a warning), e.g.
+            ``model_ctx.onnx`` then ``model_1_ctx.onnx``.
+        output_dir: Output directory for the compiled models.
         config: Compilation configuration. ``None`` skips compilation (passthrough).
         use_inference_session: Use the InferenceSession backend.
 
     Returns:
         One :class:`CompileResult` per input model, in order.
     """
+    paths = [Path(mp) for mp in model_paths]
+    out_dir = Path(output_dir) if output_dir is not None else None
+
     compiler = Compiler(
-        n_total_models=len(model_paths),
+        n_total_models=len(paths),
         use_inference_session=use_inference_session,
     )
-    # Compiled in order (the comprehension evaluates left-to-right) so the shared
-    # context accumulates across models and the last one flushes it.
-    return [
-        compiler.compile(model_path=mp, output_path=output_path, config=config)
-        for mp in model_paths
-    ]
+    # Compiled in order so the shared context accumulates and the last model flushes it.
+    # Outputs are keyed by filename stem in a single folder, so disambiguate same-named
+    # inputs by suffixing the later one(s) instead of overwriting.
+    results: list[CompileResult] = []
+    seen_stems: dict[str, int] = {}
+    for p in paths:
+        count = seen_stems.get(p.stem, 0)
+        seen_stems[p.stem] = count + 1
+        out_stem = p.stem if count == 0 else f"{p.stem}_{count}"
+        if count > 0:
+            logger.warning(
+                "Input model name %r repeats; writing its compiled output as "
+                "'%s_ctx.onnx' to avoid overwriting the earlier one.",
+                p.name,
+                out_stem,
+            )
+        out_path = out_dir / f"{out_stem}_ctx.onnx" if out_dir is not None else None
+        results.append(compiler.compile(model_path=p, output_path=out_path, config=config))
+    return results
diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index 2374e0708..3b6efd7cc 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -49,21 +49,21 @@ def process(self, context: CompileContext) -> CompileContext:
 
         Two compile paths, selected from the multi-model state on the context:
 
-        * Default (single model, ``ort.ModelCompiler``): the existing
-          ``WinMLSession``-driven path — unchanged.
-        * Plugged (``use_inference_session`` and/or ``n_total_models > 1``): a
-          backend chosen between ``ort.ModelCompiler`` and ``ort.InferenceSession``,
-          reusing one shared ``SessionOptions`` so multiple models share a single
-          EP context (weight sharing).
+        * ``_compile_model_compiler`` (single model, default): the existing
+          ``WinMLSession`` / ``ort.ModelCompiler`` path — unchanged.
+        * ``_compile_inference_session`` (``use_inference_session`` and/or
+          ``n_total_models > 1``): reuses one shared ``SessionOptions`` so multiple
+          models share a single EP context (weight sharing); the backend is
+          ``ort.InferenceSession`` when requested, else ``ort.ModelCompiler``.
         """
         context.log("Starting compile stage")
         start_time = time.time()
 
         try:
             if context.use_inference_session or context.n_total_models > 1:
-                self._compile_plugged(context)
+                self._compile_inference_session(context)
             else:
-                self._compile_default(context)
+                self._compile_model_compiler(context)
 
         except Exception as e:
             context.add_error(f"Compilation failed: {e}")
@@ -76,7 +76,7 @@ def process(self, context: CompileContext) -> CompileContext:
 
         return context
 
-    def _compile_default(self, context: CompileContext) -> None:
+    def _compile_model_compiler(self, context: CompileContext) -> None:
         """Single-model compile via ``WinMLSession`` (``ort.ModelCompiler``)."""
         # Resolve session class from compiler config
         compiler = context.config.get("compiler", "ort")
@@ -130,7 +130,7 @@ def _compile_default(self, context: CompileContext) -> None:
         if ep_config.enable_ep_context:
             self._finalize_output(context, model_path, output_dir, device=device)
 
-    def _compile_plugged(self, context: CompileContext) -> None:
+    def _compile_inference_session(self, context: CompileContext) -> None:
         """Multi-model / inference-session compile with a shared EP context.
 
         The shared ``SessionOptions`` (``context.inference_session``) is created on
@@ -153,7 +153,13 @@ def _compile_plugged(self, context: CompileContext) -> None:
         output_dir = self._get_output_dir(context)
         output_dir.mkdir(parents=True, exist_ok=True)
         model_path = self._ensure_model_file(context)
-        ctx_path = output_dir / f"{context.model_path.stem}_ctx.onnx"
+        # Honor an explicit output filename (e.g. the de-duplicated <stem>_ctx.onnx
+        # that compile_multiple_onnx assigns); otherwise derive it from the model stem.
+        user_output = context.config.get("output_path")
+        if user_output and Path(user_output).suffix == ".onnx":
+            ctx_path = Path(user_output)
+        else:
+            ctx_path = output_dir / f"{context.model_path.stem}_ctx.onnx"
         backend = "inference_session" if use_is else "model_compiler"
         context.log(
             f"[{backend}] compiling {model_path.name} "
diff --git a/tests/unit/compiler/test_compile_multiple.py b/tests/unit/compiler/test_compile_multiple.py
new file mode 100644
index 000000000..70368269b
--- /dev/null
+++ b/tests/unit/compiler/test_compile_multiple.py
@@ -0,0 +1,84 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Unit tests for ``compile_multiple_onnx`` output-name handling.
+
+``Compiler`` is mocked so these exercise the per-model output naming / de-dup
+logic only — no real compilation or EP runtime is needed.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING
+from unittest.mock import MagicMock, patch
+
+from winml.modelkit.compiler import compile_multiple_onnx
+
+
+if TYPE_CHECKING:
+    import pytest
+
+
+def _output_names(mock_compiler_cls: MagicMock) -> list[str]:
+    """Filenames passed as ``output_path`` to each ``Compiler.compile`` call, in order."""
+    calls = mock_compiler_cls.return_value.compile.call_args_list
+    return [Path(c.kwargs["output_path"]).name for c in calls]
+
+
+class TestCompileMultipleNaming:
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_duplicate_names_suffixed_with_warning(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Two inputs with the same filename: the later one gets an ``_1`` suffix and warns."""
+        m1 = tmp_path / "a" / "model.onnx"
+        m2 = tmp_path / "b" / "model.onnx"
+        out_dir = tmp_path / "out"
+        mock_compiler_cls.return_value.compile.return_value = MagicMock(success=True)
+
+        with caplog.at_level(logging.WARNING):
+            results = compile_multiple_onnx([m1, m2], out_dir)
+
+        assert len(results) == 2
+        names = _output_names(mock_compiler_cls)
+        assert names == ["model_ctx.onnx", "model_1_ctx.onnx"]
+        # Both land in the requested output directory.
+        for c in mock_compiler_cls.return_value.compile.call_args_list:
+            assert Path(c.kwargs["output_path"]).parent == out_dir
+        assert "repeats" in caplog.text
+
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_triple_duplicate_names_increment(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Three same-named inputs increment the suffix: _ , _1, _2."""
+        models = [tmp_path / d / "m.onnx" for d in ("a", "b", "c")]
+        mock_compiler_cls.return_value.compile.return_value = MagicMock(success=True)
+
+        with caplog.at_level(logging.WARNING):
+            compile_multiple_onnx(models, tmp_path / "out")
+
+        assert _output_names(mock_compiler_cls) == [
+            "m_ctx.onnx",
+            "m_1_ctx.onnx",
+            "m_2_ctx.onnx",
+        ]
+        assert caplog.text.count("repeats") == 2
+
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_unique_names_no_suffix_no_warning(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Distinct filenames keep their stems and emit no warning."""
+        m1 = tmp_path / "a.onnx"
+        m2 = tmp_path / "b.onnx"
+        mock_compiler_cls.return_value.compile.return_value = MagicMock(success=True)
+
+        with caplog.at_level(logging.WARNING):
+            compile_multiple_onnx([m1, m2], tmp_path / "out")
+
+        assert _output_names(mock_compiler_cls) == ["a_ctx.onnx", "b_ctx.onnx"]
+        assert "repeats" not in caplog.text

From eebb01195ef638fda2add998aa5d1ce393a6d357 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 14:48:50 +0800
Subject: [PATCH 03/10] compile: rename CompileStage paths; always collect
 model info

- Rename _compile_model_compiler -> _compile_single_model_compiler and
  _compile_inference_session -> _compile_multiple.
- In _compile_multiple, collect model I/O info regardless of --no-validate
  (only _validate_model stays gated on context.validate).
---
 src/winml/modelkit/compiler/stages/compile.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index 3b6efd7cc..66f94116d 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -49,9 +49,9 @@ def process(self, context: CompileContext) -> CompileContext:
 
         Two compile paths, selected from the multi-model state on the context:
 
-        * ``_compile_model_compiler`` (single model, default): the existing
+        * ``_compile_single_model_compiler`` (single model, default): the existing
           ``WinMLSession`` / ``ort.ModelCompiler`` path — unchanged.
-        * ``_compile_inference_session`` (``use_inference_session`` and/or
+        * ``_compile_multiple`` (``use_inference_session`` and/or
           ``n_total_models > 1``): reuses one shared ``SessionOptions`` so multiple
           models share a single EP context (weight sharing); the backend is
           ``ort.InferenceSession`` when requested, else ``ort.ModelCompiler``.
@@ -61,9 +61,9 @@ def process(self, context: CompileContext) -> CompileContext:
 
         try:
             if context.use_inference_session or context.n_total_models > 1:
-                self._compile_inference_session(context)
+                self._compile_multiple(context)
             else:
-                self._compile_model_compiler(context)
+                self._compile_single_model_compiler(context)
 
         except Exception as e:
             context.add_error(f"Compilation failed: {e}")
@@ -76,7 +76,7 @@ def process(self, context: CompileContext) -> CompileContext:
 
         return context
 
-    def _compile_model_compiler(self, context: CompileContext) -> None:
+    def _compile_single_model_compiler(self, context: CompileContext) -> None:
         """Single-model compile via ``WinMLSession`` (``ort.ModelCompiler``)."""
         # Resolve session class from compiler config
         compiler = context.config.get("compiler", "ort")
@@ -130,7 +130,7 @@ def _compile_model_compiler(self, context: CompileContext) -> None:
         if ep_config.enable_ep_context:
             self._finalize_output(context, model_path, output_dir, device=device)
 
-    def _compile_inference_session(self, context: CompileContext) -> None:
+    def _compile_multiple(self, context: CompileContext) -> None:
         """Multi-model / inference-session compile with a shared EP context.
 
         The shared ``SessionOptions`` (``context.inference_session``) is created on
@@ -203,7 +203,8 @@ def _compile_inference_session(self, context: CompileContext) -> None:
             # Models compiled this way are loadable; validate (run) when requested.
             if context.validate:
                 self._validate_model(session, context)
-                self._collect_model_info(session, context)
+            # Collect I/O info regardless of validation.
+            self._collect_model_info(session, context)
         else:
             # ModelCompiler backend: compile straight to the EPContext file. No
             # session is created here (smoke path — outputs are checked, not loaded).

From 9c9e25e09729f0c98eeb4ceb3ee08babdc48515d Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 15:25:30 +0800
Subject: [PATCH 04/10] compile: carry use_inference_session on
 WinMLCompileConfig

- Add WinMLCompileConfig.use_inference_session (default False) + to_dict/from_dict.
- winml compile sets config.use_inference_session from the merged CLI flag /
  config-file value (CLI flag overrides); compilation reads it from the config.
- compile_multiple_onnx drops its use_inference_session parameter and reads the
  backend from config.use_inference_session.
- Tests: assert the CLI flag is applied onto the config used for compilation.
---
 src/winml/modelkit/commands/compile.py      |  7 +--
 src/winml/modelkit/compiler/compiler.py     |  9 ++--
 src/winml/modelkit/compiler/configs.py      |  6 +++
 tests/unit/compiler/test_compile_command.py | 47 ++++++++++++++++++++-
 4 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index 6026d3c88..c70f1dfe6 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -232,6 +232,9 @@ def compile(
 
     config.validate = validate
     config.verbose = bool(verbose)
+    # CLI flag (merged with any config-file value above) overrides the config; the
+    # actual compilation reads use_inference_session from the config.
+    config.use_inference_session = use_inference_session
 
     # Set compiler options
     config.ep_config.compiler = compiler
@@ -272,9 +275,7 @@ def compile(
             # Multi-model (shared EP context) and/or inference-session backend.
             # compile_multiple_onnx writes each model as <stem>_ctx.onnx into a folder;
             # multiple models require --output-dir (enforced above).
-            results = compile_multiple_onnx(
-                models, output_dir, config, use_inference_session=use_inference_session
-            )
+            results = compile_multiple_onnx(models, output_dir, config)
 
         # Report every model's result (not just the first failure).
         multi = len(results) > 1
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index a19a68fb1..60bd00f68 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -244,14 +244,14 @@ def compile_multiple_onnx(
     model_paths: Sequence[str | Path],
     output_dir: str | Path | None = None,
     config: WinMLCompileConfig | None = None,
-    use_inference_session: bool = False,
 ) -> list[CompileResult]:
     """Compile one or more ONNX models, sharing a single EP context when >1.
 
     A single :class:`Compiler` (``n_total_models=len(model_paths)``) compiles every
     model in sequence, reusing one shared ``SessionOptions`` so the weights are shared
-    across the compiled EPContext models. The backend is ``ort.ModelCompiler`` by
-    default, or ``ort.InferenceSession`` when ``use_inference_session`` is set.
+    across the compiled EPContext models. The backend is taken from
+    ``config.use_inference_session``: ``ort.ModelCompiler`` (default) or
+    ``ort.InferenceSession`` when set.
 
     Args:
         model_paths: Input ONNX model paths. Each compiles to ``<stem>_ctx.onnx`` in
@@ -260,7 +260,6 @@ def compile_multiple_onnx(
             ``model_ctx.onnx`` then ``model_1_ctx.onnx``.
         output_dir: Output directory for the compiled models.
         config: Compilation configuration. ``None`` skips compilation (passthrough).
-        use_inference_session: Use the InferenceSession backend.
 
     Returns:
         One :class:`CompileResult` per input model, in order.
@@ -270,7 +269,7 @@ def compile_multiple_onnx(
 
     compiler = Compiler(
         n_total_models=len(paths),
-        use_inference_session=use_inference_session,
+        use_inference_session=bool(config and config.use_inference_session),
     )
     # Compiled in order so the shared context accumulates and the last model flushes it.
     # Outputs are keyed by filename stem in a single folder, so disambiguate same-named
diff --git a/src/winml/modelkit/compiler/configs.py b/src/winml/modelkit/compiler/configs.py
index 2059c9528..ebb30d3e7 100644
--- a/src/winml/modelkit/compiler/configs.py
+++ b/src/winml/modelkit/compiler/configs.py
@@ -68,6 +68,8 @@ class WinMLCompileConfig:
         ep_config: Execution provider settings
         validate: Validate compiled model
         verbose: Enable verbose logging
+        use_inference_session: Compile via ort.InferenceSession (ep.context_enable)
+            instead of the default ort.ModelCompiler backend
 
     Examples:
         # Default: QNN compilation
@@ -87,6 +89,8 @@ class WinMLCompileConfig:
     # Behavior
     validate: bool = True
     verbose: bool = False
+    # Compile backend: False -> ort.ModelCompiler (default), True -> ort.InferenceSession.
+    use_inference_session: bool = False
 
     @property
     def device(self) -> str:
@@ -246,6 +250,7 @@ def to_dict(self) -> dict[str, Any]:
             ),
             "device": self.ep_config.device,
             "validate": self.validate,
+            "use_inference_session": self.use_inference_session,
         }
 
     @classmethod
@@ -265,4 +270,5 @@ def from_dict(cls, data: dict[str, Any]) -> WinMLCompileConfig:
             ep_config=ep_config,
             validate=data.get("validate", True),
             verbose=data.get("verbose", False),
+            use_inference_session=data.get("use_inference_session", False),
         )
diff --git a/tests/unit/compiler/test_compile_command.py b/tests/unit/compiler/test_compile_command.py
index 5b49da530..2626cd1d9 100644
--- a/tests/unit/compiler/test_compile_command.py
+++ b/tests/unit/compiler/test_compile_command.py
@@ -420,8 +420,51 @@ def test_multiple_models_with_output_dir_calls_compile_multiple(
         assert [str(m) for m in passed_models] == [str(m1), str(m2)]
         # Second positional arg is the output target — the --output-dir directory.
         assert call_args.args[1] == out_dir
-        # use_inference_session defaults to False (model_compiler backend).
-        assert call_args.kwargs["use_inference_session"] is False
+        # Backend is carried on the config; defaults to False (model_compiler).
+        assert call_args.args[2].use_inference_session is False
+
+    @patch("winml.modelkit.compiler.compile_multiple_onnx")
+    def test_use_inference_session_flag_overrides_config(
+        self,
+        mock_compile_multiple: MagicMock,
+        runner: CliRunner,
+        tmp_path: Path,
+    ) -> None:
+        """--use-inference-session sets config.use_inference_session=True for the compile."""
+        m1 = tmp_path / "m1.onnx"
+        m2 = tmp_path / "m2.onnx"
+        self._create_simple_onnx(m1)
+        self._create_simple_onnx(m2)
+        out_dir = tmp_path / "out"
+
+        mock_result = MagicMock()
+        mock_result.success = True
+        mock_result.output_path = out_dir / "m2_ctx.onnx"
+        mock_result.compile_time = 1.0
+        mock_result.total_time = 1.5
+        mock_compile_multiple.return_value = [mock_result, mock_result]
+
+        result = runner.invoke(
+            main,
+            [
+                "compile",
+                "-m",
+                str(m1),
+                "-m",
+                str(m2),
+                "--device",
+                "npu",
+                "--ep",
+                "qnn",
+                "--output-dir",
+                str(out_dir),
+                "--use-inference-session",
+            ],
+        )
+
+        assert result.exit_code == 0, result.output
+        # The CLI flag is applied onto the config that drives compilation.
+        assert mock_compile_multiple.call_args.args[2].use_inference_session is True
 
     def _create_simple_onnx(self, path: Path) -> None:
         """Create a simple ONNX model for testing."""

From 2b5150bf2234f859d4913172bbbc8b556d64a412 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 16:11:16 +0800
Subject: [PATCH 05/10] compile: select InferenceSession backend via --compiler
 ort_inference_session

- Replace the --use-inference-session flag with a new --compiler choice
  "ort_inference_session" (added to EP_COMPILER_MAPPING for QNN and the default
  EP so `winml compile --list` shows it).
- Drop the use_inference_session member from Compiler and WinMLCompileConfig;
  CompileContext.use_inference_session is now a property
  (config["compiler"] == "ort_inference_session").
- _compile_single_model_compiler raises if given "ort_inference_session"
  (it routes to the inference-session path instead).
- Update docstrings and tests for the new compiler choice.
---
 src/winml/modelkit/commands/compile.py        | 30 ++++---------------
 src/winml/modelkit/compiler/compiler.py       | 26 +++++++---------
 src/winml/modelkit/compiler/configs.py        |  9 ++----
 src/winml/modelkit/compiler/context.py        | 10 +++++--
 src/winml/modelkit/compiler/stages/compile.py |  8 ++++-
 tests/e2e/test_compile_e2e.py                 | 14 +++++----
 tests/unit/compiler/test_compile_command.py   | 15 +++++-----
 7 files changed, 49 insertions(+), 63 deletions(-)

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index c70f1dfe6..83c3a3210 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -76,9 +76,10 @@
 )
 @click.option(
     "--compiler",
-    type=click.Choice(["ort", "qairt"]),
+    type=click.Choice(["ort", "ort_inference_session", "qairt"]),
     default="ort",
-    help="Compiler backend (default: ort)",
+    help="Compiler backend (default: ort). 'ort_inference_session' compiles via "
+    "ort.InferenceSession (ep.context_enable) — required for shared-context multi-model.",
 )
 @click.option(
     "--qnn-sdk-root",
@@ -92,13 +93,6 @@
     show_default=True,
     help="Embed EP context in ONNX file (default: external .bin file)",
 )
-@click.option(
-    "--use-inference-session",
-    is_flag=True,
-    default=False,
-    help="Compile via ort.InferenceSession (ep.context_enable) instead of the default "
-    "ort.ModelCompiler backend.",
-)
 @click.option(
     "--list",
     "list_compilers_flag",
@@ -122,7 +116,6 @@ def compile(
     compiler: str,
     qnn_sdk_root: Path | None,
     embed: bool,
-    use_inference_session: bool,
     list_compilers_flag: bool,
     config_file: Path | None,
 ) -> None:
@@ -163,11 +156,6 @@ def compile(
             compiler = cc["compiler"]
         if not cli_utils.is_cli_provided(ctx, "embed") and "embed_context" in cc:
             embed = cc["embed_context"]
-        if (
-            not cli_utils.is_cli_provided(ctx, "use_inference_session")
-            and "use_inference_session" in cc
-        ):
-            use_inference_session = cc["use_inference_session"]
         if not cli_utils.is_cli_provided(ctx, "validate") and "validate" in cc:
             validate = cc["validate"]
         # Config-file verbosity fallback. CLI flags always win: only honor the
@@ -232,11 +220,9 @@ def compile(
 
     config.validate = validate
     config.verbose = bool(verbose)
-    # CLI flag (merged with any config-file value above) overrides the config; the
-    # actual compilation reads use_inference_session from the config.
-    config.use_inference_session = use_inference_session
 
-    # Set compiler options
+    # Set compiler options. The compiler choice selects the backend:
+    # "ort_inference_session" -> ort.InferenceSession, else ort.ModelCompiler / qairt.
     config.ep_config.compiler = compiler
     config.ep_config.qnn_sdk_root = qnn_sdk_root
     config.ep_config.embed_context = embed
@@ -251,10 +237,6 @@ def compile(
         console.print(f"[bold blue]EP:[/bold blue] {ep}")
     console.print(f"[bold blue]Provider:[/bold blue] {provider}")
     console.print(f"[bold blue]Compiler:[/bold blue] {compiler}")
-    console.print(
-        f"[bold blue]Backend:[/bold blue] "
-        f"{'inference_session' if use_inference_session else 'model_compiler'}"
-    )
     if len(models) > 1:
         console.print(f"[bold blue]Shared EP context:[/bold blue] yes ({len(models)} models)")
     if qnn_sdk_root:
@@ -268,7 +250,7 @@ def compile(
 
     try:
         console.print("\n[bold]Compiling model(s)...[/bold]")
-        if len(models) == 1 and not use_inference_session:
+        if len(models) == 1 and compiler != "ort_inference_session":
             # Default path: single model via ort.ModelCompiler (staged pipeline).
             results = [compile_onnx(models[0], output_path=resolved_output, config=config)]
         else:
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index 60bd00f68..80bafd3cb 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -29,8 +29,8 @@
 
 # EP → available compilers. Keys are canonical EPName (or None for the default).
 EP_COMPILER_MAPPING: dict[EPName | None, list[str]] = {
-    "QNNExecutionProvider": ["ort", "qairt"],
-    None: ["ort"],
+    "QNNExecutionProvider": ["ort", "ort_inference_session", "qairt"],
+    None: ["ort", "ort_inference_session"],
 }
 
 
@@ -59,23 +59,19 @@ class Compiler:
     # Registered stages (in execution order)
     _stages: list[type[BaseStage]] | None = None
 
-    def __init__(
-        self,
-        n_total_models: int = 1,
-        use_inference_session: bool = False,
-    ) -> None:
+    def __init__(self, n_total_models: int = 1) -> None:
         """Create a compiler.
 
         Args:
             n_total_models: Total number of models compiled by this instance. When
                 >1, the models share a single EP context (weight sharing) and the
                 same shared ``SessionOptions`` is reused across every ``compile``.
-            use_inference_session: Select the ``ort.InferenceSession``
-                (``ep.context_enable``) backend instead of the default
-                ``ort.ModelCompiler``.
+
+        The compile backend (ort.ModelCompiler vs ort.InferenceSession) is taken from
+        the config's ``compiler`` setting ("ort_inference_session" selects the
+        InferenceSession backend), surfaced via ``CompileContext.use_inference_session``.
         """
         self.n_total_models = n_total_models
-        self.use_inference_session = use_inference_session
         # The shared SessionOptions: created by CompileStage on the first model and
         # reused for the rest (kept here so it survives between compile() calls).
         self.inference_session: object | None = None
@@ -141,7 +137,6 @@ def compile(
                 verbose=config.verbose,
                 n_compiled_models=self.n_compiled_models,
                 n_total_models=self.n_total_models,
-                use_inference_session=self.use_inference_session,
                 inference_session=self.inference_session,
             )
 
@@ -267,10 +262,9 @@ def compile_multiple_onnx(
     paths = [Path(mp) for mp in model_paths]
     out_dir = Path(output_dir) if output_dir is not None else None
 
-    compiler = Compiler(
-        n_total_models=len(paths),
-        use_inference_session=bool(config and config.use_inference_session),
-    )
+    # Backend is taken from config.ep_config.compiler ("ort_inference_session" selects
+    # the InferenceSession backend), surfaced via CompileContext.use_inference_session.
+    compiler = Compiler(n_total_models=len(paths))
     # Compiled in order so the shared context accumulates and the last model flushes it.
     # Outputs are keyed by filename stem in a single folder, so disambiguate same-named
     # inputs by suffixing the later one(s) instead of overwriting.
diff --git a/src/winml/modelkit/compiler/configs.py b/src/winml/modelkit/compiler/configs.py
index ebb30d3e7..b9a816a70 100644
--- a/src/winml/modelkit/compiler/configs.py
+++ b/src/winml/modelkit/compiler/configs.py
@@ -37,7 +37,8 @@ class EPConfig:
         provider_options: EP-specific options as key=value dict
         enable_ep_context: Generate EPContext model with pre-compiled graph
         embed_context: Embed context in ONNX (True) or external .bin file (False)
-        compiler: Compiler backend ("ort" or "qairt")
+        compiler: Compiler backend ("ort", "ort_inference_session", or "qairt").
+            "ort_inference_session" selects the ort.InferenceSession backend.
         qnn_sdk_root: Path to QAIRT SDK root (required when compiler is "qairt")
         device: Target device ("npu", "gpu", "cpu", "auto")
     """
@@ -68,8 +69,6 @@ class WinMLCompileConfig:
         ep_config: Execution provider settings
         validate: Validate compiled model
         verbose: Enable verbose logging
-        use_inference_session: Compile via ort.InferenceSession (ep.context_enable)
-            instead of the default ort.ModelCompiler backend
 
     Examples:
         # Default: QNN compilation
@@ -89,8 +88,6 @@ class WinMLCompileConfig:
     # Behavior
     validate: bool = True
     verbose: bool = False
-    # Compile backend: False -> ort.ModelCompiler (default), True -> ort.InferenceSession.
-    use_inference_session: bool = False
 
     @property
     def device(self) -> str:
@@ -250,7 +247,6 @@ def to_dict(self) -> dict[str, Any]:
             ),
             "device": self.ep_config.device,
             "validate": self.validate,
-            "use_inference_session": self.use_inference_session,
         }
 
     @classmethod
@@ -270,5 +266,4 @@ def from_dict(cls, data: dict[str, Any]) -> WinMLCompileConfig:
             ep_config=ep_config,
             validate=data.get("validate", True),
             verbose=data.get("verbose", False),
-            use_inference_session=data.get("use_inference_session", False),
         )
diff --git a/src/winml/modelkit/compiler/context.py b/src/winml/modelkit/compiler/context.py
index 572c5bebe..6d8559091 100644
--- a/src/winml/modelkit/compiler/context.py
+++ b/src/winml/modelkit/compiler/context.py
@@ -48,12 +48,10 @@ class CompileContext:
     # n_compiled_models: how many models the Compiler has already compiled (0-based
     #   index of the current model).
     # n_total_models: total models in this compile run (>1 enables weight sharing).
-    # use_inference_session: pick the InferenceSession backend over ort.ModelCompiler.
     # inference_session: the shared ort.SessionOptions created on the first model and
     #   reused for the rest (the EP is added once and the share group lives on it).
     n_compiled_models: int = 0
     n_total_models: int = 1
-    use_inference_session: bool = False
     inference_session: ort.SessionOptions | None = None
 
     # Output paths
@@ -103,6 +101,14 @@ def execution_provider(self) -> EPAlias:
         """Get target execution provider."""
         return cast("EPAlias", self.config.get("execution_provider", "qnn"))
 
+    @property
+    def use_inference_session(self) -> bool:
+        """Whether to use the ort.InferenceSession backend (vs ort.ModelCompiler).
+
+        True iff the configured compiler is ``"ort_inference_session"``.
+        """
+        return self.config.get("compiler") == "ort_inference_session"
+
     @property
     def enable_ep_context(self) -> bool:
         """Whether to generate EPContext model."""
diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index 66f94116d..12fa273cb 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -78,8 +78,14 @@ def process(self, context: CompileContext) -> CompileContext:
 
     def _compile_single_model_compiler(self, context: CompileContext) -> None:
         """Single-model compile via ``WinMLSession`` (``ort.ModelCompiler``)."""
-        # Resolve session class from compiler config
+        # Resolve session class from compiler config. "ort_inference_session" must not
+        # reach here — it routes to _compile_multiple via context.use_inference_session.
         compiler = context.config.get("compiler", "ort")
+        if compiler == "ort_inference_session":
+            raise ValueError(
+                "'ort_inference_session' is handled by the inference-session path, "
+                "not the single-model ModelCompiler path."
+            )
         session_cls = COMPILER_SESSION_MAPPING[compiler]
 
         # Determine final output directory (default: same as input model)
diff --git a/tests/e2e/test_compile_e2e.py b/tests/e2e/test_compile_e2e.py
index c4192a73f..9fd56e368 100644
--- a/tests/e2e/test_compile_e2e.py
+++ b/tests/e2e/test_compile_e2e.py
@@ -253,7 +253,6 @@ def test_help_lists_every_option(self) -> None:
             "--compiler",
             "--qnn-sdk-root",
             "--embed",
-            "--use-inference-session",
             "--list",
         ):
             assert opt in result.output, f"--help missing {opt}"
@@ -934,8 +933,8 @@ def _sample_for(model_path: Path) -> dict[str, np.ndarray]:
 def test_default_backend_uses_model_compiler(
     simple_matmul_onnx: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    """By default (no ``--use-inference-session``), a single-model compile is driven
-    by ``ort.ModelCompiler``."""
+    """By default (``--compiler ort``), a single-model compile is driven by
+    ``ort.ModelCompiler``."""
     require_ep("qnn")
     import onnxruntime as ort
 
@@ -980,12 +979,15 @@ def test_multi_model_shared_weights(
 
     cmd = ["-m", str(m_seq4), "-m", str(m_seq1), "--ep", "qnn", "--output-dir", str(out_dir)]
     if use_inference_session:
-        cmd.append("--use-inference-session")
+        cmd += ["--compiler", "ort_inference_session"]
     result = _invoke(*cmd)
     assert result.exit_code == 0, result.output
     assert "Success! Model compiled" in result.output, result.output
-    expected_backend = "inference_session" if use_inference_session else "model_compiler"
-    assert expected_backend in result.output.lower(), result.output
+    # The InferenceSession backend is selected via --compiler ort_inference_session.
+    if use_inference_session:
+        assert "ort_inference_session" in result.output
+    else:
+        assert "ort_inference_session" not in result.output
 
     # Both compiled wrappers exist + exactly one shared weights bin (weight sharing).
     ctx4 = out_dir / f"{m_seq4.stem}_ctx.onnx"
diff --git a/tests/unit/compiler/test_compile_command.py b/tests/unit/compiler/test_compile_command.py
index 2626cd1d9..1ae61fafc 100644
--- a/tests/unit/compiler/test_compile_command.py
+++ b/tests/unit/compiler/test_compile_command.py
@@ -420,17 +420,17 @@ def test_multiple_models_with_output_dir_calls_compile_multiple(
         assert [str(m) for m in passed_models] == [str(m1), str(m2)]
         # Second positional arg is the output target — the --output-dir directory.
         assert call_args.args[1] == out_dir
-        # Backend is carried on the config; defaults to False (model_compiler).
-        assert call_args.args[2].use_inference_session is False
+        # Backend is carried on the config's compiler; defaults to "ort" (ModelCompiler).
+        assert call_args.args[2].ep_config.compiler == "ort"
 
     @patch("winml.modelkit.compiler.compile_multiple_onnx")
-    def test_use_inference_session_flag_overrides_config(
+    def test_inference_session_compiler_sets_config(
         self,
         mock_compile_multiple: MagicMock,
         runner: CliRunner,
         tmp_path: Path,
     ) -> None:
-        """--use-inference-session sets config.use_inference_session=True for the compile."""
+        """--compiler ort_inference_session is carried on the config used for compilation."""
         m1 = tmp_path / "m1.onnx"
         m2 = tmp_path / "m2.onnx"
         self._create_simple_onnx(m1)
@@ -458,13 +458,14 @@ def test_use_inference_session_flag_overrides_config(
                 "qnn",
                 "--output-dir",
                 str(out_dir),
-                "--use-inference-session",
+                "--compiler",
+                "ort_inference_session",
             ],
         )
 
         assert result.exit_code == 0, result.output
-        # The CLI flag is applied onto the config that drives compilation.
-        assert mock_compile_multiple.call_args.args[2].use_inference_session is True
+        # The compiler choice is applied onto the config that drives compilation.
+        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == "ort_inference_session"
 
     def _create_simple_onnx(self, path: Path) -> None:
         """Create a simple ONNX model for testing."""

From 90349809c32dc9c52160bb148f0cc67b5d52531e Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 16:37:46 +0800
Subject: [PATCH 06/10] compile: output_path arg with file/dir rules; rename
 shared_session_options

- compile_multiple_onnx: rename output_dir -> output_path. A single model may pass a
  file or a directory; multiple models must pass a directory (asserted, since outputs
  share one folder). Resolve each model's output accordingly; the CLI passes the
  resolved -o/--output-dir path.
- Rename Compiler.inference_session / CompileContext.inference_session ->
  shared_session_options and tighten the annotation to ort.SessionOptions | None
  (they hold SessionOptions, not an InferenceSession).
- Add unit tests for the output_path file/dir rules.
---
 src/winml/modelkit/commands/compile.py        |  6 +-
 src/winml/modelkit/compiler/compiler.py       | 54 ++++++++++++------
 src/winml/modelkit/compiler/context.py        |  6 +-
 src/winml/modelkit/compiler/stages/compile.py |  6 +-
 tests/unit/compiler/test_compile_multiple.py  | 55 +++++++++++++++++--
 5 files changed, 97 insertions(+), 30 deletions(-)

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index 83c3a3210..97aba8fd7 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -255,9 +255,9 @@ def compile(
             results = [compile_onnx(models[0], output_path=resolved_output, config=config)]
         else:
             # Multi-model (shared EP context) and/or inference-session backend.
-            # compile_multiple_onnx writes each model as <stem>_ctx.onnx into a folder;
-            # multiple models require --output-dir (enforced above).
-            results = compile_multiple_onnx(models, output_dir, config)
+            # Multiple models require --output-dir (a directory, enforced above); a
+            # single inference_session model may use -o (a file) or --output-dir.
+            results = compile_multiple_onnx(models, resolved_output, config)
 
         # Report every model's result (not just the first failure).
         multi = len(results) > 1
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index 80bafd3cb..af026e17b 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -22,6 +22,8 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
+    import onnxruntime as ort
+
     from ..utils.constants import EPName
     from .configs import WinMLCompileConfig
     from .stages.base import BaseStage
@@ -74,7 +76,7 @@ def __init__(self, n_total_models: int = 1) -> None:
         self.n_total_models = n_total_models
         # The shared SessionOptions: created by CompileStage on the first model and
         # reused for the rest (kept here so it survives between compile() calls).
-        self.inference_session: object | None = None
+        self.shared_session_options: ort.SessionOptions | None = None
         self.n_compiled_models = 0
 
     @classmethod
@@ -137,7 +139,7 @@ def compile(
                 verbose=config.verbose,
                 n_compiled_models=self.n_compiled_models,
                 n_total_models=self.n_total_models,
-                inference_session=self.inference_session,
+                shared_session_options=self.shared_session_options,
             )
 
             if output_path is not None:
@@ -160,7 +162,7 @@ def compile(
 
             # Carry the shared SessionOptions (created/reused by CompileStage) forward
             # so the next model in a shared-context run reuses the same EP + group.
-            self.inference_session = context.inference_session
+            self.shared_session_options = context.shared_session_options
             self.n_compiled_models += 1
 
             # Build result
@@ -237,7 +239,7 @@ def compile_onnx(
 
 def compile_multiple_onnx(
     model_paths: Sequence[str | Path],
-    output_dir: str | Path | None = None,
+    output_path: str | Path | None = None,
     config: WinMLCompileConfig | None = None,
 ) -> list[CompileResult]:
     """Compile one or more ONNX models, sharing a single EP context when >1.
@@ -245,29 +247,43 @@ def compile_multiple_onnx(
     A single :class:`Compiler` (``n_total_models=len(model_paths)``) compiles every
     model in sequence, reusing one shared ``SessionOptions`` so the weights are shared
     across the compiled EPContext models. The backend is taken from
-    ``config.use_inference_session``: ``ort.ModelCompiler`` (default) or
-    ``ort.InferenceSession`` when set.
+    ``config.ep_config.compiler``: ``ort.ModelCompiler`` (default) or
+    ``ort.InferenceSession`` when it is ``"ort_inference_session"``.
 
     Args:
-        model_paths: Input ONNX model paths. Each compiles to ``<stem>_ctx.onnx`` in
-            ``output_dir``; inputs that share a filename stem are disambiguated by
-            appending an integer suffix to the later one(s) (with a warning), e.g.
-            ``model_ctx.onnx`` then ``model_1_ctx.onnx``.
-        output_dir: Output directory for the compiled models.
+        model_paths: Input ONNX model paths.
+        output_path: Where to write the compiled model(s).
+
+            * With a **single** model it may be a **file** path (the exact
+              ``*_ctx.onnx``) or a **directory** (``<stem>_ctx.onnx`` is written into
+              it); ``None`` writes next to the input.
+            * With **multiple** models it **must be a directory** — each model is
+              written as ``<stem>_ctx.onnx`` there, with same-named inputs disambiguated
+              by an integer suffix on the later one(s) (with a warning), e.g.
+              ``model_ctx.onnx`` then ``model_1_ctx.onnx``.
         config: Compilation configuration. ``None`` skips compilation (passthrough).
 
     Returns:
         One :class:`CompileResult` per input model, in order.
     """
     paths = [Path(mp) for mp in model_paths]
-    out_dir = Path(output_dir) if output_dir is not None else None
+    out = Path(output_path) if output_path is not None else None
+    # A path with a suffix (e.g. ".onnx") is a file; otherwise it's a directory.
+    out_is_file = out is not None and bool(out.suffix)
+
+    if len(paths) > 1:
+        out_is_dir = out is not None and not out_is_file
+        assert out_is_dir, (
+            "output_path must be a directory when compiling multiple models "
+            f"(shared EP context), got {output_path!r}"
+        )
 
     # Backend is taken from config.ep_config.compiler ("ort_inference_session" selects
     # the InferenceSession backend), surfaced via CompileContext.use_inference_session.
     compiler = Compiler(n_total_models=len(paths))
     # Compiled in order so the shared context accumulates and the last model flushes it.
-    # Outputs are keyed by filename stem in a single folder, so disambiguate same-named
-    # inputs by suffixing the later one(s) instead of overwriting.
+    # When writing into a directory, outputs are keyed by filename stem, so disambiguate
+    # same-named inputs by suffixing the later one(s) instead of overwriting.
     results: list[CompileResult] = []
     seen_stems: dict[str, int] = {}
     for p in paths:
@@ -281,6 +297,12 @@ def compile_multiple_onnx(
                 p.name,
                 out_stem,
             )
-        out_path = out_dir / f"{out_stem}_ctx.onnx" if out_dir is not None else None
-        results.append(compiler.compile(model_path=p, output_path=out_path, config=config))
+        if out is None:
+            resolved = None
+        elif out_is_file:
+            # Single-model file path: write exactly there.
+            resolved = out
+        else:
+            resolved = out / f"{out_stem}_ctx.onnx"
+        results.append(compiler.compile(model_path=p, output_path=resolved, config=config))
     return results
diff --git a/src/winml/modelkit/compiler/context.py b/src/winml/modelkit/compiler/context.py
index 6d8559091..aca40c6b3 100644
--- a/src/winml/modelkit/compiler/context.py
+++ b/src/winml/modelkit/compiler/context.py
@@ -48,11 +48,11 @@ class CompileContext:
     # n_compiled_models: how many models the Compiler has already compiled (0-based
     #   index of the current model).
     # n_total_models: total models in this compile run (>1 enables weight sharing).
-    # inference_session: the shared ort.SessionOptions created on the first model and
-    #   reused for the rest (the EP is added once and the share group lives on it).
+    # shared_session_options: the shared ort.SessionOptions created on the first model
+    #   and reused for the rest (the EP is added once and the share group lives on it).
     n_compiled_models: int = 0
     n_total_models: int = 1
-    inference_session: ort.SessionOptions | None = None
+    shared_session_options: ort.SessionOptions | None = None
 
     # Output paths
     output_path: Path | None = None
diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index 12fa273cb..af5d58446 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -139,7 +139,7 @@ def _compile_single_model_compiler(self, context: CompileContext) -> None:
     def _compile_multiple(self, context: CompileContext) -> None:
         """Multi-model / inference-session compile with a shared EP context.
 
-        The shared ``SessionOptions`` (``context.inference_session``) is created on
+        The shared ``SessionOptions`` (``context.shared_session_options``) is created on
         the first model — the EP is added once and, for a multi-model run, the
         ``ep.share_ep_contexts`` group is opened on it — then reused for every model.
         ``ep.stop_share_ep_contexts`` is added before the final model so the shared
@@ -173,7 +173,7 @@ def _compile_multiple(self, context: CompileContext) -> None:
         )
 
         # Build the shared SessionOptions once; reuse it for subsequent models.
-        sess_options = context.inference_session
+        sess_options = context.shared_session_options
         if sess_options is None:
             register_execution_providers(ort=True)
             resolved_device, _ = resolve_device(context.config.get("device", "auto"))
@@ -192,7 +192,7 @@ def _compile_multiple(self, context: CompileContext) -> None:
                 sess_options, ep, device_type, dict(ep_config.provider_options)
             ):
                 raise RuntimeError(f"Could not add {ep} for device type {device_type}")
-            context.inference_session = sess_options  # captured by Compiler for reuse
+            context.shared_session_options = sess_options  # captured by Compiler for reuse
 
         # Last model in a shared run flushes the shared context.
         if multi and is_last:
diff --git a/tests/unit/compiler/test_compile_multiple.py b/tests/unit/compiler/test_compile_multiple.py
index 70368269b..7f5a9959d 100644
--- a/tests/unit/compiler/test_compile_multiple.py
+++ b/tests/unit/compiler/test_compile_multiple.py
@@ -12,14 +12,11 @@
 
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING
 from unittest.mock import MagicMock, patch
 
-from winml.modelkit.compiler import compile_multiple_onnx
-
+import pytest
 
-if TYPE_CHECKING:
-    import pytest
+from winml.modelkit.compiler import compile_multiple_onnx
 
 
 def _output_names(mock_compiler_cls: MagicMock) -> list[str]:
@@ -82,3 +79,51 @@ def test_unique_names_no_suffix_no_warning(
 
         assert _output_names(mock_compiler_cls) == ["a_ctx.onnx", "b_ctx.onnx"]
         assert "repeats" not in caplog.text
+
+
+def _single_output(mock_compiler_cls: MagicMock) -> Path | None:
+    """The ``output_path`` passed to the single ``Compiler.compile`` call."""
+    out = mock_compiler_cls.return_value.compile.call_args.kwargs["output_path"]
+    return Path(out) if out is not None else None
+
+
+class TestCompileMultipleOutputPath:
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_multiple_models_require_directory(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path
+    ) -> None:
+        """Multiple models with a file output_path (has a suffix) is rejected."""
+        m1 = tmp_path / "a" / "m.onnx"
+        m2 = tmp_path / "b" / "m.onnx"
+        with pytest.raises(AssertionError, match="must be a directory"):
+            compile_multiple_onnx([m1, m2], tmp_path / "out.onnx")
+
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_multiple_models_reject_none_output(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path
+    ) -> None:
+        """Multiple models with no output_path is rejected (would break shared context)."""
+        m1 = tmp_path / "a" / "m.onnx"
+        m2 = tmp_path / "b" / "m.onnx"
+        with pytest.raises(AssertionError, match="must be a directory"):
+            compile_multiple_onnx([m1, m2], None)
+
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_single_model_file_output_path(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path
+    ) -> None:
+        """A single model accepts a file output_path and writes exactly there."""
+        mock_compiler_cls.return_value.compile.return_value = MagicMock(success=True)
+        out_file = tmp_path / "custom_name.onnx"
+        compile_multiple_onnx([tmp_path / "model.onnx"], out_file)
+        assert _single_output(mock_compiler_cls) == out_file
+
+    @patch("winml.modelkit.compiler.compiler.Compiler")
+    def test_single_model_dir_output_path(
+        self, mock_compiler_cls: MagicMock, tmp_path: Path
+    ) -> None:
+        """A single model with a directory output_path writes <stem>_ctx.onnx into it."""
+        mock_compiler_cls.return_value.compile.return_value = MagicMock(success=True)
+        out_dir = tmp_path / "out"
+        compile_multiple_onnx([tmp_path / "model.onnx"], out_dir)
+        assert _single_output(mock_compiler_cls) == out_dir / "model_ctx.onnx"

From 3d37b4da92e8246669af846b6f6ffebef7d4f3cf Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Thu, 11 Jun 2026 17:33:46 +0800
Subject: [PATCH 07/10] compile: rename compiler choice ort_inference_session
 -> ort_jit

Renames the InferenceSession-backend --compiler choice (and its references in
EP_COMPILER_MAPPING, the CompileContext.use_inference_session property, the
single-model guard, docstrings, and tests) from "ort_inference_session" to "ort_jit".
---
 src/winml/modelkit/commands/compile.py        |  8 ++++----
 src/winml/modelkit/compiler/compiler.py       | 10 +++++-----
 src/winml/modelkit/compiler/configs.py        |  4 ++--
 src/winml/modelkit/compiler/context.py        |  4 ++--
 src/winml/modelkit/compiler/stages/compile.py |  6 +++---
 tests/e2e/test_compile_e2e.py                 |  8 ++++----
 tests/unit/compiler/test_compile_command.py   |  8 ++++----
 7 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index 97aba8fd7..018864d76 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -76,9 +76,9 @@
 )
 @click.option(
     "--compiler",
-    type=click.Choice(["ort", "ort_inference_session", "qairt"]),
+    type=click.Choice(["ort", "ort_jit", "qairt"]),
     default="ort",
-    help="Compiler backend (default: ort). 'ort_inference_session' compiles via "
+    help="Compiler backend (default: ort). 'ort_jit' compiles via "
     "ort.InferenceSession (ep.context_enable) — required for shared-context multi-model.",
 )
 @click.option(
@@ -222,7 +222,7 @@ def compile(
     config.verbose = bool(verbose)
 
     # Set compiler options. The compiler choice selects the backend:
-    # "ort_inference_session" -> ort.InferenceSession, else ort.ModelCompiler / qairt.
+    # "ort_jit" -> ort.InferenceSession, else ort.ModelCompiler / qairt.
     config.ep_config.compiler = compiler
     config.ep_config.qnn_sdk_root = qnn_sdk_root
     config.ep_config.embed_context = embed
@@ -250,7 +250,7 @@ def compile(
 
     try:
         console.print("\n[bold]Compiling model(s)...[/bold]")
-        if len(models) == 1 and compiler != "ort_inference_session":
+        if len(models) == 1 and compiler != "ort_jit":
             # Default path: single model via ort.ModelCompiler (staged pipeline).
             results = [compile_onnx(models[0], output_path=resolved_output, config=config)]
         else:
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index af026e17b..495b301f2 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -31,8 +31,8 @@
 
 # EP → available compilers. Keys are canonical EPName (or None for the default).
 EP_COMPILER_MAPPING: dict[EPName | None, list[str]] = {
-    "QNNExecutionProvider": ["ort", "ort_inference_session", "qairt"],
-    None: ["ort", "ort_inference_session"],
+    "QNNExecutionProvider": ["ort", "ort_jit", "qairt"],
+    None: ["ort", "ort_jit"],
 }
 
 
@@ -70,7 +70,7 @@ def __init__(self, n_total_models: int = 1) -> None:
                 same shared ``SessionOptions`` is reused across every ``compile``.
 
         The compile backend (ort.ModelCompiler vs ort.InferenceSession) is taken from
-        the config's ``compiler`` setting ("ort_inference_session" selects the
+        the config's ``compiler`` setting ("ort_jit" selects the
         InferenceSession backend), surfaced via ``CompileContext.use_inference_session``.
         """
         self.n_total_models = n_total_models
@@ -248,7 +248,7 @@ def compile_multiple_onnx(
     model in sequence, reusing one shared ``SessionOptions`` so the weights are shared
     across the compiled EPContext models. The backend is taken from
     ``config.ep_config.compiler``: ``ort.ModelCompiler`` (default) or
-    ``ort.InferenceSession`` when it is ``"ort_inference_session"``.
+    ``ort.InferenceSession`` when it is ``"ort_jit"``.
 
     Args:
         model_paths: Input ONNX model paths.
@@ -278,7 +278,7 @@ def compile_multiple_onnx(
             f"(shared EP context), got {output_path!r}"
         )
 
-    # Backend is taken from config.ep_config.compiler ("ort_inference_session" selects
+    # Backend is taken from config.ep_config.compiler ("ort_jit" selects
     # the InferenceSession backend), surfaced via CompileContext.use_inference_session.
     compiler = Compiler(n_total_models=len(paths))
     # Compiled in order so the shared context accumulates and the last model flushes it.
diff --git a/src/winml/modelkit/compiler/configs.py b/src/winml/modelkit/compiler/configs.py
index b9a816a70..a52e3ee1f 100644
--- a/src/winml/modelkit/compiler/configs.py
+++ b/src/winml/modelkit/compiler/configs.py
@@ -37,8 +37,8 @@ class EPConfig:
         provider_options: EP-specific options as key=value dict
         enable_ep_context: Generate EPContext model with pre-compiled graph
         embed_context: Embed context in ONNX (True) or external .bin file (False)
-        compiler: Compiler backend ("ort", "ort_inference_session", or "qairt").
-            "ort_inference_session" selects the ort.InferenceSession backend.
+        compiler: Compiler backend ("ort", "ort_jit", or "qairt").
+            "ort_jit" selects the ort.InferenceSession backend.
         qnn_sdk_root: Path to QAIRT SDK root (required when compiler is "qairt")
         device: Target device ("npu", "gpu", "cpu", "auto")
     """
diff --git a/src/winml/modelkit/compiler/context.py b/src/winml/modelkit/compiler/context.py
index aca40c6b3..0017490cb 100644
--- a/src/winml/modelkit/compiler/context.py
+++ b/src/winml/modelkit/compiler/context.py
@@ -105,9 +105,9 @@ def execution_provider(self) -> EPAlias:
     def use_inference_session(self) -> bool:
         """Whether to use the ort.InferenceSession backend (vs ort.ModelCompiler).
 
-        True iff the configured compiler is ``"ort_inference_session"``.
+        True iff the configured compiler is ``"ort_jit"``.
         """
-        return self.config.get("compiler") == "ort_inference_session"
+        return self.config.get("compiler") == "ort_jit"
 
     @property
     def enable_ep_context(self) -> bool:
diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index af5d58446..70d94823f 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -78,12 +78,12 @@ def process(self, context: CompileContext) -> CompileContext:
 
     def _compile_single_model_compiler(self, context: CompileContext) -> None:
         """Single-model compile via ``WinMLSession`` (``ort.ModelCompiler``)."""
-        # Resolve session class from compiler config. "ort_inference_session" must not
+        # Resolve session class from compiler config. "ort_jit" must not
         # reach here — it routes to _compile_multiple via context.use_inference_session.
         compiler = context.config.get("compiler", "ort")
-        if compiler == "ort_inference_session":
+        if compiler == "ort_jit":
             raise ValueError(
-                "'ort_inference_session' is handled by the inference-session path, "
+                "'ort_jit' is handled by the inference-session path, "
                 "not the single-model ModelCompiler path."
             )
         session_cls = COMPILER_SESSION_MAPPING[compiler]
diff --git a/tests/e2e/test_compile_e2e.py b/tests/e2e/test_compile_e2e.py
index 9fd56e368..b1111221b 100644
--- a/tests/e2e/test_compile_e2e.py
+++ b/tests/e2e/test_compile_e2e.py
@@ -979,15 +979,15 @@ def test_multi_model_shared_weights(
 
     cmd = ["-m", str(m_seq4), "-m", str(m_seq1), "--ep", "qnn", "--output-dir", str(out_dir)]
     if use_inference_session:
-        cmd += ["--compiler", "ort_inference_session"]
+        cmd += ["--compiler", "ort_jit"]
     result = _invoke(*cmd)
     assert result.exit_code == 0, result.output
     assert "Success! Model compiled" in result.output, result.output
-    # The InferenceSession backend is selected via --compiler ort_inference_session.
+    # The InferenceSession backend is selected via --compiler ort_jit.
     if use_inference_session:
-        assert "ort_inference_session" in result.output
+        assert "ort_jit" in result.output
     else:
-        assert "ort_inference_session" not in result.output
+        assert "ort_jit" not in result.output
 
     # Both compiled wrappers exist + exactly one shared weights bin (weight sharing).
     ctx4 = out_dir / f"{m_seq4.stem}_ctx.onnx"
diff --git a/tests/unit/compiler/test_compile_command.py b/tests/unit/compiler/test_compile_command.py
index 1ae61fafc..6e70a388a 100644
--- a/tests/unit/compiler/test_compile_command.py
+++ b/tests/unit/compiler/test_compile_command.py
@@ -424,13 +424,13 @@ def test_multiple_models_with_output_dir_calls_compile_multiple(
         assert call_args.args[2].ep_config.compiler == "ort"
 
     @patch("winml.modelkit.compiler.compile_multiple_onnx")
-    def test_inference_session_compiler_sets_config(
+    def test_ort_jit_compiler_sets_config(
         self,
         mock_compile_multiple: MagicMock,
         runner: CliRunner,
         tmp_path: Path,
     ) -> None:
-        """--compiler ort_inference_session is carried on the config used for compilation."""
+        """--compiler ort_jit is carried on the config used for compilation."""
         m1 = tmp_path / "m1.onnx"
         m2 = tmp_path / "m2.onnx"
         self._create_simple_onnx(m1)
@@ -459,13 +459,13 @@ def test_inference_session_compiler_sets_config(
                 "--output-dir",
                 str(out_dir),
                 "--compiler",
-                "ort_inference_session",
+                "ort_jit",
             ],
         )
 
         assert result.exit_code == 0, result.output
         # The compiler choice is applied onto the config that drives compilation.
-        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == "ort_inference_session"
+        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == "ort_jit"
 
     def _create_simple_onnx(self, path: Path) -> None:
         """Create a simple ONNX model for testing."""

From f9a540674b631d6760b12804f562e824c60afedd Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Fri, 12 Jun 2026 10:52:48 +0800
Subject: [PATCH 08/10] compile: raise ValueError instead of assert for
 multi-model output_path

assert is stripped under python -O / PYTHONOPTIMIZE=1, so the multi-model
output_path invariant would silently disappear in optimized builds. Raise
ValueError instead, and update the unit tests to expect it.
---
 src/winml/modelkit/compiler/compiler.py      | 5 ++---
 tests/unit/compiler/test_compile_multiple.py | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index 495b301f2..2fad0261b 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -271,9 +271,8 @@ def compile_multiple_onnx(
     # A path with a suffix (e.g. ".onnx") is a file; otherwise it's a directory.
     out_is_file = out is not None and bool(out.suffix)
 
-    if len(paths) > 1:
-        out_is_dir = out is not None and not out_is_file
-        assert out_is_dir, (
+    if len(paths) > 1 and (out is None or out_is_file):
+        raise ValueError(
             "output_path must be a directory when compiling multiple models "
             f"(shared EP context), got {output_path!r}"
         )
diff --git a/tests/unit/compiler/test_compile_multiple.py b/tests/unit/compiler/test_compile_multiple.py
index 7f5a9959d..3b31829b7 100644
--- a/tests/unit/compiler/test_compile_multiple.py
+++ b/tests/unit/compiler/test_compile_multiple.py
@@ -95,7 +95,7 @@ def test_multiple_models_require_directory(
         """Multiple models with a file output_path (has a suffix) is rejected."""
         m1 = tmp_path / "a" / "m.onnx"
         m2 = tmp_path / "b" / "m.onnx"
-        with pytest.raises(AssertionError, match="must be a directory"):
+        with pytest.raises(ValueError, match="must be a directory"):
             compile_multiple_onnx([m1, m2], tmp_path / "out.onnx")
 
     @patch("winml.modelkit.compiler.compiler.Compiler")
@@ -105,7 +105,7 @@ def test_multiple_models_reject_none_output(
         """Multiple models with no output_path is rejected (would break shared context)."""
         m1 = tmp_path / "a" / "m.onnx"
         m2 = tmp_path / "b" / "m.onnx"
-        with pytest.raises(AssertionError, match="must be a directory"):
+        with pytest.raises(ValueError, match="must be a directory"):
             compile_multiple_onnx([m1, m2], None)
 
     @patch("winml.modelkit.compiler.compiler.Compiler")

From a0fdcfaf3aff419fbcf66348a013c0268bbd6def Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Fri, 12 Jun 2026 11:14:19 +0800
Subject: [PATCH 09/10] compile: rename ort_jit -> ort_session; add
 CompilerName type

- Rename the InferenceSession-backend --compiler choice from "ort_jit" to
  "ort_session" ("ort_jit" is a technique, not an object).
- Define CompilerName = Literal["ort", "ort_session", "qairt"] and COMPILER_NAMES
  in utils/constants.py (mirroring EPName); use the type for EPConfig.compiler, the
  CLI --compiler param, and EP_COMPILER_MAPPING, and derive the CLI choice list from
  COMPILER_NAMES.
- Add ORT_SESSION_COMPILER constant and use it for the backend-branching comparisons
  instead of the magic string.
---
 src/winml/modelkit/commands/compile.py        | 14 +++++++-------
 src/winml/modelkit/compiler/compiler.py       | 14 +++++++-------
 src/winml/modelkit/compiler/configs.py        |  8 ++++----
 src/winml/modelkit/compiler/context.py        |  6 ++++--
 src/winml/modelkit/compiler/stages/compile.py |  8 ++++----
 src/winml/modelkit/utils/constants.py         | 15 +++++++++++++++
 tests/e2e/test_compile_e2e.py                 |  8 ++++----
 tests/unit/compiler/test_compile_command.py   |  8 ++++----
 8 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py
index 018864d76..895d6a4d8 100644
--- a/src/winml/modelkit/commands/compile.py
+++ b/src/winml/modelkit/commands/compile.py
@@ -28,11 +28,11 @@
 from ..onnx import is_compiled_onnx
 from ..sysinfo import resolve_device, resolve_eps
 from ..utils import cli as cli_utils
-from ..utils.constants import normalize_ep_name
+from ..utils.constants import COMPILER_NAMES, ORT_SESSION_COMPILER, normalize_ep_name
 
 
 if TYPE_CHECKING:
-    from ..utils.constants import EPName, EPNameOrAlias
+    from ..utils.constants import CompilerName, EPName, EPNameOrAlias
 from ..utils.logging import configure_logging
 
 
@@ -76,9 +76,9 @@
 )
 @click.option(
     "--compiler",
-    type=click.Choice(["ort", "ort_jit", "qairt"]),
+    type=click.Choice(list(COMPILER_NAMES)),
     default="ort",
-    help="Compiler backend (default: ort). 'ort_jit' compiles via "
+    help="Compiler backend (default: ort). 'ort_session' compiles via "
     "ort.InferenceSession (ep.context_enable) — required for shared-context multi-model.",
 )
 @click.option(
@@ -113,7 +113,7 @@ def compile(
     validate: bool,
     verbose: int,
     quiet: bool,
-    compiler: str,
+    compiler: CompilerName,
     qnn_sdk_root: Path | None,
     embed: bool,
     list_compilers_flag: bool,
@@ -222,7 +222,7 @@ def compile(
     config.verbose = bool(verbose)
 
     # Set compiler options. The compiler choice selects the backend:
-    # "ort_jit" -> ort.InferenceSession, else ort.ModelCompiler / qairt.
+    # "ort_session" -> ort.InferenceSession, else ort.ModelCompiler / qairt.
     config.ep_config.compiler = compiler
     config.ep_config.qnn_sdk_root = qnn_sdk_root
     config.ep_config.embed_context = embed
@@ -250,7 +250,7 @@ def compile(
 
     try:
         console.print("\n[bold]Compiling model(s)...[/bold]")
-        if len(models) == 1 and compiler != "ort_jit":
+        if len(models) == 1 and compiler != ORT_SESSION_COMPILER:
             # Default path: single model via ort.ModelCompiler (staged pipeline).
             results = [compile_onnx(models[0], output_path=resolved_output, config=config)]
         else:
diff --git a/src/winml/modelkit/compiler/compiler.py b/src/winml/modelkit/compiler/compiler.py
index 2fad0261b..97c924328 100644
--- a/src/winml/modelkit/compiler/compiler.py
+++ b/src/winml/modelkit/compiler/compiler.py
@@ -24,15 +24,15 @@
 
     import onnxruntime as ort
 
-    from ..utils.constants import EPName
+    from ..utils.constants import CompilerName, EPName
     from .configs import WinMLCompileConfig
     from .stages.base import BaseStage
 
 
 # EP → available compilers. Keys are canonical EPName (or None for the default).
-EP_COMPILER_MAPPING: dict[EPName | None, list[str]] = {
-    "QNNExecutionProvider": ["ort", "ort_jit", "qairt"],
-    None: ["ort", "ort_jit"],
+EP_COMPILER_MAPPING: dict[EPName | None, list[CompilerName]] = {
+    "QNNExecutionProvider": ["ort", "ort_session", "qairt"],
+    None: ["ort", "ort_session"],
 }
 
 
@@ -70,7 +70,7 @@ def __init__(self, n_total_models: int = 1) -> None:
                 same shared ``SessionOptions`` is reused across every ``compile``.
 
         The compile backend (ort.ModelCompiler vs ort.InferenceSession) is taken from
-        the config's ``compiler`` setting ("ort_jit" selects the
+        the config's ``compiler`` setting ("ort_session" selects the
         InferenceSession backend), surfaced via ``CompileContext.use_inference_session``.
         """
         self.n_total_models = n_total_models
@@ -248,7 +248,7 @@ def compile_multiple_onnx(
     model in sequence, reusing one shared ``SessionOptions`` so the weights are shared
     across the compiled EPContext models. The backend is taken from
     ``config.ep_config.compiler``: ``ort.ModelCompiler`` (default) or
-    ``ort.InferenceSession`` when it is ``"ort_jit"``.
+    ``ort.InferenceSession`` when it is ``"ort_session"``.
 
     Args:
         model_paths: Input ONNX model paths.
@@ -277,7 +277,7 @@ def compile_multiple_onnx(
             f"(shared EP context), got {output_path!r}"
         )
 
-    # Backend is taken from config.ep_config.compiler ("ort_jit" selects
+    # Backend is taken from config.ep_config.compiler ("ort_session" selects
     # the InferenceSession backend), surfaced via CompileContext.use_inference_session.
     compiler = Compiler(n_total_models=len(paths))
     # Compiled in order so the shared context accumulates and the last model flushes it.
diff --git a/src/winml/modelkit/compiler/configs.py b/src/winml/modelkit/compiler/configs.py
index a52e3ee1f..50f2c71a5 100644
--- a/src/winml/modelkit/compiler/configs.py
+++ b/src/winml/modelkit/compiler/configs.py
@@ -17,7 +17,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-from ..utils.constants import EPAlias, EPName
+from ..utils.constants import CompilerName, EPAlias, EPName
 
 
 if TYPE_CHECKING:
@@ -37,8 +37,8 @@ class EPConfig:
         provider_options: EP-specific options as key=value dict
         enable_ep_context: Generate EPContext model with pre-compiled graph
         embed_context: Embed context in ONNX (True) or external .bin file (False)
-        compiler: Compiler backend ("ort", "ort_jit", or "qairt").
-            "ort_jit" selects the ort.InferenceSession backend.
+        compiler: Compiler backend ("ort", "ort_session", or "qairt").
+            "ort_session" selects the ort.InferenceSession backend.
         qnn_sdk_root: Path to QAIRT SDK root (required when compiler is "qairt")
         device: Target device ("npu", "gpu", "cpu", "auto")
     """
@@ -47,7 +47,7 @@ class EPConfig:
     provider_options: dict[str, str] = field(default_factory=dict)
     enable_ep_context: bool = True
     embed_context: bool = False
-    compiler: str = "ort"
+    compiler: CompilerName = "ort"
     qnn_sdk_root: Path | None = None
     device: str = "auto"
 
diff --git a/src/winml/modelkit/compiler/context.py b/src/winml/modelkit/compiler/context.py
index 0017490cb..fda351c50 100644
--- a/src/winml/modelkit/compiler/context.py
+++ b/src/winml/modelkit/compiler/context.py
@@ -14,6 +14,8 @@
 import onnx
 import onnxruntime as ort
 
+from ..utils.constants import ORT_SESSION_COMPILER
+
 
 if TYPE_CHECKING:
     from ..utils.constants import EPAlias
@@ -105,9 +107,9 @@ def execution_provider(self) -> EPAlias:
     def use_inference_session(self) -> bool:
         """Whether to use the ort.InferenceSession backend (vs ort.ModelCompiler).
 
-        True iff the configured compiler is ``"ort_jit"``.
+        True iff the configured compiler is ``"ort_session"``.
         """
-        return self.config.get("compiler") == "ort_jit"
+        return self.config.get("compiler") == ORT_SESSION_COMPILER
 
     @property
     def enable_ep_context(self) -> bool:
diff --git a/src/winml/modelkit/compiler/stages/compile.py b/src/winml/modelkit/compiler/stages/compile.py
index 70d94823f..4bc1c28c4 100644
--- a/src/winml/modelkit/compiler/stages/compile.py
+++ b/src/winml/modelkit/compiler/stages/compile.py
@@ -16,7 +16,7 @@
 
 from ...onnx import load_onnx, save_onnx
 from ...session import WinMLQairtSession, WinMLSession
-from ...utils.constants import normalize_ep_name
+from ...utils.constants import ORT_SESSION_COMPILER, normalize_ep_name
 from ..configs import WinMLCompileConfig
 from .base import BaseStage
 
@@ -78,12 +78,12 @@ def process(self, context: CompileContext) -> CompileContext:
 
     def _compile_single_model_compiler(self, context: CompileContext) -> None:
         """Single-model compile via ``WinMLSession`` (``ort.ModelCompiler``)."""
-        # Resolve session class from compiler config. "ort_jit" must not
+        # Resolve session class from compiler config. "ort_session" must not
         # reach here — it routes to _compile_multiple via context.use_inference_session.
         compiler = context.config.get("compiler", "ort")
-        if compiler == "ort_jit":
+        if compiler == ORT_SESSION_COMPILER:
             raise ValueError(
-                "'ort_jit' is handled by the inference-session path, "
+                f"{ORT_SESSION_COMPILER!r} is handled by the inference-session path, "
                 "not the single-model ModelCompiler path."
             )
         session_cls = COMPILER_SESSION_MAPPING[compiler]
diff --git a/src/winml/modelkit/utils/constants.py b/src/winml/modelkit/utils/constants.py
index b62e9e4c1..7b45d153a 100644
--- a/src/winml/modelkit/utils/constants.py
+++ b/src/winml/modelkit/utils/constants.py
@@ -49,6 +49,21 @@
 EPNameOrAlias: TypeAlias = EPName | EPAlias
 
 
+# Compile backends selectable via ``--compiler`` (see commands/compile.py):
+#   "ort"          -> ort.ModelCompiler (default)
+#   "ort_session"  -> ort.InferenceSession (ep.context_enable)
+#   "qairt"        -> QAIRT SDK compiler
+CompilerName = Literal["ort", "ort_session", "qairt"]
+
+# The ``--compiler`` choice that selects the ort.InferenceSession backend (the others
+# go through ort.ModelCompiler / the QAIRT SDK). Referenced wherever the backend is
+# branched on, so the magic string lives in exactly one place.
+ORT_SESSION_COMPILER: CompilerName = "ort_session"
+
+# Runtime-iterable form of ``CompilerName`` (e.g. for the CLI choice list).
+COMPILER_NAMES: tuple[CompilerName, ...] = get_args(CompilerName)
+
+
 # Supported execution providers — derived from the ``EPName`` Literal above so
 # that ``utils.constants`` stays leaf-level (no import dependency on sysinfo).
 # Membership parity with ``sysinfo.device._EP_DEVICE_MAP`` is enforced by
diff --git a/tests/e2e/test_compile_e2e.py b/tests/e2e/test_compile_e2e.py
index b1111221b..0da1c3864 100644
--- a/tests/e2e/test_compile_e2e.py
+++ b/tests/e2e/test_compile_e2e.py
@@ -979,15 +979,15 @@ def test_multi_model_shared_weights(
 
     cmd = ["-m", str(m_seq4), "-m", str(m_seq1), "--ep", "qnn", "--output-dir", str(out_dir)]
     if use_inference_session:
-        cmd += ["--compiler", "ort_jit"]
+        cmd += ["--compiler", "ort_session"]
     result = _invoke(*cmd)
     assert result.exit_code == 0, result.output
     assert "Success! Model compiled" in result.output, result.output
-    # The InferenceSession backend is selected via --compiler ort_jit.
+    # The InferenceSession backend is selected via --compiler ort_session.
     if use_inference_session:
-        assert "ort_jit" in result.output
+        assert "ort_session" in result.output
     else:
-        assert "ort_jit" not in result.output
+        assert "ort_session" not in result.output
 
     # Both compiled wrappers exist + exactly one shared weights bin (weight sharing).
     ctx4 = out_dir / f"{m_seq4.stem}_ctx.onnx"
diff --git a/tests/unit/compiler/test_compile_command.py b/tests/unit/compiler/test_compile_command.py
index 6e70a388a..910465ddf 100644
--- a/tests/unit/compiler/test_compile_command.py
+++ b/tests/unit/compiler/test_compile_command.py
@@ -424,13 +424,13 @@ def test_multiple_models_with_output_dir_calls_compile_multiple(
         assert call_args.args[2].ep_config.compiler == "ort"
 
     @patch("winml.modelkit.compiler.compile_multiple_onnx")
-    def test_ort_jit_compiler_sets_config(
+    def test_ort_session_compiler_sets_config(
         self,
         mock_compile_multiple: MagicMock,
         runner: CliRunner,
         tmp_path: Path,
     ) -> None:
-        """--compiler ort_jit is carried on the config used for compilation."""
+        """--compiler ort_session is carried on the config used for compilation."""
         m1 = tmp_path / "m1.onnx"
         m2 = tmp_path / "m2.onnx"
         self._create_simple_onnx(m1)
@@ -459,13 +459,13 @@ def test_ort_jit_compiler_sets_config(
                 "--output-dir",
                 str(out_dir),
                 "--compiler",
-                "ort_jit",
+                "ort_session",
             ],
         )
 
         assert result.exit_code == 0, result.output
         # The compiler choice is applied onto the config that drives compilation.
-        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == "ort_jit"
+        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == "ort_session"
 
     def _create_simple_onnx(self, path: Path) -> None:
         """Create a simple ONNX model for testing."""

From e07844d6e10978d7e5f6a2a6cc48f64122fa3ad6 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Fri, 12 Jun 2026 11:22:40 +0800
Subject: [PATCH 10/10] test(compile): use ORT_SESSION_COMPILER constant
 instead of "ort_session" literal

Reference the constants.py constant in the compile CLI tests' --compiler arg and
backend assertions, matching the production comparison sites.
---
 tests/e2e/test_compile_e2e.py               | 8 ++++----
 tests/unit/compiler/test_compile_command.py | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/test_compile_e2e.py b/tests/e2e/test_compile_e2e.py
index 0da1c3864..d26615ff4 100644
--- a/tests/e2e/test_compile_e2e.py
+++ b/tests/e2e/test_compile_e2e.py
@@ -45,7 +45,7 @@
 from winml.modelkit.commands.compile import compile as compile_cmd
 from winml.modelkit.onnx import is_compiled_onnx
 from winml.modelkit.utils import normalize_ep_name
-from winml.modelkit.utils.constants import EP_SUPPORTED_DEVICES
+from winml.modelkit.utils.constants import EP_SUPPORTED_DEVICES, ORT_SESSION_COMPILER
 
 
 if TYPE_CHECKING:
@@ -979,15 +979,15 @@ def test_multi_model_shared_weights(
 
     cmd = ["-m", str(m_seq4), "-m", str(m_seq1), "--ep", "qnn", "--output-dir", str(out_dir)]
     if use_inference_session:
-        cmd += ["--compiler", "ort_session"]
+        cmd += ["--compiler", ORT_SESSION_COMPILER]
     result = _invoke(*cmd)
     assert result.exit_code == 0, result.output
     assert "Success! Model compiled" in result.output, result.output
     # The InferenceSession backend is selected via --compiler ort_session.
     if use_inference_session:
-        assert "ort_session" in result.output
+        assert ORT_SESSION_COMPILER in result.output
     else:
-        assert "ort_session" not in result.output
+        assert ORT_SESSION_COMPILER not in result.output
 
     # Both compiled wrappers exist + exactly one shared weights bin (weight sharing).
     ctx4 = out_dir / f"{m_seq4.stem}_ctx.onnx"
diff --git a/tests/unit/compiler/test_compile_command.py b/tests/unit/compiler/test_compile_command.py
index 910465ddf..9aa1547d5 100644
--- a/tests/unit/compiler/test_compile_command.py
+++ b/tests/unit/compiler/test_compile_command.py
@@ -20,6 +20,7 @@
 from click.testing import CliRunner
 
 from winml.modelkit.cli import main
+from winml.modelkit.utils.constants import ORT_SESSION_COMPILER
 
 
 @pytest.fixture
@@ -459,13 +460,13 @@ def test_ort_session_compiler_sets_config(
                 "--output-dir",
                 str(out_dir),
                 "--compiler",
-                "ort_session",
+                ORT_SESSION_COMPILER,
             ],
         )
 
         assert result.exit_code == 0, result.output
         # The compiler choice is applied onto the config that drives compilation.
-        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == "ort_session"
+        assert mock_compile_multiple.call_args.args[2].ep_config.compiler == ORT_SESSION_COMPILER
 
     def _create_simple_onnx(self, path: Path) -> None:
         """Create a simple ONNX model for testing."""