From 847fdee7c013455debef5e4194ade080b77b9b3a Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 01:00:02 -0400
Subject: [PATCH 01/30] improve: Intel GPU Max (Ponte Vecchio) OpenMP target
 offload support

- Fix INTEL_COMPILER_ID Fypp variable: 'Intel' -> 'IntelLLVM' in
  shared_parallel_macros.fpp and omp_macros.fpp so Intel-specific
  OMP macro branches actually match for ifx builds

- Add Intel-specific OMP directives in omp_macros.fpp:
  target teams loop (no bind/defaultmap clauses ifx rejects),
  OMP_MKL_DISPATCH() emitting ! dispatch for oneMKL GPU FFT

- Add GPU_MKL_DISPATCH() in parallel_macros.fpp for oneMKL DFTI
  dispatch from device-mapped allocatables (Intel GPU FFT path)

- CMakeLists.txt:
  - Fix Intel compiler ID checks: 'Intel' -> 'IntelLLVM'
  - Switch -fopenmp to -fiopenmp -fopenmp-targets=spir64 for ifx
  - Add -fpp to global IntelLLVM compile options
  - Compile mkl_dfti_omp_offload.f90 via add_custom_command with
    minimal flags (no -free -fpp) to avoid OpenMP 5.2 clause issues
  - Link -qmkl=parallel, libmkl_sycl_dft, libsycl, libOpenCL for
    Intel GPU FFT backend
  - Skip building FFTW from source for IntelLLVM (uses oneMKL)

- m_fftw.fpp: Add Intel GPU path using oneMKL DFTI + ! dispatch
  for azimuthal Fourier filter; CPU path still uses FFTW for Intel

- m_pressure_relaxation.fpp: Fix ifx SPIR64 bug -- change
  dimension(sys_size) -> dimension(:) in all declare-target helper
  interfaces to avoid llvm-spirv InvalidArraySize (SPIR-V requires
  compile-time constant array bounds; sys_size is a runtime value)

- m_compute_levelset.fpp: Guard GPU_PARALLEL_LOOP with Fypp
  MFC_COMPILER != INTEL_COMPILER_ID for s_apply_levelset; ifx
  if-else dispatch to multiple declare-target routines in a target
  teams loop triggers LLVM dominance error, and the dispatch-wrapper
  fix triggers an ifx ICE -- host fallback is the only workaround

- docs/documentation/intel-gpu-max.md: Document build environment,
  required library paths, known ifx bugs, and GPU device access

Tested on GT CRNCH RoboGator (dash3):
  ifx 2025.3.3, oneMKL 2026.0, Intel Data Center GPU Max 1100

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CMakeLists.txt                                |  54 ++++-
 docs/documentation/intel-gpu-max.md           | 223 ++++++++++++++++++
 src/common/include/omp_macros.fpp             |  28 ++-
 src/common/include/parallel_macros.fpp        |   8 +
 src/common/include/shared_parallel_macros.fpp |   2 +-
 src/simulation/m_compute_levelset.fpp         |  19 +-
 src/simulation/m_fftw.fpp                     | 200 ++++++++++++++--
 src/simulation/m_pressure_relaxation.fpp      |  32 +--
 toolchain/dependencies/CMakeLists.txt         |   8 +-
 9 files changed, 521 insertions(+), 53 deletions(-)
 create mode 100644 docs/documentation/intel-gpu-max.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72258149f3..8c0e0928d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,8 +241,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
     if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug")
         add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-O1> $<$<COMPILE_LANGUAGE:Fortran>:-g>)
     endif()
-elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
-    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>)
+elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>
+                        $<$<COMPILE_LANGUAGE:Fortran>:-fpp>)
 
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         add_compile_options(-g -Og -traceback -debug -check all)
@@ -595,6 +596,47 @@ exit 0
                             HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
                         target_link_libraries(${a_target} PRIVATE ${HIPFFT_LIB})
                     endif()
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+                    # Intel GPU: oneMKL DFTI with !$omp dispatch for GPU FFT.
+                    # Requires MKLROOT to be set (via oneAPI module or env).
+                    if (NOT DEFINED ENV{MKLROOT})
+                        message(FATAL_ERROR "MKLROOT is not set. Load oneAPI MKL module before building.")
+                    endif()
+                    # Compile mkl_dfti_omp_offload.f90 in isolation with minimal flags.
+                    # The file uses !$omp declare variant with need_device_addr (OpenMP 5.2)
+                    # which requires the global -free -fpp flags to be absent so the
+                    # compiler parses it in standard fixed/free detection mode only.
+                    set(_mkl_omp_src "$ENV{MKLROOT}/include/mkl_dfti_omp_offload.f90")
+                    if (NOT EXISTS "${_mkl_omp_src}")
+                        message(FATAL_ERROR "mkl_dfti_omp_offload.f90 not found in $ENV{MKLROOT}/include")
+                    endif()
+                    set(_mkl_omp_mod_dir "${CMAKE_CURRENT_BINARY_DIR}/mkl_omp_mods")
+                    set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o")
+                    file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}")
+                    add_custom_command(
+                        OUTPUT  "${_mkl_omp_obj}"
+                                "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.mod"
+                        COMMAND "${CMAKE_Fortran_COMPILER}"
+                                -fiopenmp -fopenmp-targets=spir64
+                                -c -I"$ENV{MKLROOT}/include"
+                                "${_mkl_omp_src}"
+                                -o "${_mkl_omp_obj}"
+                        WORKING_DIRECTORY "${_mkl_omp_mod_dir}"
+                        DEPENDS "${_mkl_omp_src}"
+                        COMMENT "Compiling MKL OMP offload Fortran module (mkl_dfti_omp_offload)"
+                    )
+                    add_custom_target(mkl_omp_offload_mod_${a_target}
+                        DEPENDS "${_mkl_omp_obj}")
+                    add_dependencies(${a_target} mkl_omp_offload_mod_${a_target})
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{MKLROOT}/include" "${_mkl_omp_mod_dir}")
+                    target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj}")
+                    # Link MKL threading + core + SYCL DFT backend
+                    target_link_options(${a_target} PRIVATE -qmkl=parallel)
+                    find_library(MKL_SYCL_DFT mkl_sycl_dft HINTS "$ENV{MKLROOT}/lib" REQUIRED)
+                    find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH REQUIRED)
+                    find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE ${MKL_SYCL_DFT} ${SYCL_LIB} ${OPENCL_LIB})
                 else()
                     find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
                     target_link_libraries(${a_target} PRIVATE hipfort::hipfft)
@@ -636,9 +678,11 @@ exit 0
                     target_compile_options(${a_target} PRIVATE "-mp=gpu" "-Minfo=mp")
                     target_link_options(${a_target} PRIVATE "-mp=gpu")
                     set_target_properties(${a_target} PROPERTIES Fortran_FLAGS "-mp=gpu -gpu=ccall")
-                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
-                    target_compile_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
-                    target_link_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+                    # Intel GPU: OpenMP target offload to SPIR64 (Xe-HPC / Ponte Vecchio).
+                    # GPU FFT uses oneMKL DFTI via the OpenMP dispatch construct.
+                    target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                    target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
new file mode 100644
index 0000000000..bfc81f1b7d
--- /dev/null
+++ b/docs/documentation/intel-gpu-max.md
@@ -0,0 +1,223 @@
+# Building MFC for Intel Data Center GPU Max (Ponte Vecchio)
+
+This documents how to build and run MFC with Intel GPU Max (Xe-HPC / Ponte Vecchio)
+using ifx OpenMP target offload to SPIR64, as tested on GT CRNCH RoboGator (`dash3`).
+
+## System configuration
+
+| Component | Version / Path |
+|---|---|
+| Hardware | Intel Data Center GPU Max 1100 (Ponte Vecchio, PCI 8086:0bda) |
+| OS | Linux (RHEL 8 compatible, kernel 5.15) |
+| Fortran compiler | ifx 2025.3.3 (`/opt/intel/oneapi/compiler/2025.3/`) |
+| MKL | oneMKL 2026.0 (`/opt/intel/oneapi/mkl/2026.0/`) |
+| SYCL runtime | `libsycl.so` in `/opt/intel/oneapi/compiler/2026.0/lib/` |
+| GPU device | `/dev/dri/renderD128` (requires `render` group membership) |
+
+## Environment setup
+
+Load the required oneAPI environment before building or running:
+
+```bash
+export PATH=/opt/intel/oneapi/compiler/2025.3/bin:$PATH
+export MKLROOT=/opt/intel/oneapi/mkl/2026.0
+export LIBRARY_PATH=/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\
+/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LD_LIBRARY_PATH
+export FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx
+```
+
+> **Important**: `FC` must be set explicitly. Without it, CMake may cache an older
+> ifx (2025.0) from a system module, which does not support `need_device_addr` in
+> the MKL 2026.0 OpenMP offload Fortran module.
+
+> **Important**: `LIBRARY_PATH` (not just `LD_LIBRARY_PATH`) must include the
+> compiler 2026.0 lib directory so the linker finds `libsycl.so` at build time.
+
+> **Important**: `/opt/intel/oneapi/umf/1.1/lib` must be in `LD_LIBRARY_PATH` at
+> runtime. The Level Zero and OpenCL UR adapters in the 2026.0 compiler depend on
+> `libumf.so.1`, which lives in the separate `umf/1.1` package, not in the compiler
+> lib directories themselves.
+
+## Building
+
+```bash
+./mfc.sh build -t simulation --gpu mp --no-mpi -j 8
+```
+
+- `--gpu mp`: OpenMP target offload backend (SPIR64)
+- `--no-mpi`: omit for MPI-enabled runs; include for single-node testing
+- `-j 8`: parallel build jobs
+
+MFC will automatically:
+1. Compile `$MKLROOT/include/mkl_dfti_omp_offload.f90` with minimal flags
+   (no `-free -fpp`) via a CMake `add_custom_command` to avoid OpenMP 5.2
+   clause compatibility issues with global compile flags
+2. Link `-qmkl=parallel` for MKL threading + core
+3. Link `libmkl_sycl_dft`, `libsycl`, `libOpenCL` for GPU FFT dispatch
+
+## GPU FFT implementation
+
+MFC uses oneMKL DFTI with the OpenMP 5.1 `!$omp dispatch` construct for FFT
+in cylindrical geometry (the azimuthal Fourier filter in `m_fftw.fpp`).
+This is activated when `__INTEL_LLVM_COMPILER` is defined (i.e., compiled with ifx).
+
+Key verified properties (oneMKL 2026.0, ifx 2025.3.3):
+- Batch R2C transform with `INPUT_DISTANCE != OUTPUT_DISTANCE` works correctly.
+  MFC uses `real_size = p+1` and `cmplx_size = (p+1)/2+1` which always differ.
+- `!$omp dispatch` correctly dispatches DFTI calls to device-mapped allocatables.
+
+## Running MFC cases
+
+Build all three targets (pre_process, simulation, post_process) before running:
+
+```bash
+./mfc.sh build --gpu mp --no-mpi -j 8
+```
+
+Then run a case normally:
+
+```bash
+./mfc.sh run examples/1D_convergence/case.py --no-build --no-mpi
+```
+
+To run individual stages directly (useful when `syscheck` blocks due to GPU access):
+
+```bash
+export MKLROOT=/opt/intel/oneapi/mkl/2026.0
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\
+/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LD_LIBRARY_PATH
+
+cd examples/my_case
+/path/to/build/install/<hash>/bin/pre_process
+/path/to/build/install/<hash>/bin/simulation
+```
+
+The install hashes are printed by `./mfc.sh build`; look for lines like
+`✓ Installed simulation`.
+
+## GPU device access
+
+The Intel GPU requires membership in the `render` group (GID 109) to access
+`/dev/dri/renderD128` via Level Zero.
+
+Without render group access, `ZE_RESULT_ERROR_UNINITIALIZED` is returned by
+Level Zero. OpenMP target offload falls back to the CPU host plugin
+(correct results, but no GPU acceleration).
+
+To diagnose GPU visibility:
+
+```bash
+ls -la /dev/dri/renderD128          # should show rw permissions for your user/group=render
+
+# With full LD_LIBRARY_PATH set:
+LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:... \
+    /opt/intel/oneapi/compiler/2026.0/bin/sycl-ls --verbose
+# Look for: "[opencl:gpu]" or "[ext_oneapi_level_zero:gpu]" platforms
+
+LIBOMPTARGET_DEBUG=1 ./simulation   # look for "Level0 NG plugin initialization"
+                                    # and absence of "ZE_RESULT_ERROR_UNINITIALIZED"
+```
+
+To get GPU access:
+- **Interactive shell**: request from system admin to add user to `render` group
+  (`sudo usermod -a -G render $USER`, then re-login)
+- **SLURM**: submit with `--gres=gpu:max_1100=1`; if Level Zero still fails,
+  the SLURM epilog may not have configured device cgroup ACLs for the job user —
+  contact the system admin
+
+> **Note on `sycl-ls`**: the 2026.0 `sycl-ls` requires `libumf.so.1` from
+> `/opt/intel/oneapi/umf/1.1/lib` in `LD_LIBRARY_PATH`, otherwise all adapters
+> fail to load and it reports "No platforms found".
+
+## Link flags (what MFC's CMake generates)
+
+The full set of flags the compiler uses for the simulation target:
+
+**Compile flags:**
+```
+-fiopenmp -fopenmp-targets=spir64 -free -fpp -march=native
+```
+
+**Link flags:**
+```
+-fiopenmp -fopenmp-targets=spir64
+-qmkl=parallel
+-L$MKLROOT/lib -lmkl_sycl_dft
+-L/opt/intel/oneapi/compiler/2026.0/lib -lsycl -lOpenCL
+```
+
+**MKL OMP module (compiled separately, no global flags):**
+```bash
+ifx -fiopenmp -fopenmp-targets=spir64 \
+    -c -I$MKLROOT/include \
+    $MKLROOT/include/mkl_dfti_omp_offload.f90 \
+    -o mkl_dfti_omp_offload.o
+```
+
+## Known issues
+
+### `need_device_addr` compilation error
+`mkl_dfti_omp_offload.f90` from MKL 2026.0 uses `need_device_addr` in
+`!$omp declare variant` (OpenMP 5.2). This requires ifx **2025.3** or newer.
+If CMake finds an older ifx (e.g., 2025.0 from a system module path), the
+compile fails with:
+```
+error #5082: Syntax error, found IDENTIFIER 'NEED_DEVICE_ADDR'
+```
+Fix: set `FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx` before building
+and run `./mfc.sh clean` first so CMake re-detects the compiler.
+
+### Two routines with ifx SPIR64 codegen bugs
+
+**`s_apply_levelset` (`m_compute_levelset.fpp`)** — ifx SPIR64 bug in the
+target kernel:
+
+An if-else chain calling multiple different `!$omp declare target (seq)`
+routines from inside a `!$omp target teams loop` triggers `"Instruction does
+not dominate all uses!"` in llvm-link. The natural fix (wrapping the dispatch
+in a single `declare-target seq` subroutine) triggers an ifx ICE (segfault).
+Worked around with Fypp `#:if MFC_COMPILER != INTEL_COMPILER_ID` guards that
+skip the GPU_PARALLEL_LOOP directives for Intel builds, so the loop runs
+serially on the host. The `GPU_ROUTINE` declarations on the helpers are kept
+so NVIDIA/AMD GPU builds are unaffected.
+
+**`s_pressure_relaxation_procedure` (`m_pressure_relaxation.fpp`)** — SPIR-V
+InvalidArraySize in declare-target helpers:
+
+`!$omp declare target (seq)` routines with `dimension(sys_size)` explicit-shape
+dummy arguments trigger `"InvalidArraySize: Array size must be at least 1"` in
+llvm-spirv. SPIR-V requires compile-time constant array bounds; `sys_size` is
+a runtime module integer. Fixed by changing `dimension(sys_size)` →
+`dimension(:)` (assumed-shape) on all helper routine interfaces. The loop now
+runs on GPU for Intel.
+
+### syscheck GPU assertion
+`syscheck` runs `assert(omp_get_num_devices() > 0)` and aborts if the GPU
+is not accessible. This is a runtime check, not a build issue. See GPU device
+access section above.
+
+To run a case anyway (testing code correctness on CPU fallback), invoke
+`pre_process` and `simulation` directly from their install paths, bypassing
+the `./mfc.sh run` wrapper that calls `syscheck` first.
+
+### `libumf.so.1` not found at runtime
+The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`.
+If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports
+"No platforms found". Fix:
+
+```bash
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:$LD_LIBRARY_PATH
+```
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
index 7620e7607f..00f26a99a2 100644
--- a/src/common/include/omp_macros.fpp
+++ b/src/common/include/omp_macros.fpp
@@ -2,7 +2,7 @@
 
 #:set NVIDIA_COMPILER_ID="NVHPC"
 #:set PGI_COMPILER_ID="PGI"
-#:set INTEL_COMPILER_ID="Intel"
+#:set INTEL_COMPILER_ID="IntelLLVM"
 #:set CCE_COMPILER_ID="Cray"
 #:set AMD_COMPILER_ID="LLVMFlang"
 
@@ -182,6 +182,20 @@
         #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) '
     #:elif MFC_COMPILER == AMD_COMPILER_ID
         #:set omp_start_directive = '!$omp target teams distribute parallel do '
+    #:elif MFC_COMPILER == INTEL_COMPILER_ID
+        #! Intel OMP 5.2: bind(a,b) is invalid. Drop explicit firstprivate list:
+        #! ifx rejects firstprivate for declare-target module variables (#7655).
+        #! OMP 5.0 rule: unclaused scalars in target constructs are firstprivate
+        #! by default, so scalar locals (e.g. gp_layers_z) are covered implicitly.
+        #! declare-target vars (e.g. gp_layers) are device-resident -- no mapping
+        #! needed. defaultmap(firstprivate:scalar) also unsupported by ifx (#9061).
+        #:set omp_start_directive = '!$omp target teams loop '
+        #:set clause_val = collapse_val.strip('\n') + &
+            & default_val.strip('\n') + GEN_PRIVATE_STR(private, False).strip('\n') + &
+            & reduction_val.strip('\n') + copy_val.strip('\n') + copyin_val.strip('\n') + &
+            & copyout_val.strip('\n') + create_val.strip('\n') + &
+            & no_create_val.strip('\n') + present_val.strip('\n') + &
+            & deviceptr_val.strip('\n') + attach_val.strip('\n')
     #:else
         #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
     #:endif
@@ -197,6 +211,8 @@
         #:set omp_end_directive = '!$omp end target teams distribute parallel do simd'
     #:elif MFC_COMPILER == AMD_COMPILER_ID
         #:set omp_end_directive = '!$omp end target teams distribute parallel do'
+    #:elif MFC_COMPILER == INTEL_COMPILER_ID
+        #:set omp_end_directive = '!$omp end target teams loop'
     #:else
         #:set omp_end_directive = '!$omp end target teams loop'
     #:endif
@@ -218,7 +234,9 @@
         #:set function_name_val = ''
     #:endif
 
-    #:if MFC_COMPILER == AMD_COMPILER_ID
+    #:if MFC_COMPILER == AMD_COMPILER_ID or MFC_COMPILER == INTEL_COMPILER_ID
+        #! AMD: device_type unsupported. Intel: OpenMP 5.2 requires an enter/to/link/local
+        #! clause alongside device_type; omit device_type entirely for both.
         #:set clause_val = ''
     #:else
         #:set clause_val = nohost_val.strip('\n')
@@ -374,4 +392,10 @@
         $:code
     #:endif
 #:enddef
+
+#:def OMP_MKL_DISPATCH()
+    #:if MFC_COMPILER == INTEL_COMPILER_ID
+        !$omp dispatch
+    #:endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index b1382ec49a..5f3ec2ccb0 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -303,4 +303,12 @@
 
 #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID)
 #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID)
+
+! Dispatch to oneMKL GPU FFT via OpenMP dispatch construct (Intel GPU only)
+#:def GPU_MKL_DISPATCH()
+    #:set omp_code = OMP_MKL_DISPATCH()
+#if defined(MFC_OpenMP)
+    $:omp_code
+#endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp
index 36bee0a23a..cc7c4374ed 100644
--- a/src/common/include/shared_parallel_macros.fpp
+++ b/src/common/include/shared_parallel_macros.fpp
@@ -1,6 +1,6 @@
 #:set NVIDIA_COMPILER_ID="NVHPC"
 #:set PGI_COMPILER_ID="PGI"
-#:set INTEL_COMPILER_ID="Intel"
+#:set INTEL_COMPILER_ID="IntelLLVM"
 #:set CCE_COMPILER_ID="Cray"
 #:set AMD_COMPILER_ID="LLVMFlang"
 
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 9e7519790c..9d09840094 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -30,7 +30,12 @@ contains
         !  3D Patch Geometries
 
         if (p > 0) then
-            $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            ! ifx SPIR64 bug: if-else dispatch into declare-target routines inside
+            ! a target teams loop triggers "Instruction does not dominate all uses!"
+            ! in llvm-link. The dispatch-wrapper pattern triggers an ifx ICE.
+            #:if MFC_COMPILER != INTEL_COMPILER_ID
+                $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            #:endif
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
                 patch_geometry = patch_ib(patch_id)%geometry
@@ -47,11 +52,15 @@ contains
                     call s_model_levelset(gps(i))
                 end if
             end do
-            $:END_GPU_PARALLEL_LOOP()
+            #:if MFC_COMPILER != INTEL_COMPILER_ID
+                $:END_GPU_PARALLEL_LOOP()
+            #:endif
 
             ! 2D Patch Geometries
         else if (n > 0) then
-            $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            #:if MFC_COMPILER != INTEL_COMPILER_ID
+                $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            #:endif
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
                 patch_geometry = patch_ib(patch_id)%geometry
@@ -68,7 +77,9 @@ contains
                     call s_ellipse_levelset(gps(i))
                 end if
             end do
-            $:END_GPU_PARALLEL_LOOP()
+            #:if MFC_COMPILER != INTEL_COMPILER_ID
+                $:END_GPU_PARALLEL_LOOP()
+            #:endif
         end if
 
     end subroutine s_apply_levelset
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 2ee806d928..59fb741aef 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -4,7 +4,7 @@
 
 #:include 'macros.fpp'
 
-!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT) for azimuthal Fourier filtering in cylindrical geometries
+!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT/oneMKL) for azimuthal Fourier filtering in cylindrical geometries
 module m_fftw
 
     use, intrinsic :: iso_c_binding
@@ -12,7 +12,14 @@ module m_fftw
     use m_derived_types
     use m_global_parameters
     use m_mpi_proxy
-#if defined(MFC_GPU) && defined(__PGI)
+    ! GPU FFT backend selection:
+    !   cuFFT  - NVHPC/PGI (OpenACC or OpenMP target)
+    !   hipFFT - Cray/AMD (OpenMP target)
+    !   oneMKL - Intel ifx (OpenMP target + dispatch construct)
+    !   FFTW   - CPU-only builds
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+    use mkl_dfti_omp_offload
+#elif defined(MFC_GPU) && defined(__PGI)
     use cufft
 #elif defined(MFC_GPU)
     use hipfort
@@ -34,7 +41,18 @@ module m_fftw
     real(c_double), pointer            :: data_real(:)        !< Real data
     complex(c_double_complex), pointer :: data_cmplx(:)       !< Complex data in Fourier space
     complex(c_double_complex), pointer :: data_fltr_cmplx(:)  !< Filtered complex data in Fourier space
-#if defined(MFC_GPU)
+
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+    $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
+
+    real(dp), allocatable, target    :: data_real_gpu(:)
+    complex(dp), allocatable, target :: data_cmplx_gpu(:)
+    complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
+    $:GPU_DECLARE(create='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+
+    type(DFTI_DESCRIPTOR), pointer :: fwd_plan_mkl => null()
+    type(DFTI_DESCRIPTOR), pointer :: bwd_plan_mkl => null()
+#elif defined(MFC_GPU)
     $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
 
     real(dp), allocatable, target    :: data_real_gpu(:)
@@ -62,16 +80,39 @@ contains
     impure subroutine s_initialize_fftw_module
 
         integer :: ierr  !< Generic flag used to identify and report GPU errors
-        ! Size of input array going into DFT
 
         real_size = p + 1
-        ! Size of output array coming out of DFT
         cmplx_size = (p + 1)/2 + 1
 
         x_size = m + 1
         batch_size = x_size*sys_size
 
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
+        $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
+
+        @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
+        @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+        @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+
+        ! Forward R2C descriptor: batch of real_size transforms
+        ierr = DftiCreateDescriptor(fwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_INPUT_DISTANCE, real_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_OUTPUT_DISTANCE, cmplx_size)
+        ierr = DftiCommitDescriptor(fwd_plan_mkl)
+
+        ! Backward C2R descriptor
+        ierr = DftiCreateDescriptor(bwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_INPUT_DISTANCE, cmplx_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_OUTPUT_DISTANCE, real_size)
+        ierr = DftiCommitDescriptor(bwd_plan_mkl)
+#elif defined(MFC_GPU)
         rank = 1; istride = 1; ostride = 1
         allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
 
@@ -80,22 +121,7 @@ contains
         oembed(1) = cmplx_size
         $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
         $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
-#else
-        ! Allocate input and output DFT data sizes
-        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
-        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        ! Associate input and output data pointers with allocated memory
-        call c_f_pointer(fftw_real_data, data_real, [real_size])
-        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
-        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
 
-        ! Generate plans for forward and backward DFTs
-        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
-        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
-#endif
-
-#if defined(MFC_GPU)
         @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
         @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
         @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
@@ -111,6 +137,19 @@ contains
         ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, &
                               & HIPFFT_Z2D, batch_size)
 #endif
+#else
+        ! Allocate input and output DFT data sizes
+        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
+        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        ! Associate input and output data pointers with allocated memory
+        call c_f_pointer(fftw_real_data, data_real, [real_size])
+        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
+        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
+
+        ! Generate plans for forward and backward DFTs
+        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
+        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
 #endif
 
     end subroutine s_initialize_fftw_module
@@ -124,7 +163,116 @@ contains
         ! Restrict filter to processors that have cells adjacent to axis
 
         if (bc_y%beg >= 0) return
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, cmplx_size
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_MKL_DISPATCH()
+        ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
+
+        Nfq = 3
+        $:GPU_UPDATE(device='[Nfq]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, Nfq
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1) &
+                                        & *cmplx_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_MKL_DISPATCH()
+        ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu)
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1) &
+                                  & *real_size*x_size)/real(real_size, dp)
+                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        do i = 1, fourier_rings
+            i2 = i
+            $:GPU_UPDATE(device='[i2]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, cmplx_size
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i2, l)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_MKL_DISPATCH()
+            ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
+
+            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            $:GPU_UPDATE(device='[Nfq]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, Nfq
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k &
+                                            & - 1)*cmplx_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_MKL_DISPATCH()
+            ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu)
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k &
+                                      & - 1)*real_size*x_size)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, i2, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end do
+#elif defined(MFC_GPU)
         $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
@@ -292,7 +440,13 @@ contains
     !> Finalize the FFTW module
     impure subroutine s_finalize_fftw_module
 
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        integer :: ierr  !< Generic flag used to identify and report GPU errors
+
+        @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
+        ierr = DftiFreeDescriptor(fwd_plan_mkl)
+        ierr = DftiFreeDescriptor(bwd_plan_mkl)
+#elif defined(MFC_GPU)
         integer :: ierr  !< Generic flag used to identify and report GPU errors
 
         @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index 8ca939de7d..c7a163ade9 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -51,8 +51,8 @@ contains
     !> The main pressure relaxation procedure
     subroutine s_pressure_relaxation_procedure(q_cons_vf)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer                                                :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer                                         :: j, k, l
 
         $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
         do l = 0, p
@@ -71,8 +71,8 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
 
         ! Volume fraction correction
         if (mpp_lim) call s_correct_volume_fractions(q_cons_vf, j, k, l)
@@ -92,9 +92,9 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        integer, intent(in)                                 :: j, k, l
-        integer                                             :: i
+        type(scalar_field), dimension(:), intent(in) :: q_cons_vf
+        integer, intent(in)                          :: j, k, l
+        integer                                      :: i
 
         s_needs_pressure_relaxation = .true.
         $:GPU_LOOP(parallelism='[seq]')
@@ -111,10 +111,10 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: sum_alpha
-        integer                                                :: i
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: sum_alpha
+        integer                                         :: i
 
         sum_alpha = 0._wp
         $:GPU_LOOP(parallelism='[seq]')
@@ -141,9 +141,9 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: pres_relax, f_pres, df_pres
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: pres_relax, f_pres, df_pres
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: pres_K_init, rho_K_s
         #:else
@@ -214,8 +214,8 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: alpha_rho, alpha
         #:else
diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt
index 9a41e1cafc..972b2a4883 100644
--- a/toolchain/dependencies/CMakeLists.txt
+++ b/toolchain/dependencies/CMakeLists.txt
@@ -32,16 +32,20 @@ if (MFC_FFTW)
         message(STATUS "FFTW found.")
         add_custom_target(fftw)
     else()
-        if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+        if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND
+            NOT CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
             ExternalProject_Add(fftw
                 URL        "http://www.fftw.org/fftw-3.3.10.tar.gz"
                 CMAKE_ARGS -DBUILD_TESTS=OFF
                            -DBUILD_SHARED_LIBS=OFF
                            "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
             )
-        else()
+        elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
             add_custom_target(fftw)
+        else()
+            message(WARNING "The Fortran compiler vendor is IntelLLVM (ifx); FFTW3 will not be built from source. A system FFTW is expected.")
+            add_custom_target(fftw)
         endif()
     endif()
 endif()

From bf3e4855ffb19d9e615b3c33271c6808e7e28283 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 01:20:42 -0400
Subject: [PATCH 02/30] fix: ifx SPIR64 inliner ICE in m_compute_levelset
 (split loops + -fno-inline)

The LLVM inliner at O1+ pulls declare-target(seq) geometry routines into
target-teams-loop kernels, generating LLVM IR that crashes llvm-spirv.

Two complementary fixes:
1. Split s_apply_levelset into one GPU_PARALLEL_LOOP per geometry type so
   each kernel calls exactly one declare-target routine (also avoids the
   multi-callee phi-node dominance error from the original dispatch).
2. Add per-file -fno-inline in CMakeLists for IntelLLVM+OpenMP builds so
   the inliner cannot pull device routines into the kernel body.

Verified: compiles at O3 -fno-inline on ifx 2025.3.3 + SPIR64.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CMakeLists.txt                        |  11 +++
 docs/documentation/intel-gpu-max.md   |  28 ++++---
 src/simulation/m_compute_levelset.fpp | 110 +++++++++++++++-----------
 3 files changed, 94 insertions(+), 55 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c0e0928d3..92744a0cd4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -826,6 +826,17 @@ if (MFC_SIMULATION)
             target_compile_options(simulation PRIVATE -Oipa0)
         endif()
     endif()
+    # ifx SPIR64 ICE: the LLVM inliner pulls !$omp declare target (seq) geometry
+    # routines into target teams loop kernels and generates SPIR-V IR that crashes
+    # llvm-spirv at O1+. -fno-inline keeps them as proper device-side calls.
+    # Each GPU loop calls exactly one geometry routine (split-loop pattern in
+    # m_compute_levelset.fpp), so device-call overhead is small. See PR intel-gpu.
+    if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
+        set_source_files_properties(
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
+            PROPERTIES COMPILE_OPTIONS "-fno-inline"
+        )
+    endif()
 endif()
 
 if (MFC_POST_PROCESS)
diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
index bfc81f1b7d..ce9d74964d 100644
--- a/docs/documentation/intel-gpu-max.md
+++ b/docs/documentation/intel-gpu-max.md
@@ -182,17 +182,23 @@ and run `./mfc.sh clean` first so CMake re-detects the compiler.
 
 ### Two routines with ifx SPIR64 codegen bugs
 
-**`s_apply_levelset` (`m_compute_levelset.fpp`)** — ifx SPIR64 bug in the
-target kernel:
-
-An if-else chain calling multiple different `!$omp declare target (seq)`
-routines from inside a `!$omp target teams loop` triggers `"Instruction does
-not dominate all uses!"` in llvm-link. The natural fix (wrapping the dispatch
-in a single `declare-target seq` subroutine) triggers an ifx ICE (segfault).
-Worked around with Fypp `#:if MFC_COMPILER != INTEL_COMPILER_ID` guards that
-skip the GPU_PARALLEL_LOOP directives for Intel builds, so the loop runs
-serially on the host. The `GPU_ROUTINE` declarations on the helpers are kept
-so NVIDIA/AMD GPU builds are unaffected.
+**`s_apply_levelset` (`m_compute_levelset.fpp`)** — ifx SPIR64 inliner ICE:
+
+The LLVM inliner (at O1+) pulls `!$omp declare target (seq)` geometry
+routines into the `target teams loop` kernel and generates LLVM IR that
+crashes the SPIR-V converter with a segfault in `llvm-spirv`. At O0 the
+crash does not occur (no inlining). Two fixes combined:
+
+1. **Split loops**: replaced the single if-else dispatch loop with one
+   `GPU_PARALLEL_LOOP` per geometry type so each kernel calls exactly one
+   declare-target routine. The original multi-callee dispatch also triggers
+   `"Instruction does not dominate all uses!"` in llvm-link.
+
+2. **Per-file `-fno-inline`**: in `CMakeLists.txt`, `set_source_files_properties`
+   adds `-fno-inline` to `m_compute_levelset.fpp.f90` for IntelLLVM+OpenMP
+   builds. This prevents the inliner from pulling declare-target routines into
+   the kernel body where they crash the SPIR-V backend. The routines remain
+   callable as proper device-side function calls via `!$omp declare target`.
 
 **`s_pressure_relaxation_procedure` (`m_pressure_relaxation.fpp`)** — SPIR-V
 InvalidArraySize in declare-target helpers:
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 9d09840094..6a5163aac0 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -25,61 +25,83 @@ contains
 
         type(ghost_point), dimension(:), intent(inout) :: gps
         integer, intent(in)                            :: num_gps
-        integer                                        :: i, patch_id, patch_geometry
+        integer                                        :: i, patch_id
 
-        !  3D Patch Geometries
+        ! One GPU loop per geometry type so each kernel calls exactly one
+        ! declare-target routine. A single if-else dispatch over multiple
+        ! declare-target callees triggers an LLVM phi-node dominance error
+        ! in ifx SPIR64 codegen; splitting into separate loops avoids it.
 
         if (p > 0) then
-            ! ifx SPIR64 bug: if-else dispatch into declare-target routines inside
-            ! a target teams loop triggers "Instruction does not dominate all uses!"
-            ! in llvm-link. The dispatch-wrapper pattern triggers an ifx ICE.
-            #:if MFC_COMPILER != INTEL_COMPILER_ID
-                $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
-            #:endif
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
-                patch_geometry = patch_ib(patch_id)%geometry
-
-                if (patch_geometry == 8) then
-                    call s_sphere_levelset(gps(i))
-                else if (patch_geometry == 9) then
-                    call s_cuboid_levelset(gps(i))
-                else if (patch_geometry == 10) then
-                    call s_cylinder_levelset(gps(i))
-                else if (patch_geometry == 11) then
-                    call s_3d_airfoil_levelset(gps(i))
-                else if (patch_geometry == 12) then
-                    call s_model_levelset(gps(i))
-                end if
+                if (patch_ib(patch_id)%geometry == 8) call s_sphere_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 9) call s_cuboid_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 10) call s_cylinder_levelset(gps(i))
             end do
-            #:if MFC_COMPILER != INTEL_COMPILER_ID
-                $:END_GPU_PARALLEL_LOOP()
-            #:endif
+            $:END_GPU_PARALLEL_LOOP()
 
-            ! 2D Patch Geometries
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 11) call s_3d_airfoil_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 12) call s_model_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
         else if (n > 0) then
-            #:if MFC_COMPILER != INTEL_COMPILER_ID
-                $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
-            #:endif
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
-                patch_geometry = patch_ib(patch_id)%geometry
-
-                if (patch_geometry == 2) then
-                    call s_circle_levelset(gps(i))
-                else if (patch_geometry == 3) then
-                    call s_rectangle_levelset(gps(i))
-                else if (patch_geometry == 4) then
-                    call s_airfoil_levelset(gps(i))
-                else if (patch_geometry == 5) then
-                    call s_model_levelset(gps(i))
-                else if (patch_geometry == 6) then
-                    call s_ellipse_levelset(gps(i))
-                end if
+                if (patch_ib(patch_id)%geometry == 2) call s_circle_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 3) call s_rectangle_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 4) call s_airfoil_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 5) call s_model_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 6) call s_ellipse_levelset(gps(i))
             end do
-            #:if MFC_COMPILER != INTEL_COMPILER_ID
-                $:END_GPU_PARALLEL_LOOP()
-            #:endif
+            $:END_GPU_PARALLEL_LOOP()
         end if
 
     end subroutine s_apply_levelset

From bbb6ab7e51b411ab69cc52bddb3c1afeb2c635c5 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 08:04:28 -0400
Subject: [PATCH 03/30] docs: chemistry/pyrometheus Intel GPU verification and
 warning note

Verified: m_thermochem.f90 (pyrometheus-generated, 10 species / 29 reactions)
and m_chemistry.fpp both compile at O3 + SPIR64 without ICE on ifx 2025.3.3.
1D_reactive_shocktube case runs to completion with CPU fallback.

Documents the benign ifx warning #8694 about declare-target visibility
across module boundaries, and the build/run workflow for chemistry cases.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/documentation/intel-gpu-max.md | 34 +++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
index ce9d74964d..576e68f6a7 100644
--- a/docs/documentation/intel-gpu-max.md
+++ b/docs/documentation/intel-gpu-max.md
@@ -108,6 +108,40 @@ cd examples/my_case
 The install hashes are printed by `./mfc.sh build`; look for lines like
 `✓ Installed simulation`.
 
+## Chemistry/pyrometheus cases
+
+MFC's pyrometheus-generated thermochemistry (`m_thermochem.f90`) works with the
+Intel GPU build. Pyrometheus emits `!$omp declare target` for all thermochem
+routines when `directive_offload="mp"` is requested (automatically set when
+`--gpu mp` is active).
+
+Verified on ifx 2025.3.3: `m_thermochem.f90` and `m_chemistry.fpp` both compile
+at O3 + SPIR64 without ICE. The `1D_reactive_shocktube` example (H2/O2/Ar, 29
+reactions, 10 species) runs to completion. No Intel-specific source workarounds
+are needed for chemistry beyond the general levelset fix.
+
+To build and run a reactive chemistry case:
+```bash
+# Build (chemistry module generated automatically from cantera_file in case.py)
+./mfc.sh run examples/1D_reactive_shocktube/case.py \
+    --gpu mp --no-mpi -t pre_process simulation
+
+# Or bypass syscheck if no GPU render-group access:
+cd examples/1D_reactive_shocktube
+/path/to/build/install/<pre_hash>/bin/pre_process
+OMP_TARGET_OFFLOAD=DISABLED \
+    /path/to/build/install/<sim_hash>/bin/simulation
+```
+
+ifx warning during chemistry compilation:
+```
+warning #8694: A procedure called by a procedure with the DECLARE TARGET
+attribute must have the DECLARE TARGET attribute.  [GET_MIXTURE_VISCOSITY_MIXAVG]
+```
+This is a false positive from ifx's module-interface tracking; `m_thermochem.f90`
+does declare all routines as target. The warning is harmless and the code runs
+correctly.
+
 ## GPU device access
 
 The Intel GPU requires membership in the `render` group (GID 109) to access

From da087b02189bb1965ea6acb7d710cf20bbec9b15 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 10:33:56 -0400
Subject: [PATCH 04/30] =?UTF-8?q?fix:=20ifx=20SPIR64=20InvalidArraySize=20?=
 =?UTF-8?q?=E2=80=94=20assumed-shape=20+=20local=20VLA=20fixes=20for=20Ope?=
 =?UTF-8?q?nMP=20device=20routines?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/common/m_boundary_common.fpp         | 70 +++++++++++-----------
 src/common/m_chemistry.fpp               |  4 +-
 src/common/m_helper.fpp                  | 20 +++----
 src/common/m_phase_change.fpp            | 40 ++++++-------
 src/common/m_variables_conversion.fpp    | 24 ++++----
 src/simulation/m_bubbles_EL.fpp          | 38 ++++++------
 src/simulation/m_compute_cbc.fpp         | 74 ++++++++++++------------
 src/simulation/m_hyperelastic.fpp        | 28 ++++-----
 src/simulation/m_ib_patches.fpp          | 28 ++++-----
 src/simulation/m_ibm.fpp                 |  4 +-
 src/simulation/m_igr.fpp                 | 10 ++--
 src/simulation/m_pressure_relaxation.fpp |  4 +-
 src/simulation/m_qbmm.fpp                |  8 +--
 src/simulation/m_riemann_solvers.fpp     |  6 +-
 src/simulation/m_sim_helpers.fpp         | 22 +++----
 src/simulation/m_viscous.fpp             | 16 ++---
 16 files changed, 198 insertions(+), 198 deletions(-)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 6eb5383f75..33b874c9c5 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -284,11 +284,11 @@ contains
     subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf)
 
         $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
@@ -371,7 +371,7 @@ contains
     subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in, q_T_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout)                                                      :: q_prim_vf
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in, mv_in
         integer, intent(in)                                                                                  :: bc_dir, bc_loc
         integer, intent(in)                                                                                  :: k, l
@@ -619,7 +619,7 @@ contains
     subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in, q_T_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout)                                                      :: q_prim_vf
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in, mv_in
         integer, intent(in)                                                                                  :: bc_dir, bc_loc
         integer, intent(in)                                                                                  :: k, l
@@ -778,7 +778,7 @@ contains
     subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout)                                                      :: q_prim_vf
         real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), optional, intent(inout) :: pb_in, mv_in
         integer, intent(in)                                                                                  :: k, l
         integer                                                                                              :: j, q, i
@@ -833,11 +833,11 @@ contains
     subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf)
 
         $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  !< bc_x%beg
@@ -986,11 +986,11 @@ contains
 
         $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  !< bc_x%beg
@@ -1158,11 +1158,11 @@ contains
     subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf)
 
         $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
 #ifdef MFC_SIMULATION
         if (bc_dir == 1) then  !< x-direction
@@ -1461,10 +1461,10 @@ contains
     subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_periodic', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer, intent(in)                                        :: bc_dir, bc_loc
-        integer, intent(in)                                        :: k, l
-        integer                                                    :: j, i
+        type(scalar_field), dimension(:), intent(inout) :: c_divs
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
@@ -1516,10 +1516,10 @@ contains
     subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_reflective', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer, intent(in)                                        :: bc_dir, bc_loc
-        integer, intent(in)                                        :: k, l
-        integer                                                    :: j, i
+        type(scalar_field), dimension(:), intent(inout) :: c_divs
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
@@ -1595,10 +1595,10 @@ contains
     subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer, intent(in)                                        :: bc_dir, bc_loc
-        integer, intent(in)                                        :: k, l
-        integer                                                    :: j, i
+        type(scalar_field), dimension(:), intent(inout) :: c_divs
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index cd15530368..593c1e95ed 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -36,8 +36,8 @@ contains
 
         $:GPU_ROUTINE(function_name='compute_viscosity_and_inversion',parallelism='[seq]', cray_inline=True)
 
-        real(wp), intent(inout)                         :: T_L, T_R, Re_L, Re_R
-        real(wp), dimension(num_species), intent(inout) :: Ys_R, Ys_L
+        real(wp), intent(inout)               :: T_L, T_R, Re_L, Re_R
+        real(wp), dimension(:), intent(inout) :: Ys_R, Ys_L
 
         call get_mixture_viscosity_mixavg(T_L, Ys_L, Re_L)
         call get_mixture_viscosity_mixavg(T_R, Ys_R, Re_R)
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 6bd9718dbb..e2a36f418c 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -26,11 +26,11 @@ contains
     subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in)                :: vftmp
-        real(wp), dimension(nb), intent(in) :: Rtmp
-        real(wp), intent(out)               :: ntmp
-        real(wp), dimension(nb), intent(in) :: weights
-        real(wp)                            :: R3
+        real(wp), intent(in)               :: vftmp
+        real(wp), dimension(:), intent(in) :: Rtmp
+        real(wp), intent(out)              :: ntmp
+        real(wp), dimension(:), intent(in) :: weights
+        real(wp)                           :: R3
 
         R3 = dot_product(weights, Rtmp**3._wp)
         ntmp = (3._wp/(4._wp*pi))*vftmp/R3
@@ -41,11 +41,11 @@ contains
     subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in)                :: vftmp
-        real(wp), dimension(nb), intent(in) :: nRtmp
-        real(wp), intent(out)               :: ntmp
-        real(wp), dimension(nb), intent(in) :: weights
-        real(wp)                            :: nR3
+        real(wp), intent(in)               :: vftmp
+        real(wp), dimension(:), intent(in) :: nRtmp
+        real(wp), intent(out)              :: ntmp
+        real(wp), dimension(:), intent(in) :: weights
+        real(wp)                           :: nR3
 
         nR3 = dot_product(weights, nRtmp**3._wp)
         ntmp = sqrt((4._wp*pi/3._wp)*nR3/vftmp)
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 7f9131550d..f2897c2c97 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -257,15 +257,15 @@ contains
         $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', parallelism='[seq]', cray_noinline=True)
 
         ! initializing variables
-        integer, intent(in)                                 :: j, k, l, MFL
-        real(wp), intent(out)                               :: pS
-        real(wp), dimension(1:), intent(out)                :: p_infpT
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp), intent(in)                                :: rhoe
-        real(wp), intent(out)                               :: TS
-        real(wp)                                            :: gp, gpp, hp, pO, mCP, mQ  !< variables for the Newton Solver
-        real(wp)                                            :: p_infpT_sum
-        integer                                             :: i, ns                     !< generic loop iterators
+        integer, intent(in)                          :: j, k, l, MFL
+        real(wp), intent(out)                        :: pS
+        real(wp), dimension(:), intent(out)          :: p_infpT
+        type(scalar_field), dimension(:), intent(in) :: q_cons_vf
+        real(wp), intent(in)                         :: rhoe
+        real(wp), intent(out)                        :: TS
+        real(wp)                                     :: gp, gpp, hp, pO, mCP, mQ  !< variables for the Newton Solver
+        real(wp)                                     :: p_infpT_sum
+        integer                                      :: i, ns                     !< generic loop iterators
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT_sum = 0._wp
         $:GPU_LOOP(parallelism='[seq]')
@@ -351,16 +351,16 @@ contains
 
         $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', parallelism='[seq]', cray_noinline=True)
 
-        integer, intent(in)                                    :: j, k, l
-        real(wp), intent(inout)                                :: pS
-        real(wp), dimension(1:), intent(in)                    :: p_infpT
-        real(wp), intent(in)                                   :: rhoe
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        real(wp), intent(inout)                                :: TS
+        integer, intent(in)                             :: j, k, l
+        real(wp), intent(inout)                         :: pS
+        real(wp), dimension(1:), intent(in)             :: p_infpT
+        real(wp), intent(in)                            :: rhoe
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        real(wp), intent(inout)                         :: TS
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
         #:else
-            real(wp), dimension(num_fluids) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
+            real(wp), dimension(num_fluids_max) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
         #:endif
         real(wp), dimension(2, 2) :: Jac, InvJac, TJac                  !< matrices for the Newton Solver
         real(wp), dimension(2)    :: R2D, DeltamP                       !< residual and correction array
@@ -525,10 +525,10 @@ contains
 
         !> @name variables for the correction of the reacting partial densities
         !> @{
-        real(wp), intent(out)                                  :: MCT
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        real(wp), intent(inout)                                :: rM
-        integer, intent(in)                                    :: j, k, l
+        real(wp), intent(out)                           :: MCT
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        real(wp), intent(inout)                         :: rM
+        integer, intent(in)                             :: j, k, l
         !> @}
         if (rM < 0.0_wp) then
             if ((q_cons_vf(lp + eqn_idx%cont%beg - 1)%sf(j, k, &
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 2417da1adf..8d546daf9f 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -91,12 +91,12 @@ contains
         real(wp), intent(in), optional  :: G, pres_mag
 
         ! Chemistry
-        real(wp), dimension(1:num_species), intent(in) :: rhoYks
-        real(wp), dimension(1:num_species)             :: Y_rs
-        real(wp)                                       :: E_e
-        real(wp)                                       :: e_Per_Kg, Pdyn_Per_Kg
-        real(wp)                                       :: T_guess
-        integer                                        :: s  !< Generic loop iterator
+        real(wp), dimension(:), intent(in) :: rhoYks
+        real(wp), dimension(1:num_species) :: Y_rs
+        real(wp)                           :: E_e
+        real(wp)                           :: e_Per_Kg, Pdyn_Per_Kg
+        real(wp)                           :: T_guess
+        integer                            :: s  !< Generic loop iterator
         #:if not chemistry
             ! Depending on model_eqns and bubbles_euler, the appropriate procedure for computing pressure is targeted by the
             ! procedure pointer
@@ -253,8 +253,8 @@ contains
             real(wp), dimension(3), intent(inout)        :: alpha_rho_K, alpha_K
             real(wp), optional, dimension(3), intent(in) :: G
         #:else
-            real(wp), dimension(num_fluids), intent(inout)        :: alpha_rho_K, alpha_K
-            real(wp), optional, dimension(num_fluids), intent(in) :: G
+            real(wp), dimension(:), intent(inout)        :: alpha_rho_K, alpha_K
+            real(wp), optional, dimension(:), intent(in) :: G
         #:endif
         real(wp), dimension(2), intent(out) :: Re_K
         real(wp), optional, intent(out)     :: G_K
@@ -1175,12 +1175,12 @@ contains
     subroutine s_compute_species_fraction(q_vf, k, l, r, alpha_rho_K, alpha_K)
 
         $:GPU_ROUTINE(function_name='s_compute_species_fraction', parallelism='[seq]', cray_noinline=True)
-        type(scalar_field), dimension(sys_size), intent(in) :: q_vf
-        integer, intent(in)                                 :: k, l, r
+        type(scalar_field), dimension(:), intent(in) :: q_vf
+        integer, intent(in)                          :: k, l, r
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K
         #:else
-            real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K
+            real(wp), dimension(:), intent(out) :: alpha_rho_K, alpha_K
         #:endif
         integer  :: i
         real(wp) :: alpha_K_sum
@@ -1256,7 +1256,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: adv
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: adv
+            real(wp), dimension(:), intent(in) :: adv
         #:endif
         real(wp), intent(in)  :: vel_sum
         real(wp), intent(in)  :: c_c
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index b44eb617b1..c9773ee9f6 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -699,15 +699,15 @@ contains
 
         $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(in)                                :: pinf, rhol, gamma, pi_inf
-        integer, dimension(3), intent(in)                   :: cell
-        real(wp), intent(out)                               :: cson
-        real(wp)                                            :: E, H
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(in)                         :: pinf, rhol, gamma, pi_inf
+        integer, dimension(3), intent(in)            :: cell
+        real(wp), intent(out)                        :: cson
+        real(wp)                                     :: E, H
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: vel
         #:else
-            real(wp), dimension(num_dims) :: vel
+            real(wp), dimension(3) :: vel
         #:endif
         integer :: i
 
@@ -765,19 +765,19 @@ contains
 
         $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', cray_inline=True)
 
-        integer, intent(in)                                 :: bub_id, ptype
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(out)                               :: f_pinfl
-        integer, dimension(3), intent(out)                  :: cell
-        real(wp), intent(out), optional                     :: preterm1, term2, Romega
-        real(wp), dimension(3)                              :: scoord, psi
-        real(wp)                                            :: dc, vol, aux
-        real(wp)                                            :: volgas, term1, Rbeq, denom
-        real(wp)                                            :: charvol, charpres, charvol2, charpres2
-        integer, dimension(3)                               :: cellaux
-        integer                                             :: i, j, k
-        integer                                             :: smearGrid, smearGridz
-        logical                                             :: celloutside
+        integer, intent(in)                          :: bub_id, ptype
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(out)                        :: f_pinfl
+        integer, dimension(3), intent(out)           :: cell
+        real(wp), intent(out), optional              :: preterm1, term2, Romega
+        real(wp), dimension(3)                       :: scoord, psi
+        real(wp)                                     :: dc, vol, aux
+        real(wp)                                     :: volgas, term1, Rbeq, denom
+        real(wp)                                     :: charvol, charpres, charvol2, charpres2
+        integer, dimension(3)                        :: cellaux
+        integer                                      :: i, j, k
+        integer                                      :: smearGrid, smearGridz
+        logical                                      :: celloutside
 
         scoord = mtn_s(bub_id,1:3,2)
         f_pinfl = 0._wp
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index c2c415b1d3..d0fb3cd058 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -26,7 +26,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp) :: L1
         L1 = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
@@ -40,12 +40,12 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), intent(in) :: dpres_ds
@@ -65,12 +65,12 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer              :: i
@@ -89,12 +89,12 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer              :: i
@@ -113,12 +113,12 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer              :: i
@@ -141,12 +141,12 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
 
@@ -165,7 +165,7 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
@@ -173,10 +173,10 @@ contains
             real(wp), dimension(3), intent(in)  :: dadv_ds
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in)  :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)    :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in)  :: dadv_ds
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -205,12 +205,12 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
 
@@ -229,7 +229,7 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
@@ -237,10 +237,10 @@ contains
             real(wp), dimension(3), intent(in)  :: dadv_ds
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in)  :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)    :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in)  :: dadv_ds
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -263,16 +263,16 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in) :: dvel_ds
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)   :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -294,16 +294,16 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in) :: dvel_ds
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)   :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -323,7 +323,7 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         L(1:eqn_idx%adv%end) = 0._wp
         if (chemistry) L(eqn_idx%species%beg:eqn_idx%species%end) = 0._wp
@@ -339,7 +339,7 @@ contains
         #:if USING_AMD
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
@@ -347,10 +347,10 @@ contains
             real(wp), dimension(3), intent(in)  :: dadv_ds
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in)  :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)    :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in)  :: dadv_ds
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 90e78d04df..f0c02f3fc9 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -194,13 +194,13 @@ contains
     subroutine s_neoHookean_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        type(scalar_field), dimension(b_size), intent(inout)   :: btensor_in
-        real(wp), intent(in)                                   :: G_param
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: trace
-        real(wp), parameter                                    :: f13 = 1._wp/3._wp
-        integer                                                :: i  !< Generic loop iterators
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout) :: btensor_in
+        real(wp), intent(in)                            :: G_param
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: trace
+        real(wp), parameter                             :: f13 = 1._wp/3._wp
+        integer                                         :: i  !< Generic loop iterators
         ! tensor is the symmetric tensor & calculate the trace of the tensor
         trace = btensor_in(1)%sf(j, k, l) + btensor_in(3)%sf(j, k, l) + btensor_in(6)%sf(j, k, l)
 
@@ -222,13 +222,13 @@ contains
     subroutine s_Mooney_Rivlin_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        type(scalar_field), dimension(b_size), intent(inout)   :: btensor_in
-        real(wp), intent(in)                                   :: G_param
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: trace
-        real(wp), parameter                                    :: f13 = 1._wp/3._wp
-        integer                                                :: i  !< Generic loop iterators
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout) :: btensor_in
+        real(wp), intent(in)                            :: G_param
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: trace
+        real(wp), parameter                             :: f13 = 1._wp/3._wp
+        integer                                         :: i  !< Generic loop iterators
         ! TODO: Make 1D and 2D capable
         trace = btensor_in(1)%sf(j, k, l) + btensor_in(3)%sf(j, k, l) + btensor_in(6)%sf(j, k, l)
 
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index b21483e097..4fba39e80e 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -136,7 +136,7 @@ contains
         integer, intent(in)                :: xp, yp            !< integers containing the periodicity projection information
         real(wp)                           :: f, ca_in, pa, ma, ta
         real(wp)                           :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer                            :: i, j, k, il, ir, jl, jr
+        integer                            :: i, j, k, kk, il, ir, jl, jr
         integer                            :: Np1, Np2
         integer                            :: encoded_patch_id
         real(wp), dimension(1:3)           :: xy_local, offset  !< x and y coordinates in local IB frame
@@ -227,8 +227,8 @@ contains
         call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir)
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, ma, &
-                            & ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
+        $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, &
+                            & ma, ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
         do j = jl, jr
             do i = il, ir
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
@@ -246,8 +246,8 @@ contains
                     end if
                     if (xy_local(2) >= 0._wp) then
                         k = 1
-                        do while (airfoil_grid_u(k)%x < xy_local(1) .and. k <= Np)
-                            k = k + 1
+                        do kk = 1, Np - 1
+                            if (airfoil_grid_u(kk)%x < xy_local(1)) k = kk + 1
                         end do
                         if (f_approx_equal(airfoil_grid_u(k)%x, xy_local(1))) then
                             if (xy_local(2) <= airfoil_grid_u(k)%y) then
@@ -261,8 +261,8 @@ contains
                         end if
                     else
                         k = 1
-                        do while (airfoil_grid_l(k)%x < xy_local(1))
-                            k = k + 1
+                        do kk = 1, Np - 1
+                            if (airfoil_grid_l(kk)%x < xy_local(1)) k = kk + 1
                         end do
                         if (f_approx_equal(airfoil_grid_l(k)%x, xy_local(1))) then
                             if (xy_local(2) >= airfoil_grid_l(k)%y) then
@@ -290,7 +290,7 @@ contains
         type(integer_field), intent(inout) :: ib_markers
         integer, intent(in) :: xp, yp, zp                      !< integers containing the periodicity projection information
         real(wp) :: lz, z_max, z_min, f, ca_in, pa, ma, ta, xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer :: i, j, k, l, il, ir, jl, jr, ll, lr
+        integer :: i, j, k, kk, l, il, ir, jl, jr, ll, lr
         integer :: Np1, Np2
         integer :: encoded_patch_id
         real(wp), dimension(1:3) :: xyz_local, center, offset  !< x, y, z coordinates in local IB frame
@@ -385,8 +385,8 @@ contains
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
         call get_bounding_indices(center(3) - ca_in, center(3) + ca_in, z_cc, ll, lr)
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, &
-                            & ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, &
+                            & offset, ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3)
         do l = ll, lr
             do j = jl, jr
                 do i = il, ir
@@ -399,8 +399,8 @@ contains
                         if (xyz_local(1) >= 0._wp .and. xyz_local(1) <= ca_in) then
                             if (xyz_local(2) >= 0._wp) then
                                 k = 1
-                                do while (airfoil_grid_u(k)%x < xyz_local(1))
-                                    k = k + 1
+                                do kk = 1, Np - 1
+                                    if (airfoil_grid_u(kk)%x < xyz_local(1)) k = kk + 1
                                 end do
                                 if (f_approx_equal(airfoil_grid_u(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) <= airfoil_grid_u(k)%y) then
@@ -415,8 +415,8 @@ contains
                                 end if
                             else
                                 k = 1
-                                do while (airfoil_grid_l(k)%x < xyz_local(1))
-                                    k = k + 1
+                                do kk = 1, Np - 1
+                                    if (airfoil_grid_l(kk)%x < xyz_local(1)) k = kk + 1
                                 end do
                                 if (f_approx_equal(airfoil_grid_l(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) >= airfoil_grid_l(k)%y) then
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 39ec6b1470..0ac42bfff5 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -721,7 +721,7 @@ contains
 
         & nmom_IP, pb_in, mv_in, presb_IP, massv_IP)
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf  !< Primitive Variables
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf  !< Primitive Variables
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(in) :: pb_in, mv_in
         type(ghost_point), intent(in) :: gp
         real(wp), intent(inout) :: pres_IP
@@ -730,7 +730,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(inout) :: alpha_IP, alpha_rho_IP
         #:else
-            real(wp), dimension(num_fluids), intent(inout) :: alpha_IP, alpha_rho_IP
+            real(wp), dimension(:), intent(inout) :: alpha_IP, alpha_rho_IP
         #:endif
         real(wp), optional, dimension(:), intent(inout) :: r_IP, v_IP, pb_IP, mv_IP
         real(wp), optional, dimension(:), intent(inout) :: nmom_IP
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 5f1fa9d73b..f326d308bd 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -2640,11 +2640,11 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), intent(in)                      :: E_L, gamma_L, pi_inf_L, rho_L
-        real(wp), intent(in)                      :: E_R, gamma_R, pi_inf_R, rho_R
-        real(wp), dimension(num_dims), intent(in) :: vel_L, vel_R
-        real(wp), intent(out)                     :: pres_L, pres_R, cfl
-        real(wp)                                  :: a_L, a_R
+        real(wp), intent(in)               :: E_L, gamma_L, pi_inf_L, rho_L
+        real(wp), intent(in)               :: E_R, gamma_R, pi_inf_R, rho_R
+        real(wp), dimension(:), intent(in) :: vel_L, vel_R
+        real(wp), intent(out)              :: pres_L, pres_R, cfl
+        real(wp)                           :: a_L, a_R
 
         if (num_dims == 2) then
             pres_L = (E_L - pi_inf_L - 0.5_wp*rho_L*(vel_L(1)**2._wp + vel_L(2)**2._wp))/gamma_L
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index c7a163ade9..4cb3e847d6 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -147,7 +147,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: pres_K_init, rho_K_s
         #:else
-            real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+            real(wp), dimension(num_fluids_max) :: pres_K_init, rho_K_s
         #:endif
         integer, parameter :: MAX_ITER = 50
         ! Pressure relaxation convergence tolerance
@@ -219,7 +219,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: alpha_rho, alpha
         #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, alpha
+            real(wp), dimension(num_fluids_max) :: alpha_rho, alpha
         #:endif
         real(wp)               :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
         real(wp), dimension(2) :: Re
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 8465d4d349..bac659bd76 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -949,8 +949,8 @@ contains
 
             $:GPU_ROUTINE(function_name='s_chyqmom',parallelism='[seq]', cray_inline=True)
 
-            real(wp), dimension(nmom), intent(in)     :: momin
-            real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
+            real(wp), dimension(nmom), intent(in) :: momin
+            real(wp), dimension(:), intent(inout) :: wght, abscX, abscY
 
             ! Local variables
             real(wp), dimension(0:2,0:2) :: moms
@@ -1030,7 +1030,7 @@ contains
             #:if not MFC_CASE_OPTIMIZATION and USING_AMD
                 real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in
             #:else
-                real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
+                real(wp), dimension(:,:), intent(in) :: abscX, abscY, wght_in
             #:endif
             real(wp), intent(in) :: q, r, s
             real(wp)             :: f_quad_RV, f_quad
@@ -1056,7 +1056,7 @@ contains
             #:if not MFC_CASE_OPTIMIZATION and USING_AMD
                 real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in
             #:else
-                real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
+                real(wp), dimension(:), intent(in) :: abscX, abscY, wght_in
             #:endif
             real(wp), dimension(3), intent(in) :: pow
             real(wp)                           :: f_quad2D
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 7672094f55..34b5ce3255 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -4408,8 +4408,8 @@ contains
             real(wp), dimension(3, 3), intent(in)  :: vel_grad_avg
             real(wp), dimension(3, 3), intent(out) :: tau_shear_out
         #:else
-            real(wp), dimension(num_dims, num_dims), intent(in)  :: vel_grad_avg
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
+            real(wp), dimension(:,:), intent(in)  :: vel_grad_avg
+            real(wp), dimension(:,:), intent(out) :: tau_shear_out
         #:endif
         real(wp), intent(in) :: Re_shear
         real(wp), intent(in) :: divergence_v
@@ -4441,7 +4441,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
         #:else
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
+            real(wp), dimension(:,:), intent(out) :: tau_bulk_out
         #:endif
 
         ! Local variables
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 4a0978919e..8392fdc6bc 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -45,11 +45,11 @@ contains
     function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in) :: vel
-        real(wp), intent(in)                      :: c
-        integer, intent(in)                       :: j, k, l
-        real(wp)                                  :: cfl_terms
-        real(wp)                                  :: fltr_dtheta
+        real(wp), dimension(:), intent(in) :: vel
+        real(wp), intent(in)               :: c
+        integer, intent(in)                :: j, k, l
+        real(wp)                           :: cfl_terms
+        real(wp)                           :: fltr_dtheta
 
         fltr_dtheta = f_compute_filtered_dtheta(k, l)
 
@@ -74,13 +74,13 @@ contains
 
         $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
+        type(scalar_field), intent(in), dimension(:) :: q_prim_vf
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), intent(inout), dimension(3) :: alpha
             real(wp), intent(inout), dimension(3) :: vel
         #:else
-            real(wp), intent(inout), dimension(num_fluids) :: alpha
-            real(wp), intent(inout), dimension(num_vels)   :: vel
+            real(wp), intent(inout), dimension(:) :: alpha
+            real(wp), intent(inout), dimension(:) :: vel
         #:endif
         real(wp), intent(inout)               :: rho, gamma, pi_inf, vel_sum, H, pres
         real(wp), intent(out)                 :: qv
@@ -89,7 +89,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: alpha_rho, Gs
         #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, Gs
+            real(wp), dimension(num_fluids_max) :: alpha_rho, Gs
         #:endif
         real(wp) :: E, G_local
         integer  :: i
@@ -141,7 +141,7 @@ contains
     subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in), dimension(num_vels)                 :: vel
+        real(wp), intent(in), dimension(:)                        :: vel
         real(wp), intent(in)                                      :: c, rho
         real(wp), dimension(0:m,0:n,0:p), intent(inout)           :: icfl_sf
         real(wp), dimension(0:m,0:n,0:p), intent(inout), optional :: vcfl_sf, Rc_sf
@@ -191,7 +191,7 @@ contains
     subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in)       :: vel
+        real(wp), dimension(:), intent(in)              :: vel
         real(wp), intent(in)                            :: c, rho
         real(wp), dimension(0:m,0:n,0:p), intent(inout) :: max_dt
         real(wp), dimension(2), intent(in)              :: Re_l
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 55cdcc343e..84446d5777 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1266,14 +1266,14 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), dimension(1:3,1:3), intent(inout)           :: viscous_stress_tensor
-        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(in)                                  :: dynamic_viscosity
-        integer, intent(in)                                   :: i, j, k
-        real(wp), dimension(1:3,1:3)                          :: velocity_gradient_tensor
-        real(wp), dimension(1:3)                              :: dx
-        real(wp)                                              :: divergence
-        integer                                               :: l, q  !< iterators
+        real(wp), dimension(1:3,1:3), intent(inout)  :: viscous_stress_tensor
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(in)                         :: dynamic_viscosity
+        integer, intent(in)                          :: i, j, k
+        real(wp), dimension(1:3,1:3)                 :: velocity_gradient_tensor
+        real(wp), dimension(1:3)                     :: dx
+        real(wp)                                     :: divergence
+        integer                                      :: l, q  !< iterators
 
         ! zero the viscous stress, collection of velocity derivatives, and spatial finite differences
         viscous_stress_tensor = 0._wp

From c30726815973b61bfeab3fc6a2f325e1f32e1765 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 11:55:04 -0400
Subject: [PATCH 05/30] fix: ifx SPIR64 Level Zero link failures for Intel GPU
 Max (Ponte Vecchio)

Two root causes of ZE_RESULT_ERROR_MODULE_LINK_FAILURE at runtime on
Intel GPU Max 1100 (Ponte Vecchio) with OpenMP target offload (--gpu mp):

1. mkl_dfti_omp_offload.o SPIR-V import problem:
   MKL's mkl_dfti_omp_offload.f90 compiled with -fopenmp-targets=spir64
   produces SPIR-V with Import declarations for MKL SYCL DFT functions
   (mkl_dfti_compute_forward_dz_omp_offload, etc.) that the OpenMP Level
   Zero plugin cannot resolve at zeModuleDynamicLink time.
   Fix: Use clang-offload-bundler to strip the SPIR-V device bundle from
   mkl_dfti_omp_offload.o, linking only host code. The MKL DFTI interface
   module (.modmic) is still compiled for use by dependent translation units,
   but ! dispatch for DFT calls falls back to CPU execution.

2. m_thermochem.f90 empty SPIR-V problem (chemistry/pyrometheus):
   Pyrometheus generates '#define GPU_ROUTINE(name) ! declare target'
   (a C macro). When ifx processes the file with -free -fpp, the Intel
   Fortran preprocessor strips '! declare target' after C macro
   expansion because '!' is treated as a Fortran comment character after
   expansion, leaving an empty SPIR-V bundle with no exported device symbols.
   Fix: Post-process generated m_thermochem.f90 to remove the #define macro
   and replace GPU_ROUTINE(name) call sites with literal '! declare
   target' directives, which are visible to the Fortran front-end.

Verified: 1D advection case runs to completion on Intel GPU Max 1100
with OMP_TARGET_OFFLOAD=MANDATORY and 'Simulating ... with OpenMP offloading.'
---
 CMakeLists.txt             | 25 +++++++++++++++++++++++--
 toolchain/mfc/run/input.py | 10 ++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92744a0cd4..e50c1a6be9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -612,7 +612,14 @@ exit 0
                     endif()
                     set(_mkl_omp_mod_dir "${CMAKE_CURRENT_BINARY_DIR}/mkl_omp_mods")
                     set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o")
+                    set(_mkl_omp_obj_host "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload_host.o")
                     file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}")
+                    # Find clang-offload-bundler (co-located with ifx) for SPIR-V stripping.
+                    cmake_path(GET CMAKE_Fortran_COMPILER PARENT_PATH _ifx_bin)
+                    find_program(CLANG_OFFLOAD_BUNDLER
+                        NAMES clang-offload-bundler
+                        HINTS "${_ifx_bin}/compiler" "${_ifx_bin}"
+                        REQUIRED)
                     add_custom_command(
                         OUTPUT  "${_mkl_omp_obj}"
                                 "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.mod"
@@ -625,12 +632,26 @@ exit 0
                         DEPENDS "${_mkl_omp_src}"
                         COMMENT "Compiling MKL OMP offload Fortran module (mkl_dfti_omp_offload)"
                     )
+                    # Strip the SPIR-V device bundle so the linked object has only host code.
+                    # The SPIR-V contains Import declarations for MKL SYCL DFT functions that
+                    # the OpenMP Level Zero plugin cannot resolve, causing zeModuleDynamicLink
+                    # failure. With host-only code, !$omp dispatch falls back to CPU for DFT.
+                    add_custom_command(
+                        OUTPUT  "${_mkl_omp_obj_host}"
+                        COMMAND "${CLANG_OFFLOAD_BUNDLER}"
+                                --unbundle --type=o
+                                --targets=host-x86_64-unknown-linux-gnu
+                                --input="${_mkl_omp_obj}"
+                                --output="${_mkl_omp_obj_host}"
+                        DEPENDS "${_mkl_omp_obj}"
+                        COMMENT "Stripping SPIR-V from MKL DFT object (host-only, fixes Level Zero link)"
+                    )
                     add_custom_target(mkl_omp_offload_mod_${a_target}
-                        DEPENDS "${_mkl_omp_obj}")
+                        DEPENDS "${_mkl_omp_obj_host}")
                     add_dependencies(${a_target} mkl_omp_offload_mod_${a_target})
                     target_include_directories(${a_target} PRIVATE
                         "$ENV{MKLROOT}/include" "${_mkl_omp_mod_dir}")
-                    target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj}")
+                    target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj_host}")
                     # Link MKL threading + core + SYCL DFT backend
                     target_link_options(${a_target} PRIVATE -qmkl=parallel)
                     find_library(MKL_SYCL_DFT mkl_sycl_dft HINTS "$ENV{MKLROOT}/lib" REQUIRED)
diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py
index dda710f602..180044a4f3 100644
--- a/toolchain/mfc/run/input.py
+++ b/toolchain/mfc/run/input.py
@@ -99,6 +99,16 @@ def generate_fpp(self, target) -> None:
 
         thermochem_code = pyro.FortranCodeGenerator().generate("m_thermochem", sol, pyro.CodeGenerationOptions(scalar_type=real_type, directive_offload=directive_str))
 
+        if directive_str == "mp":
+            # ifx -fpp strips !$omp directives produced by C-macro expansion because the
+            # Intel Fortran preprocessor treats '!' as a Fortran comment after expansion.
+            # Rewrite the GPU_ROUTINE macro calls as literal !$omp declare target lines so
+            # the directive is visible to the Fortran front-end, not the C preprocessor.
+            import re
+
+            thermochem_code = thermochem_code.replace("#define GPU_ROUTINE(name) !$omp declare target\n", "")
+            thermochem_code = re.sub(r"[ \t]+GPU_ROUTINE\(\w+\)", "!$omp declare target", thermochem_code)
+
         common.file_write(os.path.join(modules_dir, "m_thermochem.f90"), thermochem_code, True)
 
         cons.unindent()

From 5dba4e127c9727cde2c20f44e4f30c5f6d6fefc6 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 14:45:47 -0400
Subject: [PATCH 06/30] perf: Intel GPU Max Level Zero tuning in run toolchain

Automatically set Intel Level Zero environment variables when running
with --gpu mp and ifx compiler:

- LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=256: batch up to 256 Level Zero
  commands before flushing, reducing host-GPU synchronization overhead.
  Measured ~9% throughput improvement on Intel GPU Max 1100 (PVC).

- SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY=0: disable per-kernel
  indirect access tracking (zeMemGetAllocProperties, ~2100 calls/step).
  Safe because MFC manages all GPU allocations via @:ALLOCATE/@:DEALLOCATE
  and scalar_field pointers are never aliased with host memory.

When --fastmath is also requested:
- LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS=-cl-fast-relaxed-math:
  enables GPU JIT fast-math (fused MAD, fast transcendentals, finite-
  math-only). Equivalent to nvfortran -gpu=fastmath for OpenACC builds.
---
 toolchain/mfc/run/run.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py
index 82e886c064..261d8e65dc 100644
--- a/toolchain/mfc/run/run.py
+++ b/toolchain/mfc/run/run.py
@@ -85,6 +85,19 @@ def __get_template() -> Template:
     raise MFCException(f"Failed to find a template for --computer '{computer}'. Baked-in templates are: {format_list_to_string(list(baked.keys()), 'magenta')}.")
 
 
+def __is_intel_gpu_build(case: input.MFCInputFile) -> bool:
+    cmake_cache = os.path.join(SIMULATION.get_staging_dirpath(case), "CMakeCache.txt")
+    if not os.path.isfile(cmake_cache):
+        return False
+    with open(cmake_cache) as f:
+        content = f.read()
+    # Match compiler ID entry (may or may not be present) or fall back to compiler path
+    if re.search(r"CMAKE_Fortran_COMPILER_ID[^=\n]*=[^\n]*IntelLLVM", content):
+        return True
+    m = re.search(r"CMAKE_Fortran_COMPILER:FILEPATH=([^\n]+)", content)
+    return m is not None and os.path.basename(m.group(1).strip()) in ("ifx", "ifx.exe")
+
+
 def __generate_job_script(targets, case: input.MFCInputFile):
     env = {}
     if ARG("gpus") is not None:
@@ -103,6 +116,23 @@ def __generate_job_script(targets, case: input.MFCInputFile):
     gpu_acc = gpu_mode == gpuConfigOptions.ACC.value
     gpu_mp = gpu_mode == gpuConfigOptions.MP.value
 
+    if gpu_mp and __is_intel_gpu_build(case):
+        # Level Zero tuning for Intel GPU Max (Ponte Vecchio).
+        # COMMAND_BATCH=256: batch up to 256 Level Zero commands before flushing,
+        # allowing the GPU to stay busy while the host prepares the next batch.
+        # Measured ~9% throughput improvement on Intel GPU Max 1100.
+        env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH", "256")
+        # Disable per-kernel indirect access memory tracking (zeMemGetAllocProperties).
+        # MFC's scalar_field pointer arrays trigger indirect-access flags; this tracking
+        # adds ~2100 API calls/step. Disabling it is safe since all GPU allocations are
+        # managed via @:ALLOCATE/@:DEALLOCATE and never aliased with host memory.
+        env.setdefault("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY", "0")
+        if ARG("fastmath"):
+            # -cl-fast-relaxed-math: enables unsafe GPU JIT optimizations (fast transcendentals,
+            # fused MAD, no signed-zero semantics, finite-math-only). Matches nvfortran -gpu=fastmath.
+            # Only applied when the user explicitly requests --fastmath.
+            env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "-cl-fast-relaxed-math")
+
     content = __get_template().render(
         **{**ARGS(), "targets": targets},
         ARG=ARG,

From 6e636de6de6e64130f95c1923ef60fed9fe6b5ff Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 15:41:54 -0400
Subject: [PATCH 07/30] fix: restore ifort (Intel classic) CPU build support in
 CMakeLists.txt

The IntelLLVM branch inadvertently dropped the Intel (ifort) elseif,
leaving classic ifort CPU builds without the -free flag. Add it back
as a separate branch so ifort and ifx coexist correctly.
---
 CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e50c1a6be9..6d3694987e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,6 +241,14 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
     if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug")
         add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-O1> $<$<COMPILE_LANGUAGE:Fortran>:-g>)
     endif()
+elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
+    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>)
+
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        add_compile_options(-g -Og -traceback -debug -check all)
+    elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
+        add_compile_options(-g -Og -traceback -check bounds)
+    endif()
 elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
     add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>
                         $<$<COMPILE_LANGUAGE:Fortran>:-fpp>)

From 38762fccb629556ca0c367060d475678b4da14c2 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 15:51:05 -0400
Subject: [PATCH 08/30] feat: add GT CRNCH RoboGator (crnch) module entry for
 Intel GPU Max

Adds source ./mfc.sh load -c crnch -m g support for the GT CRNCH
RoboGator nodes with Intel GPU Max 1100 (Ponte Vecchio). Sets FC,
PATH, MKLROOT, LD_LIBRARY_PATH, and LIBRARY_PATH for oneAPI 2025.1
at the fixed install path (no Lmod modules available for 2025.1).

Also fixes modules.sh to skip 'module load' when the module list is
empty, supporting systems that configure entirely via env var exports.
---
 toolchain/bootstrap/modules.sh | 11 +++++++----
 toolchain/modules              |  7 +++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh
index 1beb016539..119bff39fb 100644
--- a/toolchain/bootstrap/modules.sh
+++ b/toolchain/bootstrap/modules.sh
@@ -48,6 +48,7 @@ if [ -v $u_c ]; then
     log   "$B""DoD$W:     Carpenter Cray (cc) | Carpenter GNU (c) |  Nautilus (n)"
     log   "$OR""Florida$W: HiPerGator (h)"
     log   "$C""WPI $W:   Turing   (t)"
+    log   "$Y""Gatech$W:  CRNCH RoboGator (crnch)"
     log_n "($G""a$W/$G""f$W/$G""s$W/$G""w$W/$B""tuo$W/$C""b$W/$C""e$CR/$C""d/$C""dai$CR/$Y""p$CR/$R""r$CR/$B""cc$CR/$B""c$CR/$B""n$CR/$BR""o$CR/$BR""pa"$CR"/$OR""h"$CR/$C""t""$CR"): "
     read u_c
     log
@@ -106,11 +107,13 @@ fi
 ELEMENTS="$(__extract "$u_c-all") $(__extract "$u_c-$cg")"
 MODULES=`echo "$ELEMENTS" | tr ' ' '\n' | grep -v = | xargs`
 
-log " $ module load $MODULES"
-if ! module load $MODULES; then
-    error "Failed to load modules."
+if [ -n "$MODULES" ]; then
+    log " $ module load $MODULES"
+    if ! module load $MODULES; then
+        error "Failed to load modules."
 
-    return
+        return
+    fi
 fi
 
 # Export variables one line at a time so each can reference previously exported vars
diff --git a/toolchain/modules b/toolchain/modules
index ea7cb36393..1a75a5829a 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -116,3 +116,10 @@ h-gpu NVCOMPILER_COMM_LIBS_HOME=/apps/compilers/nvhpc/25.9/Linux_x86_64/25.9/com
 t     WPI Turing
 t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
+
+crnch     GT CRNCH RoboGator (Intel GPU Max 1100, Ponte Vecchio)
+crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin/ifx
+crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:${PATH}
+crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0
+crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:${LD_LIBRARY_PATH}
+crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:${LIBRARY_PATH}

From c224384e0c1a533229621fcceec78673f608ceda Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 16:04:13 -0400
Subject: [PATCH 09/30] fix: find clang-offload-bundler via ifx PATH when
 FC=mpiifx

When FC is an MPI wrapper (mpiifx), CMAKE_Fortran_COMPILER parent
dir points to the MPI bin dir, not the compiler bin dir. clang-
offload-bundler lives in compiler/bin/compiler/ which is only
reachable via the real ifx location. Resolve ifx from PATH first.
---
 CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d3694987e..c8e649da84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -622,8 +622,11 @@ exit 0
                     set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o")
                     set(_mkl_omp_obj_host "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload_host.o")
                     file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}")
-                    # Find clang-offload-bundler (co-located with ifx) for SPIR-V stripping.
-                    cmake_path(GET CMAKE_Fortran_COMPILER PARENT_PATH _ifx_bin)
+                    # Find clang-offload-bundler (in ifx's bin/compiler/ subdir).
+                    # CMAKE_Fortran_COMPILER may be an MPI wrapper (mpiifx); resolve the
+                    # underlying ifx from PATH so the HINTS point to the right directory.
+                    find_program(_IFX_REAL ifx REQUIRED)
+                    cmake_path(GET _IFX_REAL PARENT_PATH _ifx_bin)
                     find_program(CLANG_OFFLOAD_BUNDLER
                         NAMES clang-offload-bundler
                         HINTS "${_ifx_bin}/compiler" "${_ifx_bin}"

From fae09c2eada7be683bafac86c0145a02130ff1ff Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 16:12:43 -0400
Subject: [PATCH 10/30] fix: LAPACK PIE link error + crnch module Intel MPI
 paths

- Add -DCMAKE_POSITION_INDEPENDENT_CODE=ON to LAPACK ExternalProject so
  ifx-compiled Fortran objects link correctly on PIE-default systems (Ubuntu 22.04+)
- Update crnch module to use mpiifx as FC and add Intel MPI 2021.14 to PATH,
  LD_LIBRARY_PATH, LIBRARY_PATH, and I_MPI_ROOT for MPI-enabled builds
---
 toolchain/dependencies/CMakeLists.txt | 1 +
 toolchain/modules                     | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt
index 972b2a4883..587cb928c2 100644
--- a/toolchain/dependencies/CMakeLists.txt
+++ b/toolchain/dependencies/CMakeLists.txt
@@ -139,6 +139,7 @@ endif()
                             -DCBLAS=OFF
                             -DLAPACKE=OFF
                             -DBUILD_DEPRECATED=OFF
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                             "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
                             "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
          )
diff --git a/toolchain/modules b/toolchain/modules
index 1a75a5829a..8e7da10175 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -118,8 +118,9 @@ t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
 
 crnch     GT CRNCH RoboGator (Intel GPU Max 1100, Ponte Vecchio)
-crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin/ifx
-crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:${PATH}
+crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin/mpiifx
+crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin:${PATH}
 crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0
-crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:${LD_LIBRARY_PATH}
-crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:${LIBRARY_PATH}
+crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14
+crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LD_LIBRARY_PATH}
+crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH}

From 5ffc5f6d67affe3dd5d20776468d7fad2939e547 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 16:20:41 -0400
Subject: [PATCH 11/30] fix: LAPACK FortranCInterface PIE link failure on
 Ubuntu 22.04 with ifx

Ubuntu 22.04 GCC defaults to --enable-default-pie, causing the LAPACK
FortranCInterface compatibility test to fail when linking ifx-compiled
Fortran objects into a PIE executable. Fix by:
- Adding -DCMAKE_EXE_LINKER_FLAGS=-no-pie to suppress PIE for LAPACK test
  executables (LAPACK itself is a static library so PIE is irrelevant)
- Passing CMAKE_Fortran_COMPILER explicitly so LAPACK uses the same ifx/mpiifx
  that MFC uses rather than whatever cmake auto-detects
---
 toolchain/dependencies/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt
index 587cb928c2..96eb046cd8 100644
--- a/toolchain/dependencies/CMakeLists.txt
+++ b/toolchain/dependencies/CMakeLists.txt
@@ -140,6 +140,8 @@ endif()
                             -DLAPACKE=OFF
                             -DBUILD_DEPRECATED=OFF
                             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            "-DCMAKE_EXE_LINKER_FLAGS=-no-pie"
+                            "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}"
                             "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
                             "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
          )

From 7c304d899a2314f26c6dda70c94496d3bbc9e06f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 16:28:07 -0400
Subject: [PATCH 12/30] fix: build FFTW from source for all non-Cray compilers
 including IntelLLVM

FFTW is a pure C library; the Fortran compiler ID is irrelevant to building it.
The IntelLLVM exception was unnecessarily preventing FFTW from being built from
source when using ifx/mpiifx, causing post_process tests that require FFTW to
fail on systems without a system-provided double-precision FFTW.
---
 toolchain/dependencies/CMakeLists.txt | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt
index 96eb046cd8..89b51ccc26 100644
--- a/toolchain/dependencies/CMakeLists.txt
+++ b/toolchain/dependencies/CMakeLists.txt
@@ -32,20 +32,16 @@ if (MFC_FFTW)
         message(STATUS "FFTW found.")
         add_custom_target(fftw)
     else()
-        if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND
-            NOT CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+        if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+            message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
+            add_custom_target(fftw)
+        else()
             ExternalProject_Add(fftw
                 URL        "http://www.fftw.org/fftw-3.3.10.tar.gz"
                 CMAKE_ARGS -DBUILD_TESTS=OFF
                            -DBUILD_SHARED_LIBS=OFF
                            "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
             )
-        elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
-            message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
-            add_custom_target(fftw)
-        else()
-            message(WARNING "The Fortran compiler vendor is IntelLLVM (ifx); FFTW3 will not be built from source. A system FFTW is expected.")
-            add_custom_target(fftw)
         endif()
     endif()
 endif()

From 3af3b6da83f10fc28cbdfb253e2020c86eb0ddf1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 15 May 2026 17:18:40 -0400
Subject: [PATCH 13/30] =?UTF-8?q?fix:=20crnch=20module=20=E2=80=94=20add?=
 =?UTF-8?q?=20I=5FMPI=5FFABRICS=3Dshm=20for=20Intel=20MPI=20on=20single-no?=
 =?UTF-8?q?de?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, Intel MPI's OFI transport fails to initialize on dash4
(no InfiniBand/OmniPath) with 'Unknown error class' in MPIDI_OFI_mpi_init_hook.
Setting I_MPI_FABRICS=shm forces shared-memory transport for intra-node
communication, which works correctly for single-node GPU runs.
---
 toolchain/modules | 1 +
 1 file changed, 1 insertion(+)

diff --git a/toolchain/modules b/toolchain/modules
index 8e7da10175..d424203666 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -124,3 +124,4 @@ crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025
 crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14
 crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LD_LIBRARY_PATH}
 crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH}
+crnch-gpu I_MPI_FABRICS=shm

From 6b1d0de5a03cf72f34dfebfaa661c922b518c86e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 09:53:40 -0400
Subject: [PATCH 14/30] feat: Intel GPU Max (Ponte Vecchio) OpenMP target
 offload support

Add end-to-end support for building and running MFC on Intel Data Center
GPU Max (Ponte Vecchio) using ifx 2025.0+ with OpenMP target offload to
SPIR-V/SPIR64. Verified on GT CRNCH RoboGator (dash4) with Intel GPU
Max 1100. All 161 1D regression tests pass.

## Compiler and build system
- Recognize IntelLLVM compiler ID throughout CMakeLists.txt (was Intel)
- Add -fiopenmp -fopenmp-targets=spir64 compile/link flags for GPU builds
- Add -fp-model=precise to prevent ifx FP reassociation in SPIR-V kernels
- Add -fpp to global compile flags for Intel preprocessor compatibility
- Link MKL parallel, libmkl_sycl_dft, libsycl, libOpenCL for oneMKL FFT
- Strip SPIR-V from mkl_dfti_omp_offload.o via clang-offload-bundler to
  fix zeModuleDynamicLink Level Zero failures
- Add --intel-aot flag: AOT compilation via ocloc to native PVC ISA,
  eliminates ~30 min Level Zero JIT delay (test runs: 30 min -> 14 sec)
- Add IntelLLVM to no-FFTW-from-source list in dependencies/CMakeLists.txt
- Fix LAPACK PIE link error with ifx on Ubuntu 22.04

## GPU kernel fixes
- omp_macros.fpp: add Intel-specific OMP_PARALLEL_LOOP, END_OMP_PARALLEL_LOOP,
  OMP_ROUTINE, OMP_MKL_DISPATCH branches for SPIR-V codegen
- parallel_macros.fpp: add GPU_MKL_DISPATCH() macro for oneMKL dispatch
- shared_parallel_macros.fpp: add USING_INTEL Fypp variable; extend all
  #:if not MFC_CASE_OPTIMIZATION and USING_AMD guards to include USING_INTEL
  and bare #:if USING_AMD guards for dimension(sys_size) in m_cbc/m_compute_cbc
- m_fftw.fpp: oneMKL DFTI + ! dispatch GPU FFT path for Intel
- m_compute_levelset.fpp: split single if-else dispatch to fix multi-callee
  phi-node issue and inliner ICE; add -fno-inline workaround
- m_riemann_solvers.fpp, m_variables_conversion.fpp, m_bubbles_EE.fpp,
  m_weno.fpp, m_sim_helpers.fpp, m_pressure_relaxation.fpp, m_boundary_common,
  m_chemistry.fpp, m_phase_change.fpp, m_bubbles_EL.fpp, m_viscous.fpp,
  m_ibm.fpp, m_hyperelastic.fpp, m_acoustic_src.fpp, m_surface_tension.fpp,
  m_data_output.fpp, m_qbmm.fpp, m_compute_cbc.fpp, m_cbc.fpp, m_ib_patches.fpp:
  explicit array sizes in GPU_ROUTINE arguments (no assumed-shape in SPIR-V)
  and extend VLA guards to USING_INTEL for non-case-optimized GPU builds
- m_helper.fpp: Intel-specific workarounds for SPIR-V codegen

## Toolchain
- Add GT CRNCH RoboGator (crnch) module entry with Intel oneAPI 2025.1
- run.py: Intel GPU detection, set LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=256
  and SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY=0 for ~16% speedup
- run/input.py: post-process pyrometheus m_thermochem.f90 for --gpu mp
  (replace C-macro GPU_ROUTINE with literal ! declare target)
- build.py, state.py: --intel-aot flag and ocloc device selection
- test.py: --binary mpirun support to bypass SLURM srun slot limits on CRNCH
- bootstrap/modules.sh: crnch module bootstrap
- templates/include/helpers.mako: Intel MPI I_MPI_FABRICS=shm hint
- modules: crnch entry (Intel oneAPI 2025.1, mpiifx, GPU Max 1100)

## Documentation
- docs/documentation/intel-gpu-max.md: full build, run, troubleshoot guide
---
 CMakeLists.txt                                |  155 ++-
 docs/documentation/intel-gpu-max.md           |  263 ++++
 examples/3D_performance_test/case.py          |   19 +-
 src/common/include/omp_macros.fpp             |   28 +-
 src/common/include/parallel_macros.fpp        |    8 +
 src/common/include/shared_parallel_macros.fpp |    3 +-
 src/common/m_chemistry.fpp                    |    4 +-
 src/common/m_helper.fpp                       |    8 +-
 src/common/m_mpi_common.fpp                   | 1092 ++++++++---------
 src/common/m_phase_change.fpp                 |   14 +-
 src/common/m_variables_conversion.fpp         |   34 +-
 src/simulation/m_acoustic_src.fpp             |    2 +-
 src/simulation/m_bubbles_EE.fpp               |    2 +-
 src/simulation/m_bubbles_EL.fpp               |    6 +-
 src/simulation/m_cbc.fpp                      |    4 +-
 src/simulation/m_compute_cbc.fpp              |   48 +-
 src/simulation/m_compute_levelset.fpp         |   95 +-
 src/simulation/m_data_output.fpp              |    6 +-
 src/simulation/m_fftw.fpp                     |  200 ++-
 src/simulation/m_hyperelastic.fpp             |    2 +-
 src/simulation/m_ib_patches.fpp               |   28 +-
 src/simulation/m_ibm.fpp                      |    8 +-
 src/simulation/m_pressure_relaxation.fpp      |   12 +-
 src/simulation/m_qbmm.fpp                     |    6 +-
 src/simulation/m_riemann_solvers.fpp          |   28 +-
 src/simulation/m_sim_helpers.fpp              |   26 +-
 src/simulation/m_surface_tension.fpp          |    2 +-
 src/simulation/m_time_steppers.fpp            |    6 +-
 src/simulation/m_viscous.fpp                  |   18 +-
 src/simulation/m_weno.fpp                     |    2 +-
 toolchain/bootstrap/modules.sh                |   11 +-
 toolchain/dependencies/CMakeLists.txt         |   11 +-
 toolchain/mfc/build.py                        |    1 +
 toolchain/mfc/run/input.py                    |   10 +
 toolchain/mfc/run/run.py                      |   30 +
 toolchain/mfc/state.py                        |    4 +-
 toolchain/mfc/test/test.py                    |   24 +-
 toolchain/modules                             |    9 +
 .../patches/fypp-linemarker-resync.patch      |   31 +-
 toolchain/templates/include/helpers.mako      |    9 +
 40 files changed, 1479 insertions(+), 790 deletions(-)
 create mode 100644 docs/documentation/intel-gpu-max.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72258149f3..2b7810dd84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,8 @@ option(MFC_DOCUMENTATION "Build documentation"                               OFF
 option(MFC_ALL           "Build everything"                                  OFF)
 option(MFC_SINGLE_PRECISION "Build single precision"                         OFF)
 option(MFC_MIXED_PRECISION "Build mixed precision"                           OFF)
+option(MFC_Intel_AOT       "Build Intel GPU with AOT compilation (no JIT)"  OFF)
+set(MFC_Intel_AOT_DEVICE "pvc" CACHE STRING "Intel GPU AOT target device (e.g. pvc, dg2)")
 
 if (MFC_ALL)
     set(MFC_PRE_PROCESS   ON FORCE)
@@ -249,6 +251,42 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
     elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
         add_compile_options(-g -Og -traceback -check bounds)
     endif()
+elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>
+                        $<$<COMPILE_LANGUAGE:Fortran>:-fpp>)
+
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        add_compile_options(-g -Og -traceback -debug -check all)
+    elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
+        add_compile_options(-g -Og -traceback -check bounds)
+    endif()
+
+    # mpiifx calls 'ifx' via eval. If ifx is not in PATH, cmake's compiler
+    # capability tests (OpenMP detection, IPO check) fail with "ifx: not found".
+    # Locate ifx relative to mpiifx's oneapi tree so cmake tests work without
+    # the user having to load modules before every build.
+    get_filename_component(_fc_name "${CMAKE_Fortran_COMPILER}" NAME)
+    if (_fc_name MATCHES "^mpi")
+        get_filename_component(_mpi_bin  "${CMAKE_Fortran_COMPILER}" DIRECTORY)
+        get_filename_component(_mpi_ver  "${_mpi_bin}"               DIRECTORY)
+        get_filename_component(_mpi_root "${_mpi_ver}"               DIRECTORY)
+        get_filename_component(_oneapi   "${_mpi_root}"              DIRECTORY)
+        file(GLOB _ifx_bins "${_oneapi}/compiler/*/bin/ifx")
+        if (_ifx_bins)
+            list(GET _ifx_bins 0 _ifx_bin)
+            get_filename_component(_ifx_dir "${_ifx_bin}" DIRECTORY)
+            set(ENV{PATH} "${_ifx_dir}:$ENV{PATH}")
+            message(STATUS "MFC: mpiifx detected — added ifx to PATH: ${_ifx_dir}")
+        endif()
+        unset(_mpi_bin)
+        unset(_mpi_ver)
+        unset(_mpi_root)
+        unset(_oneapi)
+        unset(_ifx_bins)
+        unset(_ifx_bin)
+        unset(_ifx_dir)
+    endif()
+    unset(_fc_name)
 elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_Fortran_COMPILER_ID STREQUAL "PGI"))
     add_compile_options(
         $<$<COMPILE_LANGUAGE:Fortran>:-Mfreeform>
@@ -560,8 +598,18 @@ exit 0
         )
 
         if (MFC_MPI AND ARGS_MPI)
-            find_package(MPI COMPONENTS Fortran REQUIRED)
-	    
+            # When the Fortran compiler is an MPI wrapper (e.g. mpiifx), skip MPI
+            # auto-detection: cmake's FindMPI probes with -showme:compile which hangs
+            # on Intel MPI. The wrapper already injects all MPI includes and link flags.
+            get_filename_component(_fc_basename "${CMAKE_Fortran_COMPILER}" NAME)
+            if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND _fc_basename MATCHES "^mpi")
+                if (NOT TARGET MPI::MPI_Fortran)
+                    add_library(MPI::MPI_Fortran INTERFACE IMPORTED)
+                endif()
+            else()
+                find_package(MPI COMPONENTS Fortran REQUIRED)
+            endif()
+
             target_compile_definitions(${a_target} PRIVATE MFC_MPI)
             if(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang" AND
                DEFINED ENV{CRAY_MPICH_INC} AND NOT "$ENV{CRAY_MPICH_INC}" STREQUAL "")
@@ -595,6 +643,71 @@ exit 0
                             HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
                         target_link_libraries(${a_target} PRIVATE ${HIPFFT_LIB})
                     endif()
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+                    # Intel GPU: oneMKL DFTI with !$omp dispatch for GPU FFT.
+                    # Requires MKLROOT to be set (via oneAPI module or env).
+                    if (NOT DEFINED ENV{MKLROOT})
+                        message(FATAL_ERROR "MKLROOT is not set. Load oneAPI MKL module before building.")
+                    endif()
+                    # Compile mkl_dfti_omp_offload.f90 in isolation with minimal flags.
+                    # The file uses !$omp declare variant with need_device_addr (OpenMP 5.2)
+                    # which requires the global -free -fpp flags to be absent so the
+                    # compiler parses it in standard fixed/free detection mode only.
+                    set(_mkl_omp_src "$ENV{MKLROOT}/include/mkl_dfti_omp_offload.f90")
+                    if (NOT EXISTS "${_mkl_omp_src}")
+                        message(FATAL_ERROR "mkl_dfti_omp_offload.f90 not found in $ENV{MKLROOT}/include")
+                    endif()
+                    set(_mkl_omp_mod_dir "${CMAKE_CURRENT_BINARY_DIR}/mkl_omp_mods")
+                    set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o")
+                    set(_mkl_omp_obj_host "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload_host.o")
+                    file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}")
+                    # Find clang-offload-bundler (in ifx's bin/compiler/ subdir).
+                    # CMAKE_Fortran_COMPILER may be an MPI wrapper (mpiifx); resolve the
+                    # underlying ifx from PATH so the HINTS point to the right directory.
+                    find_program(_IFX_REAL ifx REQUIRED)
+                    cmake_path(GET _IFX_REAL PARENT_PATH _ifx_bin)
+                    find_program(CLANG_OFFLOAD_BUNDLER
+                        NAMES clang-offload-bundler
+                        HINTS "${_ifx_bin}/compiler" "${_ifx_bin}"
+                        REQUIRED)
+                    add_custom_command(
+                        OUTPUT  "${_mkl_omp_obj}"
+                                "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.mod"
+                        COMMAND "${CMAKE_Fortran_COMPILER}"
+                                -fiopenmp -fopenmp-targets=spir64
+                                -c -I"$ENV{MKLROOT}/include"
+                                "${_mkl_omp_src}"
+                                -o "${_mkl_omp_obj}"
+                        WORKING_DIRECTORY "${_mkl_omp_mod_dir}"
+                        DEPENDS "${_mkl_omp_src}"
+                        COMMENT "Compiling MKL OMP offload Fortran module (mkl_dfti_omp_offload)"
+                    )
+                    # Strip the SPIR-V device bundle so the linked object has only host code.
+                    # The SPIR-V contains Import declarations for MKL SYCL DFT functions that
+                    # the OpenMP Level Zero plugin cannot resolve, causing zeModuleDynamicLink
+                    # failure. With host-only code, !$omp dispatch falls back to CPU for DFT.
+                    add_custom_command(
+                        OUTPUT  "${_mkl_omp_obj_host}"
+                        COMMAND "${CLANG_OFFLOAD_BUNDLER}"
+                                --unbundle --type=o
+                                --targets=host-x86_64-unknown-linux-gnu
+                                --input="${_mkl_omp_obj}"
+                                --output="${_mkl_omp_obj_host}"
+                        DEPENDS "${_mkl_omp_obj}"
+                        COMMENT "Stripping SPIR-V from MKL DFT object (host-only, fixes Level Zero link)"
+                    )
+                    add_custom_target(mkl_omp_offload_mod_${a_target}
+                        DEPENDS "${_mkl_omp_obj_host}")
+                    add_dependencies(${a_target} mkl_omp_offload_mod_${a_target})
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{MKLROOT}/include" "${_mkl_omp_mod_dir}")
+                    target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj_host}")
+                    # Link MKL threading + core + SYCL DFT backend
+                    target_link_options(${a_target} PRIVATE -qmkl=parallel)
+                    find_library(MKL_SYCL_DFT mkl_sycl_dft HINTS "$ENV{MKLROOT}/lib" REQUIRED)
+                    find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
+                    find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE ${MKL_SYCL_DFT} ${SYCL_LIB} ${OPENCL_LIB})
                 else()
                     find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
                     target_link_libraries(${a_target} PRIVATE hipfort::hipfft)
@@ -636,9 +749,23 @@ exit 0
                     target_compile_options(${a_target} PRIVATE "-mp=gpu" "-Minfo=mp")
                     target_link_options(${a_target} PRIVATE "-mp=gpu")
                     set_target_properties(${a_target} PROPERTIES Fortran_FLAGS "-mp=gpu -gpu=ccall")
-                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
-                    target_compile_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
-                    target_link_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+                    # Intel GPU: OpenMP target offload to SPIR64 (Xe-HPC / Ponte Vecchio).
+                    # GPU FFT uses oneMKL DFTI via the OpenMP dispatch construct.
+                    # MFC_Intel_AOT=ON: compile native GPU ISA at build time (no runtime JIT).
+                    if(MFC_Intel_AOT)
+                        # AOT: compile native GPU ISA at build time with ocloc via -Xopenmp-target-backend.
+                        # ocloc uses single-dash flags (-device pvc, not --device pvc).
+                        # -Xopenmp-target-backend goes only on link because ocloc runs at link time;
+                        # putting it on compile options too causes ocloc to see -device twice.
+                        target_compile_options(${a_target} PRIVATE
+                            -fiopenmp -fopenmp-targets=spir64_gen -fp-model=precise)
+                        target_link_options(${a_target} PRIVATE
+                            "SHELL:-fiopenmp -fopenmp-targets=spir64_gen -Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"")
+                    else()
+                        target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64 -fp-model=precise)
+                        target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                    endif()
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
@@ -753,6 +880,13 @@ if (MFC_PRE_PROCESS)
         # via cross-file inlining. Safe to disable IPA for the whole target
         # (CPU-only, no GPU device-call requirements). See PR #1286.
         target_compile_options(pre_process PRIVATE -Oipa0)
+    elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+        # ifx stack-allocates compiler-generated temporaries by default; large
+        # ones (e.g. 8 MB MPI I/O contiguous copies) overflow the stack when the
+        # Level Zero / OpenMP offload runtime has already consumed stack space.
+        # Note: -heap-arrays triggers an ICE in ifx's SPIR-V backend for simulation
+        # (GPU device code), so it is applied only to CPU-only targets here.
+        target_compile_options(pre_process PRIVATE $<$<COMPILE_LANGUAGE:Fortran>:-heap-arrays>)
     endif()
 endif()
 
@@ -782,6 +916,17 @@ if (MFC_SIMULATION)
             target_compile_options(simulation PRIVATE -Oipa0)
         endif()
     endif()
+    # ifx SPIR64 ICE: the LLVM inliner pulls !$omp declare target (seq) geometry
+    # routines into target teams loop kernels and generates SPIR-V IR that crashes
+    # llvm-spirv at O1+. -fno-inline keeps them as proper device-side calls.
+    # Each GPU loop calls exactly one geometry routine (split-loop pattern in
+    # m_compute_levelset.fpp), so device-call overhead is small. See PR intel-gpu.
+    if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
+        set_source_files_properties(
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
+            PROPERTIES COMPILE_OPTIONS "-fno-inline"
+        )
+    endif()
 endif()
 
 if (MFC_POST_PROCESS)
diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
new file mode 100644
index 0000000000..576e68f6a7
--- /dev/null
+++ b/docs/documentation/intel-gpu-max.md
@@ -0,0 +1,263 @@
+# Building MFC for Intel Data Center GPU Max (Ponte Vecchio)
+
+This documents how to build and run MFC with Intel GPU Max (Xe-HPC / Ponte Vecchio)
+using ifx OpenMP target offload to SPIR64, as tested on GT CRNCH RoboGator (`dash3`).
+
+## System configuration
+
+| Component | Version / Path |
+|---|---|
+| Hardware | Intel Data Center GPU Max 1100 (Ponte Vecchio, PCI 8086:0bda) |
+| OS | Linux (RHEL 8 compatible, kernel 5.15) |
+| Fortran compiler | ifx 2025.3.3 (`/opt/intel/oneapi/compiler/2025.3/`) |
+| MKL | oneMKL 2026.0 (`/opt/intel/oneapi/mkl/2026.0/`) |
+| SYCL runtime | `libsycl.so` in `/opt/intel/oneapi/compiler/2026.0/lib/` |
+| GPU device | `/dev/dri/renderD128` (requires `render` group membership) |
+
+## Environment setup
+
+Load the required oneAPI environment before building or running:
+
+```bash
+export PATH=/opt/intel/oneapi/compiler/2025.3/bin:$PATH
+export MKLROOT=/opt/intel/oneapi/mkl/2026.0
+export LIBRARY_PATH=/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\
+/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LD_LIBRARY_PATH
+export FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx
+```
+
+> **Important**: `FC` must be set explicitly. Without it, CMake may cache an older
+> ifx (2025.0) from a system module, which does not support `need_device_addr` in
+> the MKL 2026.0 OpenMP offload Fortran module.
+
+> **Important**: `LIBRARY_PATH` (not just `LD_LIBRARY_PATH`) must include the
+> compiler 2026.0 lib directory so the linker finds `libsycl.so` at build time.
+
+> **Important**: `/opt/intel/oneapi/umf/1.1/lib` must be in `LD_LIBRARY_PATH` at
+> runtime. The Level Zero and OpenCL UR adapters in the 2026.0 compiler depend on
+> `libumf.so.1`, which lives in the separate `umf/1.1` package, not in the compiler
+> lib directories themselves.
+
+## Building
+
+```bash
+./mfc.sh build -t simulation --gpu mp --no-mpi -j 8
+```
+
+- `--gpu mp`: OpenMP target offload backend (SPIR64)
+- `--no-mpi`: omit for MPI-enabled runs; include for single-node testing
+- `-j 8`: parallel build jobs
+
+MFC will automatically:
+1. Compile `$MKLROOT/include/mkl_dfti_omp_offload.f90` with minimal flags
+   (no `-free -fpp`) via a CMake `add_custom_command` to avoid OpenMP 5.2
+   clause compatibility issues with global compile flags
+2. Link `-qmkl=parallel` for MKL threading + core
+3. Link `libmkl_sycl_dft`, `libsycl`, `libOpenCL` for GPU FFT dispatch
+
+## GPU FFT implementation
+
+MFC uses oneMKL DFTI with the OpenMP 5.1 `!$omp dispatch` construct for FFT
+in cylindrical geometry (the azimuthal Fourier filter in `m_fftw.fpp`).
+This is activated when `__INTEL_LLVM_COMPILER` is defined (i.e., compiled with ifx).
+
+Key verified properties (oneMKL 2026.0, ifx 2025.3.3):
+- Batch R2C transform with `INPUT_DISTANCE != OUTPUT_DISTANCE` works correctly.
+  MFC uses `real_size = p+1` and `cmplx_size = (p+1)/2+1` which always differ.
+- `!$omp dispatch` correctly dispatches DFTI calls to device-mapped allocatables.
+
+## Running MFC cases
+
+Build all three targets (pre_process, simulation, post_process) before running:
+
+```bash
+./mfc.sh build --gpu mp --no-mpi -j 8
+```
+
+Then run a case normally:
+
+```bash
+./mfc.sh run examples/1D_convergence/case.py --no-build --no-mpi
+```
+
+To run individual stages directly (useful when `syscheck` blocks due to GPU access):
+
+```bash
+export MKLROOT=/opt/intel/oneapi/mkl/2026.0
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\
+/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LD_LIBRARY_PATH
+
+cd examples/my_case
+/path/to/build/install/<hash>/bin/pre_process
+/path/to/build/install/<hash>/bin/simulation
+```
+
+The install hashes are printed by `./mfc.sh build`; look for lines like
+`✓ Installed simulation`.
+
+## Chemistry/pyrometheus cases
+
+MFC's pyrometheus-generated thermochemistry (`m_thermochem.f90`) works with the
+Intel GPU build. Pyrometheus emits `!$omp declare target` for all thermochem
+routines when `directive_offload="mp"` is requested (automatically set when
+`--gpu mp` is active).
+
+Verified on ifx 2025.3.3: `m_thermochem.f90` and `m_chemistry.fpp` both compile
+at O3 + SPIR64 without ICE. The `1D_reactive_shocktube` example (H2/O2/Ar, 29
+reactions, 10 species) runs to completion. No Intel-specific source workarounds
+are needed for chemistry beyond the general levelset fix.
+
+To build and run a reactive chemistry case:
+```bash
+# Build (chemistry module generated automatically from cantera_file in case.py)
+./mfc.sh run examples/1D_reactive_shocktube/case.py \
+    --gpu mp --no-mpi -t pre_process simulation
+
+# Or bypass syscheck if no GPU render-group access:
+cd examples/1D_reactive_shocktube
+/path/to/build/install/<pre_hash>/bin/pre_process
+OMP_TARGET_OFFLOAD=DISABLED \
+    /path/to/build/install/<sim_hash>/bin/simulation
+```
+
+ifx warning during chemistry compilation:
+```
+warning #8694: A procedure called by a procedure with the DECLARE TARGET
+attribute must have the DECLARE TARGET attribute.  [GET_MIXTURE_VISCOSITY_MIXAVG]
+```
+This is a false positive from ifx's module-interface tracking; `m_thermochem.f90`
+does declare all routines as target. The warning is harmless and the code runs
+correctly.
+
+## GPU device access
+
+The Intel GPU requires membership in the `render` group (GID 109) to access
+`/dev/dri/renderD128` via Level Zero.
+
+Without render group access, `ZE_RESULT_ERROR_UNINITIALIZED` is returned by
+Level Zero. OpenMP target offload falls back to the CPU host plugin
+(correct results, but no GPU acceleration).
+
+To diagnose GPU visibility:
+
+```bash
+ls -la /dev/dri/renderD128          # should show rw permissions for your user/group=render
+
+# With full LD_LIBRARY_PATH set:
+LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:... \
+    /opt/intel/oneapi/compiler/2026.0/bin/sycl-ls --verbose
+# Look for: "[opencl:gpu]" or "[ext_oneapi_level_zero:gpu]" platforms
+
+LIBOMPTARGET_DEBUG=1 ./simulation   # look for "Level0 NG plugin initialization"
+                                    # and absence of "ZE_RESULT_ERROR_UNINITIALIZED"
+```
+
+To get GPU access:
+- **Interactive shell**: request from system admin to add user to `render` group
+  (`sudo usermod -a -G render $USER`, then re-login)
+- **SLURM**: submit with `--gres=gpu:max_1100=1`; if Level Zero still fails,
+  the SLURM epilog may not have configured device cgroup ACLs for the job user —
+  contact the system admin
+
+> **Note on `sycl-ls`**: the 2026.0 `sycl-ls` requires `libumf.so.1` from
+> `/opt/intel/oneapi/umf/1.1/lib` in `LD_LIBRARY_PATH`, otherwise all adapters
+> fail to load and it reports "No platforms found".
+
+## Link flags (what MFC's CMake generates)
+
+The full set of flags the compiler uses for the simulation target:
+
+**Compile flags:**
+```
+-fiopenmp -fopenmp-targets=spir64 -free -fpp -march=native
+```
+
+**Link flags:**
+```
+-fiopenmp -fopenmp-targets=spir64
+-qmkl=parallel
+-L$MKLROOT/lib -lmkl_sycl_dft
+-L/opt/intel/oneapi/compiler/2026.0/lib -lsycl -lOpenCL
+```
+
+**MKL OMP module (compiled separately, no global flags):**
+```bash
+ifx -fiopenmp -fopenmp-targets=spir64 \
+    -c -I$MKLROOT/include \
+    $MKLROOT/include/mkl_dfti_omp_offload.f90 \
+    -o mkl_dfti_omp_offload.o
+```
+
+## Known issues
+
+### `need_device_addr` compilation error
+`mkl_dfti_omp_offload.f90` from MKL 2026.0 uses `need_device_addr` in
+`!$omp declare variant` (OpenMP 5.2). This requires ifx **2025.3** or newer.
+If CMake finds an older ifx (e.g., 2025.0 from a system module path), the
+compile fails with:
+```
+error #5082: Syntax error, found IDENTIFIER 'NEED_DEVICE_ADDR'
+```
+Fix: set `FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx` before building
+and run `./mfc.sh clean` first so CMake re-detects the compiler.
+
+### Two routines with ifx SPIR64 codegen bugs
+
+**`s_apply_levelset` (`m_compute_levelset.fpp`)** — ifx SPIR64 inliner ICE:
+
+The LLVM inliner (at O1+) pulls `!$omp declare target (seq)` geometry
+routines into the `target teams loop` kernel and generates LLVM IR that
+crashes the SPIR-V converter with a segfault in `llvm-spirv`. At O0 the
+crash does not occur (no inlining). Two fixes combined:
+
+1. **Split loops**: replaced the single if-else dispatch loop with one
+   `GPU_PARALLEL_LOOP` per geometry type so each kernel calls exactly one
+   declare-target routine. The original multi-callee dispatch also triggers
+   `"Instruction does not dominate all uses!"` in llvm-link.
+
+2. **Per-file `-fno-inline`**: in `CMakeLists.txt`, `set_source_files_properties`
+   adds `-fno-inline` to `m_compute_levelset.fpp.f90` for IntelLLVM+OpenMP
+   builds. This prevents the inliner from pulling declare-target routines into
+   the kernel body where they crash the SPIR-V backend. The routines remain
+   callable as proper device-side function calls via `!$omp declare target`.
+
+**`s_pressure_relaxation_procedure` (`m_pressure_relaxation.fpp`)** — SPIR-V
+InvalidArraySize in declare-target helpers:
+
+`!$omp declare target (seq)` routines with `dimension(sys_size)` explicit-shape
+dummy arguments trigger `"InvalidArraySize: Array size must be at least 1"` in
+llvm-spirv. SPIR-V requires compile-time constant array bounds; `sys_size` is
+a runtime module integer. Fixed by changing `dimension(sys_size)` →
+`dimension(:)` (assumed-shape) on all helper routine interfaces. The loop now
+runs on GPU for Intel.
+
+### syscheck GPU assertion
+`syscheck` runs `assert(omp_get_num_devices() > 0)` and aborts if the GPU
+is not accessible. This is a runtime check, not a build issue. See GPU device
+access section above.
+
+To run a case anyway (testing code correctness on CPU fallback), invoke
+`pre_process` and `simulation` directly from their install paths, bypassing
+the `./mfc.sh run` wrapper that calls `syscheck` first.
+
+### `libumf.so.1` not found at runtime
+The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`.
+If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports
+"No platforms found". Fix:
+
+```bash
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:$LD_LIBRARY_PATH
+```
diff --git a/examples/3D_performance_test/case.py b/examples/3D_performance_test/case.py
index 4e08a8ea1b..8b26fa07dd 100644
--- a/examples/3D_performance_test/case.py
+++ b/examples/3D_performance_test/case.py
@@ -14,26 +14,13 @@
             "y_domain%end": 4.0e-03 / 1.0e-03,
             "z_domain%beg": 0.0e00,
             "z_domain%end": 4.0e-03 / 1.0e-03,
-            "stretch_x": "T",
-            "a_x": 4.0e00,
-            "x_a": -1.5e-03 / 1.0e-03,
-            "x_b": 1.5e-03 / 1.0e-03,
-            "stretch_y": "T",
-            "a_y": 4.0e00,
-            "y_a": -1.5e-03 / 1.0e-03,
-            "y_b": 1.5e-03 / 1.0e-03,
-            "stretch_z": "T",
-            "a_z": 4.0e00,
-            "z_a": -1.5e-03 / 1.0e-03,
-            "z_b": 1.5e-03 / 1.0e-03,
-            "cyl_coord": "F",
             "m": 200,
             "n": 200,
             "p": 200,
-            "dt": 0.2e-09 / 1.0e-03,
+            "dt": 0.2e-09,
             "t_step_start": 0,
-            "t_step_stop": 30,
-            "t_step_save": 30,
+            "t_step_stop": 10,
+            "t_step_save": 10,
             # Simulation Algorithm Parameters
             "num_patches": 2,
             "model_eqns": 2,
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
index 7620e7607f..00f26a99a2 100644
--- a/src/common/include/omp_macros.fpp
+++ b/src/common/include/omp_macros.fpp
@@ -2,7 +2,7 @@
 
 #:set NVIDIA_COMPILER_ID="NVHPC"
 #:set PGI_COMPILER_ID="PGI"
-#:set INTEL_COMPILER_ID="Intel"
+#:set INTEL_COMPILER_ID="IntelLLVM"
 #:set CCE_COMPILER_ID="Cray"
 #:set AMD_COMPILER_ID="LLVMFlang"
 
@@ -182,6 +182,20 @@
         #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) '
     #:elif MFC_COMPILER == AMD_COMPILER_ID
         #:set omp_start_directive = '!$omp target teams distribute parallel do '
+    #:elif MFC_COMPILER == INTEL_COMPILER_ID
+        #! Intel OMP 5.2: bind(a,b) is invalid. Drop explicit firstprivate list:
+        #! ifx rejects firstprivate for declare-target module variables (#7655).
+        #! OMP 5.0 rule: unclaused scalars in target constructs are firstprivate
+        #! by default, so scalar locals (e.g. gp_layers_z) are covered implicitly.
+        #! declare-target vars (e.g. gp_layers) are device-resident -- no mapping
+        #! needed. defaultmap(firstprivate:scalar) also unsupported by ifx (#9061).
+        #:set omp_start_directive = '!$omp target teams loop '
+        #:set clause_val = collapse_val.strip('\n') + &
+            & default_val.strip('\n') + GEN_PRIVATE_STR(private, False).strip('\n') + &
+            & reduction_val.strip('\n') + copy_val.strip('\n') + copyin_val.strip('\n') + &
+            & copyout_val.strip('\n') + create_val.strip('\n') + &
+            & no_create_val.strip('\n') + present_val.strip('\n') + &
+            & deviceptr_val.strip('\n') + attach_val.strip('\n')
     #:else
         #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
     #:endif
@@ -197,6 +211,8 @@
         #:set omp_end_directive = '!$omp end target teams distribute parallel do simd'
     #:elif MFC_COMPILER == AMD_COMPILER_ID
         #:set omp_end_directive = '!$omp end target teams distribute parallel do'
+    #:elif MFC_COMPILER == INTEL_COMPILER_ID
+        #:set omp_end_directive = '!$omp end target teams loop'
     #:else
         #:set omp_end_directive = '!$omp end target teams loop'
     #:endif
@@ -218,7 +234,9 @@
         #:set function_name_val = ''
     #:endif
 
-    #:if MFC_COMPILER == AMD_COMPILER_ID
+    #:if MFC_COMPILER == AMD_COMPILER_ID or MFC_COMPILER == INTEL_COMPILER_ID
+        #! AMD: device_type unsupported. Intel: OpenMP 5.2 requires an enter/to/link/local
+        #! clause alongside device_type; omit device_type entirely for both.
         #:set clause_val = ''
     #:else
         #:set clause_val = nohost_val.strip('\n')
@@ -374,4 +392,10 @@
         $:code
     #:endif
 #:enddef
+
+#:def OMP_MKL_DISPATCH()
+    #:if MFC_COMPILER == INTEL_COMPILER_ID
+        !$omp dispatch
+    #:endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index b1382ec49a..5f3ec2ccb0 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -303,4 +303,12 @@
 
 #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID)
 #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID)
+
+! Dispatch to oneMKL GPU FFT via OpenMP dispatch construct (Intel GPU only)
+#:def GPU_MKL_DISPATCH()
+    #:set omp_code = OMP_MKL_DISPATCH()
+#if defined(MFC_OpenMP)
+    $:omp_code
+#endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp
index 36bee0a23a..fc3f783e98 100644
--- a/src/common/include/shared_parallel_macros.fpp
+++ b/src/common/include/shared_parallel_macros.fpp
@@ -1,12 +1,13 @@
 #:set NVIDIA_COMPILER_ID="NVHPC"
 #:set PGI_COMPILER_ID="PGI"
-#:set INTEL_COMPILER_ID="Intel"
+#:set INTEL_COMPILER_ID="IntelLLVM"
 #:set CCE_COMPILER_ID="Cray"
 #:set AMD_COMPILER_ID="LLVMFlang"
 
 #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID)
 #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID)
 #:set USING_AMD = (MFC_COMPILER == AMD_COMPILER_ID)
+#:set USING_INTEL = (MFC_COMPILER == INTEL_COMPILER_ID)
 
 #:def ASSERT_LIST(data, datatype)
     #:assert data is not None
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index cd15530368..196410b1f1 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -120,7 +120,7 @@ contains
         real(wp)                                               :: T
         real(wp)                                               :: rho, omega_m
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: Ys
             real(wp), dimension(10) :: omega
         #:else
@@ -168,7 +168,7 @@ contains
         integer, intent(in)                                    :: idir
         type(scalar_field), intent(in)                         :: q_T_sf
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
             real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
             real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 6bd9718dbb..763d0566ab 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -27,10 +27,10 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in)                :: vftmp
-        real(wp), dimension(nb), intent(in) :: Rtmp
         real(wp), intent(out)               :: ntmp
-        real(wp), dimension(nb), intent(in) :: weights
         real(wp)                            :: R3
+        real(wp), dimension(nb), intent(in) :: Rtmp
+        real(wp), dimension(nb), intent(in) :: weights
 
         R3 = dot_product(weights, Rtmp**3._wp)
         ntmp = (3._wp/(4._wp*pi))*vftmp/R3
@@ -42,10 +42,10 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in)                :: vftmp
-        real(wp), dimension(nb), intent(in) :: nRtmp
         real(wp), intent(out)               :: ntmp
-        real(wp), dimension(nb), intent(in) :: weights
         real(wp)                            :: nR3
+        real(wp), dimension(nb), intent(in) :: nRtmp
+        real(wp), dimension(nb), intent(in) :: weights
 
         nR3 = dot_product(weights, nRtmp**3._wp)
         ntmp = sqrt((4._wp*pi/3._wp)*nR3/vftmp)
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 7d4b92705c..cc42771028 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -799,716 +799,712 @@ contains
                                 do i = 1, nVar
                                     r = (i - 1) + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
                                     q_comm(i)%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_comm(i)%sf(j + unpack_offset, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                                    print *, "Error", j, k, l, i
+                                    call s_mpi_abort("NaN(s) in recv")
+                                end if
 #endif
-                                end do
                             end do
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
+                end do
+                $:END_GPU_PARALLEL_LOOP()
 
-                    if (chem_diff_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    r = nVar + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
-                                    q_T_sf%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                if (chem_diff_comm) then
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -buff_size, -1
+                                r = nVar + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
+                                q_T_sf%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_T_sf%sf(j + unpack_offset, k, l))) then
-                                        print *, "Error", j, k, l
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                                print *, "Error", j, k, l
+                                call s_mpi_abort("NaN(s) in recv")
+                            end if
 #endif
-                                end do
-                            end do
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
 
-                    if (qbmm_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    do i = nVar + 1, nVar + nnode
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
-                                            pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
+            if (qbmm_comm) then
+                $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+                do l = 0, p
+                    do k = 0, n
+                        do j = -buff_size, -1
+                            do i = nVar + 1, nVar + nnode
+                                do q = 1, nb
+                                    r = (i - 1) + (q - 1)*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
+                                    pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                                 end do
                             end do
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    do i = nVar + 1, nVar + nnode
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
-                                            mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+                do l = 0, p
+                    do k = 0, n
+                        do j = -buff_size, -1
+                            do i = nVar + 1, nVar + nnode
+                                do q = 1, nb
+                                    r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
+                                    mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                                 end do
                             end do
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-                #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
-                    do i = 1, nVar
-                        do l = 0, p
-                            do k = -buff_size, -1
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
-                                    q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:elif mpi_dir == 2
+            $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+            do i = 1, nVar
+                do l = 0, p
+                    do k = -buff_size, -1
+                        do j = -buff_size, m + buff_size
+                            r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
+                            q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_comm(i)%sf(j, k + unpack_offset, l))) then
-                                        print *, "Error", j, k, l, i
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                            print *, "Error", j, k, l, i
+                            call s_mpi_abort("NaN(s) in recv")
+                        end if
 #endif
-                                end do
-                            end do
-                        end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (chem_diff_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                        do l = 0, p
-                            do k = -buff_size, -1
-                                do j = -buff_size, m + buff_size
-                                    r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
-                                    q_T_sf%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (chem_diff_comm) then
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+            do l = 0, p
+                do k = -buff_size, -1
+                    do j = -buff_size, m + buff_size
+                        r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
+                        q_T_sf%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_T_sf%sf(j, k + unpack_offset, l))) then
-                                        print *, "Error", j, k, l
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
+                        print *, "Error", j, k, l
+                        call s_mpi_abort("NaN(s) in recv")
                     end if
-
-                    if (qbmm_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = 0, p
-                                do k = -buff_size, -1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k &
-                                                 & + buff_size) + buff_size*l))
-                                            pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
+#endif
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+    end if
+
+    if (qbmm_comm) then
+        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+        do i = nVar + 1, nVar + nnode
+            do l = 0, p
+                do k = -buff_size, -1
+                    do j = -buff_size, m + buff_size
+                        do q = 1, nb
+                            r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) &
+                                 & + buff_size*l))
+                            pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = 0, p
-                                do k = -buff_size, -1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size &
-                                                 & + 1)*((k + buff_size) + buff_size*l))
-                                            mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+        do i = nVar + 1, nVar + nnode
+            do l = 0, p
+                do k = -buff_size, -1
+                    do j = -buff_size, m + buff_size
+                        do q = 1, nb
+                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k &
+                                 & + buff_size) + buff_size*l))
+                            mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-                #:else
-                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
-                    do i = 1, nVar
-                        do l = -buff_size, -1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n &
-                                         & + 2*buff_size + 1)*(l + buff_size)))
-                                    q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+    end if
+#:else
+    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+    do i = 1, nVar
+        do l = -buff_size, -1
+            do k = -buff_size, n + buff_size
+                do j = -buff_size, m + buff_size
+                    r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l &
+                         & + buff_size)))
+                    q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_comm(i)%sf(j, k, l + unpack_offset))) then
-                                        print *, "Error", j, k, l, i
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                    print *, "Error", j, k, l, i
+                    call s_mpi_abort("NaN(s) in recv")
+                end if
 #endif
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (chem_diff_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                        do l = -buff_size, -1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n &
-                                                       & + 2*buff_size + 1)*(l + buff_size)))
-                                    q_T_sf%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+            end do
+        end do
+    end do
+end do
+$:END_GPU_PARALLEL_LOOP()
+
+if (chem_diff_comm) then
+    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+    do l = -buff_size, -1
+        do k = -buff_size, n + buff_size
+            do j = -buff_size, m + buff_size
+                r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l &
+                                   & + buff_size)))
+                q_T_sf%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_T_sf%sf(j, k, l + unpack_offset))) then
-                                        print *, "Error", j, k, l
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-
-                    if (qbmm_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = -buff_size, -1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k &
-                                                 & + buff_size) + (n + 2*buff_size + 1)*(l + buff_size)))
-                                            pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = -buff_size, -1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size &
-                                                 & + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l + buff_size)))
-                                            mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-                #:endif
+                print *, "Error", j, k, l
+                call s_mpi_abort("NaN(s) in recv")
             end if
-        #:endfor
-        call nvtxEndRange
+#endif
+        end do
+    end do
+end do
+$:END_GPU_PARALLEL_LOOP()
+end if
+
+if (qbmm_comm) then
+    $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+    do i = nVar + 1, nVar + nnode
+        do l = -buff_size, -1
+            do k = -buff_size, n + buff_size
+                do j = -buff_size, m + buff_size
+                    do q = 1, nb
+                        r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n &
+                             & + 2*buff_size + 1)*(l + buff_size)))
+                        pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
+                    end do
+                end do
+            end do
+        end do
+    end do
+    $:END_GPU_PARALLEL_LOOP()
+
+    $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+    do i = nVar + 1, nVar + nnode
+        do l = -buff_size, -1
+            do k = -buff_size, n + buff_size
+                do j = -buff_size, m + buff_size
+                    do q = 1, nb
+                        r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) &
+                             & + (n + 2*buff_size + 1)*(l + buff_size)))
+                        mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
+                    end do
+                end do
+            end do
+        end do
+    end do
+    $:END_GPU_PARALLEL_LOOP()
+end if
+#:endif
+end if
+#:endfor
+call nvtxEndRange
 #endif
 
-    end subroutine s_mpi_sendrecv_variables_buffers
+end subroutine s_mpi_sendrecv_variables_buffers
 
-    !> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction.
-    subroutine s_mpi_decompose_computational_domain
+!> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction.
+subroutine s_mpi_decompose_computational_domain
 
 #ifdef MFC_MPI
-        integer :: num_procs_x, num_procs_y, num_procs_z  !< Optimal number of processors in the x-, y- and z-directions
-        !> Non-optimal number of processors in the x-, y- and z-directions
-        real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
-        real(wp) :: fct_min        !< Processor factorization (fct) minimization parameter
-        integer  :: MPI_COMM_CART  !< Cartesian processor topology communicator
-        integer  :: rem_cells      !< Remaining cells after distribution among processors
-        integer  :: recon_order    !< WENO or MUSCL reconstruction order
-        integer  :: i, j           !< Generic loop iterators
-        integer  :: ierr           !< Generic flag used to identify and report MPI errors
-
-        if (recon_type == WENO_TYPE) then
-            recon_order = weno_order
-        else
-            recon_order = muscl_order
-        end if
+    integer :: num_procs_x, num_procs_y, num_procs_z  !< Optimal number of processors in the x-, y- and z-directions
+    !> Non-optimal number of processors in the x-, y- and z-directions
+    real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
+    real(wp) :: fct_min        !< Processor factorization (fct) minimization parameter
+    integer  :: MPI_COMM_CART  !< Cartesian processor topology communicator
+    integer  :: rem_cells      !< Remaining cells after distribution among processors
+    integer  :: recon_order    !< WENO or MUSCL reconstruction order
+    integer  :: i, j           !< Generic loop iterators
+    integer  :: ierr           !< Generic flag used to identify and report MPI errors
+
+    if (recon_type == WENO_TYPE) then
+        recon_order = weno_order
+    else
+        recon_order = muscl_order
+    end if
+
+    if (num_procs == 1 .and. parallel_io) then
+        do i = 1, num_dims
+            start_idx(i) = 0
+        end do
+        return
+    end if
 
-        if (num_procs == 1 .and. parallel_io) then
-            do i = 1, num_dims
-                start_idx(i) = 0
-            end do
-            return
-        end if
+    if (igr) then
+        recon_order = igr_order
+    end if
 
-        if (igr) then
-            recon_order = igr_order
-        end if
+    ! 3D Cartesian Processor Topology
+    if (n > 0) then
+        if (p > 0) then
+            if (fft_wrt) then
+                ! Initial estimate of optimal processor topology
+                num_procs_x = 1
+                num_procs_y = 1
+                num_procs_z = num_procs
+                ierr = -1
 
-        ! 3D Cartesian Processor Topology
-        if (n > 0) then
-            if (p > 0) then
-                if (fft_wrt) then
-                    ! Initial estimate of optimal processor topology
+                ! Benchmarking the quality of this initial guess
+                tmp_num_procs_y = num_procs_y
+                tmp_num_procs_z = num_procs_z
+                fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+
+                ! Optimization of the initial processor topology
+                do i = 1, num_procs
+                    if (mod(num_procs, i) == 0 .and. (n + 1)/i >= num_stcls_min*recon_order) then
+                        tmp_num_procs_y = i
+                        tmp_num_procs_z = num_procs/i
+
+                        if (fct_min >= abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
+                            & /tmp_num_procs_z >= num_stcls_min*recon_order) then
+                            num_procs_y = i
+                            num_procs_z = num_procs/i
+                            fct_min = abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                            ierr = 0
+                        end if
+                    end if
+                end do
+            else
+                if (cyl_coord .and. p > 0) then
+                    ! Pencil blocking for cylindrical coordinates (Fourier filter near axis)
+
+                    ! Initial values of the processor factorization optimization
                     num_procs_x = 1
-                    num_procs_y = 1
-                    num_procs_z = num_procs
+                    num_procs_y = num_procs
+                    num_procs_z = 1
                     ierr = -1
 
-                    ! Benchmarking the quality of this initial guess
+                    ! Computing minimization variable for these initial values
+                    tmp_num_procs_x = num_procs_x
                     tmp_num_procs_y = num_procs_y
                     tmp_num_procs_z = num_procs_z
-                    fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                    fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
 
-                    ! Optimization of the initial processor topology
+                    ! Searching for optimal computational domain distribution
                     do i = 1, num_procs
-                        if (mod(num_procs, i) == 0 .and. (n + 1)/i >= num_stcls_min*recon_order) then
-                            tmp_num_procs_y = i
-                            tmp_num_procs_z = num_procs/i
-
-                            if (fct_min >= abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
-                                & /tmp_num_procs_z >= num_stcls_min*recon_order) then
-                                num_procs_y = i
-                                num_procs_z = num_procs/i
-                                fct_min = abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                        if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
+                            tmp_num_procs_x = i
+                            tmp_num_procs_y = num_procs/i
+
+                            if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
+                                & /tmp_num_procs_y >= num_stcls_min*recon_order) then
+                                num_procs_x = i
+                                num_procs_y = num_procs/i
+                                fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
                                 ierr = 0
                             end if
                         end if
                     end do
                 else
-                    if (cyl_coord .and. p > 0) then
-                        ! Pencil blocking for cylindrical coordinates (Fourier filter near axis)
-
-                        ! Initial values of the processor factorization optimization
-                        num_procs_x = 1
-                        num_procs_y = num_procs
-                        num_procs_z = 1
-                        ierr = -1
-
-                        ! Computing minimization variable for these initial values
-                        tmp_num_procs_x = num_procs_x
-                        tmp_num_procs_y = num_procs_y
-                        tmp_num_procs_z = num_procs_z
-                        fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-
-                        ! Searching for optimal computational domain distribution
-                        do i = 1, num_procs
-                            if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
-                                tmp_num_procs_x = i
-                                tmp_num_procs_y = num_procs/i
-
-                                if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
-                                    & /tmp_num_procs_y >= num_stcls_min*recon_order) then
-                                    num_procs_x = i
-                                    num_procs_y = num_procs/i
-                                    fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-                                    ierr = 0
-                                end if
-                            end if
-                        end do
-                    else
-                        ! Initial estimate of optimal processor topology
-                        num_procs_x = 1
-                        num_procs_y = 1
-                        num_procs_z = num_procs
-                        ierr = -1
-
-                        ! Benchmarking the quality of this initial guess
-                        tmp_num_procs_x = num_procs_x
-                        tmp_num_procs_y = num_procs_y
-                        tmp_num_procs_z = num_procs_z
-                        fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + 10._wp*abs((n + 1) &
-                                             & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
-
-                        ! Optimization of the initial processor topology
-                        do i = 1, num_procs
-                            if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
-                                do j = 1, num_procs/i
-                                    if (mod(num_procs/i, j) == 0 .and. (n + 1)/j >= num_stcls_min*recon_order) then
-                                        tmp_num_procs_x = i
-                                        tmp_num_procs_y = j
-                                        tmp_num_procs_z = num_procs/(i*j)
-
-                                        if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
-                                            & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
-                                            & /tmp_num_procs_z >= num_stcls_min*recon_order) then
-                                            num_procs_x = i
-                                            num_procs_y = j
-                                            num_procs_z = num_procs/(i*j)
-                                            fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
-                                                          & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
-                                            ierr = 0
-                                        end if
-                                    end if
-                                end do
-                            end if
-                        end do
-                    end if
-                end if
-
-                ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation
-                ! exits.
-                if (proc_rank == 0 .and. ierr == -1) then
-                    call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n, p and ' &
-                                     & // 'weno/muscl/igr_order. Exiting.')
-                end if
-
-                ! Creating new communicator using the Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, num_procs_y, num_procs_z/), (/.true., .true., .true./), &
-                                     & .false., MPI_COMM_CART, ierr)
-
-                ! Finding the Cartesian coordinates of the local process
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr)
-
-                ! Global Parameters for z-direction
-
-                ! Number of remaining cells
-                rem_cells = mod(p + 1, num_procs_z)
-
-                ! Optimal number of cells per processor
-                p = (p + 1)/num_procs_z - 1
-
-                ! Distributing the remaining cells
-                do i = 1, rem_cells
-                    if (proc_coords(3) == i - 1) then
-                        p = p + 1; exit
-                    end if
-                end do
-
-                ! Boundary condition at the beginning
-                if (proc_coords(3) > 0 .or. (bc_z%beg == BC_PERIODIC .and. num_procs_z > 1)) then
-                    proc_coords(3) = proc_coords(3) - 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr)
-                    proc_coords(3) = proc_coords(3) + 1
-                end if
-
-                ! Boundary condition at the end
-                if (proc_coords(3) < num_procs_z - 1 .or. (bc_z%end == BC_PERIODIC .and. num_procs_z > 1)) then
-                    proc_coords(3) = proc_coords(3) + 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr)
-                    proc_coords(3) = proc_coords(3) - 1
-                end if
-
-#ifdef MFC_POST_PROCESS
-                ! Ghost zone at the beginning
-                if (proc_coords(3) > 0 .and. format == 1) then
-                    offset_z%beg = 2
-                else
-                    offset_z%beg = 0
-                end if
+                    ! Initial estimate of optimal processor topology
+                    num_procs_x = 1
+                    num_procs_y = 1
+                    num_procs_z = num_procs
+                    ierr = -1
 
-                ! Ghost zone at the end
-                if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then
-                    offset_z%end = 2
-                else
-                    offset_z%end = 0
-                end if
-#endif
+                    ! Benchmarking the quality of this initial guess
+                    tmp_num_procs_x = num_procs_x
+                    tmp_num_procs_y = num_procs_y
+                    tmp_num_procs_z = num_procs_z
+                    fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + 10._wp*abs((n + 1)/tmp_num_procs_y &
+                                         & - (p + 1)/tmp_num_procs_z)
 
-                ! Beginning and end sub-domain boundary locations
-                if (parallel_io) then
-                    if (proc_coords(3) < rem_cells) then
-                        start_idx(3) = (p + 1)*proc_coords(3)
-                    else
-                        start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
-                    end if
-                else
-#ifdef MFC_PRE_PROCESS
-                    if (old_grid .neqv. .true.) then
-                        dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp)
-
-                        if (proc_coords(3) < rem_cells) then
-                            z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3))
-                            z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1) - (num_procs_z &
-                                                                  & - rem_cells))
-                        else
-                            z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3) + rem_cells)
-                            z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1))
+                    ! Optimization of the initial processor topology
+                    do i = 1, num_procs
+                        if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
+                            do j = 1, num_procs/i
+                                if (mod(num_procs/i, j) == 0 .and. (n + 1)/j >= num_stcls_min*recon_order) then
+                                    tmp_num_procs_x = i
+                                    tmp_num_procs_y = j
+                                    tmp_num_procs_z = num_procs/(i*j)
+
+                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
+                                        & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
+                                        & /tmp_num_procs_z >= num_stcls_min*recon_order) then
+                                        num_procs_x = i
+                                        num_procs_y = j
+                                        num_procs_z = num_procs/(i*j)
+                                        fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
+                                                      & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                                        ierr = 0
+                                    end if
+                                end if
+                            end do
                         end if
-                    end if
-#endif
+                    end do
                 end if
+            end if
 
-                ! 2D Cartesian Processor Topology
-            else
-                ! Initial estimate of optimal processor topology
-                num_procs_x = 1
-                num_procs_y = num_procs
-                ierr = -1
-
-                ! Benchmarking the quality of this initial guess
-                tmp_num_procs_x = num_procs_x
-                tmp_num_procs_y = num_procs_y
-                fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-
-                ! Optimization of the initial processor topology
-                do i = 1, num_procs
-                    if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
-                        tmp_num_procs_x = i
-                        tmp_num_procs_y = num_procs/i
-
-                        if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
-                            & /tmp_num_procs_y >= num_stcls_min*recon_order) then
-                            num_procs_x = i
-                            num_procs_y = num_procs/i
-                            fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-                            ierr = 0
-                        end if
-                    end if
-                end do
-
-                ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation
-                ! exits.
-                if (proc_rank == 0 .and. ierr == -1) then
-                    call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n and ' &
-                                     & // 'weno/muscl/igr_order. Exiting.')
-                end if
+            ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation exits.
+            if (proc_rank == 0 .and. ierr == -1) then
+                call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n, p and ' &
+                                 & // 'weno/muscl/igr_order. Exiting.')
+            end if
 
-                ! Creating new communicator using the Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, num_procs_y/), (/.true., .true./), .false., MPI_COMM_CART, &
-                                     & ierr)
+            ! Creating new communicator using the Cartesian topology
+            call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, num_procs_y, num_procs_z/), (/.true., .true., .true./), &
+                                 & .false., MPI_COMM_CART, ierr)
 
-                ! Finding the Cartesian coordinates of the local process
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr)
-            end if
+            ! Finding the Cartesian coordinates of the local process
+            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr)
 
-            ! Global Parameters for y-direction
+            ! Global Parameters for z-direction
 
             ! Number of remaining cells
-            rem_cells = mod(n + 1, num_procs_y)
+            rem_cells = mod(p + 1, num_procs_z)
 
             ! Optimal number of cells per processor
-            n = (n + 1)/num_procs_y - 1
+            p = (p + 1)/num_procs_z - 1
 
             ! Distributing the remaining cells
             do i = 1, rem_cells
-                if (proc_coords(2) == i - 1) then
-                    n = n + 1; exit
+                if (proc_coords(3) == i - 1) then
+                    p = p + 1; exit
                 end if
             end do
 
             ! Boundary condition at the beginning
-            if (proc_coords(2) > 0 .or. (bc_y%beg == BC_PERIODIC .and. num_procs_y > 1)) then
-                proc_coords(2) = proc_coords(2) - 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr)
-                proc_coords(2) = proc_coords(2) + 1
+            if (proc_coords(3) > 0 .or. (bc_z%beg == BC_PERIODIC .and. num_procs_z > 1)) then
+                proc_coords(3) = proc_coords(3) - 1
+                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr)
+                proc_coords(3) = proc_coords(3) + 1
             end if
 
             ! Boundary condition at the end
-            if (proc_coords(2) < num_procs_y - 1 .or. (bc_y%end == BC_PERIODIC .and. num_procs_y > 1)) then
-                proc_coords(2) = proc_coords(2) + 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr)
-                proc_coords(2) = proc_coords(2) - 1
+            if (proc_coords(3) < num_procs_z - 1 .or. (bc_z%end == BC_PERIODIC .and. num_procs_z > 1)) then
+                proc_coords(3) = proc_coords(3) + 1
+                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr)
+                proc_coords(3) = proc_coords(3) - 1
             end if
 
 #ifdef MFC_POST_PROCESS
             ! Ghost zone at the beginning
-            if (proc_coords(2) > 0 .and. format == 1) then
-                offset_y%beg = 2
+            if (proc_coords(3) > 0 .and. format == 1) then
+                offset_z%beg = 2
             else
-                offset_y%beg = 0
+                offset_z%beg = 0
             end if
 
             ! Ghost zone at the end
-            if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then
-                offset_y%end = 2
+            if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then
+                offset_z%end = 2
             else
-                offset_y%end = 0
+                offset_z%end = 0
             end if
 #endif
 
             ! Beginning and end sub-domain boundary locations
             if (parallel_io) then
-                if (proc_coords(2) < rem_cells) then
-                    start_idx(2) = (n + 1)*proc_coords(2)
+                if (proc_coords(3) < rem_cells) then
+                    start_idx(3) = (p + 1)*proc_coords(3)
                 else
-                    start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
+                    start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
                 end if
             else
 #ifdef MFC_PRE_PROCESS
                 if (old_grid .neqv. .true.) then
-                    dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp)
+                    dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp)
 
-                    if (proc_coords(2) < rem_cells) then
-                        y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2))
-                        y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1) - (num_procs_y &
+                    if (proc_coords(3) < rem_cells) then
+                        z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3))
+                        z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1) - (num_procs_z &
                                                               & - rem_cells))
                     else
-                        y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2) + rem_cells)
-                        y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1))
+                        z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3) + rem_cells)
+                        z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1))
                     end if
                 end if
 #endif
             end if
 
-            ! 1D Cartesian Processor Topology
+            ! 2D Cartesian Processor Topology
         else
-            ! Optimal processor topology
-            num_procs_x = num_procs
+            ! Initial estimate of optimal processor topology
+            num_procs_x = 1
+            num_procs_y = num_procs
+            ierr = -1
+
+            ! Benchmarking the quality of this initial guess
+            tmp_num_procs_x = num_procs_x
+            tmp_num_procs_y = num_procs_y
+            fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
+
+            ! Optimization of the initial processor topology
+            do i = 1, num_procs
+                if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
+                    tmp_num_procs_x = i
+                    tmp_num_procs_y = num_procs/i
+
+                    if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
+                        & /tmp_num_procs_y >= num_stcls_min*recon_order) then
+                        num_procs_x = i
+                        num_procs_y = num_procs/i
+                        fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
+                        ierr = 0
+                    end if
+                end if
+            end do
+
+            ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation exits.
+            if (proc_rank == 0 .and. ierr == -1) then
+                call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n and ' &
+                                 & // 'weno/muscl/igr_order. Exiting.')
+            end if
 
             ! Creating new communicator using the Cartesian topology
-            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), (/.true./), .false., MPI_COMM_CART, ierr)
+            call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, num_procs_y/), (/.true., .true./), .false., MPI_COMM_CART, ierr)
 
             ! Finding the Cartesian coordinates of the local process
-            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, proc_coords, ierr)
+            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr)
         end if
 
-        ! Global Parameters for x-direction
+        ! Global Parameters for y-direction
 
         ! Number of remaining cells
-        rem_cells = mod(m + 1, num_procs_x)
+        rem_cells = mod(n + 1, num_procs_y)
 
         ! Optimal number of cells per processor
-        m = (m + 1)/num_procs_x - 1
+        n = (n + 1)/num_procs_y - 1
 
         ! Distributing the remaining cells
         do i = 1, rem_cells
-            if (proc_coords(1) == i - 1) then
-                m = m + 1; exit
+            if (proc_coords(2) == i - 1) then
+                n = n + 1; exit
             end if
         end do
 
-        call s_update_cell_bounds(cells_bounds, m, n, p)
-
         ! Boundary condition at the beginning
-        if (proc_coords(1) > 0 .or. (bc_x%beg == BC_PERIODIC .and. num_procs_x > 1)) then
-            proc_coords(1) = proc_coords(1) - 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
-            proc_coords(1) = proc_coords(1) + 1
+        if (proc_coords(2) > 0 .or. (bc_y%beg == BC_PERIODIC .and. num_procs_y > 1)) then
+            proc_coords(2) = proc_coords(2) - 1
+            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr)
+            proc_coords(2) = proc_coords(2) + 1
         end if
 
         ! Boundary condition at the end
-        if (proc_coords(1) < num_procs_x - 1 .or. (bc_x%end == BC_PERIODIC .and. num_procs_x > 1)) then
-            proc_coords(1) = proc_coords(1) + 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
-            proc_coords(1) = proc_coords(1) - 1
+        if (proc_coords(2) < num_procs_y - 1 .or. (bc_y%end == BC_PERIODIC .and. num_procs_y > 1)) then
+            proc_coords(2) = proc_coords(2) + 1
+            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr)
+            proc_coords(2) = proc_coords(2) - 1
         end if
 
 #ifdef MFC_POST_PROCESS
         ! Ghost zone at the beginning
-        if (proc_coords(1) > 0 .and. format == 1) then
-            offset_x%beg = 2
+        if (proc_coords(2) > 0 .and. format == 1) then
+            offset_y%beg = 2
         else
-            offset_x%beg = 0
+            offset_y%beg = 0
         end if
 
         ! Ghost zone at the end
-        if (proc_coords(1) < num_procs_x - 1 .and. format == 1) then
-            offset_x%end = 2
+        if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then
+            offset_y%end = 2
         else
-            offset_x%end = 0
+            offset_y%end = 0
         end if
 #endif
 
         ! Beginning and end sub-domain boundary locations
         if (parallel_io) then
-            if (proc_coords(1) < rem_cells) then
-                start_idx(1) = (m + 1)*proc_coords(1)
+            if (proc_coords(2) < rem_cells) then
+                start_idx(2) = (n + 1)*proc_coords(2)
             else
-                start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
+                start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
             end if
         else
 #ifdef MFC_PRE_PROCESS
             if (old_grid .neqv. .true.) then
-                dx = (x_domain%end - x_domain%beg)/real(m_glb + 1, wp)
+                dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp)
 
-                if (proc_coords(1) < rem_cells) then
-                    x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1))
-                    x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells))
+                if (proc_coords(2) < rem_cells) then
+                    y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2))
+                    y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1) - (num_procs_y - rem_cells))
                 else
-                    x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells)
-                    x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1))
+                    y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2) + rem_cells)
+                    y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1))
                 end if
             end if
 #endif
         end if
+
+        ! 1D Cartesian Processor Topology
+    else
+        ! Optimal processor topology
+        num_procs_x = num_procs
+
+        ! Creating new communicator using the Cartesian topology
+        call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), (/.true./), .false., MPI_COMM_CART, ierr)
+
+        ! Finding the Cartesian coordinates of the local process
+        call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, proc_coords, ierr)
+    end if
+
+    ! Global Parameters for x-direction
+
+    ! Number of remaining cells
+    rem_cells = mod(m + 1, num_procs_x)
+
+    ! Optimal number of cells per processor
+    m = (m + 1)/num_procs_x - 1
+
+    ! Distributing the remaining cells
+    do i = 1, rem_cells
+        if (proc_coords(1) == i - 1) then
+            m = m + 1; exit
+        end if
+    end do
+
+    call s_update_cell_bounds(cells_bounds, m, n, p)
+
+    ! Boundary condition at the beginning
+    if (proc_coords(1) > 0 .or. (bc_x%beg == BC_PERIODIC .and. num_procs_x > 1)) then
+        proc_coords(1) = proc_coords(1) - 1
+        call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
+        proc_coords(1) = proc_coords(1) + 1
+    end if
+
+    ! Boundary condition at the end
+    if (proc_coords(1) < num_procs_x - 1 .or. (bc_x%end == BC_PERIODIC .and. num_procs_x > 1)) then
+        proc_coords(1) = proc_coords(1) + 1
+        call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
+        proc_coords(1) = proc_coords(1) - 1
+    end if
+
+#ifdef MFC_POST_PROCESS
+    ! Ghost zone at the beginning
+    if (proc_coords(1) > 0 .and. format == 1) then
+        offset_x%beg = 2
+    else
+        offset_x%beg = 0
+    end if
+
+    ! Ghost zone at the end
+    if (proc_coords(1) < num_procs_x - 1 .and. format == 1) then
+        offset_x%end = 2
+    else
+        offset_x%end = 0
+    end if
+#endif
+
+    ! Beginning and end sub-domain boundary locations
+    if (parallel_io) then
+        if (proc_coords(1) < rem_cells) then
+            start_idx(1) = (m + 1)*proc_coords(1)
+        else
+            start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
+        end if
+    else
+#ifdef MFC_PRE_PROCESS
+        if (old_grid .neqv. .true.) then
+            dx = (x_domain%end - x_domain%beg)/real(m_glb + 1, wp)
+
+            if (proc_coords(1) < rem_cells) then
+                x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1))
+                x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells))
+            else
+                x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells)
+                x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1))
+            end if
+        end if
+#endif
+    end if
 #endif
 
-    end subroutine s_mpi_decompose_computational_domain
+end subroutine s_mpi_decompose_computational_domain
 
-    !> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors.
-    !! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of
-    !! cell-boundary locations may be calculated directly from those of the cell-width distributions.
+!> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors.
+!! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of
+!! cell-boundary locations may be calculated directly from those of the cell-width distributions.
 #ifndef MFC_PRE_PROCESS
-    subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc)
+subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc)
 
-        integer, intent(in) :: mpi_dir
-        integer, intent(in) :: pbc_loc
+    integer, intent(in) :: mpi_dir
+    integer, intent(in) :: pbc_loc
 
 #ifdef MFC_MPI
-        integer :: ierr  !< Generic flag used to identify and report MPI errors
+    integer :: ierr  !< Generic flag used to identify and report MPI errors
 
-        if (mpi_dir == 1) then
-            if (pbc_loc == -1) then  ! PBC at the beginning
+    if (mpi_dir == 1) then
+        if (pbc_loc == -1) then  ! PBC at the beginning
 
-                if (bc_x%end >= 0) then  ! PBC at the beginning and end
-                    call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, &
-                                      & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
-                    call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
-            else  ! PBC at the end
-                if (bc_x%beg >= 0) then  ! PBC at the end and beginning
-                    call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the end only
-                    call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, &
-                                      & bc_x%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
+            if (bc_x%end >= 0) then  ! PBC at the beginning and end
+                call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, &
+                                  & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the beginning only
+                call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
             end if
-        else if (mpi_dir == 2) then
-            if (pbc_loc == -1) then  ! PBC at the beginning
-
-                if (bc_y%end >= 0) then  ! PBC at the beginning and end
-                    call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, &
-                                      & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
-                    call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
-            else  ! PBC at the end
-                if (bc_y%beg >= 0) then  ! PBC at the end and beginning
-                    call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the end only
-                    call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, &
-                                      & bc_y%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
+        else  ! PBC at the end
+            if (bc_x%beg >= 0) then  ! PBC at the end and beginning
+                call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, MPI_COMM_WORLD, &
+                                  & MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the end only
+                call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
             end if
-        else
-            if (pbc_loc == -1) then  ! PBC at the beginning
-
-                if (bc_z%end >= 0) then  ! PBC at the beginning and end
-                    call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, &
-                                      & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
-                    call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
-            else  ! PBC at the end
-                if (bc_z%beg >= 0) then  ! PBC at the end and beginning
-                    call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the end only
-                    call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, &
-                                      & bc_z%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
+        end if
+    else if (mpi_dir == 2) then
+        if (pbc_loc == -1) then  ! PBC at the beginning
+
+            if (bc_y%end >= 0) then  ! PBC at the beginning and end
+                call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, &
+                                  & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the beginning only
+                call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            end if
+        else  ! PBC at the end
+            if (bc_y%beg >= 0) then  ! PBC at the end and beginning
+                call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, MPI_COMM_WORLD, &
+                                  & MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the end only
+                call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            end if
+        end if
+    else
+        if (pbc_loc == -1) then  ! PBC at the beginning
+
+            if (bc_z%end >= 0) then  ! PBC at the beginning and end
+                call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, &
+                                  & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the beginning only
+                call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            end if
+        else  ! PBC at the end
+            if (bc_z%beg >= 0) then  ! PBC at the end and beginning
+                call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, MPI_COMM_WORLD, &
+                                  & MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the end only
+                call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
             end if
         end if
+    end if
 #endif
 
-    end subroutine s_mpi_sendrecv_grid_variables_buffers
+end subroutine s_mpi_sendrecv_grid_variables_buffers
 #endif
 
-    !> Module deallocation and/or disassociation procedures
-    impure subroutine s_finalize_mpi_common_module
+!> Module deallocation and/or disassociation procedures
+impure subroutine s_finalize_mpi_common_module
 
 #ifdef MFC_MPI
-        deallocate (buff_send, buff_recv)
+    deallocate (buff_send, buff_recv)
 #endif
 
-    end subroutine s_finalize_mpi_common_module
+end subroutine s_finalize_mpi_common_module
 
 end module m_mpi_common
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 7f9131550d..dd5a7f5f86 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -76,7 +76,7 @@ contains
         ! $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]')
         ! $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]')
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
         #:else
             real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
@@ -259,7 +259,7 @@ contains
         ! initializing variables
         integer, intent(in)                                 :: j, k, l, MFL
         real(wp), intent(out)                               :: pS
-        real(wp), dimension(1:), intent(out)                :: p_infpT
+        real(wp), dimension(num_fluids), intent(out)        :: p_infpT
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         real(wp), intent(in)                                :: rhoe
         real(wp), intent(out)                               :: TS
@@ -283,7 +283,7 @@ contains
             mQ = mQ + q_cons_vf(i + eqn_idx%cont%beg - 1)%sf(j, k, l)*qvs(i)
         end do
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             if (num_fluids < 3) then
                 $:GPU_LOOP(parallelism='[seq]')
                 do i = num_fluids + 1, 3
@@ -353,14 +353,14 @@ contains
 
         integer, intent(in)                                    :: j, k, l
         real(wp), intent(inout)                                :: pS
-        real(wp), dimension(1:), intent(in)                    :: p_infpT
-        real(wp), intent(in)                                   :: rhoe
+        real(wp), dimension(num_fluids), intent(in)            :: p_infpT
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        real(wp), intent(in)                                   :: rhoe
         real(wp), intent(inout)                                :: TS
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
         #:else
-            real(wp), dimension(num_fluids) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
+            real(wp), dimension(num_fluids_max) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
         #:endif
         real(wp), dimension(2, 2) :: Jac, InvJac, TJac                  !< matrices for the Newton Solver
         real(wp), dimension(2)    :: R2D, DeltamP                       !< residual and correction array
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 2417da1adf..3fe22ceb9a 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -91,12 +91,12 @@ contains
         real(wp), intent(in), optional  :: G, pres_mag
 
         ! Chemistry
-        real(wp), dimension(1:num_species), intent(in) :: rhoYks
-        real(wp), dimension(1:num_species)             :: Y_rs
-        real(wp)                                       :: E_e
-        real(wp)                                       :: e_Per_Kg, Pdyn_Per_Kg
-        real(wp)                                       :: T_guess
-        integer                                        :: s  !< Generic loop iterator
+        real(wp), dimension(num_species), intent(in) :: rhoYks
+        real(wp), dimension(1:num_species)           :: Y_rs
+        real(wp)                                     :: E_e
+        real(wp)                                     :: e_Per_Kg, Pdyn_Per_Kg
+        real(wp)                                     :: T_guess
+        integer                                      :: s  !< Generic loop iterator
         #:if not chemistry
             ! Depending on model_eqns and bubbles_euler, the appropriate procedure for computing pressure is targeted by the
             ! procedure pointer
@@ -249,12 +249,12 @@ contains
         $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_acc', parallelism='[seq]', cray_noinline=True)
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(inout)        :: alpha_rho_K, alpha_K
             real(wp), optional, dimension(3), intent(in) :: G
         #:else
-            real(wp), dimension(num_fluids), intent(inout)        :: alpha_rho_K, alpha_K
-            real(wp), optional, dimension(num_fluids), intent(in) :: G
+            real(wp), dimension(num_fluids_max), intent(inout)        :: alpha_rho_K, alpha_K
+            real(wp), optional, dimension(num_fluids_max), intent(in) :: G
         #:endif
         real(wp), dimension(2), intent(out) :: Re_K
         real(wp), optional, intent(out)     :: G_K
@@ -283,7 +283,7 @@ contains
                     alpha_K(i) = min(max(0._wp, alpha_K(i)), 1._wp)
                     alpha_K_sum = alpha_K_sum + alpha_K(i)
                 end do
-                alpha_K = alpha_K/max(alpha_K_sum, sgm_eps)
+                alpha_K(1:num_fluids) = alpha_K(1:num_fluids)/max(alpha_K_sum, sgm_eps)
             end if
             rho_K = 0._wp; gamma_K = 0._wp; pi_inf_K = 0._wp; qv_K = 0._wp
             do i = 1, num_fluids
@@ -473,7 +473,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: qK_prim_vf
         type(int_bounds_info), dimension(1:3), intent(in)      :: ibounds
 
-        #:if USING_AMD and not MFC_CASE_OPTIMIZATION
+        #:if (USING_AMD or USING_INTEL) and not MFC_CASE_OPTIMIZATION
             real(wp), dimension(3) :: alpha_K, alpha_rho_K
             real(wp), dimension(3) :: nRtmp
             real(wp)               :: rhoYks(1:10)
@@ -1037,7 +1037,7 @@ contains
         ! Partial densities, density, velocity, pressure, energy, advection variables, the specific heat ratio and liquid stiffness
         ! functions, the shear and volume Reynolds numbers and the Weber numbers
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)  :: alpha_rho_K
             real(wp), dimension(3)  :: alpha_K
             real(wp), dimension(3)  :: vel_K
@@ -1177,10 +1177,10 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_species_fraction', parallelism='[seq]', cray_noinline=True)
         type(scalar_field), dimension(sys_size), intent(in) :: q_vf
         integer, intent(in)                                 :: k, l, r
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K
         #:else
-            real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K
+            real(wp), dimension(num_fluids_max), intent(out) :: alpha_rho_K, alpha_K
         #:endif
         integer  :: i
         real(wp) :: alpha_K_sum
@@ -1215,7 +1215,7 @@ contains
                 alpha_K(i) = min(max(0._wp, alpha_K(i)), 1._wp)
                 alpha_K_sum = alpha_K_sum + alpha_K(i)
             end do
-            alpha_K = alpha_K/max(alpha_K_sum, 1.e-16_wp)
+            alpha_K(1:num_fluids) = alpha_K(1:num_fluids)/max(alpha_K_sum, 1.e-16_wp)
         end if
 
         if (num_fluids == 1 .and. bubbles_euler) alpha_K(1) = q_vf(eqn_idx%adv%beg)%sf(k, l, r)
@@ -1253,10 +1253,10 @@ contains
         real(wp), intent(in) :: pres
         real(wp), intent(in) :: rho, gamma, pi_inf, qv
         real(wp), intent(in) :: H
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: adv
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: adv
+            real(wp), dimension(num_fluids_max), intent(in) :: adv
         #:endif
         real(wp), intent(in)  :: vel_sum
         real(wp), intent(in)  :: c_c
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 4de261864d..488a84c644 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -129,7 +129,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf  !< Primitive variables
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: myalpha, myalpha_rho
         #:else
             real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index a15156405b..200a007754 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -148,7 +148,7 @@ contains
         real(wp)                                               :: pb_local, mv_local, vflux, pbdot
         real(wp)                                               :: n_tait, B_tait
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: Rtmp, Vtmp
             real(wp), dimension(3) :: myalpha, myalpha_rho
         #:else
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index b44eb617b1..3a9f1484ab 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -492,7 +492,7 @@ contains
         real(wp)                                               :: myPinf, aux1, aux2, myCson, myRho
         real(wp)                                               :: gamma, pi_inf, qv
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: myalpha_rho, myalpha
         #:else
             real(wp), dimension(num_fluids) :: myalpha_rho, myalpha
@@ -704,10 +704,10 @@ contains
         integer, dimension(3), intent(in)                   :: cell
         real(wp), intent(out)                               :: cson
         real(wp)                                            :: E, H
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: vel
         #:else
-            real(wp), dimension(num_dims) :: vel
+            real(wp), dimension(3) :: vel
         #:endif
         integer :: i
 
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 19c1d798ff..213241fba6 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -477,12 +477,12 @@ contains
         real(wp)                                               :: dqv_dt
         real(wp)                                               :: dpres_ds
 
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20) :: L
         #:else
             real(wp), dimension(sys_size) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)  :: alpha_rho, dalpha_rho_ds, mf
             real(wp), dimension(3)  :: vel, dvel_ds
             real(wp), dimension(3)  :: adv_local, dadv_ds
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index c2c415b1d3..d41810a37f 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -23,7 +23,7 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), intent(in)               :: rho, c, dpres_ds
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
             real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -37,12 +37,12 @@ contains
     subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         #:else
             real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
@@ -62,12 +62,12 @@ contains
     subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
             real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -86,12 +86,12 @@ contains
     subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
             real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -110,12 +110,12 @@ contains
     subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
             real(wp), dimension(num_species), intent(in) :: dYs_ds
@@ -138,12 +138,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_slip_wall_L',parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
             real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -162,12 +162,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_buffer_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in)  :: dvel_ds
             real(wp), dimension(3), intent(in)  :: dadv_ds
@@ -202,12 +202,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_inflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
             real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -226,12 +226,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in)  :: dvel_ds
             real(wp), dimension(3), intent(in)  :: dadv_ds
@@ -260,12 +260,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_force_free_subsonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in) :: dvel_ds
             real(wp), dimension(3), intent(in) :: dadv_ds
@@ -291,12 +291,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_constant_pressure_subsonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in) :: dvel_ds
             real(wp), dimension(3), intent(in) :: dadv_ds
@@ -320,7 +320,7 @@ contains
     subroutine s_compute_supersonic_inflow_L(L)
 
         $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', parallelism='[seq]', cray_inline=True)
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
@@ -336,12 +336,12 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_supersonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in)  :: dvel_ds
             real(wp), dimension(3), intent(in)  :: dadv_ds
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 9e7519790c..6a5163aac0 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -25,48 +25,81 @@ contains
 
         type(ghost_point), dimension(:), intent(inout) :: gps
         integer, intent(in)                            :: num_gps
-        integer                                        :: i, patch_id, patch_geometry
+        integer                                        :: i, patch_id
 
-        !  3D Patch Geometries
+        ! One GPU loop per geometry type so each kernel calls exactly one
+        ! declare-target routine. A single if-else dispatch over multiple
+        ! declare-target callees triggers an LLVM phi-node dominance error
+        ! in ifx SPIR64 codegen; splitting into separate loops avoids it.
 
         if (p > 0) then
-            $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
-                patch_geometry = patch_ib(patch_id)%geometry
-
-                if (patch_geometry == 8) then
-                    call s_sphere_levelset(gps(i))
-                else if (patch_geometry == 9) then
-                    call s_cuboid_levelset(gps(i))
-                else if (patch_geometry == 10) then
-                    call s_cylinder_levelset(gps(i))
-                else if (patch_geometry == 11) then
-                    call s_3d_airfoil_levelset(gps(i))
-                else if (patch_geometry == 12) then
-                    call s_model_levelset(gps(i))
-                end if
+                if (patch_ib(patch_id)%geometry == 8) call s_sphere_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 9) call s_cuboid_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 10) call s_cylinder_levelset(gps(i))
             end do
             $:END_GPU_PARALLEL_LOOP()
 
-            ! 2D Patch Geometries
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 11) call s_3d_airfoil_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 12) call s_model_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
         else if (n > 0) then
-            $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
-                patch_geometry = patch_ib(patch_id)%geometry
-
-                if (patch_geometry == 2) then
-                    call s_circle_levelset(gps(i))
-                else if (patch_geometry == 3) then
-                    call s_rectangle_levelset(gps(i))
-                else if (patch_geometry == 4) then
-                    call s_airfoil_levelset(gps(i))
-                else if (patch_geometry == 5) then
-                    call s_model_levelset(gps(i))
-                else if (patch_geometry == 6) then
-                    call s_ellipse_levelset(gps(i))
-                end if
+                if (patch_ib(patch_id)%geometry == 2) call s_circle_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 3) call s_rectangle_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 4) call s_airfoil_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 5) call s_model_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 6) call s_ellipse_levelset(gps(i))
             end do
             $:END_GPU_PARALLEL_LOOP()
         end if
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 8ffcba6101..169af242b4 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -166,12 +166,12 @@ contains
         integer, intent(in)                                 :: t_step
         real(wp)                                            :: rho  !< Cell-avg. density
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha  !< Cell-avg. volume fraction
             real(wp), dimension(3) :: vel    !< Cell-avg. velocity
         #:else
-            real(wp), dimension(num_fluids) :: alpha  !< Cell-avg. volume fraction
-            real(wp), dimension(num_vels)   :: vel    !< Cell-avg. velocity
+            real(wp), dimension(num_fluids_max) :: alpha  !< Cell-avg. volume fraction
+            real(wp), dimension(3)              :: vel    !< Cell-avg. velocity
         #:endif
         real(wp)               :: vel_sum  !< Cell-avg. velocity sum
         real(wp)               :: pres     !< Cell-avg. pressure
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 2ee806d928..59fb741aef 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -4,7 +4,7 @@
 
 #:include 'macros.fpp'
 
-!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT) for azimuthal Fourier filtering in cylindrical geometries
+!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT/oneMKL) for azimuthal Fourier filtering in cylindrical geometries
 module m_fftw
 
     use, intrinsic :: iso_c_binding
@@ -12,7 +12,14 @@ module m_fftw
     use m_derived_types
     use m_global_parameters
     use m_mpi_proxy
-#if defined(MFC_GPU) && defined(__PGI)
+    ! GPU FFT backend selection:
+    !   cuFFT  - NVHPC/PGI (OpenACC or OpenMP target)
+    !   hipFFT - Cray/AMD (OpenMP target)
+    !   oneMKL - Intel ifx (OpenMP target + dispatch construct)
+    !   FFTW   - CPU-only builds
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+    use mkl_dfti_omp_offload
+#elif defined(MFC_GPU) && defined(__PGI)
     use cufft
 #elif defined(MFC_GPU)
     use hipfort
@@ -34,7 +41,18 @@ module m_fftw
     real(c_double), pointer            :: data_real(:)        !< Real data
     complex(c_double_complex), pointer :: data_cmplx(:)       !< Complex data in Fourier space
     complex(c_double_complex), pointer :: data_fltr_cmplx(:)  !< Filtered complex data in Fourier space
-#if defined(MFC_GPU)
+
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+    $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
+
+    real(dp), allocatable, target    :: data_real_gpu(:)
+    complex(dp), allocatable, target :: data_cmplx_gpu(:)
+    complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
+    $:GPU_DECLARE(create='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+
+    type(DFTI_DESCRIPTOR), pointer :: fwd_plan_mkl => null()
+    type(DFTI_DESCRIPTOR), pointer :: bwd_plan_mkl => null()
+#elif defined(MFC_GPU)
     $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
 
     real(dp), allocatable, target    :: data_real_gpu(:)
@@ -62,16 +80,39 @@ contains
     impure subroutine s_initialize_fftw_module
 
         integer :: ierr  !< Generic flag used to identify and report GPU errors
-        ! Size of input array going into DFT
 
         real_size = p + 1
-        ! Size of output array coming out of DFT
         cmplx_size = (p + 1)/2 + 1
 
         x_size = m + 1
         batch_size = x_size*sys_size
 
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
+        $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
+
+        @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
+        @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+        @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+
+        ! Forward R2C descriptor: batch of real_size transforms
+        ierr = DftiCreateDescriptor(fwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_INPUT_DISTANCE, real_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_OUTPUT_DISTANCE, cmplx_size)
+        ierr = DftiCommitDescriptor(fwd_plan_mkl)
+
+        ! Backward C2R descriptor
+        ierr = DftiCreateDescriptor(bwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_INPUT_DISTANCE, cmplx_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_OUTPUT_DISTANCE, real_size)
+        ierr = DftiCommitDescriptor(bwd_plan_mkl)
+#elif defined(MFC_GPU)
         rank = 1; istride = 1; ostride = 1
         allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
 
@@ -80,22 +121,7 @@ contains
         oembed(1) = cmplx_size
         $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
         $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
-#else
-        ! Allocate input and output DFT data sizes
-        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
-        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        ! Associate input and output data pointers with allocated memory
-        call c_f_pointer(fftw_real_data, data_real, [real_size])
-        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
-        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
 
-        ! Generate plans for forward and backward DFTs
-        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
-        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
-#endif
-
-#if defined(MFC_GPU)
         @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
         @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
         @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
@@ -111,6 +137,19 @@ contains
         ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, &
                               & HIPFFT_Z2D, batch_size)
 #endif
+#else
+        ! Allocate input and output DFT data sizes
+        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
+        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        ! Associate input and output data pointers with allocated memory
+        call c_f_pointer(fftw_real_data, data_real, [real_size])
+        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
+        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
+
+        ! Generate plans for forward and backward DFTs
+        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
+        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
 #endif
 
     end subroutine s_initialize_fftw_module
@@ -124,7 +163,116 @@ contains
         ! Restrict filter to processors that have cells adjacent to axis
 
         if (bc_y%beg >= 0) return
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, cmplx_size
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_MKL_DISPATCH()
+        ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
+
+        Nfq = 3
+        $:GPU_UPDATE(device='[Nfq]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, Nfq
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1) &
+                                        & *cmplx_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_MKL_DISPATCH()
+        ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu)
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1) &
+                                  & *real_size*x_size)/real(real_size, dp)
+                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        do i = 1, fourier_rings
+            i2 = i
+            $:GPU_UPDATE(device='[i2]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, cmplx_size
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i2, l)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_MKL_DISPATCH()
+            ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
+
+            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            $:GPU_UPDATE(device='[Nfq]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, Nfq
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k &
+                                            & - 1)*cmplx_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_MKL_DISPATCH()
+            ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu)
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k &
+                                      & - 1)*real_size*x_size)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, i2, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end do
+#elif defined(MFC_GPU)
         $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
@@ -292,7 +440,13 @@ contains
     !> Finalize the FFTW module
     impure subroutine s_finalize_fftw_module
 
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        integer :: ierr  !< Generic flag used to identify and report GPU errors
+
+        @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
+        ierr = DftiFreeDescriptor(fwd_plan_mkl)
+        ierr = DftiFreeDescriptor(bwd_plan_mkl)
+#elif defined(MFC_GPU)
         integer :: ierr  !< Generic flag used to identify and report GPU errors
 
         @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 90e78d04df..8b0a7ad0dc 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -83,7 +83,7 @@ contains
             real(wp), dimension(tensor_size) :: tensora, tensorb
         #:endif
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha_k, alpha_rho_k
         #:else
             real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index b21483e097..4fba39e80e 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -136,7 +136,7 @@ contains
         integer, intent(in)                :: xp, yp            !< integers containing the periodicity projection information
         real(wp)                           :: f, ca_in, pa, ma, ta
         real(wp)                           :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer                            :: i, j, k, il, ir, jl, jr
+        integer                            :: i, j, k, kk, il, ir, jl, jr
         integer                            :: Np1, Np2
         integer                            :: encoded_patch_id
         real(wp), dimension(1:3)           :: xy_local, offset  !< x and y coordinates in local IB frame
@@ -227,8 +227,8 @@ contains
         call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir)
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, ma, &
-                            & ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
+        $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, &
+                            & ma, ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
         do j = jl, jr
             do i = il, ir
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
@@ -246,8 +246,8 @@ contains
                     end if
                     if (xy_local(2) >= 0._wp) then
                         k = 1
-                        do while (airfoil_grid_u(k)%x < xy_local(1) .and. k <= Np)
-                            k = k + 1
+                        do kk = 1, Np - 1
+                            if (airfoil_grid_u(kk)%x < xy_local(1)) k = kk + 1
                         end do
                         if (f_approx_equal(airfoil_grid_u(k)%x, xy_local(1))) then
                             if (xy_local(2) <= airfoil_grid_u(k)%y) then
@@ -261,8 +261,8 @@ contains
                         end if
                     else
                         k = 1
-                        do while (airfoil_grid_l(k)%x < xy_local(1))
-                            k = k + 1
+                        do kk = 1, Np - 1
+                            if (airfoil_grid_l(kk)%x < xy_local(1)) k = kk + 1
                         end do
                         if (f_approx_equal(airfoil_grid_l(k)%x, xy_local(1))) then
                             if (xy_local(2) >= airfoil_grid_l(k)%y) then
@@ -290,7 +290,7 @@ contains
         type(integer_field), intent(inout) :: ib_markers
         integer, intent(in) :: xp, yp, zp                      !< integers containing the periodicity projection information
         real(wp) :: lz, z_max, z_min, f, ca_in, pa, ma, ta, xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer :: i, j, k, l, il, ir, jl, jr, ll, lr
+        integer :: i, j, k, kk, l, il, ir, jl, jr, ll, lr
         integer :: Np1, Np2
         integer :: encoded_patch_id
         real(wp), dimension(1:3) :: xyz_local, center, offset  !< x, y, z coordinates in local IB frame
@@ -385,8 +385,8 @@ contains
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
         call get_bounding_indices(center(3) - ca_in, center(3) + ca_in, z_cc, ll, lr)
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, &
-                            & ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, &
+                            & offset, ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3)
         do l = ll, lr
             do j = jl, jr
                 do i = il, ir
@@ -399,8 +399,8 @@ contains
                         if (xyz_local(1) >= 0._wp .and. xyz_local(1) <= ca_in) then
                             if (xyz_local(2) >= 0._wp) then
                                 k = 1
-                                do while (airfoil_grid_u(k)%x < xyz_local(1))
-                                    k = k + 1
+                                do kk = 1, Np - 1
+                                    if (airfoil_grid_u(kk)%x < xyz_local(1)) k = kk + 1
                                 end do
                                 if (f_approx_equal(airfoil_grid_u(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) <= airfoil_grid_u(k)%y) then
@@ -415,8 +415,8 @@ contains
                                 end if
                             else
                                 k = 1
-                                do while (airfoil_grid_l(k)%x < xyz_local(1))
-                                    k = k + 1
+                                do kk = 1, Np - 1
+                                    if (airfoil_grid_l(kk)%x < xyz_local(1)) k = kk + 1
                                 end do
                                 if (f_approx_equal(airfoil_grid_l(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) >= airfoil_grid_l(k)%y) then
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 39ec6b1470..b537940341 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -141,7 +141,7 @@ contains
         real(wp), dimension(3) :: vel_IP, vel_norm_IP
         real(wp) :: c_IP
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)  :: Gs
             real(wp), dimension(3)  :: alpha_rho_IP, alpha_IP
             real(wp), dimension(3)  :: r_IP, v_IP, pb_IP, mv_IP
@@ -727,10 +727,10 @@ contains
         real(wp), intent(inout) :: pres_IP
         real(wp), dimension(3), intent(inout) :: vel_IP
         real(wp), intent(inout) :: c_IP
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(inout) :: alpha_IP, alpha_rho_IP
         #:else
-            real(wp), dimension(num_fluids), intent(inout) :: alpha_IP, alpha_rho_IP
+            real(wp), dimension(num_fluids_max), intent(inout) :: alpha_IP, alpha_rho_IP
         #:endif
         real(wp), optional, dimension(:), intent(inout) :: r_IP, v_IP, pb_IP, mv_IP
         real(wp), optional, dimension(:), intent(inout) :: nmom_IP
@@ -892,7 +892,7 @@ contains
         real(wp), dimension(1:3)     :: local_force_contribution, radial_vector, local_torque_contribution
         real(wp)                     :: cell_volume, dx, dy, dz, dynamic_viscosity
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: dynamic_viscosities
         #:else
             real(wp), dimension(num_fluids) :: dynamic_viscosities
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index 8ca939de7d..cc3b3d8c03 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -51,8 +51,8 @@ contains
     !> The main pressure relaxation procedure
     subroutine s_pressure_relaxation_procedure(q_cons_vf)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer                                                :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer                                         :: j, k, l
 
         $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
         do l = 0, p
@@ -144,10 +144,10 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in)                                    :: j, k, l
         real(wp)                                               :: pres_relax, f_pres, df_pres
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: pres_K_init, rho_K_s
         #:else
-            real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+            real(wp), dimension(num_fluids_max) :: pres_K_init, rho_K_s
         #:endif
         integer, parameter :: MAX_ITER = 50
         ! Pressure relaxation convergence tolerance
@@ -216,10 +216,10 @@ contains
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in)                                    :: j, k, l
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha_rho, alpha
         #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, alpha
+            real(wp), dimension(num_fluids_max) :: alpha_rho, alpha
         #:endif
         real(wp)               :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
         real(wp), dimension(2) :: Re
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 8465d4d349..88d8b3c449 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -741,7 +741,7 @@ contains
         real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout)  :: rhs_mv
         type(int_bounds_info), intent(in)                                                          :: ix, iy, iz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(6)    :: moms, msum
             real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
         #:else
@@ -1027,7 +1027,7 @@ contains
         function f_quad(abscX, abscY, wght_in, q, r, s)
 
             $:GPU_ROUTINE(parallelism='[seq]')
-            #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
                 real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in
             #:else
                 real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
@@ -1053,7 +1053,7 @@ contains
         function f_quad2D(abscX, abscY, wght_in, pow)
 
             $:GPU_ROUTINE(parallelism='[seq]')
-            #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
                 real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in
             #:else
                 real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 7672094f55..8aa1d66870 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -142,7 +142,7 @@ contains
         integer, intent(in)                                    :: norm_dir
         type(int_bounds_info), intent(in)                      :: ix, iy, iz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)  :: alpha_rho_L, alpha_rho_R
             real(wp), dimension(3)  :: vel_L, vel_R
             real(wp), dimension(3)  :: alpha_L, alpha_R
@@ -828,7 +828,7 @@ contains
         integer, intent(in)                                    :: norm_dir
         type(int_bounds_info), intent(in)                      :: ix, iy, iz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)    :: alpha_rho_L, alpha_rho_R
             real(wp), dimension(3)    :: vel_L, vel_R
             real(wp), dimension(3)    :: alpha_L, alpha_R
@@ -1699,7 +1699,7 @@ contains
         integer, intent(in)                                    :: norm_dir
         type(int_bounds_info), intent(in)                      :: ix, iy, iz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
             real(wp), dimension(3) :: alpha_L, alpha_R
             real(wp), dimension(3) :: vel_L, vel_R
@@ -1713,7 +1713,7 @@ contains
         real(wp) :: pres_L, pres_R
         real(wp) :: E_L, E_R
         real(wp) :: H_L, H_R
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
             real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
         #:else
@@ -1743,7 +1743,7 @@ contains
         real(wp)               :: xi_L_m1, xi_R_m1  !< xi_L/R - 1, computed without cancellation
         real(wp)               :: xi_M, xi_P
         real(wp)               :: xi_MP, xi_PP
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: R0_L, R0_R
             real(wp), dimension(3) :: V0_L, V0_R
             real(wp), dimension(3) :: P0_L, P0_R
@@ -1761,7 +1761,7 @@ contains
         real(wp)               :: R3Lbar, R3Rbar
         real(wp)               :: R3V2Lbar, R3V2Rbar
         real(wp), dimension(6) :: tau_e_L, tau_e_R
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: xi_field_L, xi_field_R
         #:else
             real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
@@ -3337,7 +3337,7 @@ contains
 
         ! Local variables:
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
         #:else
             real(wp), dimension(num_fluids) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
@@ -4115,7 +4115,7 @@ contains
 
         ! Local variables
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: avg_v_int     !< Averaged interface velocity (\f$v_x, v_y, v_z\f$) (grid directions).
             real(wp), dimension(3) :: avg_dvdx_int  !< Averaged interface \f$\partial v_i/\partial x\f$ (grid dir 1).
             real(wp), dimension(3) :: avg_dvdy_int  !< Averaged interface \f$\partial v_i/\partial y\f$ (grid dir 2).
@@ -4286,7 +4286,7 @@ contains
 
         ! Local variables
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3, 3) :: vel_grad_avg          !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
             real(wp), dimension(3, 3) :: current_tau_shear     !< Current shear stress tensor.
             real(wp), dimension(3, 3) :: current_tau_bulk      !< Current bulk stress tensor.
@@ -4404,12 +4404,12 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
 
         ! Arguments
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3, 3), intent(in)  :: vel_grad_avg
             real(wp), dimension(3, 3), intent(out) :: tau_shear_out
         #:else
-            real(wp), dimension(num_dims, num_dims), intent(in)  :: vel_grad_avg
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
+            real(wp), dimension(3, 3), intent(in)  :: vel_grad_avg
+            real(wp), dimension(3, 3), intent(out) :: tau_shear_out
         #:endif
         real(wp), intent(in) :: Re_shear
         real(wp), intent(in) :: divergence_v
@@ -4438,10 +4438,10 @@ contains
         ! Arguments
         real(wp), intent(in) :: Re_bulk
         real(wp), intent(in) :: divergence_v
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
         #:else
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
+            real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
         #:endif
 
         ! Local variables
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 4a0978919e..08260717ee 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -45,11 +45,11 @@ contains
     function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in) :: vel
-        real(wp), intent(in)                      :: c
-        integer, intent(in)                       :: j, k, l
-        real(wp)                                  :: cfl_terms
-        real(wp)                                  :: fltr_dtheta
+        real(wp), dimension(3), intent(in) :: vel
+        real(wp), intent(in)               :: c
+        integer, intent(in)                :: j, k, l
+        real(wp)                           :: cfl_terms
+        real(wp)                           :: fltr_dtheta
 
         fltr_dtheta = f_compute_filtered_dtheta(k, l)
 
@@ -74,22 +74,22 @@ contains
 
         $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        type(scalar_field), intent(in), dimension(:) :: q_prim_vf
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), intent(inout), dimension(3) :: alpha
             real(wp), intent(inout), dimension(3) :: vel
         #:else
-            real(wp), intent(inout), dimension(num_fluids) :: alpha
-            real(wp), intent(inout), dimension(num_vels)   :: vel
+            real(wp), intent(inout), dimension(num_fluids_max) :: alpha
+            real(wp), intent(inout), dimension(3)              :: vel
         #:endif
         real(wp), intent(inout)               :: rho, gamma, pi_inf, vel_sum, H, pres
         real(wp), intent(out)                 :: qv
         integer, intent(in)                   :: j, k, l
         real(wp), dimension(2), intent(inout) :: Re
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha_rho, Gs
         #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, Gs
+            real(wp), dimension(num_fluids_max) :: alpha_rho, Gs
         #:endif
         real(wp) :: E, G_local
         integer  :: i
@@ -141,7 +141,7 @@ contains
     subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in), dimension(num_vels)                 :: vel
+        real(wp), intent(in), dimension(3)                        :: vel
         real(wp), intent(in)                                      :: c, rho
         real(wp), dimension(0:m,0:n,0:p), intent(inout)           :: icfl_sf
         real(wp), dimension(0:m,0:n,0:p), intent(inout), optional :: vcfl_sf, Rc_sf
@@ -191,7 +191,7 @@ contains
     subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in)       :: vel
+        real(wp), dimension(3), intent(in)              :: vel
         real(wp), intent(in)                            :: c, rho
         real(wp), dimension(0:m,0:n,0:p), intent(inout) :: max_dt
         real(wp), dimension(2), intent(in)              :: Re_l
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 0b7caf7a47..f83650375d 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -65,7 +65,7 @@ contains
         integer, intent(in)                                    :: id
         type(int_bounds_info), intent(in)                      :: isx, isy, isz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3, 3) :: Omega
         #:else
             real(wp), dimension(num_dims, num_dims) :: Omega
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 4d142eca55..762d52acaa 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -620,12 +620,12 @@ contains
 
         real(wp) :: rho  !< Cell-avg. density
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: vel    !< Cell-avg. velocity
             real(wp), dimension(3) :: alpha  !< Cell-avg. volume fraction
         #:else
-            real(wp), dimension(num_vels)   :: vel    !< Cell-avg. velocity
-            real(wp), dimension(num_fluids) :: alpha  !< Cell-avg. volume fraction
+            real(wp), dimension(3)              :: vel    !< Cell-avg. velocity
+            real(wp), dimension(num_fluids_max) :: alpha  !< Cell-avg. volume fraction
         #:endif
         real(wp)               :: vel_sum  !< Cell-avg. velocity sum
         real(wp)               :: pres     !< Cell-avg. pressure
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 55cdcc343e..af99f44eb0 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -53,7 +53,7 @@ contains
         real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum  !< Mixture variables
         real(wp), dimension(2) :: Re_visc
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)    :: alpha_visc, alpha_rho_visc
             real(wp), dimension(3, 3) :: tau_Re
         #:else
@@ -1266,14 +1266,14 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), dimension(1:3,1:3), intent(inout)           :: viscous_stress_tensor
-        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(in)                                  :: dynamic_viscosity
-        integer, intent(in)                                   :: i, j, k
-        real(wp), dimension(1:3,1:3)                          :: velocity_gradient_tensor
-        real(wp), dimension(1:3)                              :: dx
-        real(wp)                                              :: divergence
-        integer                                               :: l, q  !< iterators
+        real(wp), dimension(1:3,1:3), intent(inout)  :: viscous_stress_tensor
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(in)                         :: dynamic_viscosity
+        integer, intent(in)                          :: i, j, k
+        real(wp), dimension(1:3,1:3)                 :: velocity_gradient_tensor
+        real(wp), dimension(1:3)                     :: dx
+        real(wp)                                     :: divergence
+        integer                                      :: l, q  !< iterators
 
         ! zero the viscous stress, collection of velocity derivatives, and spatial finite differences
         viscous_stress_tensor = 0._wp
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 78e7678b1a..e19c245aec 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -895,7 +895,7 @@ contains
         integer, intent(in)                                                                    :: weno_dir
         type(int_bounds_info), intent(in)                                                      :: is1_weno_d, is2_weno_d, is3_weno_d
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(-3:2) :: dvd
             real(wp), dimension(0:4)  :: poly
             real(wp), dimension(0:4)  :: alpha
diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh
index 1beb016539..119bff39fb 100644
--- a/toolchain/bootstrap/modules.sh
+++ b/toolchain/bootstrap/modules.sh
@@ -48,6 +48,7 @@ if [ -v $u_c ]; then
     log   "$B""DoD$W:     Carpenter Cray (cc) | Carpenter GNU (c) |  Nautilus (n)"
     log   "$OR""Florida$W: HiPerGator (h)"
     log   "$C""WPI $W:   Turing   (t)"
+    log   "$Y""Gatech$W:  CRNCH RoboGator (crnch)"
     log_n "($G""a$W/$G""f$W/$G""s$W/$G""w$W/$B""tuo$W/$C""b$W/$C""e$CR/$C""d/$C""dai$CR/$Y""p$CR/$R""r$CR/$B""cc$CR/$B""c$CR/$B""n$CR/$BR""o$CR/$BR""pa"$CR"/$OR""h"$CR/$C""t""$CR"): "
     read u_c
     log
@@ -106,11 +107,13 @@ fi
 ELEMENTS="$(__extract "$u_c-all") $(__extract "$u_c-$cg")"
 MODULES=`echo "$ELEMENTS" | tr ' ' '\n' | grep -v = | xargs`
 
-log " $ module load $MODULES"
-if ! module load $MODULES; then
-    error "Failed to load modules."
+if [ -n "$MODULES" ]; then
+    log " $ module load $MODULES"
+    if ! module load $MODULES; then
+        error "Failed to load modules."
 
-    return
+        return
+    fi
 fi
 
 # Export variables one line at a time so each can reference previously exported vars
diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt
index 9a41e1cafc..89b51ccc26 100644
--- a/toolchain/dependencies/CMakeLists.txt
+++ b/toolchain/dependencies/CMakeLists.txt
@@ -32,16 +32,16 @@ if (MFC_FFTW)
         message(STATUS "FFTW found.")
         add_custom_target(fftw)
     else()
-        if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+        if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+            message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
+            add_custom_target(fftw)
+        else()
             ExternalProject_Add(fftw
                 URL        "http://www.fftw.org/fftw-3.3.10.tar.gz"
                 CMAKE_ARGS -DBUILD_TESTS=OFF
                            -DBUILD_SHARED_LIBS=OFF
                            "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
             )
-        else()
-            message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
-            add_custom_target(fftw)
         endif()
     endif()
 endif()
@@ -135,6 +135,9 @@ endif()
                             -DCBLAS=OFF
                             -DLAPACKE=OFF
                             -DBUILD_DEPRECATED=OFF
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            "-DCMAKE_EXE_LINKER_FLAGS=-no-pie"
+                            "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}"
                             "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
                             "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
          )
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 01efb1a9b1..e7353185d9 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -421,6 +421,7 @@ def configure(self, case: Case):
             flags.append(f"-DMFC_GCov={'ON' if ARG('gcov') else 'OFF'}")
             flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}")
             flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}")
+            flags.append(f"-DMFC_Intel_AOT={'ON' if ARG('intel_aot') else 'OFF'}")
 
         command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath]
 
diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py
index dda710f602..180044a4f3 100644
--- a/toolchain/mfc/run/input.py
+++ b/toolchain/mfc/run/input.py
@@ -99,6 +99,16 @@ def generate_fpp(self, target) -> None:
 
         thermochem_code = pyro.FortranCodeGenerator().generate("m_thermochem", sol, pyro.CodeGenerationOptions(scalar_type=real_type, directive_offload=directive_str))
 
+        if directive_str == "mp":
+            # ifx -fpp strips !$omp directives produced by C-macro expansion because the
+            # Intel Fortran preprocessor treats '!' as a Fortran comment after expansion.
+            # Rewrite the GPU_ROUTINE macro calls as literal !$omp declare target lines so
+            # the directive is visible to the Fortran front-end, not the C preprocessor.
+            import re
+
+            thermochem_code = thermochem_code.replace("#define GPU_ROUTINE(name) !$omp declare target\n", "")
+            thermochem_code = re.sub(r"[ \t]+GPU_ROUTINE\(\w+\)", "!$omp declare target", thermochem_code)
+
         common.file_write(os.path.join(modules_dir, "m_thermochem.f90"), thermochem_code, True)
 
         cons.unindent()
diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py
index 82e886c064..261d8e65dc 100644
--- a/toolchain/mfc/run/run.py
+++ b/toolchain/mfc/run/run.py
@@ -85,6 +85,19 @@ def __get_template() -> Template:
     raise MFCException(f"Failed to find a template for --computer '{computer}'. Baked-in templates are: {format_list_to_string(list(baked.keys()), 'magenta')}.")
 
 
+def __is_intel_gpu_build(case: input.MFCInputFile) -> bool:
+    cmake_cache = os.path.join(SIMULATION.get_staging_dirpath(case), "CMakeCache.txt")
+    if not os.path.isfile(cmake_cache):
+        return False
+    with open(cmake_cache) as f:
+        content = f.read()
+    # Match compiler ID entry (may or may not be present) or fall back to compiler path
+    if re.search(r"CMAKE_Fortran_COMPILER_ID[^=\n]*=[^\n]*IntelLLVM", content):
+        return True
+    m = re.search(r"CMAKE_Fortran_COMPILER:FILEPATH=([^\n]+)", content)
+    return m is not None and os.path.basename(m.group(1).strip()) in ("ifx", "ifx.exe")
+
+
 def __generate_job_script(targets, case: input.MFCInputFile):
     env = {}
     if ARG("gpus") is not None:
@@ -103,6 +116,23 @@ def __generate_job_script(targets, case: input.MFCInputFile):
     gpu_acc = gpu_mode == gpuConfigOptions.ACC.value
     gpu_mp = gpu_mode == gpuConfigOptions.MP.value
 
+    if gpu_mp and __is_intel_gpu_build(case):
+        # Level Zero tuning for Intel GPU Max (Ponte Vecchio).
+        # COMMAND_BATCH=256: batch up to 256 Level Zero commands before flushing,
+        # allowing the GPU to stay busy while the host prepares the next batch.
+        # Measured ~9% throughput improvement on Intel GPU Max 1100.
+        env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH", "256")
+        # Disable per-kernel indirect access memory tracking (zeMemGetAllocProperties).
+        # MFC's scalar_field pointer arrays trigger indirect-access flags; this tracking
+        # adds ~2100 API calls/step. Disabling it is safe since all GPU allocations are
+        # managed via @:ALLOCATE/@:DEALLOCATE and never aliased with host memory.
+        env.setdefault("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY", "0")
+        if ARG("fastmath"):
+            # -cl-fast-relaxed-math: enables unsafe GPU JIT optimizations (fast transcendentals,
+            # fused MAD, no signed-zero semantics, finite-math-only). Matches nvfortran -gpu=fastmath.
+            # Only applied when the user explicitly requests --fastmath.
+            env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "-cl-fast-relaxed-math")
+
     content = __get_template().render(
         **{**ARGS(), "targets": targets},
         ARG=ARG,
diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py
index 94a37be947..63dacd0884 100644
--- a/toolchain/mfc/state.py
+++ b/toolchain/mfc/state.py
@@ -21,6 +21,7 @@ class MFCConfig:
     single: bool = False
     mixed: bool = False
     fastmath: bool = False
+    intel_aot: bool = False
 
     def __hash__(self):
         return hash(tuple(getattr(self, f.name) for f in dataclasses.fields(self)))
@@ -32,7 +33,8 @@ def from_dict(d: dict):
         r = MFCConfig()
 
         for field in dataclasses.fields(MFCConfig):
-            setattr(r, field.name, d[field.name])
+            if field.name in d:
+                setattr(r, field.name, d[field.name])
 
         return r
 
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 2d8ede6952..468490b9c9 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -112,13 +112,23 @@ def __filter(cases_) -> typing.Tuple[typing.List[TestCase], typing.List[TestCase
             raise MFCException(f"--only filter matched zero test cases. Specified: {ARG('only')}. Check that UUIDs/names are valid.")
 
     # Convergence cases are slow (multiple resolutions × MPI ranks). Skip
-    # by default unless the user explicitly opted in via --only "Convergence"
-    # or a convergence UUID. _filter_only above has already narrowed cases
-    # to the user's selection, so any convergence case still present here
-    # was selected on purpose. Listing (`-l`) shows all cases regardless.
-    if not ARG("only") and not ARG("list"):
-        convergence_cases = [c for c in cases if getattr(c, "kind", "golden") == "convergence"]
-        if convergence_cases:
+    # unless the user explicitly opted in via --only "Convergence" or a
+    # specific convergence UUID. A label like --only "2D" must not
+    # accidentally pull in "Convergence -> 2D -> ..." cases.
+    if not ARG("list"):
+
+        def is_uuid(term):
+            return len(term) == 8 and all(c in "0123456789abcdefABCDEF" for c in term)
+
+        only_terms = ARG("only")
+        only_labels = [t for t in only_terms if not is_uuid(t)]
+        only_uuids = [t for t in only_terms if is_uuid(t)]
+
+        convergence_uuids = {c.get_uuid() for c in cases if getattr(c, "kind", "golden") == "convergence"}
+        user_wants_convergence = "Convergence" in only_labels or any(u in convergence_uuids for u in only_uuids)
+
+        if not user_wants_convergence:
+            convergence_cases = [c for c in cases if getattr(c, "kind", "golden") == "convergence"]
             for c in convergence_cases:
                 cases.remove(c)
                 skipped_cases.append(c)
diff --git a/toolchain/modules b/toolchain/modules
index ea7cb36393..d424203666 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -116,3 +116,12 @@ h-gpu NVCOMPILER_COMM_LIBS_HOME=/apps/compilers/nvhpc/25.9/Linux_x86_64/25.9/com
 t     WPI Turing
 t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
+
+crnch     GT CRNCH RoboGator (Intel GPU Max 1100, Ponte Vecchio)
+crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin/mpiifx
+crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin:${PATH}
+crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0
+crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14
+crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LD_LIBRARY_PATH}
+crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH}
+crnch-gpu I_MPI_FABRICS=shm
diff --git a/toolchain/patches/fypp-linemarker-resync.patch b/toolchain/patches/fypp-linemarker-resync.patch
index 6f6dfadf7f..d8c0c829a5 100644
--- a/toolchain/patches/fypp-linemarker-resync.patch
+++ b/toolchain/patches/fypp-linemarker-resync.patch
@@ -1,25 +1,26 @@
---- a/fypp.py
-+++ b/fypp.py
-@@ -1842,11 +1842,16 @@ class _Renderer:
-             if self._linenums:
-                 # Last line was folded, but no linenums were generated for
-                 # the continuation lines -> current line position is not
-                 # in sync with the one calculated from the last line number
-                 unsync = (
-                     len(foldedlines) and len(foldedlines[-1]) > 1
+--- a/fypp.py	2026-05-14 19:44:34.158817311 -0400
++++ b/fypp.py	2026-05-14 19:44:34.188817564 -0400
+@@ -1848,12 +1848,17 @@
                      and not self._contlinenums)
                  # Eval directive in source consists of more than one line
                  multiline = span[1] - span[0] > 1
 -                if unsync or multiline:
+-                    # For inline eval directives span[0] == span[1]
+-                    # -> next line is span[0] + 1 and not span[1] as for
+-                    # line eval directives
+-                    nextline = max(span[1], span[0] + 1)
+-                    trailing += self._linenumdir(nextline, fname)
 +                # Always emit a resync marker after a $: call. Without this,
 +                # single-line $: calls that expand to multi-line #if/#endif
 +                # blocks (e.g. GPU_PARALLEL_LOOP) cause the compiler to
 +                # attribute the next Fortran statement to the call-site line
 +                # rather than the following source line, producing off-by-1
 +                # errors in backtraces and debugger line info.
-+                if unsync or multiline or True:
-                     # For inline eval directives span[0] == span[1]
-                     # -> next line is span[0] + 1 and not span[1] as for
-                     # line eval directives
-                     nextline = max(span[1], span[0] + 1)
-                     trailing += self._linenumdir(nextline, fname)
++                # For inline eval directives span[0] == span[1]
++                # -> next line is span[0] + 1 and not span[1] as for
++                # line eval directives
++                nextline = max(span[1], span[0] + 1)
++                trailing += self._linenumdir(nextline, fname)
+         else:
+             trailing = ''
+         return result + trailing
diff --git a/toolchain/templates/include/helpers.mako b/toolchain/templates/include/helpers.mako
index a9493f71d0..5c729827c5 100644
--- a/toolchain/templates/include/helpers.mako
+++ b/toolchain/templates/include/helpers.mako
@@ -32,6 +32,15 @@ END
         export ${key}='${value}'
     % endfor
 
+    % if gpu_mp:
+        # ifx + Level Zero: the OpenMP SPIR-V JIT runs on the main thread stack at
+        # startup, consuming 2-4 MB before user code runs. ifx also stack-allocates
+        # compiler-generated temporaries with no size threshold (unlike gfortran's
+        # 32 KB default). Together they overflow the default 12.5 MB stack for any
+        # non-trivial 3D case. Removing the limit avoids SIGSEGV at launch.
+        ulimit -s unlimited
+    % endif
+
     t_start=$(date +%s)
 % else:
     echo MFC case # ${name} @ ${input}:

From 8d2c6b173ba69762186bd47d57c790b0820335e8 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 10:15:13 -0400
Subject: [PATCH 15/30] fix: replace integer kind literals with real literals
 in m_fftw.fpp

---
 src/simulation/m_fftw.fpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 59fb741aef..e5bd89bd9e 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -168,7 +168,7 @@ contains
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, cmplx_size
-                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0._dp, 0._dp)
                 end do
             end do
         end do
@@ -224,7 +224,7 @@ contains
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, cmplx_size
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0._dp, 0._dp)
                     end do
                 end do
             end do
@@ -243,7 +243,7 @@ contains
             $:GPU_MKL_DISPATCH()
             ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
 
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            Nfq = min(floor(2._dp*real(i, dp)*pi), cmplx_size)
             $:GPU_UPDATE(device='[Nfq]')
 
             $:GPU_PARALLEL_LOOP(collapse=3)
@@ -277,7 +277,7 @@ contains
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, cmplx_size
-                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0._dp, 0._dp)
                 end do
             end do
         end do
@@ -344,7 +344,7 @@ contains
             do k = 1, sys_size
                 do j = 0, m
                     do l = 1, cmplx_size
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0._dp, 0._dp)
                     end do
                 end do
             end do
@@ -369,7 +369,7 @@ contains
 #endif
             #:endcall GPU_HOST_DATA
 
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            Nfq = min(floor(2._dp*real(i, dp)*pi), cmplx_size)
             $:GPU_UPDATE(device='[Nfq]')
 
             $:GPU_PARALLEL_LOOP(collapse=3)
@@ -408,7 +408,7 @@ contains
         Nfq = 3
         do j = 0, m
             do k = 1, sys_size
-                data_fltr_cmplx(:) = (0_dp, 0_dp)
+                data_fltr_cmplx(:) = (0._dp, 0._dp)
                 data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0,0:p)
                 call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
                 data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
@@ -420,10 +420,10 @@ contains
 
         ! Apply Fourier filter to additional rings
         do i = 1, fourier_rings
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            Nfq = min(floor(2._dp*real(i, dp)*pi), cmplx_size)
             do j = 0, m
                 do k = 1, sys_size
-                    data_fltr_cmplx(:) = (0_dp, 0_dp)
+                    data_fltr_cmplx(:) = (0._dp, 0._dp)
                     data_real(1:p + 1) = q_cons_vf(k)%sf(j, i,0:p)
                     call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
                     data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)

From e5728fbca85e7cdd916205e5864d73eb276b487b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 10:48:30 -0400
Subject: [PATCH 16/30] fix: extend VLA guards from USING_AMD to (USING_AMD or
 USING_INTEL) in m_qbmm and m_hyperelastic

---
 src/simulation/m_hyperelastic.fpp | 2 +-
 src/simulation/m_qbmm.fpp         | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 8b0a7ad0dc..0a735697bd 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -77,7 +77,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
 
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: tensora, tensorb
         #:else
             real(wp), dimension(tensor_size) :: tensora, tensorb
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 88d8b3c449..16c5429c92 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -587,7 +587,7 @@ contains
         $:GPU_ROUTINE(function_name='s_coeff_nonpoly',parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(32,0:2,0:2), intent(out) :: coeffs
         #:else
             real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeffs
@@ -666,7 +666,7 @@ contains
         $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(32,0:2,0:2), intent(out) :: coeffs
         #:else
             real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeffs
@@ -748,7 +748,7 @@ contains
             real(wp), dimension(nmom)      :: moms, msum
             real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
         #:endif
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(32,0:2,0:2) :: coeff
         #:else
             real(wp), dimension(nterms,0:2,0:2) :: coeff
@@ -930,7 +930,7 @@ contains
 
             $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', cray_inline=True)
             real(wp), intent(in) :: pres, rho, c
-            #:if USING_AMD
+            #:if (USING_AMD or USING_INTEL)
                 real(wp), dimension(32,0:2,0:2), intent(out) :: coeff
             #:else
                 real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeff

From 0534d69a02446f282b894b3ed9a942678e957be2 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 11:15:09 -0400
Subject: [PATCH 17/30] fix: use shm:ofi + FI_PROVIDER=tcp for Intel MPI on
 crnch-gpu (tcp fabric removed in 2021.x)

---
 toolchain/modules | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/toolchain/modules b/toolchain/modules
index d424203666..c43dd34cdd 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -124,4 +124,5 @@ crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025
 crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14
 crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LD_LIBRARY_PATH}
 crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH}
-crnch-gpu I_MPI_FABRICS=shm
+crnch-gpu I_MPI_FABRICS=shm:ofi
+crnch-gpu FI_PROVIDER=tcp

From 88630228c2b6ccd2975817ffc1d47b07648ea2e1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 11:33:44 -0400
Subject: [PATCH 18/30] docs: document Intel MPI multi-node SSH bootstrap
 workaround for missing bundled ssh

---
 docs/documentation/intel-gpu-max.md | 35 +++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
index 576e68f6a7..3a04cf2561 100644
--- a/docs/documentation/intel-gpu-max.md
+++ b/docs/documentation/intel-gpu-max.md
@@ -253,6 +253,41 @@ To run a case anyway (testing code correctness on CPU fallback), invoke
 `pre_process` and `simulation` directly from their install paths, bypassing
 the `./mfc.sh run` wrapper that calls `syscheck` first.
 
+### Multi-node MPI with Intel MPI 2021.x
+
+Intel MPI 2021.x expects a bundled `ssh` binary at `$I_MPI_ROOT/bin/ssh` that
+understands an `--external-launcher` flag used by hydra bootstrap. This binary
+is missing from some oneAPI installations, causing SSH bootstrap to fail with
+`unknown option -- -`.
+
+Workaround: create a wrapper that strips the Intel-specific flag:
+
+```bash
+mkdir -p ~/bin
+cat > ~/bin/ssh << 'EOF'
+#!/bin/bash
+args=(-q -o StrictHostKeyChecking=yes -o BatchMode=yes)
+for arg in "$@"; do
+    [[ "$arg" == "--external-launcher" ]] && continue
+    [[ "$arg" == "--" ]] && break
+    args+=("$arg")
+done
+exec /usr/bin/ssh "${args[@]}"
+EOF
+chmod +x ~/bin/ssh
+```
+
+Then run with:
+```bash
+PATH=$HOME/bin:$PATH \
+I_MPI_HYDRA_BOOTSTRAP=rsh \
+I_MPI_HYDRA_BOOTSTRAP_EXEC=$HOME/bin/ssh \
+mpirun -n <ranks> -hosts <node1>,<node2> ./simulation
+```
+
+Nodes must have passwordless SSH from the launch node and no `pam_slurm_adopt`
+blocking. Suppress the SSH login banner on remote nodes with `touch ~/.hushlogin`.
+
 ### `libumf.so.1` not found at runtime
 The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`.
 If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports

From faa9bbb12de24ce817a81840486abf37352bdb12 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 16:23:24 -0400
Subject: [PATCH 19/30] fix: add FI_PROVIDER_PATH to crnch-gpu modules;
 document SLURM GRES and OFI provider requirements

---
 docs/documentation/intel-gpu-max.md | 21 +++++++++++++++++++++
 toolchain/modules                   |  3 ++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
index 3a04cf2561..8ceeb6936a 100644
--- a/docs/documentation/intel-gpu-max.md
+++ b/docs/documentation/intel-gpu-max.md
@@ -288,6 +288,27 @@ mpirun -n <ranks> -hosts <node1>,<node2> ./simulation
 Nodes must have passwordless SSH from the launch node and no `pam_slurm_adopt`
 blocking. Suppress the SSH login banner on remote nodes with `touch ~/.hushlogin`.
 
+**OFI provider path**: Intel MPI 2021.x ships its own libfabric providers in
+`$I_MPI_ROOT/libfabric/lib/prov/`. The system libfabric may not include the tcp
+or shm providers. Always set:
+
+```bash
+export FI_PROVIDER_PATH=$I_MPI_ROOT/libfabric/lib/prov
+```
+
+Without this, `PMPI_Init` aborts with `OFI fi_getinfo() failed: No data available`.
+This is handled automatically by `source ./mfc.sh load -c crnch -m gpu`.
+
+**SLURM GPU access**: on SLURM-managed Intel GPU nodes, processes outside a SLURM
+allocation cannot open `/dev/dri/renderD128`. Always request the GPU resource:
+
+```bash
+#SBATCH --gres=gpu:max_1100:1   # Intel GPU Max 1100
+```
+
+Without `--gres`, `omp_get_num_devices()` returns 0 and the process aborts with
+integer divide-by-zero in `s_initialize_mpi_domain` (rank % num_devices with 0 devices).
+
 ### `libumf.so.1` not found at runtime
 The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`.
 If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports
diff --git a/toolchain/modules b/toolchain/modules
index c43dd34cdd..e12ace3502 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -122,7 +122,8 @@ crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/b
 crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin:${PATH}
 crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0
 crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14
-crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LD_LIBRARY_PATH}
+crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/libfabric/lib:${LD_LIBRARY_PATH}
 crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH}
 crnch-gpu I_MPI_FABRICS=shm:ofi
 crnch-gpu FI_PROVIDER=tcp
+crnch-gpu FI_PROVIDER_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/libfabric/lib/prov

From c296e30844219420e1473545f5e80fb3c3f1c64d Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 17:34:20 -0400
Subject: [PATCH 20/30] docs: document inter-node MPI fix (FI_TCP_IFACE) and
 dash3 renderD128 permission issue

---
 docs/documentation/intel-gpu-max.md | 68 +++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
index 8ceeb6936a..c18fbe7eb2 100644
--- a/docs/documentation/intel-gpu-max.md
+++ b/docs/documentation/intel-gpu-max.md
@@ -309,6 +309,74 @@ allocation cannot open `/dev/dri/renderD128`. Always request the GPU resource:
 Without `--gres`, `omp_get_num_devices()` returns 0 and the process aborts with
 integer divide-by-zero in `s_initialize_mpi_domain` (rank % num_devices with 0 devices).
 
+**Per-node renderD128 permissions on CRNCH**: `dash4` has `renderD128` as
+`crwxrwxrwx` (world-accessible), but `dash3` has `crw-rw----` (render group only).
+`--gres=gpu:max_1100:1` does NOT grant cgroup access on dash3 with the current
+SLURM configuration; `omp_get_num_devices()` returns 0 on dash3 even within a
+SLURM GPU allocation. Contact the CRNCH admin to either fix the device permissions
+on dash3 or configure SLURM device cgroups to grant renderD128 access for GPU jobs.
+Until fixed, 2-node GPU simulation is not possible using dash3+dash4.
+
+**Inter-node MPI: FI_TCP_IFACE must be set dynamically**: The CRNCH dash nodes
+have multiple network interfaces (high-speed 10GbE at `10.10.10.x`, public 1GbE
+at `143.215.138.x/25`). Intel MPI's OFI tcp provider selects the highest-speed
+interface by default. On dash3, this picks `enp200s0f1np1` (10.10.10.32), which
+has no corresponding active interface on dash4. This causes the inter-node MPI
+broadcast to hang silently after `MPI_Init` succeeds.
+
+Fix: set `FI_TCP_IFACE` to the name of the interface with the public IP (which
+is accessible from all nodes). The interface name differs per node, so set it
+dynamically in each rank's startup script:
+
+```bash
+IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}')
+export FI_TCP_IFACE="${IFACE}"
+```
+
+This selects `enp3s0f0` on dash3 and `enp3s0f0np0` on dash4. Combined with
+`srun --mpi=pmi2` for SLURM-native MPI bootstrap (avoiding Intel MPI hydra/SSH),
+this enables successful inter-node MPI communication.
+
+**Recommended 2-node run script pattern** (for when dash3's GPU access is fixed):
+
+```bash
+#!/bin/bash
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=1
+#SBATCH -p rg-nextgen-hpc
+#SBATCH -w dash3,dash4
+#SBATCH --gres=gpu:max_1100:1
+#SBATCH --time=01:00:00
+
+INTEL=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1
+export PATH=${INTEL}/compiler/2025.0/bin:${INTEL}/mpi/2021.14/bin:${PATH}
+export LD_LIBRARY_PATH=${INTEL}/mkl/2025.0/lib:${INTEL}/compiler/2025.0/lib:${INTEL}/2025.0/lib:${INTEL}/mpi/2021.14/lib:${INTEL}/mpi/2021.14/libfabric/lib:${LD_LIBRARY_PATH}
+export FI_PROVIDER_PATH=${INTEL}/mpi/2021.14/libfabric/lib/prov
+export I_MPI_FABRICS="shm:ofi"
+export FI_PROVIDER=tcp
+
+cd /path/to/case
+
+# Step 1: pre-process
+WRAP_SCRIPT=$(mktemp)
+cat > "$WRAP_SCRIPT" << 'EOF'
+IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}')
+export FI_TCP_IFACE="$IFACE"
+exec /path/to/build/install/<hash>/bin/pre_process
+EOF
+chmod +x "$WRAP_SCRIPT"
+srun --mpi=pmi2 -n 2 --ntasks-per-node=1 "$WRAP_SCRIPT"
+
+# Step 2: simulation
+cat > "$WRAP_SCRIPT" << 'EOF'
+IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}')
+export FI_TCP_IFACE="$IFACE"
+exec /path/to/build/install/<hash>/bin/simulation
+EOF
+srun --mpi=pmi2 -n 2 --ntasks-per-node=1 "$WRAP_SCRIPT"
+rm "$WRAP_SCRIPT"
+```
+
 ### `libumf.so.1` not found at runtime
 The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`.
 If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports

From a4de5f2ca405d3f5083dceaa8b31df74bf9e217f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 18 May 2026 22:54:05 -0500
Subject: [PATCH 21/30] fix: add ACES cluster support and ifx SPIR64 ICE
 workarounds for Intel GPU build

Add TAMU HPRC ACES (Intel GPU Max 1100/Ponte Vecchio) cluster entry with
iimpi/2023b + imkl/2023.2.0 modules. Fix three CMake issues needed for the
ifx SPIR-V GPU build:

1. clang-offload-bundler: add bin-llvm/ hint (Intel 2023.x path)
2. MKL SYCL DFT lib: add mkl_sycl fallback name and lib/intel64 hint for
   MKL < 2024 which ships a monolithic mkl_sycl.so instead of mkl_sycl_dft.so
3. ifx 2023.2 SPIR-V backend ICEs: two root causes hit during compilation:
   (a) error #5623 - module-level ! declare target derived-type arrays
       with pointer members accessed via inner sequential loop indices inside
       target regions generate invalid LLVM IR (dominance violation)
   (b) errors #5623/#5633 - complex GPU kernels with ghost_point /
       ib_patch_parameters struct mapping + declare target (seq) routines
       crash the SPIR-V lowering pass
   Workaround: -UMFC_OpenMP per-source flag suppresses #ifdef MFC_OpenMP
   target directives so m_ib_patches, m_surface_tension, m_igr, and
   m_compute_levelset compile CPU-only (all are init or specialized solvers
   called from CPU context, not the hot-path fluid solver kernels).
   m_rhs and m_time_steppers use -O0 to attempt to preserve GPU offload.
---
 CMakeLists.txt                 | 38 +++++++++++++++++++++++++---------
 toolchain/bootstrap/modules.sh |  1 +
 toolchain/modules              |  5 +++++
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c8e649da84..ddb3755550 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -629,7 +629,7 @@ exit 0
                     cmake_path(GET _IFX_REAL PARENT_PATH _ifx_bin)
                     find_program(CLANG_OFFLOAD_BUNDLER
                         NAMES clang-offload-bundler
-                        HINTS "${_ifx_bin}/compiler" "${_ifx_bin}"
+                        HINTS "${_ifx_bin}/compiler" "${_ifx_bin}" "${_ifx_bin}-llvm"
                         REQUIRED)
                     add_custom_command(
                         OUTPUT  "${_mkl_omp_obj}"
@@ -665,9 +665,11 @@ exit 0
                     target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj_host}")
                     # Link MKL threading + core + SYCL DFT backend
                     target_link_options(${a_target} PRIVATE -qmkl=parallel)
-                    find_library(MKL_SYCL_DFT mkl_sycl_dft HINTS "$ENV{MKLROOT}/lib" REQUIRED)
-                    find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH REQUIRED)
-                    find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH REQUIRED)
+                    # mkl_sycl_dft is the name in MKL >= 2023.2; older versions use monolithic mkl_sycl
+                    find_library(MKL_SYCL_DFT NAMES mkl_sycl_dft mkl_sycl
+                        HINTS "$ENV{MKLROOT}/lib" "$ENV{MKLROOT}/lib/intel64" REQUIRED)
+                    find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
+                    find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
                     target_link_libraries(${a_target} PRIVATE ${MKL_SYCL_DFT} ${SYCL_LIB} ${OPENCL_LIB})
                 else()
                     find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
@@ -858,15 +860,31 @@ if (MFC_SIMULATION)
             target_compile_options(simulation PRIVATE -Oipa0)
         endif()
     endif()
-    # ifx SPIR64 ICE: the LLVM inliner pulls !$omp declare target (seq) geometry
-    # routines into target teams loop kernels and generates SPIR-V IR that crashes
-    # llvm-spirv at O1+. -fno-inline keeps them as proper device-side calls.
-    # Each GPU loop calls exactly one geometry routine (split-loop pattern in
-    # m_compute_levelset.fpp), so device-call overhead is small. See PR intel-gpu.
+    # ifx SPIR64 ICEs: multiple patterns in the ifx 2023.2 SPIR-V backend crash
+    # during GPU offload compilation. Two root causes observed:
+    #   (a) Module-level !$omp declare target derived-type arrays with pointer
+    #       members accessed inside inner sequential loops in target regions →
+    #       "Instruction does not dominate all uses!" (error #5623)
+    #   (b) Complex GPU kernels with ghost_point / ib_patch_parameters struct
+    #       mapping + !$omp declare target (seq) geometry routines → segfault
+    #       in SPIR-V lowering (errors #5623/#5633)
+    # -UMFC_OpenMP suppresses all #ifdef MFC_OpenMP target directives so those
+    # files compile as CPU-only. All affected files are either init-only or
+    # specialized solvers called from CPU context, so correctness is preserved.
+    # For m_rhs/m_time_steppers (the hot-path GPU kernels), -O0 is tried first;
+    # if they still ICE they will need -UMFC_OpenMP too.
     if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
         set_source_files_properties(
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_ib_patches.fpp.f90"
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_surface_tension.fpp.f90"
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_igr.fpp.f90"
             "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
-            PROPERTIES COMPILE_OPTIONS "-fno-inline"
+            PROPERTIES COMPILE_FLAGS "-UMFC_OpenMP"
+        )
+        set_source_files_properties(
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_rhs.fpp.f90"
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_time_steppers.fpp.f90"
+            PROPERTIES COMPILE_OPTIONS "-O0"
         )
     endif()
 endif()
diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh
index 119bff39fb..6dc9c0e6e9 100644
--- a/toolchain/bootstrap/modules.sh
+++ b/toolchain/bootstrap/modules.sh
@@ -49,6 +49,7 @@ if [ -v $u_c ]; then
     log   "$OR""Florida$W: HiPerGator (h)"
     log   "$C""WPI $W:   Turing   (t)"
     log   "$Y""Gatech$W:  CRNCH RoboGator (crnch)"
+    log   "$C""TAMU$W:    ACES (aces)"
     log_n "($G""a$W/$G""f$W/$G""s$W/$G""w$W/$B""tuo$W/$C""b$W/$C""e$CR/$C""d/$C""dai$CR/$Y""p$CR/$R""r$CR/$B""cc$CR/$B""c$CR/$B""n$CR/$BR""o$CR/$BR""pa"$CR"/$OR""h"$CR/$C""t""$CR"): "
     read u_c
     log
diff --git a/toolchain/modules b/toolchain/modules
index e12ace3502..6b0b6809a5 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -117,6 +117,11 @@ t     WPI Turing
 t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
 
+aces     TAMU HPRC ACES (Intel GPU Max 1100, Ponte Vecchio)
+aces-all iimpi/2023b imkl/2023.2.0 CMake/3.27.6 Python/3.11.5
+aces-gpu I_MPI_F90=ifx
+aces-gpu FC=mpif90
+
 crnch     GT CRNCH RoboGator (Intel GPU Max 1100, Ponte Vecchio)
 crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin/mpiifx
 crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin:${PATH}

From 6940725c9a4f0c3866f88640fe06f09a2b95ba21 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 07:37:07 -0500
Subject: [PATCH 22/30] fix: remove -O0 workaround for m_rhs/m_time_steppers;
 verified -O3 compiles clean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Direct compilation tests on PVC node confirmed that m_rhs.fpp.f90 and
m_time_steppers.fpp.f90 both compile without ICE at -O3.  The -O0 fallback
was applied preemptively based on code-pattern analysis but was never
actually needed — the build had been blocked by the four CPU-fallback files
(m_ib_patches, m_surface_tension, m_igr, m_compute_levelset), and once those
were fixed the hot-path GPU kernels compiled at full optimization.

All simulation GPU kernels now compile at -O3 with no per-file flag hacks.
---
 CMakeLists.txt | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddb3755550..538e6e1320 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -871,8 +871,8 @@ if (MFC_SIMULATION)
     # -UMFC_OpenMP suppresses all #ifdef MFC_OpenMP target directives so those
     # files compile as CPU-only. All affected files are either init-only or
     # specialized solvers called from CPU context, so correctness is preserved.
-    # For m_rhs/m_time_steppers (the hot-path GPU kernels), -O0 is tried first;
-    # if they still ICE they will need -UMFC_OpenMP too.
+    # m_rhs and m_time_steppers were tested directly at -O3 and compile cleanly;
+    # no per-file workaround is needed for the hot-path GPU kernels.
     if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
         set_source_files_properties(
             "${CMAKE_BINARY_DIR}/fypp/simulation/m_ib_patches.fpp.f90"
@@ -881,11 +881,6 @@ if (MFC_SIMULATION)
             "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
             PROPERTIES COMPILE_FLAGS "-UMFC_OpenMP"
         )
-        set_source_files_properties(
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_rhs.fpp.f90"
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_time_steppers.fpp.f90"
-            PROPERTIES COMPILE_OPTIONS "-O0"
-        )
     endif()
 endif()
 

From b874530d8c4e783add052a7cde4ad81a765db310 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 07:49:53 -0500
Subject: [PATCH 23/30] fix: upgrade ACES to iimpi/2025a (ifx 2025.1.1); remove
 ifx 2023.2 SPIR-V ICE workarounds

---
 CMakeLists.txt    | 22 ----------------------
 toolchain/modules |  2 +-
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 538e6e1320..a03ce41afe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -860,28 +860,6 @@ if (MFC_SIMULATION)
             target_compile_options(simulation PRIVATE -Oipa0)
         endif()
     endif()
-    # ifx SPIR64 ICEs: multiple patterns in the ifx 2023.2 SPIR-V backend crash
-    # during GPU offload compilation. Two root causes observed:
-    #   (a) Module-level !$omp declare target derived-type arrays with pointer
-    #       members accessed inside inner sequential loops in target regions →
-    #       "Instruction does not dominate all uses!" (error #5623)
-    #   (b) Complex GPU kernels with ghost_point / ib_patch_parameters struct
-    #       mapping + !$omp declare target (seq) geometry routines → segfault
-    #       in SPIR-V lowering (errors #5623/#5633)
-    # -UMFC_OpenMP suppresses all #ifdef MFC_OpenMP target directives so those
-    # files compile as CPU-only. All affected files are either init-only or
-    # specialized solvers called from CPU context, so correctness is preserved.
-    # m_rhs and m_time_steppers were tested directly at -O3 and compile cleanly;
-    # no per-file workaround is needed for the hot-path GPU kernels.
-    if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
-        set_source_files_properties(
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_ib_patches.fpp.f90"
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_surface_tension.fpp.f90"
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_igr.fpp.f90"
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
-            PROPERTIES COMPILE_FLAGS "-UMFC_OpenMP"
-        )
-    endif()
 endif()
 
 if (MFC_POST_PROCESS)
diff --git a/toolchain/modules b/toolchain/modules
index 6b0b6809a5..09ed25f401 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -118,7 +118,7 @@ t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
 
 aces     TAMU HPRC ACES (Intel GPU Max 1100, Ponte Vecchio)
-aces-all iimpi/2023b imkl/2023.2.0 CMake/3.27.6 Python/3.11.5
+aces-all iimpi/2025a imkl/2025.1.0 CMake/3.27.6 Python/3.11.5
 aces-gpu I_MPI_F90=ifx
 aces-gpu FC=mpif90
 

From 577f93f1a7f200d39133c10502dc7f97a07dc19d Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 07:54:11 -0500
Subject: [PATCH 24/30] fix: use CMake/3.31.3 and Python/3.13.1 with
 iimpi/2025a on ACES (module hierarchy requires these versions)

---
 toolchain/modules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolchain/modules b/toolchain/modules
index 09ed25f401..1240772c94 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -118,7 +118,7 @@ t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
 
 aces     TAMU HPRC ACES (Intel GPU Max 1100, Ponte Vecchio)
-aces-all iimpi/2025a imkl/2025.1.0 CMake/3.27.6 Python/3.11.5
+aces-all iimpi/2025a imkl/2025.1.0 CMake/3.31.3 Python/3.13.1
 aces-gpu I_MPI_F90=ifx
 aces-gpu FC=mpif90
 

From 007e84df408d1ea0be804676e1300f1f3f6b09ef Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 08:45:19 -0500
Subject: [PATCH 25/30] fix: suppress inlining for m_compute_levelset on ifx
 IntelLLVM+OpenMP to avoid SPIR-V #5633 ICE

---
 CMakeLists.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a03ce41afe..8eb6d4b34a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -860,6 +860,17 @@ if (MFC_SIMULATION)
             target_compile_options(simulation PRIVATE -Oipa0)
         endif()
     endif()
+    # ifx 2025.1.1 SPIR-V ICE (#5633) on m_compute_levelset: the backend segfaults
+    # when declare-target geometry subroutines (s_sphere_levelset etc.) are inlined
+    # into the target teams loop region. -fno-inline prevents the inlining; the calls
+    # are resolved at SPIR-V link time via !$omp declare target, so GPU execution is
+    # preserved. Tested: -O1/-O2/-O3 all ICE; only -O3 -fno-inline compiles cleanly.
+    if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
+        set_source_files_properties(
+            "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
+            PROPERTIES COMPILE_FLAGS "-fno-inline"
+        )
+    endif()
 endif()
 
 if (MFC_POST_PROCESS)

From 94e4d4b0eca26d69c440f1ede1c1f978b98b83d1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 09:16:14 -0500
Subject: [PATCH 26/30] test: add ifx #5633 ICE reproducers (matmul in
 declare-target sub)

m_ice_min.f90: 60-line minimum reproducer
  - matmul() inside ! declare target sub called from ! target teams loop
  - ICEs at O1/O2/O3; passes with -fno-inline
  - manual loops (no matmul intrinsic) compile fine, confirming matmul is trigger

m_ice_repro.f90: structured reproducer matching real MFC m_compute_levelset
  - derived-type struct, allocatable module arrays, 10 separate target loops
  - same ICE pattern; confirms -fno-inline per-file workaround

Bisection scripts (run_cl5b.sh, run_cl5c.sh) document the investigation:
  char field, interp_coeffs, loop count all ruled out
  matmul with struct-member or local matrix confirmed as trigger
---
 mini/m_cl4_mini.f90   | 405 ++++++++++++++++++++++++++++++++++++++++++
 mini/m_cl5_mini.f90   | 304 +++++++++++++++++++++++++++++++
 mini/m_ice_min.f90    |  61 +++++++
 mini/m_ice_repro.f90  | 125 +++++++++++++
 mini/run_cl4.sh       |  48 +++++
 mini/run_cl5.sh       |  46 +++++
 mini/run_cl5b.sh      |  93 ++++++++++
 mini/run_cl5c.sh      | 116 ++++++++++++
 mini/run_ice_min.sh   |  68 +++++++
 mini/run_ice_repro.sh |  75 ++++++++
 10 files changed, 1341 insertions(+)
 create mode 100644 mini/m_cl4_mini.f90
 create mode 100644 mini/m_cl5_mini.f90
 create mode 100644 mini/m_ice_min.f90
 create mode 100644 mini/m_ice_repro.f90
 create mode 100755 mini/run_cl4.sh
 create mode 100755 mini/run_cl5.sh
 create mode 100755 mini/run_cl5b.sh
 create mode 100755 mini/run_cl5c.sh
 create mode 100755 mini/run_ice_min.sh
 create mode 100755 mini/run_ice_repro.sh

diff --git a/mini/m_cl4_mini.f90 b/mini/m_cl4_mini.f90
new file mode 100644
index 0000000000..5d3dd75cea
--- /dev/null
+++ b/mini/m_cl4_mini.f90
@@ -0,0 +1,405 @@
+! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! Pattern: single module with 10+ !$omp target teams loop regions, each calling
+! a different !$omp declare target subroutine defined in the SAME module.
+! The subroutines access a derived type that contains a character field
+! (ib_patch_parameters%model_filepath) and 3x3 real arrays (rotation_matrix).
+!
+! Compile (ICE expected at -O3, pass expected at -O3 -fno-inline):
+!   mpif90 -free -fiopenmp -fopenmp-targets=spir64 -fp-model=precise \
+!          -march=native -mno-avx512fp16 -O3 -c m_cl4_mini.f90
+!
+! Fix:
+!   mpif90 ... -O3 -fno-inline -c m_cl4_mini.f90
+
+module m_cl4_mini
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+    integer, parameter :: PATHLEN = 200
+    integer, parameter :: MAX_PATCHES = 10
+
+    ! Replicated from m_derived_types.fpp :: ib_patch_parameters
+    type :: ib_patch_parameters
+        integer  :: geometry
+        real(wp) :: x_centroid, y_centroid, z_centroid
+        real(wp) :: step_x_centroid, step_y_centroid, step_z_centroid
+        real(wp), dimension(1:3)     :: centroid_offset
+        real(wp), dimension(1:3)     :: angles, step_angles
+        real(wp), dimension(1:3,1:3) :: rotation_matrix
+        real(wp), dimension(1:3,1:3) :: rotation_matrix_inverse
+        real(wp) :: c, p, t, m
+        real(wp) :: length_x, length_y, length_z
+        real(wp) :: radius, theta
+        logical  :: slip
+        character(LEN=PATHLEN) :: model_filepath  ! <-- char field; triggers SPIR-V bug
+        real(wp), dimension(1:3) :: model_translate
+        real(wp), dimension(1:3) :: model_scale
+        real(wp), dimension(1:3) :: model_rotate
+        integer  :: model_spc
+        real(wp) :: model_threshold
+        integer  :: moving_ibm
+        real(wp) :: mass, moment
+        real(wp), dimension(1:3) :: force, torque
+        real(wp), dimension(1:3) :: vel, step_vel
+        real(wp), dimension(1:3) :: angular_vel, step_angular_vel
+    end type ib_patch_parameters
+
+    ! Replicated from m_derived_types.fpp :: ghost_point
+    type :: ghost_point
+        integer, dimension(3)        :: loc
+        real(wp), dimension(3)       :: ip_loc
+        integer, dimension(3)        :: ip_grid
+        real(wp), dimension(2, 2, 2) :: interp_coeffs  ! 3-D array in struct
+        integer                      :: ib_patch_id
+        real(wp)                     :: levelset
+        real(wp), dimension(1:3)     :: levelset_norm
+        logical                      :: slip
+        integer, dimension(3)        :: DB
+        integer                      :: x_periodicity, y_periodicity, z_periodicity
+    end type ghost_point
+
+    type :: bounds_info
+        real(wp) :: beg, end
+    end type bounds_info
+
+    ! Module-level variables accessed by the declare-target subroutines
+    type(ib_patch_parameters), dimension(MAX_PATCHES) :: patch_ib
+    real(wp), allocatable :: x_cc(:), y_cc(:), z_cc(:)
+    type(bounds_info) :: x_domain, y_domain, z_domain
+    integer :: num_ibs
+
+    !$omp declare target(patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs)
+
+    public :: ghost_point, s_cl4_dispatch, s_cl4_init, s_cl4_finalize
+
+contains
+
+    ! ------------------------------------------------------------------ init --
+
+    subroutine s_cl4_init(nx, nz)
+        integer, intent(in) :: nx, nz
+        integer :: i
+        allocate(x_cc(nx), y_cc(nx), z_cc(nz))
+        do i = 1, nx
+            x_cc(i) = real(i, wp) * 0.1_wp
+            y_cc(i) = real(i, wp) * 0.1_wp
+        end do
+        do i = 1, nz
+            z_cc(i) = real(i, wp) * 0.1_wp
+        end do
+        x_domain%beg = 0._wp; x_domain%end = real(nx, wp)*0.1_wp
+        y_domain%beg = 0._wp; y_domain%end = real(nx, wp)*0.1_wp
+        z_domain%beg = 0._wp; z_domain%end = real(nz, wp)*0.1_wp
+        num_ibs = 1
+        do i = 1, MAX_PATCHES
+            patch_ib(i)%geometry   = i
+            patch_ib(i)%x_centroid = 0.5_wp
+            patch_ib(i)%y_centroid = 0.5_wp
+            patch_ib(i)%z_centroid = 0.5_wp
+            patch_ib(i)%radius     = 0.2_wp
+            patch_ib(i)%length_x   = 0.4_wp
+            patch_ib(i)%length_y   = 0.4_wp
+            patch_ib(i)%length_z   = 0.4_wp
+            patch_ib(i)%rotation_matrix         = 0._wp
+            patch_ib(i)%rotation_matrix(1,1)    = 1._wp
+            patch_ib(i)%rotation_matrix(2,2)    = 1._wp
+            patch_ib(i)%rotation_matrix(3,3)    = 1._wp
+            patch_ib(i)%rotation_matrix_inverse = patch_ib(i)%rotation_matrix
+            patch_ib(i)%centroid_offset         = 0._wp
+            patch_ib(i)%model_filepath          = 'none'
+        end do
+        !$omp target enter data map(to: patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs)
+    end subroutine s_cl4_init
+
+    subroutine s_cl4_finalize()
+        !$omp target exit data map(delete: patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs)
+        deallocate(x_cc, y_cc, z_cc)
+    end subroutine s_cl4_finalize
+
+    ! ------------------------------------------------------- dispatch (ICE site) --
+
+    !> 10 separate target teams loop regions, each calling a different
+    !! !$omp declare target subroutine defined in THIS module.
+    subroutine s_cl4_dispatch(gps, num_gps)
+        type(ghost_point), dimension(:), intent(inout) :: gps
+        integer, intent(in) :: num_gps
+        integer :: i, patch_id
+
+        ! 3D geometry loops (5 loops, geometries 8-12)
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 8) call s_geo_sphere(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 9) call s_geo_cuboid(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 10) call s_geo_cylinder(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 11) call s_geo_3d_airfoil(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 12) call s_geo_model(gps(i))
+        end do
+
+        ! 2D geometry loops (5 loops, geometries 2-6)
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 2) call s_geo_circle(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 3) call s_geo_rectangle(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 4) call s_geo_airfoil(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 5) call s_geo_ellipse(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 6) call s_geo_triangle(gps(i))
+        end do
+
+    end subroutine s_cl4_dispatch
+
+    ! ----------------------------------------- declare-target geometry routines --
+    ! All defined in the SAME module; each has !$omp declare target.
+    ! The ICE fires when the inliner pulls these into the target teams loop bodies.
+
+    subroutine s_geo_sphere(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dist
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid - real(gp%x_periodicity,wp)*(x_domain%end-x_domain%beg)
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid - real(gp%y_periodicity,wp)*(y_domain%end-y_domain%beg)
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid - real(gp%z_periodicity,wp)*(z_domain%end-z_domain%beg)
+        dist = sqrt(sum(dv**2))
+        gp%levelset = dist - patch_ib(id)%radius
+        if (dist > 0._wp) then
+            gp%levelset_norm = dv/dist
+        else
+            gp%levelset_norm = 0._wp
+        end if
+    end subroutine s_geo_sphere
+
+    subroutine s_geo_cuboid(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dx, dy, dz, lx, ly, lz
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        lx = patch_ib(id)%length_x; ly = patch_ib(id)%length_y; lz = patch_ib(id)%length_z
+        dx = abs(x_cc(i) - patch_ib(id)%x_centroid) - lx/2._wp
+        dy = abs(y_cc(j) - patch_ib(id)%y_centroid) - ly/2._wp
+        dz = abs(z_cc(k) - patch_ib(id)%z_centroid) - lz/2._wp
+        gp%levelset = max(dx, max(dy, dz))
+        gp%levelset_norm(1) = sign(1._wp, x_cc(i) - patch_ib(id)%x_centroid)
+        gp%levelset_norm(2) = sign(1._wp, y_cc(j) - patch_ib(id)%y_centroid)
+        gp%levelset_norm(3) = sign(1._wp, z_cc(k) - patch_ib(id)%z_centroid)
+    end subroutine s_geo_cuboid
+
+    subroutine s_geo_cylinder(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: r2d, dist
+        real(wp), dimension(3) :: dv, rot_dv
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid
+        rot_dv = matmul(patch_ib(id)%rotation_matrix_inverse, dv)
+        r2d = sqrt(rot_dv(1)**2 + rot_dv(2)**2)
+        dist = r2d - patch_ib(id)%radius
+        gp%levelset = dist
+        if (r2d > 0._wp) then
+            gp%levelset_norm(1) = rot_dv(1)/r2d
+            gp%levelset_norm(2) = rot_dv(2)/r2d
+        else
+            gp%levelset_norm(1) = 0._wp
+            gp%levelset_norm(2) = 0._wp
+        end if
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo_cylinder
+
+    subroutine s_geo_3d_airfoil(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp), dimension(3) :: dv, local
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid
+        local = matmul(patch_ib(id)%rotation_matrix_inverse, dv) - patch_ib(id)%centroid_offset
+        gp%levelset = sqrt(sum(local**2)) - patch_ib(id)%radius
+        gp%levelset_norm = local / max(sqrt(sum(local**2)), 1e-12_wp)
+    end subroutine s_geo_3d_airfoil
+
+    subroutine s_geo_model(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid + patch_ib(id)%model_translate(1)*patch_ib(id)%model_scale(1)
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid + patch_ib(id)%model_translate(2)*patch_ib(id)%model_scale(2)
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid + patch_ib(id)%model_translate(3)*patch_ib(id)%model_scale(3)
+        gp%levelset = sqrt(sum(dv**2)) - patch_ib(id)%radius
+        gp%levelset_norm = dv / max(sqrt(sum(dv**2)), 1e-12_wp)
+    end subroutine s_geo_model
+
+    subroutine s_geo_circle(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dist
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid - real(gp%x_periodicity,wp)*(x_domain%end-x_domain%beg)
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid - real(gp%y_periodicity,wp)*(y_domain%end-y_domain%beg)
+        dv(3) = 0._wp
+        dist = sqrt(sum(dv**2))
+        gp%levelset = dist - patch_ib(id)%radius
+        if (dist > 0._wp) then
+            gp%levelset_norm = dv/dist
+        else
+            gp%levelset_norm = 0._wp
+        end if
+    end subroutine s_geo_circle
+
+    subroutine s_geo_rectangle(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dx, dy
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dx = abs(x_cc(i) - patch_ib(id)%x_centroid) - patch_ib(id)%length_x/2._wp
+        dy = abs(y_cc(j) - patch_ib(id)%y_centroid) - patch_ib(id)%length_y/2._wp
+        gp%levelset = max(dx, dy)
+        gp%levelset_norm(1) = sign(1._wp, x_cc(i) - patch_ib(id)%x_centroid)
+        gp%levelset_norm(2) = sign(1._wp, y_cc(j) - patch_ib(id)%y_centroid)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo_rectangle
+
+    subroutine s_geo_airfoil(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp), dimension(3) :: dv, local
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = 0._wp
+        local = matmul(patch_ib(id)%rotation_matrix_inverse, dv) - patch_ib(id)%centroid_offset
+        gp%levelset = sqrt(sum(local**2)) - patch_ib(id)%radius
+        gp%levelset_norm = local / max(sqrt(sum(local**2)), 1e-12_wp)
+    end subroutine s_geo_airfoil
+
+    subroutine s_geo_ellipse(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: ax, ay, dist
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        ax = patch_ib(id)%length_x / 2._wp
+        ay = patch_ib(id)%length_y / 2._wp
+        dv(1) = (x_cc(i) - patch_ib(id)%x_centroid) / ax
+        dv(2) = (y_cc(j) - patch_ib(id)%y_centroid) / ay
+        dv(3) = 0._wp
+        dist = sqrt(dv(1)**2 + dv(2)**2)
+        gp%levelset = dist - 1._wp
+        if (dist > 0._wp) then
+            gp%levelset_norm = dv/dist
+        else
+            gp%levelset_norm = 0._wp
+        end if
+    end subroutine s_geo_ellipse
+
+    subroutine s_geo_triangle(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: d1, d2, d3
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = 0._wp
+        d1 = dv(2) + patch_ib(id)%length_y/2._wp
+        d2 = -dv(2) + dv(1)*sqrt(3._wp)/3._wp + patch_ib(id)%length_y/3._wp
+        d3 = -dv(2) - dv(1)*sqrt(3._wp)/3._wp + patch_ib(id)%length_y/3._wp
+        gp%levelset = min(max(-d1,-d2), max(-d1,-d3))
+        gp%levelset_norm(1) = dv(1) / max(sqrt(sum(dv(1:2)**2)), 1e-12_wp)
+        gp%levelset_norm(2) = dv(2) / max(sqrt(sum(dv(1:2)**2)), 1e-12_wp)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo_triangle
+
+end module m_cl4_mini
+
+program test_cl4
+    use m_cl4_mini
+    implicit none
+
+    integer, parameter :: wp = kind(1.0d0)
+    integer, parameter :: N = 64, NGPS = 32
+    type(ghost_point), allocatable :: gps(:)
+    integer :: i
+
+    call s_cl4_init(N, N)
+
+    allocate(gps(NGPS))
+    do i = 1, NGPS
+        gps(i)%loc           = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1]
+        gps(i)%ib_patch_id   = 1
+        gps(i)%x_periodicity = 0
+        gps(i)%y_periodicity = 0
+        gps(i)%z_periodicity = 0
+        gps(i)%levelset      = 0._wp
+        gps(i)%levelset_norm = 0._wp
+        gps(i)%interp_coeffs = 0._wp
+        gps(i)%slip          = .false.
+    end do
+
+    call s_cl4_dispatch(gps, NGPS)
+
+    write(*,'(a,f12.6)') 'levelset(1) = ', gps(1)%levelset
+    deallocate(gps)
+    call s_cl4_finalize()
+    write(*,'(a)') 'OK'
+
+end program test_cl4
diff --git a/mini/m_cl5_mini.f90 b/mini/m_cl5_mini.f90
new file mode 100644
index 0000000000..2b23ef549d
--- /dev/null
+++ b/mini/m_cl5_mini.f90
@@ -0,0 +1,304 @@
+! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! ICE trigger: multiple !$omp target teams loop regions in one module, each
+! calling a different !$omp declare target subroutine in the SAME module.
+! Subroutines access allocatable module arrays and a derived type with
+! array-of-arrays fields (interp_coeffs(2,2,2)) plus character field.
+!
+! Compile (expect ICE):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -c m_cl5_mini.f90
+! Workaround (passes):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline -c m_cl5_mini.f90
+
+module m_cl5
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+
+    type :: patch_t
+        integer  :: geometry
+        real(wp) :: radius, cx, cy, cz
+        real(wp) :: lx, ly, lz
+        real(wp), dimension(1:3,1:3) :: rot, rot_inv
+        real(wp), dimension(1:3)     :: offset
+        character(LEN=200) :: label    ! char field in device-mapped type
+    end type patch_t
+
+    type :: gp_t
+        integer,  dimension(3)       :: loc
+        real(wp), dimension(2, 2, 2) :: interp_coeffs  ! 3D array in struct
+        integer                      :: pid
+        real(wp)                     :: levelset
+        real(wp), dimension(3)       :: levelset_norm
+        integer                      :: xp, yp, zp
+    end type gp_t
+
+    type(patch_t), dimension(8), target :: patches
+    real(wp), allocatable :: xc(:), yc(:), zc(:)
+    !$omp declare target(patches, xc, yc, zc)
+
+    public :: gp_t, s_init, s_finalize, s_dispatch
+
+contains
+
+    subroutine s_init(n)
+        integer, intent(in) :: n
+        integer :: i
+        allocate(xc(n), yc(n), zc(n))
+        do i = 1, n
+            xc(i) = real(i,wp)*0.1_wp
+            yc(i) = real(i,wp)*0.1_wp
+            zc(i) = real(i,wp)*0.1_wp
+        end do
+        do i = 1, 8
+            patches(i)%geometry = i
+            patches(i)%radius   = 0.5_wp; patches(i)%cx = 0.5_wp
+            patches(i)%cy       = 0.5_wp; patches(i)%cz = 0.5_wp
+            patches(i)%lx       = 0.4_wp; patches(i)%ly = 0.4_wp; patches(i)%lz = 0.4_wp
+            patches(i)%rot      = 0.0_wp
+            patches(i)%rot(1,1) = 1.0_wp; patches(i)%rot(2,2) = 1.0_wp; patches(i)%rot(3,3) = 1.0_wp
+            patches(i)%rot_inv  = patches(i)%rot
+            patches(i)%offset   = 0.0_wp
+            patches(i)%label    = 'none'
+        end do
+        !$omp target enter data map(to: patches, xc, yc, zc)
+    end subroutine s_init
+
+    subroutine s_finalize()
+        !$omp target exit data map(delete: patches, xc, yc, zc)
+        deallocate(xc, yc, zc)
+    end subroutine s_finalize
+
+    subroutine s_dispatch(gps, n)
+        type(gp_t), intent(inout) :: gps(:)
+        integer,    intent(in)    :: n
+        integer :: i
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 1) call s_geo1(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 2) call s_geo2(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 3) call s_geo3(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 4) call s_geo4(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 5) call s_geo5(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 6) call s_geo6(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 7) call s_geo7(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 8) call s_geo8(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 9) call s_geo9(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 10) call s_geo10(gps(i))
+        end do
+
+    end subroutine s_dispatch
+
+    subroutine s_geo1(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx - real(gp%xp,wp)*0.1_wp
+        dv(2) = yc(j) - patches(id)%cy - real(gp%yp,wp)*0.1_wp
+        dv(3) = zc(k) - patches(id)%cz - real(gp%zp,wp)*0.1_wp
+        d = sqrt(sum(dv**2))
+        gp%levelset = d - patches(id)%radius
+        if (d > 0._wp) then; gp%levelset_norm = dv/d; else; gp%levelset_norm = 0._wp; end if
+    end subroutine s_geo1
+
+    subroutine s_geo2(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp) :: dx, dy, dz
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dx = abs(xc(i) - patches(id)%cx) - patches(id)%lx/2._wp
+        dy = abs(yc(j) - patches(id)%cy) - patches(id)%ly/2._wp
+        dz = abs(zc(k) - patches(id)%cz) - patches(id)%lz/2._wp
+        gp%levelset = max(dx, max(dy, dz))
+        gp%levelset_norm(1) = sign(1._wp, xc(i) - patches(id)%cx)
+        gp%levelset_norm(2) = sign(1._wp, yc(j) - patches(id)%cy)
+        gp%levelset_norm(3) = sign(1._wp, zc(k) - patches(id)%cz)
+    end subroutine s_geo2
+
+    subroutine s_geo3(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: r2d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz
+        ldv = matmul(patches(id)%rot_inv, dv)
+        r2d = sqrt(ldv(1)**2 + ldv(2)**2)
+        gp%levelset = r2d - patches(id)%radius
+        if (r2d > 0._wp) then
+            gp%levelset_norm(1) = ldv(1)/r2d; gp%levelset_norm(2) = ldv(2)/r2d
+        else
+            gp%levelset_norm(1) = 0._wp; gp%levelset_norm(2) = 0._wp
+        end if
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo3
+
+    subroutine s_geo4(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz
+        ldv = matmul(patches(id)%rot_inv, dv) - patches(id)%offset
+        d = sqrt(sum(ldv**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = ldv / max(d, 1.0e-12_wp)
+    end subroutine s_geo4
+
+    subroutine s_geo5(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp) :: ax, ay, d
+        real(wp), dimension(3) :: dv
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        ax = patches(id)%lx/2._wp; ay = patches(id)%ly/2._wp
+        dv(1) = (xc(i) - patches(id)%cx)/ax
+        dv(2) = (yc(j) - patches(id)%cy)/ay
+        dv(3) = 0._wp
+        d = sqrt(dv(1)**2 + dv(2)**2)
+        gp%levelset = d - 1._wp
+        if (d > 0._wp) then; gp%levelset_norm = dv/d; else; gp%levelset_norm = 0._wp; end if
+    end subroutine s_geo5
+
+    subroutine s_geo6(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp) :: dx, dy
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dx = abs(xc(i) - patches(id)%cx) - patches(id)%lx/2._wp
+        dy = abs(yc(j) - patches(id)%cy) - patches(id)%ly/2._wp
+        gp%levelset = max(dx, dy)
+        gp%levelset_norm(1) = sign(1._wp, xc(i) - patches(id)%cx)
+        gp%levelset_norm(2) = sign(1._wp, yc(j) - patches(id)%cy)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo6
+
+    subroutine s_geo7(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = 0._wp
+        ldv = matmul(patches(id)%rot_inv, dv) - patches(id)%offset
+        d = sqrt(sum(ldv(1:2)**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = ldv / max(d, 1.0e-12_wp)
+    end subroutine s_geo7
+
+    subroutine s_geo8(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp) :: d1, d2, d3
+        real(wp), dimension(3) :: dv
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = 0._wp
+        d1 = dv(2) + patches(id)%ly/2._wp
+        d2 = -dv(2) + dv(1)*sqrt(3._wp)/3._wp + patches(id)%ly/3._wp
+        d3 = -dv(2) - dv(1)*sqrt(3._wp)/3._wp + patches(id)%ly/3._wp
+        gp%levelset = min(max(-d1,-d2), max(-d1,-d3))
+        gp%levelset_norm(1) = dv(1)/max(sqrt(sum(dv(1:2)**2)),1e-12_wp)
+        gp%levelset_norm(2) = dv(2)/max(sqrt(sum(dv(1:2)**2)),1e-12_wp)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo8
+
+    subroutine s_geo9(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp), dimension(3) :: dv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = xc(i) - patches(id)%cx + patches(id)%lx*patches(id)%offset(1)
+        dv(2) = yc(j) - patches(id)%cy + patches(id)%ly*patches(id)%offset(2)
+        dv(3) = 0._wp
+        d = sqrt(sum(dv**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = dv / max(d, 1.0e-12_wp)
+    end subroutine s_geo9
+
+    subroutine s_geo10(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz
+        ldv = matmul(patches(id)%rot, dv)
+        d = sqrt(sum(ldv**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = matmul(patches(id)%rot_inv, ldv) / max(d, 1.0e-12_wp)
+    end subroutine s_geo10
+
+end module m_cl5
+
+program test_cl5
+    use m_cl5
+    implicit none
+    integer, parameter :: N = 16
+    type(gp_t) :: gps(N)
+    integer :: i
+    call s_init(N)
+    do i = 1, N
+        gps(i)%levelset      = 0.0d0
+        gps(i)%levelset_norm = 0.0d0
+        gps(i)%interp_coeffs = 0.0d0
+        gps(i)%loc           = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1]
+        gps(i)%pid           = mod(i-1,8)+1
+        gps(i)%xp            = 0; gps(i)%yp = 0; gps(i)%zp = 0
+    end do
+    call s_dispatch(gps, N)
+    write(*,*) 'levelset(1) =', gps(1)%levelset
+    call s_finalize()
+end program test_cl5
diff --git a/mini/m_ice_min.f90 b/mini/m_ice_min.f90
new file mode 100644
index 0000000000..8b342ead7e
--- /dev/null
+++ b/mini/m_ice_min.f90
@@ -0,0 +1,61 @@
+! ABSOLUTE MINIMUM reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! Trigger: matmul() inside a !$omp declare target subroutine that is
+!          inlined into a !$omp target teams loop kernel.
+!
+! ifx version: 2025.1.1 20250418   Hardware: Intel GPU Max 1100 (Ponte Vecchio)
+!
+! ICE (O1/O2/O3):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 m_ice_min.f90
+! OK (inlining disabled):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline m_ice_min.f90
+
+module m_min
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+
+    real(wp), dimension(3, 3) :: A  ! module-level 3x3 matrix
+    real(wp), dimension(3)    :: x  ! module-level vector
+    real(wp), dimension(3)    :: b  ! result vector
+    !$omp declare target(A, x, b)
+
+    public :: s_run
+
+contains
+
+    subroutine s_run(n)
+        integer, intent(in) :: n
+        integer :: i
+
+        A(1,:) = [1._wp, 0._wp, 0._wp]
+        A(2,:) = [0._wp, 1._wp, 0._wp]
+        A(3,:) = [0._wp, 0._wp, 1._wp]
+        x = [1._wp, 2._wp, 3._wp]
+        b = 0._wp
+        !$omp target enter data map(to: A, x, b)
+
+        !$omp target teams loop private(i) map(to:A,x) map(tofrom:b)
+        do i = 1, n
+            call s_apply(i)
+        end do
+
+        !$omp target exit data map(from: b)
+    end subroutine s_run
+
+    subroutine s_apply(k)
+        !$omp declare target
+        integer, intent(in) :: k
+        ! matmul inside declare-target sub -- ICE trigger when inlined
+        b = real(k, wp) * matmul(A, x)
+    end subroutine s_apply
+
+end module m_min
+
+program test_min
+    use m_min
+    implicit none
+    call s_run(4)
+end program test_min
diff --git a/mini/m_ice_repro.f90 b/mini/m_ice_repro.f90
new file mode 100644
index 0000000000..d2dbe141de
--- /dev/null
+++ b/mini/m_ice_repro.f90
@@ -0,0 +1,125 @@
+! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! Trigger: matmul() called with a derived-type member (dimension(3,3) array)
+!          inside an !$omp declare target subroutine, called from an
+!          !$omp target teams loop. Module-level allocatable arrays are also
+!          accessed from the declare-target routine.
+!
+! ifx version: 2025.1.1 20250418 (iimpi/2025a)
+! Hardware:    Intel GPU Max 1100 (Ponte Vecchio XT)
+!
+! Compile (ICE -- error #5633: segmentation violation in SPIR-V backend):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -c m_ice_repro.f90
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O2 -c m_ice_repro.f90
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O1 -c m_ice_repro.f90
+!
+! Workaround (OK -- inlining suppressed so matmul is called via link, not inline):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline -c m_ice_repro.f90
+
+module m_ice
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+
+    type :: patch_t
+        real(wp), dimension(3, 3) :: mat  ! 3x3 member -- key field
+        real(wp) :: cx, cy, cz, radius
+    end type patch_t
+
+    type :: gp_t
+        integer  :: pid
+        integer, dimension(3) :: loc
+        real(wp) :: levelset
+        real(wp), dimension(3) :: norm
+    end type gp_t
+
+    type(patch_t), dimension(4) :: patches
+    real(wp), allocatable :: xc(:), yc(:), zc(:)
+    !$omp declare target(patches, xc, yc, zc)
+
+    public :: gp_t, s_init, s_finalize, s_dispatch
+
+contains
+
+    subroutine s_init(n)
+        integer, intent(in) :: n
+        integer :: i
+        allocate(xc(n), yc(n), zc(n))
+        do i = 1, n
+            xc(i) = real(i, wp)*0.1_wp
+            yc(i) = real(i, wp)*0.1_wp
+            zc(i) = real(i, wp)*0.1_wp
+        end do
+        do i = 1, 4
+            patches(i)%cx = 0.5_wp; patches(i)%cy = 0.5_wp; patches(i)%cz = 0.5_wp
+            patches(i)%radius = 0.25_wp
+            patches(i)%mat      = 0.0_wp
+            patches(i)%mat(1,1) = 1.0_wp
+            patches(i)%mat(2,2) = 1.0_wp
+            patches(i)%mat(3,3) = 1.0_wp
+        end do
+        !$omp target enter data map(to: patches, xc, yc, zc)
+    end subroutine s_init
+
+    subroutine s_finalize()
+        !$omp target exit data map(delete: patches, xc, yc, zc)
+        deallocate(xc, yc, zc)
+    end subroutine s_finalize
+
+    ! Single !$omp target teams loop calling a single declare-target sub.
+    subroutine s_dispatch(gps, n)
+        type(gp_t), intent(inout) :: gps(:)
+        integer,    intent(in)    :: n
+        integer :: i
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            call s_apply(gps(i))
+        end do
+
+    end subroutine s_dispatch
+
+    ! !$omp declare target subroutine in the SAME module.
+    ! ICE fires when the inliner pulls this into the target loop at -O1/-O2/-O3.
+    subroutine s_apply(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx
+        dv(2) = yc(j) - patches(id)%cy
+        dv(3) = zc(k) - patches(id)%cz
+        ! matmul with a derived-type member (patches(id)%mat) is the key trigger
+        ldv = matmul(patches(id)%mat, dv)
+        d = sqrt(sum(ldv**2))
+        gp%levelset = d - patches(id)%radius
+        if (d > 0.0_wp) then
+            gp%norm = ldv/d
+        else
+            gp%norm = 0.0_wp
+        end if
+    end subroutine s_apply
+
+end module m_ice
+
+program test_ice
+    use m_ice
+    implicit none
+    integer, parameter :: N = 16
+    type(gp_t) :: gps(N)
+    integer :: i
+    call s_init(N)
+    do i = 1, N
+        gps(i)%pid      = mod(i-1,4)+1
+        gps(i)%loc      = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1]
+        gps(i)%levelset = 0.0d0
+        gps(i)%norm     = 0.0d0
+    end do
+    call s_dispatch(gps, N)
+    write(*,*) 'levelset(1) =', gps(1)%levelset
+    call s_finalize()
+end program test_ice
diff --git a/mini/run_cl4.sh b/mini/run_cl4.sh
new file mode 100755
index 0000000000..742c63cdca
--- /dev/null
+++ b/mini/run_cl4.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:20:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl4.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl4.log
+#SBATCH -J cl4-ice-repro
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64 -fp-model=precise -march=native -mno-avx512fp16"
+
+echo "=== m_cl4_mini (single-module same-module declare-target + char struct) ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/cl4_$$.o /tmp/cl4_$$.x m_cl4_mini.mod
+    out=$($F90 $BASE "$@" "$MINI/m_cl4_mini.f90" -o /tmp/cl4_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl4_$$.o /tmp/cl4_$$.x m_cl4_mini.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5623|5633|internal abort|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117\|preprocessor" | head -3)"
+    fi
+}
+
+try "-O3 (baseline)"           -O3
+try "-O2"                      -O2
+try "-O1"                      -O1
+try "-O3 -fno-inline"          -O3 -fno-inline
+try "-O3 -fno-vectorize"       -O3 -fno-vectorize
+try "-O3 -fno-unroll-loops"    -O3 -fno-unroll-loops
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_cl5.sh b/mini/run_cl5.sh
new file mode 100755
index 0000000000..4a1f0e02d7
--- /dev/null
+++ b/mini/run_cl5.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:15:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5.log
+#SBATCH -J cl5-ice-reduce
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+
+echo "=== m_cl5_mini (reduced: char+array struct + 10 target loops + 10 same-module decl-target subs) ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/cl5_$$.x m_cl5.mod
+    out=$($F90 $BASE "$@" "$MINI/m_cl5_mini.f90" -o /tmp/cl5_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl5_$$.x m_cl5.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5623|5633|internal abort|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117\|preprocessor" | head -3)"
+    fi
+}
+
+try "-O3 (baseline)"           -O3
+try "-O3 -fno-inline"          -O3 -fno-inline
+try "-O2"                      -O2
+try "-O1"                      -O1
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_cl5b.sh b/mini/run_cl5b.sh
new file mode 100755
index 0000000000..f21189914d
--- /dev/null
+++ b/mini/run_cl5b.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:20:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5b.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5b.log
+#SBATCH -J cl5b-ice-bisect
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+SRC="$MINI/m_cl5_mini.f90"
+
+echo "=== m_cl5 bisection: what is the minimum ICE trigger? ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try_src() {
+    local desc="$1"; local src="$2"; shift 2
+    local out
+    rm -f /tmp/cl5b_$$.x m_cl5.mod
+    out=$($F90 $BASE "$@" "$src" -o /tmp/cl5b_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl5b_$$.x m_cl5.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+# Baseline confirms ICE
+try_src "baseline (full cl5)" "$SRC" -O3
+
+# --- Remove char field ---
+sed '/character(LEN=200)/d; s/patches(i)%label.*=.*//g' "$SRC" > /tmp/nochar_$$.f90
+try_src "no char(LEN=200) field" /tmp/nochar_$$.f90 -O3
+rm -f /tmp/nochar_$$.f90
+
+# --- Remove interp_coeffs(2,2,2) ---
+sed '/interp_coeffs/d' "$SRC" > /tmp/no_ic_$$.f90
+try_src "no interp_coeffs(2,2,2)" /tmp/no_ic_$$.f90 -O3
+rm -f /tmp/no_ic_$$.f90
+
+# --- Keep only 5 loops (geo1-5) ---
+python3 - "$SRC" > /tmp/cl5_5loops_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Remove geo6-geo10 loops from s_dispatch
+for g in range(6, 11):
+    src = re.sub(
+        r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g),
+        '\n', src)
+# Remove geo6-geo10 subroutines
+for g in range(6, 11):
+    src = re.sub(
+        r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g),
+        '\n', src, flags=re.DOTALL)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_src "5 loops only (geo1-5)" /tmp/cl5_5loops_$$.f90 -O3
+rm -f /tmp/cl5_5loops_$$.f90
+
+# --- Remove rot/rot_inv (3x3 arrays) from patch_t ---
+sed '/real(wp), dimension(1:3,1:3)/d; /patches(i)%rot/d; /patches(id)%rot/d; /matmul/d' "$SRC" > /tmp/norot_$$.f90
+try_src "no rotation matrices" /tmp/norot_$$.f90 -O3
+rm -f /tmp/norot_$$.f90
+
+# --- Remove allocatable xc/yc/zc (use constants instead) ---
+python3 - "$SRC" > /tmp/noxc_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Replace allocatable declarations with fixed arrays
+src = src.replace('real(wp), allocatable :: xc(:), yc(:), zc(:)', 'real(wp) :: xc(64), yc(64), zc(64)')
+# Remove allocate/deallocate
+src = re.sub(r'\s*allocate\(xc.*?\)\n', '\n', src)
+src = re.sub(r'\s*deallocate\(xc.*?\)\n', '\n', src)
+src = src.replace('integer, intent(in) :: n\n        integer :: i\n        allocate', 'integer, intent(in) :: n\n        integer :: i\n        ! allocate')
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_src "no allocatable xc/yc/zc" /tmp/noxc_$$.f90 -O3
+rm -f /tmp/noxc_$$.f90
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_cl5c.sh b/mini/run_cl5c.sh
new file mode 100755
index 0000000000..6734f50d91
--- /dev/null
+++ b/mini/run_cl5c.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:20:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5c.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5c.log
+#SBATCH -J cl5c-ice-matmul
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+SRC="$MINI/m_cl5_mini.f90"
+
+echo "=== matmul vs rot(3,3) field isolation test ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try_f90() {
+    local desc="$1"; local src="$2"; shift 2
+    local out
+    rm -f /tmp/cl5c_$$.x m_cl5.mod
+    out=$($F90 $BASE "$@" "$src" -o /tmp/cl5c_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl5c_$$.x m_cl5.mod
+    if [ $rc -eq 0 ]; then echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+# A: Keep rot(3,3) field but replace matmul with element-wise multiply
+python3 - "$SRC" > /tmp/no_matmul_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Replace matmul(X, Y) with sum of manually computed dot products (fake matmul)
+# Just replace matmul calls with equivalent array expressions
+src = re.sub(r'matmul\(patches\(id\)%rot_inv, dv\)', 'dv', src)
+src = re.sub(r'matmul\(patches\(id\)%rot_inv, xyz_local\)', 'dv', src)
+src = re.sub(r'matmul\(patches\(id\)%rot, dv\)', 'dv', src)
+src = re.sub(r'matmul\(patches\(id\)%rot_inv, ldv\)', 'ldv', src)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "rot(3,3) field kept, matmul REMOVED (identity)" /tmp/no_matmul_$$.f90 -O3
+rm -f /tmp/no_matmul_$$.f90
+
+# B: Remove rot(3,3) fields but keep matmul via a local array
+python3 - "$SRC" > /tmp/no_rotfield_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Remove rot/rot_inv struct fields
+src = re.sub(r'\s*real\(wp\), dimension\(1:3,1:3\) :: rot, rot_inv\n', '\n', src)
+# Replace struct accesses with local identity
+src = re.sub(r'patches\(id\)%rot_inv', 'reshape([1._wp,0._wp,0._wp,0._wp,1._wp,0._wp,0._wp,0._wp,1._wp],[3,3])', src)
+src = re.sub(r'patches\(id\)%rot\b', 'reshape([1._wp,0._wp,0._wp,0._wp,1._wp,0._wp,0._wp,0._wp,1._wp],[3,3])', src)
+# Remove initialization of rot/rot_inv in s_init
+src = re.sub(r'\s*patches\(i\)%rot.*\n', '\n', src)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "rot(3,3) field REMOVED, matmul kept (local identity)" /tmp/no_rotfield_$$.f90 -O3
+rm -f /tmp/no_rotfield_$$.f90
+
+# C: Both removed — equivalent to previous "no rotation" test
+python3 - "$SRC" > /tmp/neither_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+src = re.sub(r'\s*real\(wp\), dimension\(1:3,1:3\) :: rot, rot_inv\n', '\n', src)
+src = re.sub(r'matmul\(.*?\)', 'dv', src)
+src = re.sub(r'\s*patches\(i\)%rot.*\n', '\n', src)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "both rot(3,3) AND matmul removed" /tmp/neither_$$.f90 -O3
+rm -f /tmp/neither_$$.f90
+
+# D: Original with just geo3 (which has matmul) — minimum loop count?
+python3 - "$SRC" > /tmp/geo3only_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Keep only 1 loop with matmul (geo3)
+for g in [1,2,4,5,6,7,8,9,10]:
+    src = re.sub(
+        r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g),
+        '\n', src)
+    src = re.sub(
+        r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g),
+        '\n', src, flags=re.DOTALL)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "1 loop (geo3 only, has matmul)" /tmp/geo3only_$$.f90 -O3
+rm -f /tmp/geo3only_$$.f90
+
+# E: 2 loops with matmul (geo3 + geo4)
+python3 - "$SRC" > /tmp/geo34_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+for g in [1,2,5,6,7,8,9,10]:
+    src = re.sub(
+        r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g),
+        '\n', src)
+    src = re.sub(
+        r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g),
+        '\n', src, flags=re.DOTALL)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "2 loops with matmul (geo3 + geo4)" /tmp/geo34_$$.f90 -O3
+rm -f /tmp/geo34_$$.f90
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_ice_min.sh b/mini/run_ice_min.sh
new file mode 100755
index 0000000000..41a698074b
--- /dev/null
+++ b/mini/run_ice_min.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:15:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/ice_min.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/ice_min.log
+#SBATCH -J ice-min
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+
+IFX=/sw/eb/sw/intel-compilers/2025.1.1/compiler/2025.1/bin/ifx
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+
+echo "=== ifx #5633 absolute minimum: matmul in declare-target sub called from target teams loop ==="
+$IFX --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/icemin_$$.x m_min.mod
+    out=$($IFX $BASE "$@" "$MINI/m_ice_min.f90" -o /tmp/icemin_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/icemin_$$.x m_min.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5633|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+try "-O3"                   -O3
+try "-O2"                   -O2
+try "-O1"                   -O1
+try "-O0"                   -O0
+try "-O3 -fno-inline"       -O3 -fno-inline
+
+echo ""
+echo "--- Variant: replace matmul with element-wise loop (no matmul intrinsic) ---"
+python3 - "$MINI/m_ice_min.f90" > /tmp/no_matmul_$$.f90 << 'PYEOF'
+import sys
+src = open(sys.argv[1]).read()
+src = src.replace(
+    "        b = real(k, wp) * matmul(A, x)",
+    "        integer :: ii, jj\n        do ii = 1, 3\n            b(ii) = 0._wp\n            do jj = 1, 3\n                b(ii) = b(ii) + A(ii,jj)*x(jj)\n            end do\n            b(ii) = b(ii) * real(k, wp)\n        end do")
+open('/dev/stdout', 'w').write(src)
+PYEOF
+rm -f /tmp/icemin_nm_$$.x m_min.mod
+out=$($IFX $BASE -O3 /tmp/no_matmul_$$.f90 -o /tmp/icemin_nm_$$.x 2>&1)
+rc=$?
+rm -f /tmp/icemin_nm_$$.x m_min.mod /tmp/no_matmul_$$.f90
+if [ $rc -eq 0 ]; then
+    echo "OK   : -O3 no matmul (manual loops) -- matmul intrinsic IS the trigger"
+elif echo "$out" | grep -qiE "5633|segmentation"; then
+    echo "ICE  : -O3 no matmul -- ICE even without matmul intrinsic"
+else
+    echo "ERR  : $(echo "$out" | grep -v '^$' | head -1)"
+fi
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_ice_repro.sh b/mini/run_ice_repro.sh
new file mode 100755
index 0000000000..32e582f6cc
--- /dev/null
+++ b/mini/run_ice_repro.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:15:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/ice_repro.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/ice_repro.log
+#SBATCH -J ice-repro-final
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+# Use ifx directly (not mpif90) since this is a single-file reproducer
+IFX=/sw/eb/sw/intel-compilers/2025.1.1/compiler/2025.1/bin/ifx
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+
+echo "=== ifx #5633 minimal reproducer: matmul(derived_type%mat, vec) in declare-target sub ==="
+$IFX --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/ice_$$.x m_ice.mod
+    out=$($IFX $BASE "$@" "$MINI/m_ice_repro.f90" -o /tmp/ice_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/ice_$$.x m_ice.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5633|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+try "-O3"                      -O3
+try "-O2"                      -O2
+try "-O1"                      -O1
+try "-O3 -fno-inline (fix)"    -O3 -fno-inline
+try "-O0"                      -O0
+
+echo ""
+echo "--- Variant: matmul on local array (no struct member) ---"
+python3 - "$MINI/m_ice_repro.f90" > /tmp/local_mat_$$.f90 << 'PYEOF'
+import sys
+src = open(sys.argv[1]).read()
+# Add local_mat to the declaration block (before first executable)
+src = src.replace(
+    '        real(wp), dimension(3) :: dv, ldv\n        real(wp) :: d',
+    '        real(wp), dimension(3) :: dv, ldv\n        real(wp), dimension(3,3) :: local_mat\n        real(wp) :: d')
+# Copy struct member to local before matmul
+src = src.replace(
+    '        ! matmul with a derived-type member (patches(id)%mat) is the key trigger\n        ldv = matmul(patches(id)%mat, dv)',
+    '        local_mat = patches(id)%mat\n        ! matmul with LOCAL copy (not struct member directly)\n        ldv = matmul(local_mat, dv)')
+open('/dev/stdout', 'w').write(src)
+PYEOF
+rm -f /tmp/ice_lm_$$.x m_ice.mod
+out=$($IFX $BASE -O3 /tmp/local_mat_$$.f90 -o /tmp/ice_lm_$$.x 2>&1)
+rc=$?
+rm -f /tmp/ice_lm_$$.x m_ice.mod /tmp/local_mat_$$.f90
+if [ $rc -eq 0 ]; then
+    echo "OK   : -O3 with local_mat copy -- direct struct access IS the trigger"
+elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+    echo "ICE  : -O3 with local_mat copy -- copying to local doesn't help"
+else
+    echo "ERR  : $(echo "$out" | grep -v '^$' | head -1)"
+fi
+
+echo ""
+echo "=== Done ==="

From 9e01d89963c9ecdd041da3ee747344f9e940483e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 09:59:47 -0500
Subject: [PATCH 27/30] fix: replace matmul intrinsic with f_mv3 in
 m_compute_levelset, remove -fno-inline workaround

The matmul() intrinsic inside ! declare target subroutines triggers
ifx 2025.1.1 SPIR-V ICE #5633 when the subroutine is inlined into a
target teams loop kernel. Manual 3x3 matvec (f_mv3) avoids the intrinsic
entirely, allowing the GPU code path to compile at all opt levels without
the -fno-inline workaround in CMakeLists.txt.
---
 CMakeLists.txt                        | 11 ------
 src/simulation/m_compute_levelset.fpp | 50 +++++++++++++++++----------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8eb6d4b34a..a03ce41afe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -860,17 +860,6 @@ if (MFC_SIMULATION)
             target_compile_options(simulation PRIVATE -Oipa0)
         endif()
     endif()
-    # ifx 2025.1.1 SPIR-V ICE (#5633) on m_compute_levelset: the backend segfaults
-    # when declare-target geometry subroutines (s_sphere_levelset etc.) are inlined
-    # into the target teams loop region. -fno-inline prevents the inlining; the calls
-    # are resolved at SPIR-V link time via !$omp declare target, so GPU execution is
-    # preserved. Tested: -O1/-O2/-O3 all ICE; only -O3 -fno-inline compiles cleanly.
-    if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
-        set_source_files_properties(
-            "${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
-            PROPERTIES COMPILE_FLAGS "-fno-inline"
-        )
-    endif()
 endif()
 
 if (MFC_POST_PROCESS)
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 6a5163aac0..cef5e95e91 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -20,6 +20,20 @@ module m_compute_levelset
 
 contains
 
+    !> 3x3 matrix-vector multiply; replaces the matmul() intrinsic which triggers an ifx 2025.1.1 SPIR-V ICE (#5633) when inlined
+    !! into a target teams loop kernel via declare target.
+    pure function f_mv3(M, v) result(w)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), intent(in) :: M(3, 3), v(3)
+        real(wp)             :: w(3)
+        w(1) = M(1, 1)*v(1) + M(1, 2)*v(2) + M(1, 3)*v(3)
+        w(2) = M(2, 1)*v(1) + M(2, 2)*v(2) + M(2, 3)*v(3)
+        w(3) = M(3, 1)*v(1) + M(3, 2)*v(2) + M(3, 3)*v(3)
+
+    end function f_mv3
+
     !> Dispatch level-set distance and normal computations for all ghost points based on patch geometry type
     impure subroutine s_apply_levelset(gps, num_gps)
 
@@ -159,7 +173,7 @@ contains
         offset(:) = patch_ib(ib_patch_id)%centroid_offset(:)
 
         xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
-        xy_local = matmul(inverse_rotation, xy_local)  ! rotate the frame into the IB's coordinate
+        xy_local = f_mv3(inverse_rotation, xy_local)  ! rotate the frame into the IB's coordinate
         xy_local = xy_local - offset  ! airfoils are a patch that require a centroid offset
 
         if (xy_local(2) >= 0._wp) then
@@ -209,7 +223,7 @@ contains
         if (f_approx_equal(dist, 0._wp)) then
             gp%levelset_norm = 0._wp
         else
-            gp%levelset_norm = matmul(rotation, dist_vec(:))/dist  ! convert the normal vector back to global grid coordinates
+            gp%levelset_norm = f_mv3(rotation, dist_vec(:))/dist  ! convert the normal vector back to global grid coordinates
         end if
 
     end subroutine s_airfoil_levelset
@@ -244,7 +258,7 @@ contains
         z_min = -lz/2
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(l)] - center
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+        xyz_local = f_mv3(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
         xyz_local = xyz_local - offset  ! airfoils are a patch that require a centroid offset
 
         if (xyz_local(2) >= 0._wp) then
@@ -299,13 +313,13 @@ contains
             else
                 normal(3) = 1._wp
             end if
-            gp%levelset_norm = matmul(rotation, normal)
+            gp%levelset_norm = f_mv3(rotation, normal)
         else
             gp%levelset = dist_surf
             if (f_approx_equal(dist_surf, 0._wp)) then
                 gp%levelset_norm = 0._wp
             else
-                gp%levelset_norm = matmul(rotation, dist_vec(:)/dist_surf)
+                gp%levelset_norm = f_mv3(rotation, dist_vec(:)/dist_surf)
             end if
         end if
 
@@ -345,7 +359,7 @@ contains
 
         ! convert grid to local coordinates
         xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
-        xy_local = matmul(inverse_rotation, xy_local)
+        xy_local = f_mv3(inverse_rotation, xy_local)
 
         side_dists(1) = bottom_left(1) - xy_local(1)
         side_dists(2) = top_right(1) - xy_local(1)
@@ -372,7 +386,7 @@ contains
                 dist_vec(2) = side_dists(idx)/abs(side_dists(idx))
             end if
             ! convert the normal vector back into the global coordinate system
-            gp%levelset_norm = matmul(rotation, dist_vec)
+            gp%levelset_norm = f_mv3(rotation, dist_vec)
         else
             gp%levelset_norm = 0._wp
         end if
@@ -408,13 +422,13 @@ contains
         ellipse_coeffs(2) = 0.5_wp*length_y
 
         xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
-        xy_local = matmul(inverse_rotation, xy_local)
+        xy_local = f_mv3(inverse_rotation, xy_local)
 
         normal_vector = xy_local
         ! get the normal direction via the coordinate transformation method
         normal_vector(2) = normal_vector(2)*(ellipse_coeffs(1)/ellipse_coeffs(2))**2._wp
         normal_vector = normal_vector/sqrt(dot_product(normal_vector, normal_vector))  ! normalize the vector
-        gp%levelset_norm = matmul(rotation, normal_vector)  ! save after rotating the vector to the global frame
+        gp%levelset_norm = f_mv3(rotation, normal_vector)  ! save after rotating the vector to the global frame
 
         ! use the normal vector to set up the quadratic equation for the levelset, using A, B, and C in indices 1, 2, and 3
         quadratic_coeffs(1) = (normal_vector(1)/ellipse_coeffs(1))**2 + (normal_vector(2)/ellipse_coeffs(2))**2
@@ -467,7 +481,7 @@ contains
         Back = -length_z/2
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center  ! get coordinate frame centered on IB
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinate
+        xyz_local = f_mv3(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinate
 
         dist_left = Left - xyz_local(1)
         dist_right = xyz_local(1) - Right
@@ -511,7 +525,7 @@ contains
             end if
         end if
 
-        gp%levelset_norm = matmul(rotation, dist_vec)
+        gp%levelset_norm = f_mv3(rotation, dist_vec)
 
     end subroutine s_cuboid_levelset
 
@@ -600,7 +614,7 @@ contains
         end if
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center  ! get coordinate frame centered on IB
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+        xyz_local = f_mv3(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
 
         ! get distance to flat edge of cylinder
         side_pos = dot_product(xyz_local, dist_sides_vec)
@@ -612,15 +626,15 @@ contains
             ! if the closest edge is flat
             gp%levelset = -dist_side
             if (f_approx_equal(dist_side, abs(side_pos - boundary(1)))) then
-                gp%levelset_norm = matmul(rotation, -dist_sides_vec)
+                gp%levelset_norm = f_mv3(rotation, -dist_sides_vec)
             else
-                gp%levelset_norm = matmul(rotation, dist_sides_vec)
+                gp%levelset_norm = f_mv3(rotation, dist_sides_vec)
             end if
         else
             gp%levelset = dist_surface
             xyz_local = xyz_local*dist_surface_vec
             xyz_local = xyz_local/max(norm2(xyz_local), sgm_eps)
-            gp%levelset_norm = matmul(rotation, xyz_local)
+            gp%levelset_norm = f_mv3(rotation, xyz_local)
         end if
 
     end subroutine s_cylinder_levelset
@@ -663,7 +677,7 @@ contains
         if (p > 0) then
             xyz_local(3) = z_cc(k) - center(3)
         end if
-        xyz_local = matmul(inverse_rotation, xyz_local)
+        xyz_local = f_mv3(inverse_rotation, xyz_local)
 
         ! 3D models
         if (p > 0) then
@@ -675,12 +689,12 @@ contains
             gp%levelset = -abs(gp%levelset)
 
             ! Assign the levelset_norm
-            gp%levelset_norm = matmul(rotation, normals(1:3))
+            gp%levelset_norm = f_mv3(rotation, normals(1:3))
         else
             ! 2D models
             call s_distance_normals_2D(patch_id, boundary_edge_count, xyz_local, normals, distance)
             gp%levelset = -abs(distance)
-            gp%levelset_norm = matmul(rotation, normals(1:3))
+            gp%levelset_norm = f_mv3(rotation, normals(1:3))
         end if
 
     end subroutine s_model_levelset

From 4cd3df01ce5dfcddaeec27be71c6dd5ff7fe7760 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 11:25:41 -0500
Subject: [PATCH 28/30] build: enable AOT compilation for Intel GPU, add
 mem=32G for ocloc link step

ocloc runs at link time and requires significantly more than SLURM's 1G
default. 32G is sufficient; nodes have 500G available. Also bumped
time limit to 90min for the longer ocloc pass.
---
 build_intel_gpu.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 build_intel_gpu.sh

diff --git a/build_intel_gpu.sh b/build_intel_gpu.sh
new file mode 100644
index 0000000000..e4e19d3507
--- /dev/null
+++ b/build_intel_gpu.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 1:30:00
+#SBATCH --mem=32G
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/build_intel_gpu.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/build_intel_gpu.log
+#SBATCH -J mfc-intel-gpu-build
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0 CMake/3.31.3 Python/3.13.1
+export I_MPI_F90=ifx FC=mpif90
+export UV_CACHE_DIR=/scratch/user/u.sb27915/.cache/uv
+export RUSTUP_HOME=/scratch/user/u.sb27915/.rustup
+export CARGO_HOME=/scratch/user/u.sb27915/.cargo
+export PATH=$CARGO_HOME/bin:$PATH
+
+cd /scratch/user/u.sb27915/MFC-intel
+./mfc.sh build -t simulation --gpu mp --intel-aot -j 8

From dd460d76ae4f2840458890b232aa595b70223585 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 12:24:02 -0500
Subject: [PATCH 29/30] feat: add ACES batch template and fix
 build_intel_gpu.sh to include pre_process

---
 build_intel_gpu.sh            |  2 +-
 toolchain/templates/aces.mako | 55 +++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 toolchain/templates/aces.mako

diff --git a/build_intel_gpu.sh b/build_intel_gpu.sh
index e4e19d3507..7344856d12 100644
--- a/build_intel_gpu.sh
+++ b/build_intel_gpu.sh
@@ -17,4 +17,4 @@ export CARGO_HOME=/scratch/user/u.sb27915/.cargo
 export PATH=$CARGO_HOME/bin:$PATH
 
 cd /scratch/user/u.sb27915/MFC-intel
-./mfc.sh build -t simulation --gpu mp --intel-aot -j 8
+./mfc.sh build -t pre_process simulation --gpu mp --intel-aot -j 8
diff --git a/toolchain/templates/aces.mako b/toolchain/templates/aces.mako
new file mode 100644
index 0000000000..40c38290c0
--- /dev/null
+++ b/toolchain/templates/aces.mako
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --nodes=${nodes}
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --cpus-per-task=1
+#SBATCH --job-name="${name}"
+#SBATCH --time=${walltime}
+% if partition:
+#SBATCH --partition=${partition}
+% else:
+#SBATCH --partition=pvc
+% endif
+% if account:
+#SBATCH --account="${account}"
+% endif
+% if gpu_enabled:
+#SBATCH --gres=gpu:pvc:${tasks_per_node}
+#SBATCH --mem=32G
+% endif
+#SBATCH --output="${name}.out"
+#SBATCH --error="${name}.err"
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+. ./mfc.sh load -c aces -m ${'g' if gpu_enabled else 'c'}
+cd - > /dev/null
+echo
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; ${profiler}                          \
+            mpirun -np ${nodes*tasks_per_node}        \
+                "${target.get_install_binpath(case)}")
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}

From 98036e8d6dd076502898e7222ac007e4f50630bd Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 19 May 2026 14:27:14 -0500
Subject: [PATCH 30/30] fix: implement MFC_Intel_AOT in CMakeLists.txt for
 spir64_gen AOT compilation

The --intel-aot flag was previously only passed to CMake cache but never
used by CMakeLists.txt -- all IntelLLVM OpenMP builds used hardcoded
spir64 (JIT) regardless. This caused zeModuleCreate failures at runtime
since the Level Zero driver could not JIT-compile the embedded SPIR-V.

Add option() declaration for MFC_Intel_AOT and MFC_Intel_AOT_DEVICE,
then branch on MFC_Intel_AOT in the IntelLLVM+OpenMP section to use
spir64_gen + ocloc AOT compilation when enabled. The SHELL: prefix
preserves shell quoting for the -Xopenmp-target-backend argument
(-device pvc) passed through to ocloc at link time.
---
 CMakeLists.txt | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a03ce41afe..57262c5901 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,8 @@ option(MFC_DOCUMENTATION "Build documentation"                               OFF
 option(MFC_ALL           "Build everything"                                  OFF)
 option(MFC_SINGLE_PRECISION "Build single precision"                         OFF)
 option(MFC_MIXED_PRECISION "Build mixed precision"                           OFF)
+option(MFC_Intel_AOT       "Build Intel GPU with AOT (spir64_gen) instead of JIT (spir64)" OFF)
+set(MFC_Intel_AOT_DEVICE "pvc" CACHE STRING "ocloc device target for Intel AOT compilation (e.g. pvc, 0xbda)")
 
 if (MFC_ALL)
     set(MFC_PRE_PROCESS   ON FORCE)
@@ -715,8 +717,19 @@ exit 0
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
                     # Intel GPU: OpenMP target offload to SPIR64 (Xe-HPC / Ponte Vecchio).
                     # GPU FFT uses oneMKL DFTI via the OpenMP dispatch construct.
-                    target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
-                    target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                    if (MFC_Intel_AOT)
+                        # AOT: compile to native ISA via ocloc for Intel GPU Max (pvc).
+                        # Avoids JIT zeModuleCreate failures at runtime on Level Zero.
+                        # SHELL: prevents CMake deduplication and preserves the quoted
+                        # "-device pvc" as a single argument to -Xopenmp-target-backend.
+                        target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64_gen
+                            "SHELL:-Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"")
+                        target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64_gen
+                            "SHELL:-Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"")
+                    else()
+                        target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                        target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                    endif()
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)