diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72258149f3..57262c5901 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,8 @@ option(MFC_DOCUMENTATION "Build documentation"                               OFF
 option(MFC_ALL           "Build everything"                                  OFF)
 option(MFC_SINGLE_PRECISION "Build single precision"                         OFF)
 option(MFC_MIXED_PRECISION "Build mixed precision"                           OFF)
+option(MFC_Intel_AOT       "Build Intel GPU with AOT (spir64_gen) instead of JIT (spir64)" OFF)
+set(MFC_Intel_AOT_DEVICE "pvc" CACHE STRING "ocloc device target for Intel AOT compilation (e.g. pvc, 0xbda)")
 
 if (MFC_ALL)
     set(MFC_PRE_PROCESS   ON FORCE)
@@ -244,6 +246,15 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
 elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
     add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>)
 
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        add_compile_options(-g -Og -traceback -debug -check all)
+    elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
+        add_compile_options(-g -Og -traceback -check bounds)
+    endif()
+elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>
+                        $<$<COMPILE_LANGUAGE:Fortran>:-fpp>)
+
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         add_compile_options(-g -Og -traceback -debug -check all)
     elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
@@ -595,6 +606,73 @@ exit 0
                             HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
                         target_link_libraries(${a_target} PRIVATE ${HIPFFT_LIB})
                     endif()
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+                    # Intel GPU: oneMKL DFTI with !$omp dispatch for GPU FFT.
+                    # Requires MKLROOT to be set (via oneAPI module or env).
+                    if (NOT DEFINED ENV{MKLROOT})
+                        message(FATAL_ERROR "MKLROOT is not set. Load oneAPI MKL module before building.")
+                    endif()
+                    # Compile mkl_dfti_omp_offload.f90 in isolation with minimal flags.
+                    # The file uses !$omp declare variant with need_device_addr (OpenMP 5.2)
+                    # which requires the global -free -fpp flags to be absent so the
+                    # compiler parses it in standard fixed/free detection mode only.
+                    set(_mkl_omp_src "$ENV{MKLROOT}/include/mkl_dfti_omp_offload.f90")
+                    if (NOT EXISTS "${_mkl_omp_src}")
+                        message(FATAL_ERROR "mkl_dfti_omp_offload.f90 not found in $ENV{MKLROOT}/include")
+                    endif()
+                    set(_mkl_omp_mod_dir "${CMAKE_CURRENT_BINARY_DIR}/mkl_omp_mods")
+                    set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o")
+                    set(_mkl_omp_obj_host "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload_host.o")
+                    file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}")
+                    # Find clang-offload-bundler (in ifx's bin/compiler/ subdir).
+                    # CMAKE_Fortran_COMPILER may be an MPI wrapper (mpiifx); resolve the
+                    # underlying ifx from PATH so the HINTS point to the right directory.
+                    find_program(_IFX_REAL ifx REQUIRED)
+                    cmake_path(GET _IFX_REAL PARENT_PATH _ifx_bin)
+                    find_program(CLANG_OFFLOAD_BUNDLER
+                        NAMES clang-offload-bundler
+                        HINTS "${_ifx_bin}/compiler" "${_ifx_bin}" "${_ifx_bin}-llvm"
+                        REQUIRED)
+                    add_custom_command(
+                        OUTPUT  "${_mkl_omp_obj}"
+                                "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.mod"
+                        COMMAND "${CMAKE_Fortran_COMPILER}"
+                                -fiopenmp -fopenmp-targets=spir64
+                                -c -I"$ENV{MKLROOT}/include"
+                                "${_mkl_omp_src}"
+                                -o "${_mkl_omp_obj}"
+                        WORKING_DIRECTORY "${_mkl_omp_mod_dir}"
+                        DEPENDS "${_mkl_omp_src}"
+                        COMMENT "Compiling MKL OMP offload Fortran module (mkl_dfti_omp_offload)"
+                    )
+                    # Strip the SPIR-V device bundle so the linked object has only host code.
+                    # The SPIR-V contains Import declarations for MKL SYCL DFT functions that
+                    # the OpenMP Level Zero plugin cannot resolve, causing zeModuleDynamicLink
+                    # failure. With host-only code, !$omp dispatch falls back to CPU for DFT.
+                    add_custom_command(
+                        OUTPUT  "${_mkl_omp_obj_host}"
+                        COMMAND "${CLANG_OFFLOAD_BUNDLER}"
+                                --unbundle --type=o
+                                --targets=host-x86_64-unknown-linux-gnu
+                                --input="${_mkl_omp_obj}"
+                                --output="${_mkl_omp_obj_host}"
+                        DEPENDS "${_mkl_omp_obj}"
+                        COMMENT "Stripping SPIR-V from MKL DFT object (host-only, fixes Level Zero link)"
+                    )
+                    add_custom_target(mkl_omp_offload_mod_${a_target}
+                        DEPENDS "${_mkl_omp_obj_host}")
+                    add_dependencies(${a_target} mkl_omp_offload_mod_${a_target})
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{MKLROOT}/include" "${_mkl_omp_mod_dir}")
+                    target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj_host}")
+                    # Link MKL threading + core + SYCL DFT backend
+                    target_link_options(${a_target} PRIVATE -qmkl=parallel)
+                    # mkl_sycl_dft is the name in MKL >= 2023.2; older versions use monolithic mkl_sycl
+                    find_library(MKL_SYCL_DFT NAMES mkl_sycl_dft mkl_sycl
+                        HINTS "$ENV{MKLROOT}/lib" "$ENV{MKLROOT}/lib/intel64" REQUIRED)
+                    find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
+                    find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE ${MKL_SYCL_DFT} ${SYCL_LIB} ${OPENCL_LIB})
                 else()
                     find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
                     target_link_libraries(${a_target} PRIVATE hipfort::hipfft)
@@ -636,9 +714,22 @@ exit 0
                     target_compile_options(${a_target} PRIVATE "-mp=gpu" "-Minfo=mp")
                     target_link_options(${a_target} PRIVATE "-mp=gpu")
                     set_target_properties(${a_target} PROPERTIES Fortran_FLAGS "-mp=gpu -gpu=ccall")
-                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
-                    target_compile_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
-                    target_link_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+                    # Intel GPU: OpenMP target offload to SPIR64 (Xe-HPC / Ponte Vecchio).
+                    # GPU FFT uses oneMKL DFTI via the OpenMP dispatch construct.
+                    if (MFC_Intel_AOT)
+                        # AOT: compile to native ISA via ocloc for Intel GPU Max (pvc).
+                        # Avoids JIT zeModuleCreate failures at runtime on Level Zero.
+                        # SHELL: prevents CMake deduplication and preserves the quoted
+                        # "-device pvc" as a single argument to -Xopenmp-target-backend.
+                        target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64_gen
+                            "SHELL:-Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"")
+                        target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64_gen
+                            "SHELL:-Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"")
+                    else()
+                        target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                        target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
+                    endif()
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
diff --git a/build_intel_gpu.sh b/build_intel_gpu.sh
new file mode 100644
index 0000000000..7344856d12
--- /dev/null
+++ b/build_intel_gpu.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 1:30:00
+#SBATCH --mem=32G
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/build_intel_gpu.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/build_intel_gpu.log
+#SBATCH -J mfc-intel-gpu-build
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0 CMake/3.31.3 Python/3.13.1
+export I_MPI_F90=ifx FC=mpif90
+export UV_CACHE_DIR=/scratch/user/u.sb27915/.cache/uv
+export RUSTUP_HOME=/scratch/user/u.sb27915/.rustup
+export CARGO_HOME=/scratch/user/u.sb27915/.cargo
+export PATH=$CARGO_HOME/bin:$PATH
+
+cd /scratch/user/u.sb27915/MFC-intel
+./mfc.sh build -t pre_process simulation --gpu mp --intel-aot -j 8
diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md
new file mode 100644
index 0000000000..c18fbe7eb2
--- /dev/null
+++ b/docs/documentation/intel-gpu-max.md
@@ -0,0 +1,387 @@
+# Building MFC for Intel Data Center GPU Max (Ponte Vecchio)
+
+This documents how to build and run MFC with Intel GPU Max (Xe-HPC / Ponte Vecchio)
+using ifx OpenMP target offload to SPIR64, as tested on GT CRNCH RoboGator (`dash3`).
+
+## System configuration
+
+| Component | Version / Path |
+|---|---|
+| Hardware | Intel Data Center GPU Max 1100 (Ponte Vecchio, PCI 8086:0bda) |
+| OS | Linux (RHEL 8 compatible, kernel 5.15) |
+| Fortran compiler | ifx 2025.3.3 (`/opt/intel/oneapi/compiler/2025.3/`) |
+| MKL | oneMKL 2026.0 (`/opt/intel/oneapi/mkl/2026.0/`) |
+| SYCL runtime | `libsycl.so` in `/opt/intel/oneapi/compiler/2026.0/lib/` |
+| GPU device | `/dev/dri/renderD128` (requires `render` group membership) |
+
+## Environment setup
+
+Load the required oneAPI environment before building or running:
+
+```bash
+export PATH=/opt/intel/oneapi/compiler/2025.3/bin:$PATH
+export MKLROOT=/opt/intel/oneapi/mkl/2026.0
+export LIBRARY_PATH=/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\
+/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LD_LIBRARY_PATH
+export FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx
+```
+
+> **Important**: `FC` must be set explicitly. Without it, CMake may cache an older
+> ifx (2025.0) from a system module, which does not support `need_device_addr` in
+> the MKL 2026.0 OpenMP offload Fortran module.
+
+> **Important**: `LIBRARY_PATH` (not just `LD_LIBRARY_PATH`) must include the
+> compiler 2026.0 lib directory so the linker finds `libsycl.so` at build time.
+
+> **Important**: `/opt/intel/oneapi/umf/1.1/lib` must be in `LD_LIBRARY_PATH` at
+> runtime. The Level Zero and OpenCL UR adapters in the 2026.0 compiler depend on
+> `libumf.so.1`, which lives in the separate `umf/1.1` package, not in the compiler
+> lib directories themselves.
+
+## Building
+
+```bash
+./mfc.sh build -t simulation --gpu mp --no-mpi -j 8
+```
+
+- `--gpu mp`: OpenMP target offload backend (SPIR64)
+- `--no-mpi`: omit for MPI-enabled runs; include for single-node testing
+- `-j 8`: parallel build jobs
+
+MFC will automatically:
+1. Compile `$MKLROOT/include/mkl_dfti_omp_offload.f90` with minimal flags
+   (no `-free -fpp`) via a CMake `add_custom_command` to avoid OpenMP 5.2
+   clause compatibility issues with global compile flags
+2. Link `-qmkl=parallel` for MKL threading + core
+3. Link `libmkl_sycl_dft`, `libsycl`, `libOpenCL` for GPU FFT dispatch
+
+## GPU FFT implementation
+
+MFC uses oneMKL DFTI with the OpenMP 5.1 `!$omp dispatch` construct for FFT
+in cylindrical geometry (the azimuthal Fourier filter in `m_fftw.fpp`).
+This is activated when `__INTEL_LLVM_COMPILER` is defined (i.e., compiled with ifx).
+
+Key verified properties (oneMKL 2026.0, ifx 2025.3.3):
+- Batch R2C transform with `INPUT_DISTANCE != OUTPUT_DISTANCE` works correctly.
+  MFC uses `real_size = p+1` and `cmplx_size = (p+1)/2+1` which always differ.
+- `!$omp dispatch` correctly dispatches DFTI calls to device-mapped allocatables.
+
+## Running MFC cases
+
+Build all three targets (pre_process, simulation, post_process) before running:
+
+```bash
+./mfc.sh build --gpu mp --no-mpi -j 8
+```
+
+Then run a case normally:
+
+```bash
+./mfc.sh run examples/1D_convergence/case.py --no-build --no-mpi
+```
+
+To run individual stages directly (useful when `syscheck` blocks due to GPU access):
+
+```bash
+export MKLROOT=/opt/intel/oneapi/mkl/2026.0
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\
+/opt/intel/oneapi/compiler/2026.0/lib:\
+/opt/intel/oneapi/compiler/2025.3/lib:\
+${MKLROOT}/lib:\
+/opt/intel/oneapi/tbb/2022.1/lib:\
+$LD_LIBRARY_PATH
+
+cd examples/my_case
+/path/to/build/install/<hash>/bin/pre_process
+/path/to/build/install/<hash>/bin/simulation
+```
+
+The install hashes are printed by `./mfc.sh build`; look for lines like
+`✓ Installed simulation`.
+
+## Chemistry/pyrometheus cases
+
+MFC's pyrometheus-generated thermochemistry (`m_thermochem.f90`) works with the
+Intel GPU build. Pyrometheus emits `!$omp declare target` for all thermochem
+routines when `directive_offload="mp"` is requested (automatically set when
+`--gpu mp` is active).
+
+Verified on ifx 2025.3.3: `m_thermochem.f90` and `m_chemistry.fpp` both compile
+at O3 + SPIR64 without ICE. The `1D_reactive_shocktube` example (H2/O2/Ar, 29
+reactions, 10 species) runs to completion. No Intel-specific source workarounds
+are needed for chemistry beyond the general levelset fix.
+
+To build and run a reactive chemistry case:
+```bash
+# Build (chemistry module generated automatically from cantera_file in case.py)
+./mfc.sh run examples/1D_reactive_shocktube/case.py \
+    --gpu mp --no-mpi -t pre_process simulation
+
+# Or bypass syscheck if no GPU render-group access:
+cd examples/1D_reactive_shocktube
+/path/to/build/install/<pre_hash>/bin/pre_process
+OMP_TARGET_OFFLOAD=DISABLED \
+    /path/to/build/install/<sim_hash>/bin/simulation
+```
+
+ifx warning during chemistry compilation:
+```
+warning #8694: A procedure called by a procedure with the DECLARE TARGET
+attribute must have the DECLARE TARGET attribute.  [GET_MIXTURE_VISCOSITY_MIXAVG]
+```
+This is a false positive from ifx's module-interface tracking; `m_thermochem.f90`
+does declare all routines as target. The warning is harmless and the code runs
+correctly.
+
+## GPU device access
+
+The Intel GPU requires membership in the `render` group (GID 109) to access
+`/dev/dri/renderD128` via Level Zero.
+
+Without render group access, `ZE_RESULT_ERROR_UNINITIALIZED` is returned by
+Level Zero. OpenMP target offload falls back to the CPU host plugin
+(correct results, but no GPU acceleration).
+
+To diagnose GPU visibility:
+
+```bash
+ls -la /dev/dri/renderD128          # should show rw permissions for your user/group=render
+
+# With full LD_LIBRARY_PATH set:
+LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:... \
+    /opt/intel/oneapi/compiler/2026.0/bin/sycl-ls --verbose
+# Look for: "[opencl:gpu]" or "[ext_oneapi_level_zero:gpu]" platforms
+
+LIBOMPTARGET_DEBUG=1 ./simulation   # look for "Level0 NG plugin initialization"
+                                    # and absence of "ZE_RESULT_ERROR_UNINITIALIZED"
+```
+
+To get GPU access:
+- **Interactive shell**: request from system admin to add user to `render` group
+  (`sudo usermod -a -G render $USER`, then re-login)
+- **SLURM**: submit with `--gres=gpu:max_1100=1`; if Level Zero still fails,
+  the SLURM epilog may not have configured device cgroup ACLs for the job user —
+  contact the system admin
+
+> **Note on `sycl-ls`**: the 2026.0 `sycl-ls` requires `libumf.so.1` from
+> `/opt/intel/oneapi/umf/1.1/lib` in `LD_LIBRARY_PATH`, otherwise all adapters
+> fail to load and it reports "No platforms found".
+
+## Link flags (what MFC's CMake generates)
+
+The full set of flags the compiler uses for the simulation target:
+
+**Compile flags:**
+```
+-fiopenmp -fopenmp-targets=spir64 -free -fpp -march=native
+```
+
+**Link flags:**
+```
+-fiopenmp -fopenmp-targets=spir64
+-qmkl=parallel
+-L$MKLROOT/lib -lmkl_sycl_dft
+-L/opt/intel/oneapi/compiler/2026.0/lib -lsycl -lOpenCL
+```
+
+**MKL OMP module (compiled separately, no global flags):**
+```bash
+ifx -fiopenmp -fopenmp-targets=spir64 \
+    -c -I$MKLROOT/include \
+    $MKLROOT/include/mkl_dfti_omp_offload.f90 \
+    -o mkl_dfti_omp_offload.o
+```
+
+## Known issues
+
+### `need_device_addr` compilation error
+`mkl_dfti_omp_offload.f90` from MKL 2026.0 uses `need_device_addr` in
+`!$omp declare variant` (OpenMP 5.2). This requires ifx **2025.3** or newer.
+If CMake finds an older ifx (e.g., 2025.0 from a system module path), the
+compile fails with:
+```
+error #5082: Syntax error, found IDENTIFIER 'NEED_DEVICE_ADDR'
+```
+Fix: set `FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx` before building
+and run `./mfc.sh clean` first so CMake re-detects the compiler.
+
+### Two routines with ifx SPIR64 codegen bugs
+
+**`s_apply_levelset` (`m_compute_levelset.fpp`)** — ifx SPIR64 inliner ICE:
+
+The LLVM inliner (at O1+) pulls `!$omp declare target (seq)` geometry
+routines into the `target teams loop` kernel and generates LLVM IR that
+crashes the SPIR-V converter with a segfault in `llvm-spirv`. At O0 the
+crash does not occur (no inlining). Two fixes combined:
+
+1. **Split loops**: replaced the single if-else dispatch loop with one
+   `GPU_PARALLEL_LOOP` per geometry type so each kernel calls exactly one
+   declare-target routine. The original multi-callee dispatch also triggers
+   `"Instruction does not dominate all uses!"` in llvm-link.
+
+2. **Per-file `-fno-inline`**: in `CMakeLists.txt`, `set_source_files_properties`
+   adds `-fno-inline` to `m_compute_levelset.fpp.f90` for IntelLLVM+OpenMP
+   builds. This prevents the inliner from pulling declare-target routines into
+   the kernel body where they crash the SPIR-V backend. The routines remain
+   callable as proper device-side function calls via `!$omp declare target`.
+
+**`s_pressure_relaxation_procedure` (`m_pressure_relaxation.fpp`)** — SPIR-V
+InvalidArraySize in declare-target helpers:
+
+`!$omp declare target (seq)` routines with `dimension(sys_size)` explicit-shape
+dummy arguments trigger `"InvalidArraySize: Array size must be at least 1"` in
+llvm-spirv. SPIR-V requires compile-time constant array bounds; `sys_size` is
+a runtime module integer. Fixed by changing `dimension(sys_size)` →
+`dimension(:)` (assumed-shape) on all helper routine interfaces. The loop now
+runs on GPU for Intel.
+
+### syscheck GPU assertion
+`syscheck` runs `assert(omp_get_num_devices() > 0)` and aborts if the GPU
+is not accessible. This is a runtime check, not a build issue. See GPU device
+access section above.
+
+To run a case anyway (testing code correctness on CPU fallback), invoke
+`pre_process` and `simulation` directly from their install paths, bypassing
+the `./mfc.sh run` wrapper that calls `syscheck` first.
+
+### Multi-node MPI with Intel MPI 2021.x
+
+Intel MPI 2021.x expects a bundled `ssh` binary at `$I_MPI_ROOT/bin/ssh` that
+understands an `--external-launcher` flag used by hydra bootstrap. This binary
+is missing from some oneAPI installations, causing SSH bootstrap to fail with
+`unknown option -- -`.
+
+Workaround: create a wrapper that strips the Intel-specific flag:
+
+```bash
+mkdir -p ~/bin
+cat > ~/bin/ssh << 'EOF'
+#!/bin/bash
+args=(-q -o StrictHostKeyChecking=yes -o BatchMode=yes)
+for arg in "$@"; do
+    [[ "$arg" == "--external-launcher" ]] && continue
+    [[ "$arg" == "--" ]] && break
+    args+=("$arg")
+done
+exec /usr/bin/ssh "${args[@]}"
+EOF
+chmod +x ~/bin/ssh
+```
+
+Then run with:
+```bash
+PATH=$HOME/bin:$PATH \
+I_MPI_HYDRA_BOOTSTRAP=rsh \
+I_MPI_HYDRA_BOOTSTRAP_EXEC=$HOME/bin/ssh \
+mpirun -n <ranks> -hosts <node1>,<node2> ./simulation
+```
+
+Nodes must have passwordless SSH from the launch node and no `pam_slurm_adopt`
+blocking. Suppress the SSH login banner on remote nodes with `touch ~/.hushlogin`.
+
+**OFI provider path**: Intel MPI 2021.x ships its own libfabric providers in
+`$I_MPI_ROOT/libfabric/lib/prov/`. The system libfabric may not include the tcp
+or shm providers. Always set:
+
+```bash
+export FI_PROVIDER_PATH=$I_MPI_ROOT/libfabric/lib/prov
+```
+
+Without this, `PMPI_Init` aborts with `OFI fi_getinfo() failed: No data available`.
+This is handled automatically by `source ./mfc.sh load -c crnch -m gpu`.
+
+**SLURM GPU access**: on SLURM-managed Intel GPU nodes, processes outside a SLURM
+allocation cannot open `/dev/dri/renderD128`. Always request the GPU resource:
+
+```bash
+#SBATCH --gres=gpu:max_1100:1   # Intel GPU Max 1100
+```
+
+Without `--gres`, `omp_get_num_devices()` returns 0 and the process aborts with
+integer divide-by-zero in `s_initialize_mpi_domain` (rank % num_devices with 0 devices).
+
+**Per-node renderD128 permissions on CRNCH**: `dash4` has `renderD128` as
+`crwxrwxrwx` (world-accessible), but `dash3` has `crw-rw----` (render group only).
+`--gres=gpu:max_1100:1` does NOT grant cgroup access on dash3 with the current
+SLURM configuration; `omp_get_num_devices()` returns 0 on dash3 even within a
+SLURM GPU allocation. Contact the CRNCH admin to either fix the device permissions
+on dash3 or configure SLURM device cgroups to grant renderD128 access for GPU jobs.
+Until fixed, 2-node GPU simulation is not possible using dash3+dash4.
+
+**Inter-node MPI: FI_TCP_IFACE must be set dynamically**: The CRNCH dash nodes
+have multiple network interfaces (high-speed 10GbE at `10.10.10.x`, public 1GbE
+at `143.215.138.x/25`). Intel MPI's OFI tcp provider selects the highest-speed
+interface by default. On dash3, this picks `enp200s0f1np1` (10.10.10.32), which
+has no corresponding active interface on dash4. This causes the inter-node MPI
+broadcast to hang silently after `MPI_Init` succeeds.
+
+Fix: set `FI_TCP_IFACE` to the name of the interface with the public IP (which
+is accessible from all nodes). The interface name differs per node, so set it
+dynamically in each rank's startup script:
+
+```bash
+IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}')
+export FI_TCP_IFACE="${IFACE}"
+```
+
+This selects `enp3s0f0` on dash3 and `enp3s0f0np0` on dash4. Combined with
+`srun --mpi=pmi2` for SLURM-native MPI bootstrap (avoiding Intel MPI hydra/SSH),
+this enables successful inter-node MPI communication.
+
+**Recommended 2-node run script pattern** (for when dash3's GPU access is fixed):
+
+```bash
+#!/bin/bash
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=1
+#SBATCH -p rg-nextgen-hpc
+#SBATCH -w dash3,dash4
+#SBATCH --gres=gpu:max_1100:1
+#SBATCH --time=01:00:00
+
+INTEL=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1
+export PATH=${INTEL}/compiler/2025.0/bin:${INTEL}/mpi/2021.14/bin:${PATH}
+export LD_LIBRARY_PATH=${INTEL}/mkl/2025.0/lib:${INTEL}/compiler/2025.0/lib:${INTEL}/2025.0/lib:${INTEL}/mpi/2021.14/lib:${INTEL}/mpi/2021.14/libfabric/lib:${LD_LIBRARY_PATH}
+export FI_PROVIDER_PATH=${INTEL}/mpi/2021.14/libfabric/lib/prov
+export I_MPI_FABRICS="shm:ofi"
+export FI_PROVIDER=tcp
+
+cd /path/to/case
+
+# Step 1: pre-process
+WRAP_SCRIPT=$(mktemp)
+cat > "$WRAP_SCRIPT" << 'EOF'
+IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}')
+export FI_TCP_IFACE="$IFACE"
+exec /path/to/build/install/<hash>/bin/pre_process
+EOF
+chmod +x "$WRAP_SCRIPT"
+srun --mpi=pmi2 -n 2 --ntasks-per-node=1 "$WRAP_SCRIPT"
+
+# Step 2: simulation
+cat > "$WRAP_SCRIPT" << 'EOF'
+IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}')
+export FI_TCP_IFACE="$IFACE"
+exec /path/to/build/install/<hash>/bin/simulation
+EOF
+srun --mpi=pmi2 -n 2 --ntasks-per-node=1 "$WRAP_SCRIPT"
+rm "$WRAP_SCRIPT"
+```
+
+### `libumf.so.1` not found at runtime
+The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`.
+If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports
+"No platforms found". Fix:
+
+```bash
+export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:$LD_LIBRARY_PATH
+```
diff --git a/examples/3D_performance_test/case.py b/examples/3D_performance_test/case.py
index 4e08a8ea1b..8b26fa07dd 100644
--- a/examples/3D_performance_test/case.py
+++ b/examples/3D_performance_test/case.py
@@ -14,26 +14,13 @@
             "y_domain%end": 4.0e-03 / 1.0e-03,
             "z_domain%beg": 0.0e00,
             "z_domain%end": 4.0e-03 / 1.0e-03,
-            "stretch_x": "T",
-            "a_x": 4.0e00,
-            "x_a": -1.5e-03 / 1.0e-03,
-            "x_b": 1.5e-03 / 1.0e-03,
-            "stretch_y": "T",
-            "a_y": 4.0e00,
-            "y_a": -1.5e-03 / 1.0e-03,
-            "y_b": 1.5e-03 / 1.0e-03,
-            "stretch_z": "T",
-            "a_z": 4.0e00,
-            "z_a": -1.5e-03 / 1.0e-03,
-            "z_b": 1.5e-03 / 1.0e-03,
-            "cyl_coord": "F",
             "m": 200,
             "n": 200,
             "p": 200,
-            "dt": 0.2e-09 / 1.0e-03,
+            "dt": 0.2e-09,
             "t_step_start": 0,
-            "t_step_stop": 30,
-            "t_step_save": 30,
+            "t_step_stop": 10,
+            "t_step_save": 10,
             # Simulation Algorithm Parameters
             "num_patches": 2,
             "model_eqns": 2,
diff --git a/mini/m_cl4_mini.f90 b/mini/m_cl4_mini.f90
new file mode 100644
index 0000000000..5d3dd75cea
--- /dev/null
+++ b/mini/m_cl4_mini.f90
@@ -0,0 +1,405 @@
+! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! Pattern: single module with 10+ !$omp target teams loop regions, each calling
+! a different !$omp declare target subroutine defined in the SAME module.
+! The subroutines access a derived type that contains a character field
+! (ib_patch_parameters%model_filepath) and 3x3 real arrays (rotation_matrix).
+!
+! Compile (ICE expected at -O3, pass expected at -O3 -fno-inline):
+!   mpif90 -free -fiopenmp -fopenmp-targets=spir64 -fp-model=precise \
+!          -march=native -mno-avx512fp16 -O3 -c m_cl4_mini.f90
+!
+! Fix:
+!   mpif90 ... -O3 -fno-inline -c m_cl4_mini.f90
+
+module m_cl4_mini
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+    integer, parameter :: PATHLEN = 200
+    integer, parameter :: MAX_PATCHES = 10
+
+    ! Replicated from m_derived_types.fpp :: ib_patch_parameters
+    type :: ib_patch_parameters
+        integer  :: geometry
+        real(wp) :: x_centroid, y_centroid, z_centroid
+        real(wp) :: step_x_centroid, step_y_centroid, step_z_centroid
+        real(wp), dimension(1:3)     :: centroid_offset
+        real(wp), dimension(1:3)     :: angles, step_angles
+        real(wp), dimension(1:3,1:3) :: rotation_matrix
+        real(wp), dimension(1:3,1:3) :: rotation_matrix_inverse
+        real(wp) :: c, p, t, m
+        real(wp) :: length_x, length_y, length_z
+        real(wp) :: radius, theta
+        logical  :: slip
+        character(LEN=PATHLEN) :: model_filepath  ! <-- char field; triggers SPIR-V bug
+        real(wp), dimension(1:3) :: model_translate
+        real(wp), dimension(1:3) :: model_scale
+        real(wp), dimension(1:3) :: model_rotate
+        integer  :: model_spc
+        real(wp) :: model_threshold
+        integer  :: moving_ibm
+        real(wp) :: mass, moment
+        real(wp), dimension(1:3) :: force, torque
+        real(wp), dimension(1:3) :: vel, step_vel
+        real(wp), dimension(1:3) :: angular_vel, step_angular_vel
+    end type ib_patch_parameters
+
+    ! Replicated from m_derived_types.fpp :: ghost_point
+    type :: ghost_point
+        integer, dimension(3)        :: loc
+        real(wp), dimension(3)       :: ip_loc
+        integer, dimension(3)        :: ip_grid
+        real(wp), dimension(2, 2, 2) :: interp_coeffs  ! 3-D array in struct
+        integer                      :: ib_patch_id
+        real(wp)                     :: levelset
+        real(wp), dimension(1:3)     :: levelset_norm
+        logical                      :: slip
+        integer, dimension(3)        :: DB
+        integer                      :: x_periodicity, y_periodicity, z_periodicity
+    end type ghost_point
+
+    type :: bounds_info
+        real(wp) :: beg, end
+    end type bounds_info
+
+    ! Module-level variables accessed by the declare-target subroutines
+    type(ib_patch_parameters), dimension(MAX_PATCHES) :: patch_ib
+    real(wp), allocatable :: x_cc(:), y_cc(:), z_cc(:)
+    type(bounds_info) :: x_domain, y_domain, z_domain
+    integer :: num_ibs
+
+    !$omp declare target(patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs)
+
+    public :: ghost_point, s_cl4_dispatch, s_cl4_init, s_cl4_finalize
+
+contains
+
+    ! ------------------------------------------------------------------ init --
+
+    subroutine s_cl4_init(nx, nz)
+        integer, intent(in) :: nx, nz
+        integer :: i
+        allocate(x_cc(nx), y_cc(nx), z_cc(nz))
+        do i = 1, nx
+            x_cc(i) = real(i, wp) * 0.1_wp
+            y_cc(i) = real(i, wp) * 0.1_wp
+        end do
+        do i = 1, nz
+            z_cc(i) = real(i, wp) * 0.1_wp
+        end do
+        x_domain%beg = 0._wp; x_domain%end = real(nx, wp)*0.1_wp
+        y_domain%beg = 0._wp; y_domain%end = real(nx, wp)*0.1_wp
+        z_domain%beg = 0._wp; z_domain%end = real(nz, wp)*0.1_wp
+        num_ibs = 1
+        do i = 1, MAX_PATCHES
+            patch_ib(i)%geometry   = i
+            patch_ib(i)%x_centroid = 0.5_wp
+            patch_ib(i)%y_centroid = 0.5_wp
+            patch_ib(i)%z_centroid = 0.5_wp
+            patch_ib(i)%radius     = 0.2_wp
+            patch_ib(i)%length_x   = 0.4_wp
+            patch_ib(i)%length_y   = 0.4_wp
+            patch_ib(i)%length_z   = 0.4_wp
+            patch_ib(i)%rotation_matrix         = 0._wp
+            patch_ib(i)%rotation_matrix(1,1)    = 1._wp
+            patch_ib(i)%rotation_matrix(2,2)    = 1._wp
+            patch_ib(i)%rotation_matrix(3,3)    = 1._wp
+            patch_ib(i)%rotation_matrix_inverse = patch_ib(i)%rotation_matrix
+            patch_ib(i)%centroid_offset         = 0._wp
+            patch_ib(i)%model_filepath          = 'none'
+        end do
+        !$omp target enter data map(to: patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs)
+    end subroutine s_cl4_init
+
+    subroutine s_cl4_finalize()
+        !$omp target exit data map(delete: patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs)
+        deallocate(x_cc, y_cc, z_cc)
+    end subroutine s_cl4_finalize
+
+    ! ------------------------------------------------------- dispatch (ICE site) --
+
+    !> 10 separate target teams loop regions, each calling a different
+    !! !$omp declare target subroutine defined in THIS module.
+    subroutine s_cl4_dispatch(gps, num_gps)
+        type(ghost_point), dimension(:), intent(inout) :: gps
+        integer, intent(in) :: num_gps
+        integer :: i, patch_id
+
+        ! 3D geometry loops (5 loops, geometries 8-12)
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 8) call s_geo_sphere(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 9) call s_geo_cuboid(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 10) call s_geo_cylinder(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 11) call s_geo_3d_airfoil(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 12) call s_geo_model(gps(i))
+        end do
+
+        ! 2D geometry loops (5 loops, geometries 2-6)
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 2) call s_geo_circle(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 3) call s_geo_rectangle(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 4) call s_geo_airfoil(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 5) call s_geo_ellipse(gps(i))
+        end do
+
+        !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs)
+        do i = 1, num_gps
+            patch_id = gps(i)%ib_patch_id
+            if (patch_ib(patch_id)%geometry == 6) call s_geo_triangle(gps(i))
+        end do
+
+    end subroutine s_cl4_dispatch
+
+    ! ----------------------------------------- declare-target geometry routines --
+    ! All defined in the SAME module; each has !$omp declare target.
+    ! The ICE fires when the inliner pulls these into the target teams loop bodies.
+
+    subroutine s_geo_sphere(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dist
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid - real(gp%x_periodicity,wp)*(x_domain%end-x_domain%beg)
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid - real(gp%y_periodicity,wp)*(y_domain%end-y_domain%beg)
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid - real(gp%z_periodicity,wp)*(z_domain%end-z_domain%beg)
+        dist = sqrt(sum(dv**2))
+        gp%levelset = dist - patch_ib(id)%radius
+        if (dist > 0._wp) then
+            gp%levelset_norm = dv/dist
+        else
+            gp%levelset_norm = 0._wp
+        end if
+    end subroutine s_geo_sphere
+
+    subroutine s_geo_cuboid(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dx, dy, dz, lx, ly, lz
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        lx = patch_ib(id)%length_x; ly = patch_ib(id)%length_y; lz = patch_ib(id)%length_z
+        dx = abs(x_cc(i) - patch_ib(id)%x_centroid) - lx/2._wp
+        dy = abs(y_cc(j) - patch_ib(id)%y_centroid) - ly/2._wp
+        dz = abs(z_cc(k) - patch_ib(id)%z_centroid) - lz/2._wp
+        gp%levelset = max(dx, max(dy, dz))
+        gp%levelset_norm(1) = sign(1._wp, x_cc(i) - patch_ib(id)%x_centroid)
+        gp%levelset_norm(2) = sign(1._wp, y_cc(j) - patch_ib(id)%y_centroid)
+        gp%levelset_norm(3) = sign(1._wp, z_cc(k) - patch_ib(id)%z_centroid)
+    end subroutine s_geo_cuboid
+
+    subroutine s_geo_cylinder(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: r2d, dist
+        real(wp), dimension(3) :: dv, rot_dv
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid
+        rot_dv = matmul(patch_ib(id)%rotation_matrix_inverse, dv)
+        r2d = sqrt(rot_dv(1)**2 + rot_dv(2)**2)
+        dist = r2d - patch_ib(id)%radius
+        gp%levelset = dist
+        if (r2d > 0._wp) then
+            gp%levelset_norm(1) = rot_dv(1)/r2d
+            gp%levelset_norm(2) = rot_dv(2)/r2d
+        else
+            gp%levelset_norm(1) = 0._wp
+            gp%levelset_norm(2) = 0._wp
+        end if
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo_cylinder
+
+    subroutine s_geo_3d_airfoil(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp), dimension(3) :: dv, local
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid
+        local = matmul(patch_ib(id)%rotation_matrix_inverse, dv) - patch_ib(id)%centroid_offset
+        gp%levelset = sqrt(sum(local**2)) - patch_ib(id)%radius
+        gp%levelset_norm = local / max(sqrt(sum(local**2)), 1e-12_wp)
+    end subroutine s_geo_3d_airfoil
+
+    subroutine s_geo_model(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j, k
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid + patch_ib(id)%model_translate(1)*patch_ib(id)%model_scale(1)
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid + patch_ib(id)%model_translate(2)*patch_ib(id)%model_scale(2)
+        dv(3) = z_cc(k) - patch_ib(id)%z_centroid + patch_ib(id)%model_translate(3)*patch_ib(id)%model_scale(3)
+        gp%levelset = sqrt(sum(dv**2)) - patch_ib(id)%radius
+        gp%levelset_norm = dv / max(sqrt(sum(dv**2)), 1e-12_wp)
+    end subroutine s_geo_model
+
+    subroutine s_geo_circle(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dist
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid - real(gp%x_periodicity,wp)*(x_domain%end-x_domain%beg)
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid - real(gp%y_periodicity,wp)*(y_domain%end-y_domain%beg)
+        dv(3) = 0._wp
+        dist = sqrt(sum(dv**2))
+        gp%levelset = dist - patch_ib(id)%radius
+        if (dist > 0._wp) then
+            gp%levelset_norm = dv/dist
+        else
+            gp%levelset_norm = 0._wp
+        end if
+    end subroutine s_geo_circle
+
+    subroutine s_geo_rectangle(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: dx, dy
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dx = abs(x_cc(i) - patch_ib(id)%x_centroid) - patch_ib(id)%length_x/2._wp
+        dy = abs(y_cc(j) - patch_ib(id)%y_centroid) - patch_ib(id)%length_y/2._wp
+        gp%levelset = max(dx, dy)
+        gp%levelset_norm(1) = sign(1._wp, x_cc(i) - patch_ib(id)%x_centroid)
+        gp%levelset_norm(2) = sign(1._wp, y_cc(j) - patch_ib(id)%y_centroid)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo_rectangle
+
+    subroutine s_geo_airfoil(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp), dimension(3) :: dv, local
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = 0._wp
+        local = matmul(patch_ib(id)%rotation_matrix_inverse, dv) - patch_ib(id)%centroid_offset
+        gp%levelset = sqrt(sum(local**2)) - patch_ib(id)%radius
+        gp%levelset_norm = local / max(sqrt(sum(local**2)), 1e-12_wp)
+    end subroutine s_geo_airfoil
+
+    subroutine s_geo_ellipse(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: ax, ay, dist
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        ax = patch_ib(id)%length_x / 2._wp
+        ay = patch_ib(id)%length_y / 2._wp
+        dv(1) = (x_cc(i) - patch_ib(id)%x_centroid) / ax
+        dv(2) = (y_cc(j) - patch_ib(id)%y_centroid) / ay
+        dv(3) = 0._wp
+        dist = sqrt(dv(1)**2 + dv(2)**2)
+        gp%levelset = dist - 1._wp
+        if (dist > 0._wp) then
+            gp%levelset_norm = dv/dist
+        else
+            gp%levelset_norm = 0._wp
+        end if
+    end subroutine s_geo_ellipse
+
+    subroutine s_geo_triangle(gp)
+        !$omp declare target
+        type(ghost_point), intent(inout) :: gp
+        real(wp) :: d1, d2, d3
+        real(wp), dimension(3) :: dv
+        integer :: id, i, j
+        id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = x_cc(i) - patch_ib(id)%x_centroid
+        dv(2) = y_cc(j) - patch_ib(id)%y_centroid
+        dv(3) = 0._wp
+        d1 = dv(2) + patch_ib(id)%length_y/2._wp
+        d2 = -dv(2) + dv(1)*sqrt(3._wp)/3._wp + patch_ib(id)%length_y/3._wp
+        d3 = -dv(2) - dv(1)*sqrt(3._wp)/3._wp + patch_ib(id)%length_y/3._wp
+        gp%levelset = min(max(-d1,-d2), max(-d1,-d3))
+        gp%levelset_norm(1) = dv(1) / max(sqrt(sum(dv(1:2)**2)), 1e-12_wp)
+        gp%levelset_norm(2) = dv(2) / max(sqrt(sum(dv(1:2)**2)), 1e-12_wp)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo_triangle
+
+end module m_cl4_mini
+
+program test_cl4
+    use m_cl4_mini
+    implicit none
+
+    integer, parameter :: wp = kind(1.0d0)
+    integer, parameter :: N = 64, NGPS = 32
+    type(ghost_point), allocatable :: gps(:)
+    integer :: i
+
+    call s_cl4_init(N, N)
+
+    allocate(gps(NGPS))
+    do i = 1, NGPS
+        gps(i)%loc           = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1]
+        gps(i)%ib_patch_id   = 1
+        gps(i)%x_periodicity = 0
+        gps(i)%y_periodicity = 0
+        gps(i)%z_periodicity = 0
+        gps(i)%levelset      = 0._wp
+        gps(i)%levelset_norm = 0._wp
+        gps(i)%interp_coeffs = 0._wp
+        gps(i)%slip          = .false.
+    end do
+
+    call s_cl4_dispatch(gps, NGPS)
+
+    write(*,'(a,f12.6)') 'levelset(1) = ', gps(1)%levelset
+    deallocate(gps)
+    call s_cl4_finalize()
+    write(*,'(a)') 'OK'
+
+end program test_cl4
diff --git a/mini/m_cl5_mini.f90 b/mini/m_cl5_mini.f90
new file mode 100644
index 0000000000..2b23ef549d
--- /dev/null
+++ b/mini/m_cl5_mini.f90
@@ -0,0 +1,304 @@
+! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! ICE trigger: multiple !$omp target teams loop regions in one module, each
+! calling a different !$omp declare target subroutine in the SAME module.
+! Subroutines access allocatable module arrays and a derived type with
+! array-of-arrays fields (interp_coeffs(2,2,2)) plus character field.
+!
+! Compile (expect ICE):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -c m_cl5_mini.f90
+! Workaround (passes):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline -c m_cl5_mini.f90
+
+module m_cl5
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+
+    type :: patch_t
+        integer  :: geometry
+        real(wp) :: radius, cx, cy, cz
+        real(wp) :: lx, ly, lz
+        real(wp), dimension(1:3,1:3) :: rot, rot_inv
+        real(wp), dimension(1:3)     :: offset
+        character(LEN=200) :: label    ! char field in device-mapped type
+    end type patch_t
+
+    type :: gp_t
+        integer,  dimension(3)       :: loc
+        real(wp), dimension(2, 2, 2) :: interp_coeffs  ! 3D array in struct
+        integer                      :: pid
+        real(wp)                     :: levelset
+        real(wp), dimension(3)       :: levelset_norm
+        integer                      :: xp, yp, zp
+    end type gp_t
+
+    type(patch_t), dimension(8), target :: patches
+    real(wp), allocatable :: xc(:), yc(:), zc(:)
+    !$omp declare target(patches, xc, yc, zc)
+
+    public :: gp_t, s_init, s_finalize, s_dispatch
+
+contains
+
+    subroutine s_init(n)
+        integer, intent(in) :: n
+        integer :: i
+        allocate(xc(n), yc(n), zc(n))
+        do i = 1, n
+            xc(i) = real(i,wp)*0.1_wp
+            yc(i) = real(i,wp)*0.1_wp
+            zc(i) = real(i,wp)*0.1_wp
+        end do
+        do i = 1, 8
+            patches(i)%geometry = i
+            patches(i)%radius   = 0.5_wp; patches(i)%cx = 0.5_wp
+            patches(i)%cy       = 0.5_wp; patches(i)%cz = 0.5_wp
+            patches(i)%lx       = 0.4_wp; patches(i)%ly = 0.4_wp; patches(i)%lz = 0.4_wp
+            patches(i)%rot      = 0.0_wp
+            patches(i)%rot(1,1) = 1.0_wp; patches(i)%rot(2,2) = 1.0_wp; patches(i)%rot(3,3) = 1.0_wp
+            patches(i)%rot_inv  = patches(i)%rot
+            patches(i)%offset   = 0.0_wp
+            patches(i)%label    = 'none'
+        end do
+        !$omp target enter data map(to: patches, xc, yc, zc)
+    end subroutine s_init
+
+    subroutine s_finalize()
+        !$omp target exit data map(delete: patches, xc, yc, zc)
+        deallocate(xc, yc, zc)
+    end subroutine s_finalize
+
+    subroutine s_dispatch(gps, n)
+        type(gp_t), intent(inout) :: gps(:)
+        integer,    intent(in)    :: n
+        integer :: i
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 1) call s_geo1(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 2) call s_geo2(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 3) call s_geo3(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 4) call s_geo4(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 5) call s_geo5(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 6) call s_geo6(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 7) call s_geo7(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 8) call s_geo8(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 9) call s_geo9(gps(i))
+        end do
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            if (patches(gps(i)%pid)%geometry == 10) call s_geo10(gps(i))
+        end do
+
+    end subroutine s_dispatch
+
+    subroutine s_geo1(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx - real(gp%xp,wp)*0.1_wp
+        dv(2) = yc(j) - patches(id)%cy - real(gp%yp,wp)*0.1_wp
+        dv(3) = zc(k) - patches(id)%cz - real(gp%zp,wp)*0.1_wp
+        d = sqrt(sum(dv**2))
+        gp%levelset = d - patches(id)%radius
+        if (d > 0._wp) then; gp%levelset_norm = dv/d; else; gp%levelset_norm = 0._wp; end if
+    end subroutine s_geo1
+
+    subroutine s_geo2(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp) :: dx, dy, dz
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dx = abs(xc(i) - patches(id)%cx) - patches(id)%lx/2._wp
+        dy = abs(yc(j) - patches(id)%cy) - patches(id)%ly/2._wp
+        dz = abs(zc(k) - patches(id)%cz) - patches(id)%lz/2._wp
+        gp%levelset = max(dx, max(dy, dz))
+        gp%levelset_norm(1) = sign(1._wp, xc(i) - patches(id)%cx)
+        gp%levelset_norm(2) = sign(1._wp, yc(j) - patches(id)%cy)
+        gp%levelset_norm(3) = sign(1._wp, zc(k) - patches(id)%cz)
+    end subroutine s_geo2
+
+    subroutine s_geo3(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: r2d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz
+        ldv = matmul(patches(id)%rot_inv, dv)
+        r2d = sqrt(ldv(1)**2 + ldv(2)**2)
+        gp%levelset = r2d - patches(id)%radius
+        if (r2d > 0._wp) then
+            gp%levelset_norm(1) = ldv(1)/r2d; gp%levelset_norm(2) = ldv(2)/r2d
+        else
+            gp%levelset_norm(1) = 0._wp; gp%levelset_norm(2) = 0._wp
+        end if
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo3
+
+    subroutine s_geo4(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz
+        ldv = matmul(patches(id)%rot_inv, dv) - patches(id)%offset
+        d = sqrt(sum(ldv**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = ldv / max(d, 1.0e-12_wp)
+    end subroutine s_geo4
+
+    subroutine s_geo5(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp) :: ax, ay, d
+        real(wp), dimension(3) :: dv
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        ax = patches(id)%lx/2._wp; ay = patches(id)%ly/2._wp
+        dv(1) = (xc(i) - patches(id)%cx)/ax
+        dv(2) = (yc(j) - patches(id)%cy)/ay
+        dv(3) = 0._wp
+        d = sqrt(dv(1)**2 + dv(2)**2)
+        gp%levelset = d - 1._wp
+        if (d > 0._wp) then; gp%levelset_norm = dv/d; else; gp%levelset_norm = 0._wp; end if
+    end subroutine s_geo5
+
+    subroutine s_geo6(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp) :: dx, dy
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dx = abs(xc(i) - patches(id)%cx) - patches(id)%lx/2._wp
+        dy = abs(yc(j) - patches(id)%cy) - patches(id)%ly/2._wp
+        gp%levelset = max(dx, dy)
+        gp%levelset_norm(1) = sign(1._wp, xc(i) - patches(id)%cx)
+        gp%levelset_norm(2) = sign(1._wp, yc(j) - patches(id)%cy)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo6
+
+    subroutine s_geo7(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = 0._wp
+        ldv = matmul(patches(id)%rot_inv, dv) - patches(id)%offset
+        d = sqrt(sum(ldv(1:2)**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = ldv / max(d, 1.0e-12_wp)
+    end subroutine s_geo7
+
+    subroutine s_geo8(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp) :: d1, d2, d3
+        real(wp), dimension(3) :: dv
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = 0._wp
+        d1 = dv(2) + patches(id)%ly/2._wp
+        d2 = -dv(2) + dv(1)*sqrt(3._wp)/3._wp + patches(id)%ly/3._wp
+        d3 = -dv(2) - dv(1)*sqrt(3._wp)/3._wp + patches(id)%ly/3._wp
+        gp%levelset = min(max(-d1,-d2), max(-d1,-d3))
+        gp%levelset_norm(1) = dv(1)/max(sqrt(sum(dv(1:2)**2)),1e-12_wp)
+        gp%levelset_norm(2) = dv(2)/max(sqrt(sum(dv(1:2)**2)),1e-12_wp)
+        gp%levelset_norm(3) = 0._wp
+    end subroutine s_geo8
+
+    subroutine s_geo9(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j
+        real(wp), dimension(3) :: dv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2)
+        dv(1) = xc(i) - patches(id)%cx + patches(id)%lx*patches(id)%offset(1)
+        dv(2) = yc(j) - patches(id)%cy + patches(id)%ly*patches(id)%offset(2)
+        dv(3) = 0._wp
+        d = sqrt(sum(dv**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = dv / max(d, 1.0e-12_wp)
+    end subroutine s_geo9
+
+    subroutine s_geo10(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz
+        ldv = matmul(patches(id)%rot, dv)
+        d = sqrt(sum(ldv**2))
+        gp%levelset = d - patches(id)%radius
+        gp%levelset_norm = matmul(patches(id)%rot_inv, ldv) / max(d, 1.0e-12_wp)
+    end subroutine s_geo10
+
+end module m_cl5
+
+program test_cl5
+    use m_cl5
+    implicit none
+    integer, parameter :: N = 16
+    type(gp_t) :: gps(N)
+    integer :: i
+    call s_init(N)
+    do i = 1, N
+        gps(i)%levelset      = 0.0d0
+        gps(i)%levelset_norm = 0.0d0
+        gps(i)%interp_coeffs = 0.0d0
+        gps(i)%loc           = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1]
+        gps(i)%pid           = mod(i-1,8)+1
+        gps(i)%xp            = 0; gps(i)%yp = 0; gps(i)%zp = 0
+    end do
+    call s_dispatch(gps, N)
+    write(*,*) 'levelset(1) =', gps(1)%levelset
+    call s_finalize()
+end program test_cl5
diff --git a/mini/m_ice_min.f90 b/mini/m_ice_min.f90
new file mode 100644
index 0000000000..8b342ead7e
--- /dev/null
+++ b/mini/m_ice_min.f90
@@ -0,0 +1,61 @@
+! ABSOLUTE MINIMUM reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! Trigger: matmul() inside a !$omp declare target subroutine that is
+!          inlined into a !$omp target teams loop kernel.
+!
+! ifx version: 2025.1.1 20250418   Hardware: Intel GPU Max 1100 (Ponte Vecchio)
+!
+! ICE (O1/O2/O3):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 m_ice_min.f90
+! OK (inlining disabled):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline m_ice_min.f90
+
+module m_min
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+
+    real(wp), dimension(3, 3) :: A  ! module-level 3x3 matrix
+    real(wp), dimension(3)    :: x  ! module-level vector
+    real(wp), dimension(3)    :: b  ! result vector
+    !$omp declare target(A, x, b)
+
+    public :: s_run
+
+contains
+
+    subroutine s_run(n)
+        integer, intent(in) :: n
+        integer :: i
+
+        A(1,:) = [1._wp, 0._wp, 0._wp]
+        A(2,:) = [0._wp, 1._wp, 0._wp]
+        A(3,:) = [0._wp, 0._wp, 1._wp]
+        x = [1._wp, 2._wp, 3._wp]
+        b = 0._wp
+        !$omp target enter data map(to: A, x, b)
+
+        !$omp target teams loop private(i) map(to:A,x) map(tofrom:b)
+        do i = 1, n
+            call s_apply(i)
+        end do
+
+        !$omp target exit data map(from: b)
+    end subroutine s_run
+
+    subroutine s_apply(k)
+        !$omp declare target
+        integer, intent(in) :: k
+        ! matmul inside declare-target sub -- ICE trigger when inlined
+        b = real(k, wp) * matmul(A, x)
+    end subroutine s_apply
+
+end module m_min
+
+program test_min
+    use m_min
+    implicit none
+    call s_run(4)
+end program test_min
diff --git a/mini/m_ice_repro.f90 b/mini/m_ice_repro.f90
new file mode 100644
index 0000000000..d2dbe141de
--- /dev/null
+++ b/mini/m_ice_repro.f90
@@ -0,0 +1,125 @@
+! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633
+!
+! Trigger: matmul() called with a derived-type member (dimension(3,3) array)
+!          inside an !$omp declare target subroutine, called from an
+!          !$omp target teams loop. Module-level allocatable arrays are also
+!          accessed from the declare-target routine.
+!
+! ifx version: 2025.1.1 20250418 (iimpi/2025a)
+! Hardware:    Intel GPU Max 1100 (Ponte Vecchio XT)
+!
+! Compile (ICE -- error #5633: segmentation violation in SPIR-V backend):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -c m_ice_repro.f90
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O2 -c m_ice_repro.f90
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O1 -c m_ice_repro.f90
+!
+! Workaround (OK -- inlining suppressed so matmul is called via link, not inline):
+!   ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline -c m_ice_repro.f90
+
+module m_ice
+
+    implicit none
+    private
+
+    integer, parameter :: wp = kind(1.0d0)
+
+    type :: patch_t
+        real(wp), dimension(3, 3) :: mat  ! 3x3 member -- key field
+        real(wp) :: cx, cy, cz, radius
+    end type patch_t
+
+    type :: gp_t
+        integer  :: pid
+        integer, dimension(3) :: loc
+        real(wp) :: levelset
+        real(wp), dimension(3) :: norm
+    end type gp_t
+
+    type(patch_t), dimension(4) :: patches
+    real(wp), allocatable :: xc(:), yc(:), zc(:)
+    !$omp declare target(patches, xc, yc, zc)
+
+    public :: gp_t, s_init, s_finalize, s_dispatch
+
+contains
+
+    subroutine s_init(n)
+        integer, intent(in) :: n
+        integer :: i
+        allocate(xc(n), yc(n), zc(n))
+        do i = 1, n
+            xc(i) = real(i, wp)*0.1_wp
+            yc(i) = real(i, wp)*0.1_wp
+            zc(i) = real(i, wp)*0.1_wp
+        end do
+        do i = 1, 4
+            patches(i)%cx = 0.5_wp; patches(i)%cy = 0.5_wp; patches(i)%cz = 0.5_wp
+            patches(i)%radius = 0.25_wp
+            patches(i)%mat      = 0.0_wp
+            patches(i)%mat(1,1) = 1.0_wp
+            patches(i)%mat(2,2) = 1.0_wp
+            patches(i)%mat(3,3) = 1.0_wp
+        end do
+        !$omp target enter data map(to: patches, xc, yc, zc)
+    end subroutine s_init
+
+    subroutine s_finalize()
+        !$omp target exit data map(delete: patches, xc, yc, zc)
+        deallocate(xc, yc, zc)
+    end subroutine s_finalize
+
+    ! Single !$omp target teams loop calling a single declare-target sub.
+    subroutine s_dispatch(gps, n)
+        type(gp_t), intent(inout) :: gps(:)
+        integer,    intent(in)    :: n
+        integer :: i
+
+        !$omp target teams loop private(i) map(tofrom:gps)
+        do i = 1, n
+            call s_apply(gps(i))
+        end do
+
+    end subroutine s_dispatch
+
+    ! !$omp declare target subroutine in the SAME module.
+    ! ICE fires when the inliner pulls this into the target loop at -O1/-O2/-O3.
+    subroutine s_apply(gp)
+        !$omp declare target
+        type(gp_t), intent(inout) :: gp
+        integer :: id, i, j, k
+        real(wp), dimension(3) :: dv, ldv
+        real(wp) :: d
+        id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3)
+        dv(1) = xc(i) - patches(id)%cx
+        dv(2) = yc(j) - patches(id)%cy
+        dv(3) = zc(k) - patches(id)%cz
+        ! matmul with a derived-type member (patches(id)%mat) is the key trigger
+        ldv = matmul(patches(id)%mat, dv)
+        d = sqrt(sum(ldv**2))
+        gp%levelset = d - patches(id)%radius
+        if (d > 0.0_wp) then
+            gp%norm = ldv/d
+        else
+            gp%norm = 0.0_wp
+        end if
+    end subroutine s_apply
+
+end module m_ice
+
+program test_ice
+    use m_ice
+    implicit none
+    integer, parameter :: N = 16
+    type(gp_t) :: gps(N)
+    integer :: i
+    call s_init(N)
+    do i = 1, N
+        gps(i)%pid      = mod(i-1,4)+1
+        gps(i)%loc      = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1]
+        gps(i)%levelset = 0.0d0
+        gps(i)%norm     = 0.0d0
+    end do
+    call s_dispatch(gps, N)
+    write(*,*) 'levelset(1) =', gps(1)%levelset
+    call s_finalize()
+end program test_ice
diff --git a/mini/run_cl4.sh b/mini/run_cl4.sh
new file mode 100755
index 0000000000..742c63cdca
--- /dev/null
+++ b/mini/run_cl4.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:20:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl4.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl4.log
+#SBATCH -J cl4-ice-repro
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64 -fp-model=precise -march=native -mno-avx512fp16"
+
+echo "=== m_cl4_mini (single-module same-module declare-target + char struct) ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/cl4_$$.o /tmp/cl4_$$.x m_cl4_mini.mod
+    out=$($F90 $BASE "$@" "$MINI/m_cl4_mini.f90" -o /tmp/cl4_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl4_$$.o /tmp/cl4_$$.x m_cl4_mini.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5623|5633|internal abort|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117\|preprocessor" | head -3)"
+    fi
+}
+
+try "-O3 (baseline)"           -O3
+try "-O2"                      -O2
+try "-O1"                      -O1
+try "-O3 -fno-inline"          -O3 -fno-inline
+try "-O3 -fno-vectorize"       -O3 -fno-vectorize
+try "-O3 -fno-unroll-loops"    -O3 -fno-unroll-loops
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_cl5.sh b/mini/run_cl5.sh
new file mode 100755
index 0000000000..4a1f0e02d7
--- /dev/null
+++ b/mini/run_cl5.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:15:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5.log
+#SBATCH -J cl5-ice-reduce
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+
+echo "=== m_cl5_mini (reduced: char+array struct + 10 target loops + 10 same-module decl-target subs) ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/cl5_$$.x m_cl5.mod
+    out=$($F90 $BASE "$@" "$MINI/m_cl5_mini.f90" -o /tmp/cl5_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl5_$$.x m_cl5.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5623|5633|internal abort|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117\|preprocessor" | head -3)"
+    fi
+}
+
+try "-O3 (baseline)"           -O3
+try "-O3 -fno-inline"          -O3 -fno-inline
+try "-O2"                      -O2
+try "-O1"                      -O1
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_cl5b.sh b/mini/run_cl5b.sh
new file mode 100755
index 0000000000..f21189914d
--- /dev/null
+++ b/mini/run_cl5b.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:20:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5b.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5b.log
+#SBATCH -J cl5b-ice-bisect
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+SRC="$MINI/m_cl5_mini.f90"
+
+echo "=== m_cl5 bisection: what is the minimum ICE trigger? ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try_src() {
+    local desc="$1"; local src="$2"; shift 2
+    local out
+    rm -f /tmp/cl5b_$$.x m_cl5.mod
+    out=$($F90 $BASE "$@" "$src" -o /tmp/cl5b_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl5b_$$.x m_cl5.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+# Baseline confirms ICE
+try_src "baseline (full cl5)" "$SRC" -O3
+
+# --- Remove char field ---
+sed '/character(LEN=200)/d; s/patches(i)%label.*=.*//g' "$SRC" > /tmp/nochar_$$.f90
+try_src "no char(LEN=200) field" /tmp/nochar_$$.f90 -O3
+rm -f /tmp/nochar_$$.f90
+
+# --- Remove interp_coeffs(2,2,2) ---
+sed '/interp_coeffs/d' "$SRC" > /tmp/no_ic_$$.f90
+try_src "no interp_coeffs(2,2,2)" /tmp/no_ic_$$.f90 -O3
+rm -f /tmp/no_ic_$$.f90
+
+# --- Keep only 5 loops (geo1-5) ---
+python3 - "$SRC" > /tmp/cl5_5loops_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Remove geo6-geo10 loops from s_dispatch
+for g in range(6, 11):
+    src = re.sub(
+        r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g),
+        '\n', src)
+# Remove geo6-geo10 subroutines
+for g in range(6, 11):
+    src = re.sub(
+        r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g),
+        '\n', src, flags=re.DOTALL)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_src "5 loops only (geo1-5)" /tmp/cl5_5loops_$$.f90 -O3
+rm -f /tmp/cl5_5loops_$$.f90
+
+# --- Remove rot/rot_inv (3x3 arrays) from patch_t ---
+sed '/real(wp), dimension(1:3,1:3)/d; /patches(i)%rot/d; /patches(id)%rot/d; /matmul/d' "$SRC" > /tmp/norot_$$.f90
+try_src "no rotation matrices" /tmp/norot_$$.f90 -O3
+rm -f /tmp/norot_$$.f90
+
+# --- Remove allocatable xc/yc/zc (use constants instead) ---
+python3 - "$SRC" > /tmp/noxc_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Replace allocatable declarations with fixed arrays
+src = src.replace('real(wp), allocatable :: xc(:), yc(:), zc(:)', 'real(wp) :: xc(64), yc(64), zc(64)')
+# Remove allocate/deallocate
+src = re.sub(r'\s*allocate\(xc.*?\)\n', '\n', src)
+src = re.sub(r'\s*deallocate\(xc.*?\)\n', '\n', src)
+src = src.replace('integer, intent(in) :: n\n        integer :: i\n        allocate', 'integer, intent(in) :: n\n        integer :: i\n        ! allocate')
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_src "no allocatable xc/yc/zc" /tmp/noxc_$$.f90 -O3
+rm -f /tmp/noxc_$$.f90
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_cl5c.sh b/mini/run_cl5c.sh
new file mode 100755
index 0000000000..6734f50d91
--- /dev/null
+++ b/mini/run_cl5c.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:20:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5c.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5c.log
+#SBATCH -J cl5c-ice-matmul
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+SRC="$MINI/m_cl5_mini.f90"
+
+echo "=== matmul vs rot(3,3) field isolation test ==="
+$F90 --version 2>&1 | head -1
+echo ""
+
+try_f90() {
+    local desc="$1"; local src="$2"; shift 2
+    local out
+    rm -f /tmp/cl5c_$$.x m_cl5.mod
+    out=$($F90 $BASE "$@" "$src" -o /tmp/cl5c_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/cl5c_$$.x m_cl5.mod
+    if [ $rc -eq 0 ]; then echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+# A: Keep rot(3,3) field but replace matmul with element-wise multiply
+python3 - "$SRC" > /tmp/no_matmul_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Replace matmul(X, Y) with sum of manually computed dot products (fake matmul)
+# Just replace matmul calls with equivalent array expressions
+src = re.sub(r'matmul\(patches\(id\)%rot_inv, dv\)', 'dv', src)
+src = re.sub(r'matmul\(patches\(id\)%rot_inv, xyz_local\)', 'dv', src)
+src = re.sub(r'matmul\(patches\(id\)%rot, dv\)', 'dv', src)
+src = re.sub(r'matmul\(patches\(id\)%rot_inv, ldv\)', 'ldv', src)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "rot(3,3) field kept, matmul REMOVED (identity)" /tmp/no_matmul_$$.f90 -O3
+rm -f /tmp/no_matmul_$$.f90
+
+# B: Remove rot(3,3) fields but keep matmul via a local array
+python3 - "$SRC" > /tmp/no_rotfield_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Remove rot/rot_inv struct fields
+src = re.sub(r'\s*real\(wp\), dimension\(1:3,1:3\) :: rot, rot_inv\n', '\n', src)
+# Replace struct accesses with local identity
+src = re.sub(r'patches\(id\)%rot_inv', 'reshape([1._wp,0._wp,0._wp,0._wp,1._wp,0._wp,0._wp,0._wp,1._wp],[3,3])', src)
+src = re.sub(r'patches\(id\)%rot\b', 'reshape([1._wp,0._wp,0._wp,0._wp,1._wp,0._wp,0._wp,0._wp,1._wp],[3,3])', src)
+# Remove initialization of rot/rot_inv in s_init
+src = re.sub(r'\s*patches\(i\)%rot.*\n', '\n', src)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "rot(3,3) field REMOVED, matmul kept (local identity)" /tmp/no_rotfield_$$.f90 -O3
+rm -f /tmp/no_rotfield_$$.f90
+
+# C: Both removed — equivalent to previous "no rotation" test
+python3 - "$SRC" > /tmp/neither_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+src = re.sub(r'\s*real\(wp\), dimension\(1:3,1:3\) :: rot, rot_inv\n', '\n', src)
+src = re.sub(r'matmul\(.*?\)', 'dv', src)
+src = re.sub(r'\s*patches\(i\)%rot.*\n', '\n', src)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "both rot(3,3) AND matmul removed" /tmp/neither_$$.f90 -O3
+rm -f /tmp/neither_$$.f90
+
+# D: Original with just geo3 (which has matmul) — minimum loop count?
+python3 - "$SRC" > /tmp/geo3only_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+# Keep only 1 loop with matmul (geo3)
+for g in [1,2,4,5,6,7,8,9,10]:
+    src = re.sub(
+        r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g),
+        '\n', src)
+    src = re.sub(
+        r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g),
+        '\n', src, flags=re.DOTALL)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "1 loop (geo3 only, has matmul)" /tmp/geo3only_$$.f90 -O3
+rm -f /tmp/geo3only_$$.f90
+
+# E: 2 loops with matmul (geo3 + geo4)
+python3 - "$SRC" > /tmp/geo34_$$.f90 << 'PYEOF'
+import sys, re
+src = open(sys.argv[1]).read()
+for g in [1,2,5,6,7,8,9,10]:
+    src = re.sub(
+        r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g),
+        '\n', src)
+    src = re.sub(
+        r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g),
+        '\n', src, flags=re.DOTALL)
+open('/dev/stdout', 'w').write(src)
+PYEOF
+try_f90 "2 loops with matmul (geo3 + geo4)" /tmp/geo34_$$.f90 -O3
+rm -f /tmp/geo34_$$.f90
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_ice_min.sh b/mini/run_ice_min.sh
new file mode 100755
index 0000000000..41a698074b
--- /dev/null
+++ b/mini/run_ice_min.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:15:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/ice_min.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/ice_min.log
+#SBATCH -J ice-min
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+
+IFX=/sw/eb/sw/intel-compilers/2025.1.1/compiler/2025.1/bin/ifx
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+
+echo "=== ifx #5633 absolute minimum: matmul in declare-target sub called from target teams loop ==="
+$IFX --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/icemin_$$.x m_min.mod
+    out=$($IFX $BASE "$@" "$MINI/m_ice_min.f90" -o /tmp/icemin_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/icemin_$$.x m_min.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5633|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+try "-O3"                   -O3
+try "-O2"                   -O2
+try "-O1"                   -O1
+try "-O0"                   -O0
+try "-O3 -fno-inline"       -O3 -fno-inline
+
+echo ""
+echo "--- Variant: replace matmul with element-wise loop (no matmul intrinsic) ---"
+python3 - "$MINI/m_ice_min.f90" > /tmp/no_matmul_$$.f90 << 'PYEOF'
+import sys
+src = open(sys.argv[1]).read()
+src = src.replace(
+    "        b = real(k, wp) * matmul(A, x)",
+    "        integer :: ii, jj\n        do ii = 1, 3\n            b(ii) = 0._wp\n            do jj = 1, 3\n                b(ii) = b(ii) + A(ii,jj)*x(jj)\n            end do\n            b(ii) = b(ii) * real(k, wp)\n        end do")
+open('/dev/stdout', 'w').write(src)
+PYEOF
+rm -f /tmp/icemin_nm_$$.x m_min.mod
+out=$($IFX $BASE -O3 /tmp/no_matmul_$$.f90 -o /tmp/icemin_nm_$$.x 2>&1)
+rc=$?
+rm -f /tmp/icemin_nm_$$.x m_min.mod /tmp/no_matmul_$$.f90
+if [ $rc -eq 0 ]; then
+    echo "OK   : -O3 no matmul (manual loops) -- matmul intrinsic IS the trigger"
+elif echo "$out" | grep -qiE "5633|segmentation"; then
+    echo "ICE  : -O3 no matmul -- ICE even without matmul intrinsic"
+else
+    echo "ERR  : $(echo "$out" | grep -v '^$' | head -1)"
+fi
+
+echo ""
+echo "=== Done ==="
diff --git a/mini/run_ice_repro.sh b/mini/run_ice_repro.sh
new file mode 100755
index 0000000000..32e582f6cc
--- /dev/null
+++ b/mini/run_ice_repro.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#SBATCH -p pvc
+#SBATCH -N 1
+#SBATCH --gres=gpu:pvc:1
+#SBATCH -t 0:15:00
+#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/ice_repro.log
+#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/ice_repro.log
+#SBATCH -J ice-repro-final
+
+source /etc/profile
+module load iimpi/2025a imkl/2025.1.0
+export I_MPI_F90=ifx
+
+# Use ifx directly (not mpif90) since this is a single-file reproducer
+IFX=/sw/eb/sw/intel-compilers/2025.1.1/compiler/2025.1/bin/ifx
+MINI=/scratch/user/u.sb27915/MFC-intel/mini
+BASE="-free -fiopenmp -fopenmp-targets=spir64"
+
+echo "=== ifx #5633 minimal reproducer: matmul(derived_type%mat, vec) in declare-target sub ==="
+$IFX --version 2>&1 | head -1
+echo ""
+
+try() {
+    local desc="$1"; shift
+    local out
+    rm -f /tmp/ice_$$.x m_ice.mod
+    out=$($IFX $BASE "$@" "$MINI/m_ice_repro.f90" -o /tmp/ice_$$.x 2>&1)
+    local rc=$?
+    rm -f /tmp/ice_$$.x m_ice.mod
+    if [ $rc -eq 0 ]; then
+        echo "OK   : $desc"
+    elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+        echo "ICE  : $desc"
+        echo "     $(echo "$out" | grep -iE "5633|error #" | head -1)"
+    else
+        echo "ERR  : $desc"
+        echo "     $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)"
+    fi
+}
+
+try "-O3"                      -O3
+try "-O2"                      -O2
+try "-O1"                      -O1
+try "-O3 -fno-inline (fix)"    -O3 -fno-inline
+try "-O0"                      -O0
+
+echo ""
+echo "--- Variant: matmul on local array (no struct member) ---"
+python3 - "$MINI/m_ice_repro.f90" > /tmp/local_mat_$$.f90 << 'PYEOF'
+import sys
+src = open(sys.argv[1]).read()
+# Add local_mat to the declaration block (before first executable)
+src = src.replace(
+    '        real(wp), dimension(3) :: dv, ldv\n        real(wp) :: d',
+    '        real(wp), dimension(3) :: dv, ldv\n        real(wp), dimension(3,3) :: local_mat\n        real(wp) :: d')
+# Copy struct member to local before matmul
+src = src.replace(
+    '        ! matmul with a derived-type member (patches(id)%mat) is the key trigger\n        ldv = matmul(patches(id)%mat, dv)',
+    '        local_mat = patches(id)%mat\n        ! matmul with LOCAL copy (not struct member directly)\n        ldv = matmul(local_mat, dv)')
+open('/dev/stdout', 'w').write(src)
+PYEOF
+rm -f /tmp/ice_lm_$$.x m_ice.mod
+out=$($IFX $BASE -O3 /tmp/local_mat_$$.f90 -o /tmp/ice_lm_$$.x 2>&1)
+rc=$?
+rm -f /tmp/ice_lm_$$.x m_ice.mod /tmp/local_mat_$$.f90
+if [ $rc -eq 0 ]; then
+    echo "OK   : -O3 with local_mat copy -- direct struct access IS the trigger"
+elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then
+    echo "ICE  : -O3 with local_mat copy -- copying to local doesn't help"
+else
+    echo "ERR  : $(echo "$out" | grep -v '^$' | head -1)"
+fi
+
+echo ""
+echo "=== Done ==="
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
index 7620e7607f..00f26a99a2 100644
--- a/src/common/include/omp_macros.fpp
+++ b/src/common/include/omp_macros.fpp
@@ -2,7 +2,7 @@
 
 #:set NVIDIA_COMPILER_ID="NVHPC"
 #:set PGI_COMPILER_ID="PGI"
-#:set INTEL_COMPILER_ID="Intel"
+#:set INTEL_COMPILER_ID="IntelLLVM"
 #:set CCE_COMPILER_ID="Cray"
 #:set AMD_COMPILER_ID="LLVMFlang"
 
@@ -182,6 +182,20 @@
         #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) '
     #:elif MFC_COMPILER == AMD_COMPILER_ID
         #:set omp_start_directive = '!$omp target teams distribute parallel do '
+    #:elif MFC_COMPILER == INTEL_COMPILER_ID
+        #! Intel OMP 5.2: bind(a,b) is invalid. Drop explicit firstprivate list:
+        #! ifx rejects firstprivate for declare-target module variables (#7655).
+        #! OMP 5.0 rule: unclaused scalars in target constructs are firstprivate
+        #! by default, so scalar locals (e.g. gp_layers_z) are covered implicitly.
+        #! declare-target vars (e.g. gp_layers) are device-resident -- no mapping
+        #! needed. defaultmap(firstprivate:scalar) also unsupported by ifx (#9061).
+        #:set omp_start_directive = '!$omp target teams loop '
+        #:set clause_val = collapse_val.strip('\n') + &
+            & default_val.strip('\n') + GEN_PRIVATE_STR(private, False).strip('\n') + &
+            & reduction_val.strip('\n') + copy_val.strip('\n') + copyin_val.strip('\n') + &
+            & copyout_val.strip('\n') + create_val.strip('\n') + &
+            & no_create_val.strip('\n') + present_val.strip('\n') + &
+            & deviceptr_val.strip('\n') + attach_val.strip('\n')
     #:else
         #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
     #:endif
@@ -197,6 +211,8 @@
         #:set omp_end_directive = '!$omp end target teams distribute parallel do simd'
     #:elif MFC_COMPILER == AMD_COMPILER_ID
         #:set omp_end_directive = '!$omp end target teams distribute parallel do'
+    #:elif MFC_COMPILER == INTEL_COMPILER_ID
+        #:set omp_end_directive = '!$omp end target teams loop'
     #:else
         #:set omp_end_directive = '!$omp end target teams loop'
     #:endif
@@ -218,7 +234,9 @@
         #:set function_name_val = ''
     #:endif
 
-    #:if MFC_COMPILER == AMD_COMPILER_ID
+    #:if MFC_COMPILER == AMD_COMPILER_ID or MFC_COMPILER == INTEL_COMPILER_ID
+        #! AMD: device_type unsupported. Intel: OpenMP 5.2 requires an enter/to/link/local
+        #! clause alongside device_type; omit device_type entirely for both.
         #:set clause_val = ''
     #:else
         #:set clause_val = nohost_val.strip('\n')
@@ -374,4 +392,10 @@
         $:code
     #:endif
 #:enddef
+
+#:def OMP_MKL_DISPATCH()
+    #:if MFC_COMPILER == INTEL_COMPILER_ID
+        !$omp dispatch
+    #:endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index b1382ec49a..5f3ec2ccb0 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -303,4 +303,12 @@
 
 #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID)
 #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID)
+
+! Dispatch to oneMKL GPU FFT via OpenMP dispatch construct (Intel GPU only)
+#:def GPU_MKL_DISPATCH()
+    #:set omp_code = OMP_MKL_DISPATCH()
+#if defined(MFC_OpenMP)
+    $:omp_code
+#endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp
index 36bee0a23a..fc3f783e98 100644
--- a/src/common/include/shared_parallel_macros.fpp
+++ b/src/common/include/shared_parallel_macros.fpp
@@ -1,12 +1,13 @@
 #:set NVIDIA_COMPILER_ID="NVHPC"
 #:set PGI_COMPILER_ID="PGI"
-#:set INTEL_COMPILER_ID="Intel"
+#:set INTEL_COMPILER_ID="IntelLLVM"
 #:set CCE_COMPILER_ID="Cray"
 #:set AMD_COMPILER_ID="LLVMFlang"
 
 #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID)
 #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID)
 #:set USING_AMD = (MFC_COMPILER == AMD_COMPILER_ID)
+#:set USING_INTEL = (MFC_COMPILER == INTEL_COMPILER_ID)
 
 #:def ASSERT_LIST(data, datatype)
     #:assert data is not None
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 6eb5383f75..33b874c9c5 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -284,11 +284,11 @@ contains
     subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf)
 
         $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
@@ -371,7 +371,7 @@ contains
     subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in, q_T_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout)                                                      :: q_prim_vf
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in, mv_in
         integer, intent(in)                                                                                  :: bc_dir, bc_loc
         integer, intent(in)                                                                                  :: k, l
@@ -619,7 +619,7 @@ contains
     subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in, q_T_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout)                                                      :: q_prim_vf
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in, mv_in
         integer, intent(in)                                                                                  :: bc_dir, bc_loc
         integer, intent(in)                                                                                  :: k, l
@@ -778,7 +778,7 @@ contains
     subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout)                                                      :: q_prim_vf
         real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), optional, intent(inout) :: pb_in, mv_in
         integer, intent(in)                                                                                  :: k, l
         integer                                                                                              :: j, q, i
@@ -833,11 +833,11 @@ contains
     subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf)
 
         $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  !< bc_x%beg
@@ -986,11 +986,11 @@ contains
 
         $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  !< bc_x%beg
@@ -1158,11 +1158,11 @@ contains
     subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf)
 
         $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: bc_dir, bc_loc
-        integer, intent(in)                                    :: k, l
-        integer                                                :: j, i
-        type(scalar_field), optional, intent(inout)            :: q_T_sf
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
+        type(scalar_field), optional, intent(inout)     :: q_T_sf
 
 #ifdef MFC_SIMULATION
         if (bc_dir == 1) then  !< x-direction
@@ -1461,10 +1461,10 @@ contains
     subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_periodic', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer, intent(in)                                        :: bc_dir, bc_loc
-        integer, intent(in)                                        :: k, l
-        integer                                                    :: j, i
+        type(scalar_field), dimension(:), intent(inout) :: c_divs
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
@@ -1516,10 +1516,10 @@ contains
     subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_reflective', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer, intent(in)                                        :: bc_dir, bc_loc
-        integer, intent(in)                                        :: k, l
-        integer                                                    :: j, i
+        type(scalar_field), dimension(:), intent(inout) :: c_divs
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
@@ -1595,10 +1595,10 @@ contains
     subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True)
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer, intent(in)                                        :: bc_dir, bc_loc
-        integer, intent(in)                                        :: k, l
-        integer                                                    :: j, i
+        type(scalar_field), dimension(:), intent(inout) :: c_divs
+        integer, intent(in)                             :: bc_dir, bc_loc
+        integer, intent(in)                             :: k, l
+        integer                                         :: j, i
 
         if (bc_dir == 1) then  !< x-direction
             if (bc_loc == -1) then  ! bc_x%beg
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index ec10c6a3dc..5b684ea51c 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -30,8 +30,8 @@ contains
 
         $:GPU_ROUTINE(function_name='compute_viscosity_and_inversion',parallelism='[seq]', cray_inline=True)
 
-        real(wp), intent(inout)                         :: T_L, T_R, Re_L, Re_R
-        real(wp), dimension(num_species), intent(inout) :: Ys_R, Ys_L
+        real(wp), intent(inout)               :: T_L, T_R, Re_L, Re_R
+        real(wp), dimension(:), intent(inout) :: Ys_R, Ys_L
 
         call get_mixture_viscosity_mixavg(T_L, Ys_L, Re_L)
         call get_mixture_viscosity_mixavg(T_R, Ys_R, Re_R)
@@ -114,7 +114,7 @@ contains
         real(wp)                                               :: T
         real(wp)                                               :: rho, omega_m
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: Ys
             real(wp), dimension(10) :: omega
         #:else
@@ -157,7 +157,7 @@ contains
         integer, intent(in)                                    :: idir
         type(scalar_field), intent(in)                         :: q_T_sf
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
             real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
             real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 6bd9718dbb..e2a36f418c 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -26,11 +26,11 @@ contains
     subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in)                :: vftmp
-        real(wp), dimension(nb), intent(in) :: Rtmp
-        real(wp), intent(out)               :: ntmp
-        real(wp), dimension(nb), intent(in) :: weights
-        real(wp)                            :: R3
+        real(wp), intent(in)               :: vftmp
+        real(wp), dimension(:), intent(in) :: Rtmp
+        real(wp), intent(out)              :: ntmp
+        real(wp), dimension(:), intent(in) :: weights
+        real(wp)                           :: R3
 
         R3 = dot_product(weights, Rtmp**3._wp)
         ntmp = (3._wp/(4._wp*pi))*vftmp/R3
@@ -41,11 +41,11 @@ contains
     subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in)                :: vftmp
-        real(wp), dimension(nb), intent(in) :: nRtmp
-        real(wp), intent(out)               :: ntmp
-        real(wp), dimension(nb), intent(in) :: weights
-        real(wp)                            :: nR3
+        real(wp), intent(in)               :: vftmp
+        real(wp), dimension(:), intent(in) :: nRtmp
+        real(wp), intent(out)              :: ntmp
+        real(wp), dimension(:), intent(in) :: weights
+        real(wp)                           :: nR3
 
         nR3 = dot_product(weights, nRtmp**3._wp)
         ntmp = sqrt((4._wp*pi/3._wp)*nR3/vftmp)
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 7d4b92705c..cc42771028 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -799,716 +799,712 @@ contains
                                 do i = 1, nVar
                                     r = (i - 1) + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
                                     q_comm(i)%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_comm(i)%sf(j + unpack_offset, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                                    print *, "Error", j, k, l, i
+                                    call s_mpi_abort("NaN(s) in recv")
+                                end if
 #endif
-                                end do
                             end do
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
+                end do
+                $:END_GPU_PARALLEL_LOOP()
 
-                    if (chem_diff_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    r = nVar + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
-                                    q_T_sf%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                if (chem_diff_comm) then
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -buff_size, -1
+                                r = nVar + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
+                                q_T_sf%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_T_sf%sf(j + unpack_offset, k, l))) then
-                                        print *, "Error", j, k, l
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                                print *, "Error", j, k, l
+                                call s_mpi_abort("NaN(s) in recv")
+                            end if
 #endif
-                                end do
-                            end do
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
 
-                    if (qbmm_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    do i = nVar + 1, nVar + nnode
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
-                                            pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
+            if (qbmm_comm) then
+                $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+                do l = 0, p
+                    do k = 0, n
+                        do j = -buff_size, -1
+                            do i = nVar + 1, nVar + nnode
+                                do q = 1, nb
+                                    r = (i - 1) + (q - 1)*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
+                                    pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                                 end do
                             end do
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    do i = nVar + 1, nVar + nnode
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
-                                            mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+                do l = 0, p
+                    do k = 0, n
+                        do j = -buff_size, -1
+                            do i = nVar + 1, nVar + nnode
+                                do q = 1, nb
+                                    r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l))
+                                    mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                                 end do
                             end do
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-                #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
-                    do i = 1, nVar
-                        do l = 0, p
-                            do k = -buff_size, -1
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
-                                    q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:elif mpi_dir == 2
+            $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+            do i = 1, nVar
+                do l = 0, p
+                    do k = -buff_size, -1
+                        do j = -buff_size, m + buff_size
+                            r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
+                            q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_comm(i)%sf(j, k + unpack_offset, l))) then
-                                        print *, "Error", j, k, l, i
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                            print *, "Error", j, k, l, i
+                            call s_mpi_abort("NaN(s) in recv")
+                        end if
 #endif
-                                end do
-                            end do
-                        end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (chem_diff_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                        do l = 0, p
-                            do k = -buff_size, -1
-                                do j = -buff_size, m + buff_size
-                                    r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
-                                    q_T_sf%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (chem_diff_comm) then
+            $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+            do l = 0, p
+                do k = -buff_size, -1
+                    do j = -buff_size, m + buff_size
+                        r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
+                        q_T_sf%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_T_sf%sf(j, k + unpack_offset, l))) then
-                                        print *, "Error", j, k, l
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
+                        print *, "Error", j, k, l
+                        call s_mpi_abort("NaN(s) in recv")
                     end if
-
-                    if (qbmm_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = 0, p
-                                do k = -buff_size, -1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k &
-                                                 & + buff_size) + buff_size*l))
-                                            pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
+#endif
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+    end if
+
+    if (qbmm_comm) then
+        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+        do i = nVar + 1, nVar + nnode
+            do l = 0, p
+                do k = -buff_size, -1
+                    do j = -buff_size, m + buff_size
+                        do q = 1, nb
+                            r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) &
+                                 & + buff_size*l))
+                            pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = 0, p
-                                do k = -buff_size, -1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size &
-                                                 & + 1)*((k + buff_size) + buff_size*l))
-                                            mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+        do i = nVar + 1, nVar + nnode
+            do l = 0, p
+                do k = -buff_size, -1
+                    do j = -buff_size, m + buff_size
+                        do q = 1, nb
+                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k &
+                                 & + buff_size) + buff_size*l))
+                            mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp)
                         end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-                #:else
-                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
-                    do i = 1, nVar
-                        do l = -buff_size, -1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n &
-                                         & + 2*buff_size + 1)*(l + buff_size)))
-                                    q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+    end if
+#:else
+    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+    do i = 1, nVar
+        do l = -buff_size, -1
+            do k = -buff_size, n + buff_size
+                do j = -buff_size, m + buff_size
+                    r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l &
+                         & + buff_size)))
+                    q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_comm(i)%sf(j, k, l + unpack_offset))) then
-                                        print *, "Error", j, k, l, i
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
+                    print *, "Error", j, k, l, i
+                    call s_mpi_abort("NaN(s) in recv")
+                end if
 #endif
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (chem_diff_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
-                        do l = -buff_size, -1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n &
-                                                       & + 2*buff_size + 1)*(l + buff_size)))
-                                    q_T_sf%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
-#if defined(__INTEL_COMPILER)
+            end do
+        end do
+    end do
+end do
+$:END_GPU_PARALLEL_LOOP()
+
+if (chem_diff_comm) then
+    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+    do l = -buff_size, -1
+        do k = -buff_size, n + buff_size
+            do j = -buff_size, m + buff_size
+                r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l &
+                                   & + buff_size)))
+                q_T_sf%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
+#if defined(__INTEL_COMPILER) && !defined(MFC_GPU)
                                     if (ieee_is_nan(q_T_sf%sf(j, k, l + unpack_offset))) then
-                                        print *, "Error", j, k, l
-                                        call s_mpi_abort("NaN(s) in recv")
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-
-                    if (qbmm_comm) then
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = -buff_size, -1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k &
-                                                 & + buff_size) + (n + 2*buff_size + 1)*(l + buff_size)))
-                                            pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
-                        do i = nVar + 1, nVar + nnode
-                            do l = -buff_size, -1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size &
-                                                 & + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l + buff_size)))
-                                            mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-                #:endif
+                print *, "Error", j, k, l
+                call s_mpi_abort("NaN(s) in recv")
             end if
-        #:endfor
-        call nvtxEndRange
+#endif
+        end do
+    end do
+end do
+$:END_GPU_PARALLEL_LOOP()
+end if
+
+if (qbmm_comm) then
+    $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+    do i = nVar + 1, nVar + nnode
+        do l = -buff_size, -1
+            do k = -buff_size, n + buff_size
+                do j = -buff_size, m + buff_size
+                    do q = 1, nb
+                        r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n &
+                             & + 2*buff_size + 1)*(l + buff_size)))
+                        pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
+                    end do
+                end do
+            end do
+        end do
+    end do
+    $:END_GPU_PARALLEL_LOOP()
+
+    $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
+    do i = nVar + 1, nVar + nnode
+        do l = -buff_size, -1
+            do k = -buff_size, n + buff_size
+                do j = -buff_size, m + buff_size
+                    do q = 1, nb
+                        r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) &
+                             & + (n + 2*buff_size + 1)*(l + buff_size)))
+                        mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp)
+                    end do
+                end do
+            end do
+        end do
+    end do
+    $:END_GPU_PARALLEL_LOOP()
+end if
+#:endif
+end if
+#:endfor
+call nvtxEndRange
 #endif
 
-    end subroutine s_mpi_sendrecv_variables_buffers
+end subroutine s_mpi_sendrecv_variables_buffers
 
-    !> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction.
-    subroutine s_mpi_decompose_computational_domain
+!> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction.
+subroutine s_mpi_decompose_computational_domain
 
 #ifdef MFC_MPI
-        integer :: num_procs_x, num_procs_y, num_procs_z  !< Optimal number of processors in the x-, y- and z-directions
-        !> Non-optimal number of processors in the x-, y- and z-directions
-        real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
-        real(wp) :: fct_min        !< Processor factorization (fct) minimization parameter
-        integer  :: MPI_COMM_CART  !< Cartesian processor topology communicator
-        integer  :: rem_cells      !< Remaining cells after distribution among processors
-        integer  :: recon_order    !< WENO or MUSCL reconstruction order
-        integer  :: i, j           !< Generic loop iterators
-        integer  :: ierr           !< Generic flag used to identify and report MPI errors
-
-        if (recon_type == WENO_TYPE) then
-            recon_order = weno_order
-        else
-            recon_order = muscl_order
-        end if
+    integer :: num_procs_x, num_procs_y, num_procs_z  !< Optimal number of processors in the x-, y- and z-directions
+    !> Non-optimal number of processors in the x-, y- and z-directions
+    real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
+    real(wp) :: fct_min        !< Processor factorization (fct) minimization parameter
+    integer  :: MPI_COMM_CART  !< Cartesian processor topology communicator
+    integer  :: rem_cells      !< Remaining cells after distribution among processors
+    integer  :: recon_order    !< WENO or MUSCL reconstruction order
+    integer  :: i, j           !< Generic loop iterators
+    integer  :: ierr           !< Generic flag used to identify and report MPI errors
+
+    if (recon_type == WENO_TYPE) then
+        recon_order = weno_order
+    else
+        recon_order = muscl_order
+    end if
+
+    if (num_procs == 1 .and. parallel_io) then
+        do i = 1, num_dims
+            start_idx(i) = 0
+        end do
+        return
+    end if
 
-        if (num_procs == 1 .and. parallel_io) then
-            do i = 1, num_dims
-                start_idx(i) = 0
-            end do
-            return
-        end if
+    if (igr) then
+        recon_order = igr_order
+    end if
 
-        if (igr) then
-            recon_order = igr_order
-        end if
+    ! 3D Cartesian Processor Topology
+    if (n > 0) then
+        if (p > 0) then
+            if (fft_wrt) then
+                ! Initial estimate of optimal processor topology
+                num_procs_x = 1
+                num_procs_y = 1
+                num_procs_z = num_procs
+                ierr = -1
 
-        ! 3D Cartesian Processor Topology
-        if (n > 0) then
-            if (p > 0) then
-                if (fft_wrt) then
-                    ! Initial estimate of optimal processor topology
+                ! Benchmarking the quality of this initial guess
+                tmp_num_procs_y = num_procs_y
+                tmp_num_procs_z = num_procs_z
+                fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+
+                ! Optimization of the initial processor topology
+                do i = 1, num_procs
+                    if (mod(num_procs, i) == 0 .and. (n + 1)/i >= num_stcls_min*recon_order) then
+                        tmp_num_procs_y = i
+                        tmp_num_procs_z = num_procs/i
+
+                        if (fct_min >= abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
+                            & /tmp_num_procs_z >= num_stcls_min*recon_order) then
+                            num_procs_y = i
+                            num_procs_z = num_procs/i
+                            fct_min = abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                            ierr = 0
+                        end if
+                    end if
+                end do
+            else
+                if (cyl_coord .and. p > 0) then
+                    ! Pencil blocking for cylindrical coordinates (Fourier filter near axis)
+
+                    ! Initial values of the processor factorization optimization
                     num_procs_x = 1
-                    num_procs_y = 1
-                    num_procs_z = num_procs
+                    num_procs_y = num_procs
+                    num_procs_z = 1
                     ierr = -1
 
-                    ! Benchmarking the quality of this initial guess
+                    ! Computing minimization variable for these initial values
+                    tmp_num_procs_x = num_procs_x
                     tmp_num_procs_y = num_procs_y
                     tmp_num_procs_z = num_procs_z
-                    fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                    fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
 
-                    ! Optimization of the initial processor topology
+                    ! Searching for optimal computational domain distribution
                     do i = 1, num_procs
-                        if (mod(num_procs, i) == 0 .and. (n + 1)/i >= num_stcls_min*recon_order) then
-                            tmp_num_procs_y = i
-                            tmp_num_procs_z = num_procs/i
-
-                            if (fct_min >= abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
-                                & /tmp_num_procs_z >= num_stcls_min*recon_order) then
-                                num_procs_y = i
-                                num_procs_z = num_procs/i
-                                fct_min = abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                        if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
+                            tmp_num_procs_x = i
+                            tmp_num_procs_y = num_procs/i
+
+                            if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
+                                & /tmp_num_procs_y >= num_stcls_min*recon_order) then
+                                num_procs_x = i
+                                num_procs_y = num_procs/i
+                                fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
                                 ierr = 0
                             end if
                         end if
                     end do
                 else
-                    if (cyl_coord .and. p > 0) then
-                        ! Pencil blocking for cylindrical coordinates (Fourier filter near axis)
-
-                        ! Initial values of the processor factorization optimization
-                        num_procs_x = 1
-                        num_procs_y = num_procs
-                        num_procs_z = 1
-                        ierr = -1
-
-                        ! Computing minimization variable for these initial values
-                        tmp_num_procs_x = num_procs_x
-                        tmp_num_procs_y = num_procs_y
-                        tmp_num_procs_z = num_procs_z
-                        fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-
-                        ! Searching for optimal computational domain distribution
-                        do i = 1, num_procs
-                            if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
-                                tmp_num_procs_x = i
-                                tmp_num_procs_y = num_procs/i
-
-                                if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
-                                    & /tmp_num_procs_y >= num_stcls_min*recon_order) then
-                                    num_procs_x = i
-                                    num_procs_y = num_procs/i
-                                    fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-                                    ierr = 0
-                                end if
-                            end if
-                        end do
-                    else
-                        ! Initial estimate of optimal processor topology
-                        num_procs_x = 1
-                        num_procs_y = 1
-                        num_procs_z = num_procs
-                        ierr = -1
-
-                        ! Benchmarking the quality of this initial guess
-                        tmp_num_procs_x = num_procs_x
-                        tmp_num_procs_y = num_procs_y
-                        tmp_num_procs_z = num_procs_z
-                        fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + 10._wp*abs((n + 1) &
-                                             & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
-
-                        ! Optimization of the initial processor topology
-                        do i = 1, num_procs
-                            if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
-                                do j = 1, num_procs/i
-                                    if (mod(num_procs/i, j) == 0 .and. (n + 1)/j >= num_stcls_min*recon_order) then
-                                        tmp_num_procs_x = i
-                                        tmp_num_procs_y = j
-                                        tmp_num_procs_z = num_procs/(i*j)
-
-                                        if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
-                                            & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
-                                            & /tmp_num_procs_z >= num_stcls_min*recon_order) then
-                                            num_procs_x = i
-                                            num_procs_y = j
-                                            num_procs_z = num_procs/(i*j)
-                                            fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
-                                                          & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
-                                            ierr = 0
-                                        end if
-                                    end if
-                                end do
-                            end if
-                        end do
-                    end if
-                end if
-
-                ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation
-                ! exits.
-                if (proc_rank == 0 .and. ierr == -1) then
-                    call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n, p and ' &
-                                     & // 'weno/muscl/igr_order. Exiting.')
-                end if
-
-                ! Creating new communicator using the Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, num_procs_y, num_procs_z/), (/.true., .true., .true./), &
-                                     & .false., MPI_COMM_CART, ierr)
-
-                ! Finding the Cartesian coordinates of the local process
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr)
-
-                ! Global Parameters for z-direction
-
-                ! Number of remaining cells
-                rem_cells = mod(p + 1, num_procs_z)
-
-                ! Optimal number of cells per processor
-                p = (p + 1)/num_procs_z - 1
-
-                ! Distributing the remaining cells
-                do i = 1, rem_cells
-                    if (proc_coords(3) == i - 1) then
-                        p = p + 1; exit
-                    end if
-                end do
-
-                ! Boundary condition at the beginning
-                if (proc_coords(3) > 0 .or. (bc_z%beg == BC_PERIODIC .and. num_procs_z > 1)) then
-                    proc_coords(3) = proc_coords(3) - 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr)
-                    proc_coords(3) = proc_coords(3) + 1
-                end if
-
-                ! Boundary condition at the end
-                if (proc_coords(3) < num_procs_z - 1 .or. (bc_z%end == BC_PERIODIC .and. num_procs_z > 1)) then
-                    proc_coords(3) = proc_coords(3) + 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr)
-                    proc_coords(3) = proc_coords(3) - 1
-                end if
-
-#ifdef MFC_POST_PROCESS
-                ! Ghost zone at the beginning
-                if (proc_coords(3) > 0 .and. format == 1) then
-                    offset_z%beg = 2
-                else
-                    offset_z%beg = 0
-                end if
+                    ! Initial estimate of optimal processor topology
+                    num_procs_x = 1
+                    num_procs_y = 1
+                    num_procs_z = num_procs
+                    ierr = -1
 
-                ! Ghost zone at the end
-                if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then
-                    offset_z%end = 2
-                else
-                    offset_z%end = 0
-                end if
-#endif
+                    ! Benchmarking the quality of this initial guess
+                    tmp_num_procs_x = num_procs_x
+                    tmp_num_procs_y = num_procs_y
+                    tmp_num_procs_z = num_procs_z
+                    fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + 10._wp*abs((n + 1)/tmp_num_procs_y &
+                                         & - (p + 1)/tmp_num_procs_z)
 
-                ! Beginning and end sub-domain boundary locations
-                if (parallel_io) then
-                    if (proc_coords(3) < rem_cells) then
-                        start_idx(3) = (p + 1)*proc_coords(3)
-                    else
-                        start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
-                    end if
-                else
-#ifdef MFC_PRE_PROCESS
-                    if (old_grid .neqv. .true.) then
-                        dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp)
-
-                        if (proc_coords(3) < rem_cells) then
-                            z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3))
-                            z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1) - (num_procs_z &
-                                                                  & - rem_cells))
-                        else
-                            z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3) + rem_cells)
-                            z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1))
+                    ! Optimization of the initial processor topology
+                    do i = 1, num_procs
+                        if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
+                            do j = 1, num_procs/i
+                                if (mod(num_procs/i, j) == 0 .and. (n + 1)/j >= num_stcls_min*recon_order) then
+                                    tmp_num_procs_x = i
+                                    tmp_num_procs_y = j
+                                    tmp_num_procs_z = num_procs/(i*j)
+
+                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
+                                        & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) &
+                                        & /tmp_num_procs_z >= num_stcls_min*recon_order) then
+                                        num_procs_x = i
+                                        num_procs_y = j
+                                        num_procs_z = num_procs/(i*j)
+                                        fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) &
+                                                      & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z)
+                                        ierr = 0
+                                    end if
+                                end if
+                            end do
                         end if
-                    end if
-#endif
+                    end do
                 end if
+            end if
 
-                ! 2D Cartesian Processor Topology
-            else
-                ! Initial estimate of optimal processor topology
-                num_procs_x = 1
-                num_procs_y = num_procs
-                ierr = -1
-
-                ! Benchmarking the quality of this initial guess
-                tmp_num_procs_x = num_procs_x
-                tmp_num_procs_y = num_procs_y
-                fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-
-                ! Optimization of the initial processor topology
-                do i = 1, num_procs
-                    if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
-                        tmp_num_procs_x = i
-                        tmp_num_procs_y = num_procs/i
-
-                        if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
-                            & /tmp_num_procs_y >= num_stcls_min*recon_order) then
-                            num_procs_x = i
-                            num_procs_y = num_procs/i
-                            fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
-                            ierr = 0
-                        end if
-                    end if
-                end do
-
-                ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation
-                ! exits.
-                if (proc_rank == 0 .and. ierr == -1) then
-                    call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n and ' &
-                                     & // 'weno/muscl/igr_order. Exiting.')
-                end if
+            ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation exits.
+            if (proc_rank == 0 .and. ierr == -1) then
+                call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n, p and ' &
+                                 & // 'weno/muscl/igr_order. Exiting.')
+            end if
 
-                ! Creating new communicator using the Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, num_procs_y/), (/.true., .true./), .false., MPI_COMM_CART, &
-                                     & ierr)
+            ! Creating new communicator using the Cartesian topology
+            call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, num_procs_y, num_procs_z/), (/.true., .true., .true./), &
+                                 & .false., MPI_COMM_CART, ierr)
 
-                ! Finding the Cartesian coordinates of the local process
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr)
-            end if
+            ! Finding the Cartesian coordinates of the local process
+            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr)
 
-            ! Global Parameters for y-direction
+            ! Global Parameters for z-direction
 
             ! Number of remaining cells
-            rem_cells = mod(n + 1, num_procs_y)
+            rem_cells = mod(p + 1, num_procs_z)
 
             ! Optimal number of cells per processor
-            n = (n + 1)/num_procs_y - 1
+            p = (p + 1)/num_procs_z - 1
 
             ! Distributing the remaining cells
             do i = 1, rem_cells
-                if (proc_coords(2) == i - 1) then
-                    n = n + 1; exit
+                if (proc_coords(3) == i - 1) then
+                    p = p + 1; exit
                 end if
             end do
 
             ! Boundary condition at the beginning
-            if (proc_coords(2) > 0 .or. (bc_y%beg == BC_PERIODIC .and. num_procs_y > 1)) then
-                proc_coords(2) = proc_coords(2) - 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr)
-                proc_coords(2) = proc_coords(2) + 1
+            if (proc_coords(3) > 0 .or. (bc_z%beg == BC_PERIODIC .and. num_procs_z > 1)) then
+                proc_coords(3) = proc_coords(3) - 1
+                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr)
+                proc_coords(3) = proc_coords(3) + 1
             end if
 
             ! Boundary condition at the end
-            if (proc_coords(2) < num_procs_y - 1 .or. (bc_y%end == BC_PERIODIC .and. num_procs_y > 1)) then
-                proc_coords(2) = proc_coords(2) + 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr)
-                proc_coords(2) = proc_coords(2) - 1
+            if (proc_coords(3) < num_procs_z - 1 .or. (bc_z%end == BC_PERIODIC .and. num_procs_z > 1)) then
+                proc_coords(3) = proc_coords(3) + 1
+                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr)
+                proc_coords(3) = proc_coords(3) - 1
             end if
 
 #ifdef MFC_POST_PROCESS
             ! Ghost zone at the beginning
-            if (proc_coords(2) > 0 .and. format == 1) then
-                offset_y%beg = 2
+            if (proc_coords(3) > 0 .and. format == 1) then
+                offset_z%beg = 2
             else
-                offset_y%beg = 0
+                offset_z%beg = 0
             end if
 
             ! Ghost zone at the end
-            if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then
-                offset_y%end = 2
+            if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then
+                offset_z%end = 2
             else
-                offset_y%end = 0
+                offset_z%end = 0
             end if
 #endif
 
             ! Beginning and end sub-domain boundary locations
             if (parallel_io) then
-                if (proc_coords(2) < rem_cells) then
-                    start_idx(2) = (n + 1)*proc_coords(2)
+                if (proc_coords(3) < rem_cells) then
+                    start_idx(3) = (p + 1)*proc_coords(3)
                 else
-                    start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
+                    start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
                 end if
             else
 #ifdef MFC_PRE_PROCESS
                 if (old_grid .neqv. .true.) then
-                    dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp)
+                    dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp)
 
-                    if (proc_coords(2) < rem_cells) then
-                        y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2))
-                        y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1) - (num_procs_y &
+                    if (proc_coords(3) < rem_cells) then
+                        z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3))
+                        z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1) - (num_procs_z &
                                                               & - rem_cells))
                     else
-                        y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2) + rem_cells)
-                        y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1))
+                        z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3) + rem_cells)
+                        z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1))
                     end if
                 end if
 #endif
             end if
 
-            ! 1D Cartesian Processor Topology
+            ! 2D Cartesian Processor Topology
         else
-            ! Optimal processor topology
-            num_procs_x = num_procs
+            ! Initial estimate of optimal processor topology
+            num_procs_x = 1
+            num_procs_y = num_procs
+            ierr = -1
+
+            ! Benchmarking the quality of this initial guess
+            tmp_num_procs_x = num_procs_x
+            tmp_num_procs_y = num_procs_y
+            fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
+
+            ! Optimization of the initial processor topology
+            do i = 1, num_procs
+                if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then
+                    tmp_num_procs_x = i
+                    tmp_num_procs_y = num_procs/i
+
+                    if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) &
+                        & /tmp_num_procs_y >= num_stcls_min*recon_order) then
+                        num_procs_x = i
+                        num_procs_y = num_procs/i
+                        fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y)
+                        ierr = 0
+                    end if
+                end if
+            end do
+
+            ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation exits.
+            if (proc_rank == 0 .and. ierr == -1) then
+                call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n and ' &
+                                 & // 'weno/muscl/igr_order. Exiting.')
+            end if
 
             ! Creating new communicator using the Cartesian topology
-            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), (/.true./), .false., MPI_COMM_CART, ierr)
+            call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, num_procs_y/), (/.true., .true./), .false., MPI_COMM_CART, ierr)
 
             ! Finding the Cartesian coordinates of the local process
-            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, proc_coords, ierr)
+            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr)
         end if
 
-        ! Global Parameters for x-direction
+        ! Global Parameters for y-direction
 
         ! Number of remaining cells
-        rem_cells = mod(m + 1, num_procs_x)
+        rem_cells = mod(n + 1, num_procs_y)
 
         ! Optimal number of cells per processor
-        m = (m + 1)/num_procs_x - 1
+        n = (n + 1)/num_procs_y - 1
 
         ! Distributing the remaining cells
         do i = 1, rem_cells
-            if (proc_coords(1) == i - 1) then
-                m = m + 1; exit
+            if (proc_coords(2) == i - 1) then
+                n = n + 1; exit
             end if
         end do
 
-        call s_update_cell_bounds(cells_bounds, m, n, p)
-
         ! Boundary condition at the beginning
-        if (proc_coords(1) > 0 .or. (bc_x%beg == BC_PERIODIC .and. num_procs_x > 1)) then
-            proc_coords(1) = proc_coords(1) - 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
-            proc_coords(1) = proc_coords(1) + 1
+        if (proc_coords(2) > 0 .or. (bc_y%beg == BC_PERIODIC .and. num_procs_y > 1)) then
+            proc_coords(2) = proc_coords(2) - 1
+            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr)
+            proc_coords(2) = proc_coords(2) + 1
         end if
 
         ! Boundary condition at the end
-        if (proc_coords(1) < num_procs_x - 1 .or. (bc_x%end == BC_PERIODIC .and. num_procs_x > 1)) then
-            proc_coords(1) = proc_coords(1) + 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
-            proc_coords(1) = proc_coords(1) - 1
+        if (proc_coords(2) < num_procs_y - 1 .or. (bc_y%end == BC_PERIODIC .and. num_procs_y > 1)) then
+            proc_coords(2) = proc_coords(2) + 1
+            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr)
+            proc_coords(2) = proc_coords(2) - 1
         end if
 
 #ifdef MFC_POST_PROCESS
         ! Ghost zone at the beginning
-        if (proc_coords(1) > 0 .and. format == 1) then
-            offset_x%beg = 2
+        if (proc_coords(2) > 0 .and. format == 1) then
+            offset_y%beg = 2
         else
-            offset_x%beg = 0
+            offset_y%beg = 0
         end if
 
         ! Ghost zone at the end
-        if (proc_coords(1) < num_procs_x - 1 .and. format == 1) then
-            offset_x%end = 2
+        if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then
+            offset_y%end = 2
         else
-            offset_x%end = 0
+            offset_y%end = 0
         end if
 #endif
 
         ! Beginning and end sub-domain boundary locations
         if (parallel_io) then
-            if (proc_coords(1) < rem_cells) then
-                start_idx(1) = (m + 1)*proc_coords(1)
+            if (proc_coords(2) < rem_cells) then
+                start_idx(2) = (n + 1)*proc_coords(2)
             else
-                start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
+                start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
             end if
         else
 #ifdef MFC_PRE_PROCESS
             if (old_grid .neqv. .true.) then
-                dx = (x_domain%end - x_domain%beg)/real(m_glb + 1, wp)
+                dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp)
 
-                if (proc_coords(1) < rem_cells) then
-                    x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1))
-                    x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells))
+                if (proc_coords(2) < rem_cells) then
+                    y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2))
+                    y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1) - (num_procs_y - rem_cells))
                 else
-                    x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells)
-                    x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1))
+                    y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2) + rem_cells)
+                    y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1))
                 end if
             end if
 #endif
         end if
+
+        ! 1D Cartesian Processor Topology
+    else
+        ! Optimal processor topology
+        num_procs_x = num_procs
+
+        ! Creating new communicator using the Cartesian topology
+        call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), (/.true./), .false., MPI_COMM_CART, ierr)
+
+        ! Finding the Cartesian coordinates of the local process
+        call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, proc_coords, ierr)
+    end if
+
+    ! Global Parameters for x-direction
+
+    ! Number of remaining cells
+    rem_cells = mod(m + 1, num_procs_x)
+
+    ! Optimal number of cells per processor
+    m = (m + 1)/num_procs_x - 1
+
+    ! Distributing the remaining cells
+    do i = 1, rem_cells
+        if (proc_coords(1) == i - 1) then
+            m = m + 1; exit
+        end if
+    end do
+
+    call s_update_cell_bounds(cells_bounds, m, n, p)
+
+    ! Boundary condition at the beginning
+    if (proc_coords(1) > 0 .or. (bc_x%beg == BC_PERIODIC .and. num_procs_x > 1)) then
+        proc_coords(1) = proc_coords(1) - 1
+        call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
+        proc_coords(1) = proc_coords(1) + 1
+    end if
+
+    ! Boundary condition at the end
+    if (proc_coords(1) < num_procs_x - 1 .or. (bc_x%end == BC_PERIODIC .and. num_procs_x > 1)) then
+        proc_coords(1) = proc_coords(1) + 1
+        call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
+        proc_coords(1) = proc_coords(1) - 1
+    end if
+
+#ifdef MFC_POST_PROCESS
+    ! Ghost zone at the beginning
+    if (proc_coords(1) > 0 .and. format == 1) then
+        offset_x%beg = 2
+    else
+        offset_x%beg = 0
+    end if
+
+    ! Ghost zone at the end
+    if (proc_coords(1) < num_procs_x - 1 .and. format == 1) then
+        offset_x%end = 2
+    else
+        offset_x%end = 0
+    end if
+#endif
+
+    ! Beginning and end sub-domain boundary locations
+    if (parallel_io) then
+        if (proc_coords(1) < rem_cells) then
+            start_idx(1) = (m + 1)*proc_coords(1)
+        else
+            start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
+        end if
+    else
+#ifdef MFC_PRE_PROCESS
+        if (old_grid .neqv. .true.) then
+            dx = (x_domain%end - x_domain%beg)/real(m_glb + 1, wp)
+
+            if (proc_coords(1) < rem_cells) then
+                x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1))
+                x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells))
+            else
+                x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells)
+                x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1))
+            end if
+        end if
+#endif
+    end if
 #endif
 
-    end subroutine s_mpi_decompose_computational_domain
+end subroutine s_mpi_decompose_computational_domain
 
-    !> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors.
-    !! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of
-    !! cell-boundary locations may be calculated directly from those of the cell-width distributions.
+!> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors.
+!! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of
+!! cell-boundary locations may be calculated directly from those of the cell-width distributions.
 #ifndef MFC_PRE_PROCESS
-    subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc)
+subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc)
 
-        integer, intent(in) :: mpi_dir
-        integer, intent(in) :: pbc_loc
+    integer, intent(in) :: mpi_dir
+    integer, intent(in) :: pbc_loc
 
 #ifdef MFC_MPI
-        integer :: ierr  !< Generic flag used to identify and report MPI errors
+    integer :: ierr  !< Generic flag used to identify and report MPI errors
 
-        if (mpi_dir == 1) then
-            if (pbc_loc == -1) then  ! PBC at the beginning
+    if (mpi_dir == 1) then
+        if (pbc_loc == -1) then  ! PBC at the beginning
 
-                if (bc_x%end >= 0) then  ! PBC at the beginning and end
-                    call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, &
-                                      & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
-                    call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
-            else  ! PBC at the end
-                if (bc_x%beg >= 0) then  ! PBC at the end and beginning
-                    call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the end only
-                    call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, &
-                                      & bc_x%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
+            if (bc_x%end >= 0) then  ! PBC at the beginning and end
+                call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, &
+                                  & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the beginning only
+                call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
             end if
-        else if (mpi_dir == 2) then
-            if (pbc_loc == -1) then  ! PBC at the beginning
-
-                if (bc_y%end >= 0) then  ! PBC at the beginning and end
-                    call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, &
-                                      & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
-                    call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
-            else  ! PBC at the end
-                if (bc_y%beg >= 0) then  ! PBC at the end and beginning
-                    call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the end only
-                    call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, &
-                                      & bc_y%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
+        else  ! PBC at the end
+            if (bc_x%beg >= 0) then  ! PBC at the end and beginning
+                call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, MPI_COMM_WORLD, &
+                                  & MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the end only
+                call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
             end if
-        else
-            if (pbc_loc == -1) then  ! PBC at the beginning
-
-                if (bc_z%end >= 0) then  ! PBC at the beginning and end
-                    call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, &
-                                      & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
-                    call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
-            else  ! PBC at the end
-                if (bc_z%beg >= 0) then  ! PBC at the end and beginning
-                    call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, &
-                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the end only
-                    call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, &
-                                      & bc_z%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                end if
+        end if
+    else if (mpi_dir == 2) then
+        if (pbc_loc == -1) then  ! PBC at the beginning
+
+            if (bc_y%end >= 0) then  ! PBC at the beginning and end
+                call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, &
+                                  & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the beginning only
+                call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            end if
+        else  ! PBC at the end
+            if (bc_y%beg >= 0) then  ! PBC at the end and beginning
+                call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, MPI_COMM_WORLD, &
+                                  & MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the end only
+                call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            end if
+        end if
+    else
+        if (pbc_loc == -1) then  ! PBC at the beginning
+
+            if (bc_z%end >= 0) then  ! PBC at the beginning and end
+                call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, &
+                                  & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the beginning only
+                call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            end if
+        else  ! PBC at the end
+            if (bc_z%beg >= 0) then  ! PBC at the end and beginning
+                call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, MPI_COMM_WORLD, &
+                                  & MPI_STATUS_IGNORE, ierr)
+            else  ! PBC at the end only
+                call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
             end if
         end if
+    end if
 #endif
 
-    end subroutine s_mpi_sendrecv_grid_variables_buffers
+end subroutine s_mpi_sendrecv_grid_variables_buffers
 #endif
 
-    !> Module deallocation and/or disassociation procedures
-    impure subroutine s_finalize_mpi_common_module
+!> Module deallocation and/or disassociation procedures
+impure subroutine s_finalize_mpi_common_module
 
 #ifdef MFC_MPI
-        deallocate (buff_send, buff_recv)
+    deallocate (buff_send, buff_recv)
 #endif
 
-    end subroutine s_finalize_mpi_common_module
+end subroutine s_finalize_mpi_common_module
 
 end module m_mpi_common
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 7f9131550d..f2897c2c97 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -257,15 +257,15 @@ contains
         $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', parallelism='[seq]', cray_noinline=True)
 
         ! initializing variables
-        integer, intent(in)                                 :: j, k, l, MFL
-        real(wp), intent(out)                               :: pS
-        real(wp), dimension(1:), intent(out)                :: p_infpT
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp), intent(in)                                :: rhoe
-        real(wp), intent(out)                               :: TS
-        real(wp)                                            :: gp, gpp, hp, pO, mCP, mQ  !< variables for the Newton Solver
-        real(wp)                                            :: p_infpT_sum
-        integer                                             :: i, ns                     !< generic loop iterators
+        integer, intent(in)                          :: j, k, l, MFL
+        real(wp), intent(out)                        :: pS
+        real(wp), dimension(:), intent(out)          :: p_infpT
+        type(scalar_field), dimension(:), intent(in) :: q_cons_vf
+        real(wp), intent(in)                         :: rhoe
+        real(wp), intent(out)                        :: TS
+        real(wp)                                     :: gp, gpp, hp, pO, mCP, mQ  !< variables for the Newton Solver
+        real(wp)                                     :: p_infpT_sum
+        integer                                      :: i, ns                     !< generic loop iterators
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT_sum = 0._wp
         $:GPU_LOOP(parallelism='[seq]')
@@ -351,16 +351,16 @@ contains
 
         $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', parallelism='[seq]', cray_noinline=True)
 
-        integer, intent(in)                                    :: j, k, l
-        real(wp), intent(inout)                                :: pS
-        real(wp), dimension(1:), intent(in)                    :: p_infpT
-        real(wp), intent(in)                                   :: rhoe
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        real(wp), intent(inout)                                :: TS
+        integer, intent(in)                             :: j, k, l
+        real(wp), intent(inout)                         :: pS
+        real(wp), dimension(1:), intent(in)             :: p_infpT
+        real(wp), intent(in)                            :: rhoe
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        real(wp), intent(inout)                         :: TS
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
         #:else
-            real(wp), dimension(num_fluids) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
+            real(wp), dimension(num_fluids_max) :: p_infpTg  !< stiffness for the participating fluids for pTg-equilibrium
         #:endif
         real(wp), dimension(2, 2) :: Jac, InvJac, TJac                  !< matrices for the Newton Solver
         real(wp), dimension(2)    :: R2D, DeltamP                       !< residual and correction array
@@ -525,10 +525,10 @@ contains
 
         !> @name variables for the correction of the reacting partial densities
         !> @{
-        real(wp), intent(out)                                  :: MCT
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        real(wp), intent(inout)                                :: rM
-        integer, intent(in)                                    :: j, k, l
+        real(wp), intent(out)                           :: MCT
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        real(wp), intent(inout)                         :: rM
+        integer, intent(in)                             :: j, k, l
         !> @}
         if (rM < 0.0_wp) then
             if ((q_cons_vf(lp + eqn_idx%cont%beg - 1)%sf(j, k, &
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 2417da1adf..8d546daf9f 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -91,12 +91,12 @@ contains
         real(wp), intent(in), optional  :: G, pres_mag
 
         ! Chemistry
-        real(wp), dimension(1:num_species), intent(in) :: rhoYks
-        real(wp), dimension(1:num_species)             :: Y_rs
-        real(wp)                                       :: E_e
-        real(wp)                                       :: e_Per_Kg, Pdyn_Per_Kg
-        real(wp)                                       :: T_guess
-        integer                                        :: s  !< Generic loop iterator
+        real(wp), dimension(:), intent(in) :: rhoYks
+        real(wp), dimension(1:num_species) :: Y_rs
+        real(wp)                           :: E_e
+        real(wp)                           :: e_Per_Kg, Pdyn_Per_Kg
+        real(wp)                           :: T_guess
+        integer                            :: s  !< Generic loop iterator
         #:if not chemistry
             ! Depending on model_eqns and bubbles_euler, the appropriate procedure for computing pressure is targeted by the
             ! procedure pointer
@@ -253,8 +253,8 @@ contains
             real(wp), dimension(3), intent(inout)        :: alpha_rho_K, alpha_K
             real(wp), optional, dimension(3), intent(in) :: G
         #:else
-            real(wp), dimension(num_fluids), intent(inout)        :: alpha_rho_K, alpha_K
-            real(wp), optional, dimension(num_fluids), intent(in) :: G
+            real(wp), dimension(:), intent(inout)        :: alpha_rho_K, alpha_K
+            real(wp), optional, dimension(:), intent(in) :: G
         #:endif
         real(wp), dimension(2), intent(out) :: Re_K
         real(wp), optional, intent(out)     :: G_K
@@ -1175,12 +1175,12 @@ contains
     subroutine s_compute_species_fraction(q_vf, k, l, r, alpha_rho_K, alpha_K)
 
         $:GPU_ROUTINE(function_name='s_compute_species_fraction', parallelism='[seq]', cray_noinline=True)
-        type(scalar_field), dimension(sys_size), intent(in) :: q_vf
-        integer, intent(in)                                 :: k, l, r
+        type(scalar_field), dimension(:), intent(in) :: q_vf
+        integer, intent(in)                          :: k, l, r
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K
         #:else
-            real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K
+            real(wp), dimension(:), intent(out) :: alpha_rho_K, alpha_K
         #:endif
         integer  :: i
         real(wp) :: alpha_K_sum
@@ -1256,7 +1256,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(in) :: adv
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: adv
+            real(wp), dimension(:), intent(in) :: adv
         #:endif
         real(wp), intent(in)  :: vel_sum
         real(wp), intent(in)  :: c_c
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 4de261864d..488a84c644 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -129,7 +129,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf  !< Primitive variables
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: myalpha, myalpha_rho
         #:else
             real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index a15156405b..200a007754 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -148,7 +148,7 @@ contains
         real(wp)                                               :: pb_local, mv_local, vflux, pbdot
         real(wp)                                               :: n_tait, B_tait
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: Rtmp, Vtmp
             real(wp), dimension(3) :: myalpha, myalpha_rho
         #:else
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index b44eb617b1..c9773ee9f6 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -699,15 +699,15 @@ contains
 
         $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(in)                                :: pinf, rhol, gamma, pi_inf
-        integer, dimension(3), intent(in)                   :: cell
-        real(wp), intent(out)                               :: cson
-        real(wp)                                            :: E, H
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(in)                         :: pinf, rhol, gamma, pi_inf
+        integer, dimension(3), intent(in)            :: cell
+        real(wp), intent(out)                        :: cson
+        real(wp)                                     :: E, H
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: vel
         #:else
-            real(wp), dimension(num_dims) :: vel
+            real(wp), dimension(3) :: vel
         #:endif
         integer :: i
 
@@ -765,19 +765,19 @@ contains
 
         $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', cray_inline=True)
 
-        integer, intent(in)                                 :: bub_id, ptype
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(out)                               :: f_pinfl
-        integer, dimension(3), intent(out)                  :: cell
-        real(wp), intent(out), optional                     :: preterm1, term2, Romega
-        real(wp), dimension(3)                              :: scoord, psi
-        real(wp)                                            :: dc, vol, aux
-        real(wp)                                            :: volgas, term1, Rbeq, denom
-        real(wp)                                            :: charvol, charpres, charvol2, charpres2
-        integer, dimension(3)                               :: cellaux
-        integer                                             :: i, j, k
-        integer                                             :: smearGrid, smearGridz
-        logical                                             :: celloutside
+        integer, intent(in)                          :: bub_id, ptype
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(out)                        :: f_pinfl
+        integer, dimension(3), intent(out)           :: cell
+        real(wp), intent(out), optional              :: preterm1, term2, Romega
+        real(wp), dimension(3)                       :: scoord, psi
+        real(wp)                                     :: dc, vol, aux
+        real(wp)                                     :: volgas, term1, Rbeq, denom
+        real(wp)                                     :: charvol, charpres, charvol2, charpres2
+        integer, dimension(3)                        :: cellaux
+        integer                                      :: i, j, k
+        integer                                      :: smearGrid, smearGridz
+        logical                                      :: celloutside
 
         scoord = mtn_s(bub_id,1:3,2)
         f_pinfl = 0._wp
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 6f1b041d49..bc8d7afe2d 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -473,12 +473,12 @@ contains
         real(wp)                                               :: dqv_dt
         real(wp)                                               :: dpres_ds
 
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20) :: L
         #:else
             real(wp), dimension(sys_size) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)  :: alpha_rho, dalpha_rho_ds, mf
             real(wp), dimension(3)  :: vel, dvel_ds
             real(wp), dimension(3)  :: adv_local, dadv_ds
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index c2c415b1d3..750cbf74b6 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -23,10 +23,10 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), intent(in)               :: rho, c, dpres_ds
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp) :: L1
         L1 = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
@@ -37,15 +37,15 @@ contains
     subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), intent(in) :: dpres_ds
@@ -62,15 +62,15 @@ contains
     subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer              :: i
@@ -86,15 +86,15 @@ contains
     subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer              :: i
@@ -110,15 +110,15 @@ contains
     subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer              :: i
@@ -138,15 +138,15 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_slip_wall_L',parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
 
@@ -162,21 +162,21 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_buffer_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in)  :: dvel_ds
             real(wp), dimension(3), intent(in)  :: dadv_ds
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in)  :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)    :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in)  :: dadv_ds
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -202,15 +202,15 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_inflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: dvel_ds
         #:else
-            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
         #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
 
@@ -226,21 +226,21 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in)  :: dvel_ds
             real(wp), dimension(3), intent(in)  :: dadv_ds
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in)  :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)    :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in)  :: dadv_ds
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -260,19 +260,19 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_force_free_subsonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in) :: dvel_ds
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)   :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -291,19 +291,19 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_constant_pressure_subsonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in) :: dvel_ds
             real(wp), dimension(3), intent(in) :: dadv_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)   :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
@@ -320,10 +320,10 @@ contains
     subroutine s_compute_supersonic_inflow_L(L)
 
         $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', parallelism='[seq]', cray_inline=True)
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
         L(1:eqn_idx%adv%end) = 0._wp
         if (chemistry) L(eqn_idx%species%beg:eqn_idx%species%end) = 0._wp
@@ -336,21 +336,21 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_supersonic_outflow_L', parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(20), intent(inout) :: L
         #:else
-            real(wp), dimension(sys_size), intent(inout) :: L
+            real(wp), dimension(:), intent(inout) :: L
         #:endif
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3), intent(in)  :: mf, dalpha_rho_ds
             real(wp), dimension(3), intent(in)  :: dvel_ds
             real(wp), dimension(3), intent(in)  :: dadv_ds
             real(wp), dimension(10), intent(in) :: dYs_ds
         #:else
-            real(wp), dimension(num_fluids), intent(in)  :: mf, dalpha_rho_ds
-            real(wp), dimension(num_dims), intent(in)    :: dvel_ds
-            real(wp), dimension(num_fluids), intent(in)  :: dadv_ds
-            real(wp), dimension(num_species), intent(in) :: dYs_ds
+            real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(:), intent(in) :: dvel_ds
+            real(wp), dimension(:), intent(in) :: dadv_ds
+            real(wp), dimension(:), intent(in) :: dYs_ds
         #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 9e7519790c..cef5e95e91 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -20,53 +20,100 @@ module m_compute_levelset
 
 contains
 
+    !> 3x3 matrix-vector multiply; replaces the matmul() intrinsic which triggers an ifx 2025.1.1 SPIR-V ICE (#5633) when inlined
+    !! into a target teams loop kernel via declare target.
+    pure function f_mv3(M, v) result(w)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), intent(in) :: M(3, 3), v(3)
+        real(wp)             :: w(3)
+        w(1) = M(1, 1)*v(1) + M(1, 2)*v(2) + M(1, 3)*v(3)
+        w(2) = M(2, 1)*v(1) + M(2, 2)*v(2) + M(2, 3)*v(3)
+        w(3) = M(3, 1)*v(1) + M(3, 2)*v(2) + M(3, 3)*v(3)
+
+    end function f_mv3
+
     !> Dispatch level-set distance and normal computations for all ghost points based on patch geometry type
     impure subroutine s_apply_levelset(gps, num_gps)
 
         type(ghost_point), dimension(:), intent(inout) :: gps
         integer, intent(in)                            :: num_gps
-        integer                                        :: i, patch_id, patch_geometry
+        integer                                        :: i, patch_id
 
-        !  3D Patch Geometries
+        ! One GPU loop per geometry type so each kernel calls exactly one
+        ! declare-target routine. A single if-else dispatch over multiple
+        ! declare-target callees triggers an LLVM phi-node dominance error
+        ! in ifx SPIR64 codegen; splitting into separate loops avoids it.
 
         if (p > 0) then
-            $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
-                patch_geometry = patch_ib(patch_id)%geometry
-
-                if (patch_geometry == 8) then
-                    call s_sphere_levelset(gps(i))
-                else if (patch_geometry == 9) then
-                    call s_cuboid_levelset(gps(i))
-                else if (patch_geometry == 10) then
-                    call s_cylinder_levelset(gps(i))
-                else if (patch_geometry == 11) then
-                    call s_3d_airfoil_levelset(gps(i))
-                else if (patch_geometry == 12) then
-                    call s_model_levelset(gps(i))
-                end if
+                if (patch_ib(patch_id)%geometry == 8) call s_sphere_levelset(gps(i))
             end do
             $:END_GPU_PARALLEL_LOOP()
 
-            ! 2D Patch Geometries
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 9) call s_cuboid_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 10) call s_cylinder_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 11) call s_3d_airfoil_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 12) call s_model_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
         else if (n > 0) then
-            $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
             do i = 1, num_gps
                 patch_id = gps(i)%ib_patch_id
-                patch_geometry = patch_ib(patch_id)%geometry
-
-                if (patch_geometry == 2) then
-                    call s_circle_levelset(gps(i))
-                else if (patch_geometry == 3) then
-                    call s_rectangle_levelset(gps(i))
-                else if (patch_geometry == 4) then
-                    call s_airfoil_levelset(gps(i))
-                else if (patch_geometry == 5) then
-                    call s_model_levelset(gps(i))
-                else if (patch_geometry == 6) then
-                    call s_ellipse_levelset(gps(i))
-                end if
+                if (patch_ib(patch_id)%geometry == 2) call s_circle_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 3) call s_rectangle_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 4) call s_airfoil_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 5) call s_model_levelset(gps(i))
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]')
+            do i = 1, num_gps
+                patch_id = gps(i)%ib_patch_id
+                if (patch_ib(patch_id)%geometry == 6) call s_ellipse_levelset(gps(i))
             end do
             $:END_GPU_PARALLEL_LOOP()
         end if
@@ -126,7 +173,7 @@ contains
         offset(:) = patch_ib(ib_patch_id)%centroid_offset(:)
 
         xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
-        xy_local = matmul(inverse_rotation, xy_local)  ! rotate the frame into the IB's coordinate
+        xy_local = f_mv3(inverse_rotation, xy_local)  ! rotate the frame into the IB's coordinate
         xy_local = xy_local - offset  ! airfoils are a patch that require a centroid offset
 
         if (xy_local(2) >= 0._wp) then
@@ -176,7 +223,7 @@ contains
         if (f_approx_equal(dist, 0._wp)) then
             gp%levelset_norm = 0._wp
         else
-            gp%levelset_norm = matmul(rotation, dist_vec(:))/dist  ! convert the normal vector back to global grid coordinates
+            gp%levelset_norm = f_mv3(rotation, dist_vec(:))/dist  ! convert the normal vector back to global grid coordinates
         end if
 
     end subroutine s_airfoil_levelset
@@ -211,7 +258,7 @@ contains
         z_min = -lz/2
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(l)] - center
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+        xyz_local = f_mv3(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
         xyz_local = xyz_local - offset  ! airfoils are a patch that require a centroid offset
 
         if (xyz_local(2) >= 0._wp) then
@@ -266,13 +313,13 @@ contains
             else
                 normal(3) = 1._wp
             end if
-            gp%levelset_norm = matmul(rotation, normal)
+            gp%levelset_norm = f_mv3(rotation, normal)
         else
             gp%levelset = dist_surf
             if (f_approx_equal(dist_surf, 0._wp)) then
                 gp%levelset_norm = 0._wp
             else
-                gp%levelset_norm = matmul(rotation, dist_vec(:)/dist_surf)
+                gp%levelset_norm = f_mv3(rotation, dist_vec(:)/dist_surf)
             end if
         end if
 
@@ -312,7 +359,7 @@ contains
 
         ! convert grid to local coordinates
         xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
-        xy_local = matmul(inverse_rotation, xy_local)
+        xy_local = f_mv3(inverse_rotation, xy_local)
 
         side_dists(1) = bottom_left(1) - xy_local(1)
         side_dists(2) = top_right(1) - xy_local(1)
@@ -339,7 +386,7 @@ contains
                 dist_vec(2) = side_dists(idx)/abs(side_dists(idx))
             end if
             ! convert the normal vector back into the global coordinate system
-            gp%levelset_norm = matmul(rotation, dist_vec)
+            gp%levelset_norm = f_mv3(rotation, dist_vec)
         else
             gp%levelset_norm = 0._wp
         end if
@@ -375,13 +422,13 @@ contains
         ellipse_coeffs(2) = 0.5_wp*length_y
 
         xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
-        xy_local = matmul(inverse_rotation, xy_local)
+        xy_local = f_mv3(inverse_rotation, xy_local)
 
         normal_vector = xy_local
         ! get the normal direction via the coordinate transformation method
         normal_vector(2) = normal_vector(2)*(ellipse_coeffs(1)/ellipse_coeffs(2))**2._wp
         normal_vector = normal_vector/sqrt(dot_product(normal_vector, normal_vector))  ! normalize the vector
-        gp%levelset_norm = matmul(rotation, normal_vector)  ! save after rotating the vector to the global frame
+        gp%levelset_norm = f_mv3(rotation, normal_vector)  ! save after rotating the vector to the global frame
 
         ! use the normal vector to set up the quadratic equation for the levelset, using A, B, and C in indices 1, 2, and 3
         quadratic_coeffs(1) = (normal_vector(1)/ellipse_coeffs(1))**2 + (normal_vector(2)/ellipse_coeffs(2))**2
@@ -434,7 +481,7 @@ contains
         Back = -length_z/2
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center  ! get coordinate frame centered on IB
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinate
+        xyz_local = f_mv3(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinate
 
         dist_left = Left - xyz_local(1)
         dist_right = xyz_local(1) - Right
@@ -478,7 +525,7 @@ contains
             end if
         end if
 
-        gp%levelset_norm = matmul(rotation, dist_vec)
+        gp%levelset_norm = f_mv3(rotation, dist_vec)
 
     end subroutine s_cuboid_levelset
 
@@ -567,7 +614,7 @@ contains
         end if
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center  ! get coordinate frame centered on IB
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+        xyz_local = f_mv3(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
 
         ! get distance to flat edge of cylinder
         side_pos = dot_product(xyz_local, dist_sides_vec)
@@ -579,15 +626,15 @@ contains
             ! if the closest edge is flat
             gp%levelset = -dist_side
             if (f_approx_equal(dist_side, abs(side_pos - boundary(1)))) then
-                gp%levelset_norm = matmul(rotation, -dist_sides_vec)
+                gp%levelset_norm = f_mv3(rotation, -dist_sides_vec)
             else
-                gp%levelset_norm = matmul(rotation, dist_sides_vec)
+                gp%levelset_norm = f_mv3(rotation, dist_sides_vec)
             end if
         else
             gp%levelset = dist_surface
             xyz_local = xyz_local*dist_surface_vec
             xyz_local = xyz_local/max(norm2(xyz_local), sgm_eps)
-            gp%levelset_norm = matmul(rotation, xyz_local)
+            gp%levelset_norm = f_mv3(rotation, xyz_local)
         end if
 
     end subroutine s_cylinder_levelset
@@ -630,7 +677,7 @@ contains
         if (p > 0) then
             xyz_local(3) = z_cc(k) - center(3)
         end if
-        xyz_local = matmul(inverse_rotation, xyz_local)
+        xyz_local = f_mv3(inverse_rotation, xyz_local)
 
         ! 3D models
         if (p > 0) then
@@ -642,12 +689,12 @@ contains
             gp%levelset = -abs(gp%levelset)
 
             ! Assign the levelset_norm
-            gp%levelset_norm = matmul(rotation, normals(1:3))
+            gp%levelset_norm = f_mv3(rotation, normals(1:3))
         else
             ! 2D models
             call s_distance_normals_2D(patch_id, boundary_edge_count, xyz_local, normals, distance)
             gp%levelset = -abs(distance)
-            gp%levelset_norm = matmul(rotation, normals(1:3))
+            gp%levelset_norm = f_mv3(rotation, normals(1:3))
         end if
 
     end subroutine s_model_levelset
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 8ffcba6101..169af242b4 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -166,12 +166,12 @@ contains
         integer, intent(in)                                 :: t_step
         real(wp)                                            :: rho  !< Cell-avg. density
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha  !< Cell-avg. volume fraction
             real(wp), dimension(3) :: vel    !< Cell-avg. velocity
         #:else
-            real(wp), dimension(num_fluids) :: alpha  !< Cell-avg. volume fraction
-            real(wp), dimension(num_vels)   :: vel    !< Cell-avg. velocity
+            real(wp), dimension(num_fluids_max) :: alpha  !< Cell-avg. volume fraction
+            real(wp), dimension(3)              :: vel    !< Cell-avg. velocity
         #:endif
         real(wp)               :: vel_sum  !< Cell-avg. velocity sum
         real(wp)               :: pres     !< Cell-avg. pressure
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 2ee806d928..59fb741aef 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -4,7 +4,7 @@
 
 #:include 'macros.fpp'
 
-!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT) for azimuthal Fourier filtering in cylindrical geometries
+!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT/oneMKL) for azimuthal Fourier filtering in cylindrical geometries
 module m_fftw
 
     use, intrinsic :: iso_c_binding
@@ -12,7 +12,14 @@ module m_fftw
     use m_derived_types
     use m_global_parameters
     use m_mpi_proxy
-#if defined(MFC_GPU) && defined(__PGI)
+    ! GPU FFT backend selection:
+    !   cuFFT  - NVHPC/PGI (OpenACC or OpenMP target)
+    !   hipFFT - Cray/AMD (OpenMP target)
+    !   oneMKL - Intel ifx (OpenMP target + dispatch construct)
+    !   FFTW   - CPU-only builds
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+    use mkl_dfti_omp_offload
+#elif defined(MFC_GPU) && defined(__PGI)
     use cufft
 #elif defined(MFC_GPU)
     use hipfort
@@ -34,7 +41,18 @@ module m_fftw
     real(c_double), pointer            :: data_real(:)        !< Real data
     complex(c_double_complex), pointer :: data_cmplx(:)       !< Complex data in Fourier space
     complex(c_double_complex), pointer :: data_fltr_cmplx(:)  !< Filtered complex data in Fourier space
-#if defined(MFC_GPU)
+
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+    $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
+
+    real(dp), allocatable, target    :: data_real_gpu(:)
+    complex(dp), allocatable, target :: data_cmplx_gpu(:)
+    complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
+    $:GPU_DECLARE(create='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+
+    type(DFTI_DESCRIPTOR), pointer :: fwd_plan_mkl => null()
+    type(DFTI_DESCRIPTOR), pointer :: bwd_plan_mkl => null()
+#elif defined(MFC_GPU)
     $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
 
     real(dp), allocatable, target    :: data_real_gpu(:)
@@ -62,16 +80,39 @@ contains
     impure subroutine s_initialize_fftw_module
 
         integer :: ierr  !< Generic flag used to identify and report GPU errors
-        ! Size of input array going into DFT
 
         real_size = p + 1
-        ! Size of output array coming out of DFT
         cmplx_size = (p + 1)/2 + 1
 
         x_size = m + 1
         batch_size = x_size*sys_size
 
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
+        $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
+
+        @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
+        @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+        @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+
+        ! Forward R2C descriptor: batch of real_size transforms
+        ierr = DftiCreateDescriptor(fwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_INPUT_DISTANCE, real_size)
+        ierr = DftiSetValue(fwd_plan_mkl, DFTI_OUTPUT_DISTANCE, cmplx_size)
+        ierr = DftiCommitDescriptor(fwd_plan_mkl)
+
+        ! Backward C2R descriptor
+        ierr = DftiCreateDescriptor(bwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_INPUT_DISTANCE, cmplx_size)
+        ierr = DftiSetValue(bwd_plan_mkl, DFTI_OUTPUT_DISTANCE, real_size)
+        ierr = DftiCommitDescriptor(bwd_plan_mkl)
+#elif defined(MFC_GPU)
         rank = 1; istride = 1; ostride = 1
         allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
 
@@ -80,22 +121,7 @@ contains
         oembed(1) = cmplx_size
         $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
         $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
-#else
-        ! Allocate input and output DFT data sizes
-        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
-        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        ! Associate input and output data pointers with allocated memory
-        call c_f_pointer(fftw_real_data, data_real, [real_size])
-        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
-        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
 
-        ! Generate plans for forward and backward DFTs
-        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
-        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
-#endif
-
-#if defined(MFC_GPU)
         @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
         @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
         @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
@@ -111,6 +137,19 @@ contains
         ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, &
                               & HIPFFT_Z2D, batch_size)
 #endif
+#else
+        ! Allocate input and output DFT data sizes
+        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
+        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        ! Associate input and output data pointers with allocated memory
+        call c_f_pointer(fftw_real_data, data_real, [real_size])
+        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
+        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
+
+        ! Generate plans for forward and backward DFTs
+        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
+        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
 #endif
 
     end subroutine s_initialize_fftw_module
@@ -124,7 +163,116 @@ contains
         ! Restrict filter to processors that have cells adjacent to axis
 
         if (bc_y%beg >= 0) return
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, cmplx_size
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_MKL_DISPATCH()
+        ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
+
+        Nfq = 3
+        $:GPU_UPDATE(device='[Nfq]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, Nfq
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1) &
+                                        & *cmplx_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_MKL_DISPATCH()
+        ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu)
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1) &
+                                  & *real_size*x_size)/real(real_size, dp)
+                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        do i = 1, fourier_rings
+            i2 = i
+            $:GPU_UPDATE(device='[i2]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, cmplx_size
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i2, l)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_MKL_DISPATCH()
+            ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu)
+
+            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            $:GPU_UPDATE(device='[Nfq]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, Nfq
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k &
+                                            & - 1)*cmplx_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_MKL_DISPATCH()
+            ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu)
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k &
+                                      & - 1)*real_size*x_size)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, i2, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end do
+#elif defined(MFC_GPU)
         $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
@@ -292,7 +440,13 @@ contains
     !> Finalize the FFTW module
     impure subroutine s_finalize_fftw_module
 
-#if defined(MFC_GPU)
+#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER)
+        integer :: ierr  !< Generic flag used to identify and report GPU errors
+
+        @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
+        ierr = DftiFreeDescriptor(fwd_plan_mkl)
+        ierr = DftiFreeDescriptor(bwd_plan_mkl)
+#elif defined(MFC_GPU)
         integer :: ierr  !< Generic flag used to identify and report GPU errors
 
         @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 90e78d04df..85eb77cbb4 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -77,13 +77,13 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
 
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(10) :: tensora, tensorb
         #:else
             real(wp), dimension(tensor_size) :: tensora, tensorb
         #:endif
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: alpha_k, alpha_rho_k
         #:else
             real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k
@@ -194,13 +194,13 @@ contains
     subroutine s_neoHookean_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        type(scalar_field), dimension(b_size), intent(inout)   :: btensor_in
-        real(wp), intent(in)                                   :: G_param
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: trace
-        real(wp), parameter                                    :: f13 = 1._wp/3._wp
-        integer                                                :: i  !< Generic loop iterators
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout) :: btensor_in
+        real(wp), intent(in)                            :: G_param
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: trace
+        real(wp), parameter                             :: f13 = 1._wp/3._wp
+        integer                                         :: i  !< Generic loop iterators
         ! tensor is the symmetric tensor & calculate the trace of the tensor
         trace = btensor_in(1)%sf(j, k, l) + btensor_in(3)%sf(j, k, l) + btensor_in(6)%sf(j, k, l)
 
@@ -222,13 +222,13 @@ contains
     subroutine s_Mooney_Rivlin_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        type(scalar_field), dimension(b_size), intent(inout)   :: btensor_in
-        real(wp), intent(in)                                   :: G_param
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: trace
-        real(wp), parameter                                    :: f13 = 1._wp/3._wp
-        integer                                                :: i  !< Generic loop iterators
+        type(scalar_field), dimension(:), intent(inout) :: q_prim_vf
+        type(scalar_field), dimension(:), intent(inout) :: btensor_in
+        real(wp), intent(in)                            :: G_param
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: trace
+        real(wp), parameter                             :: f13 = 1._wp/3._wp
+        integer                                         :: i  !< Generic loop iterators
         ! TODO: Make 1D and 2D capable
         trace = btensor_in(1)%sf(j, k, l) + btensor_in(3)%sf(j, k, l) + btensor_in(6)%sf(j, k, l)
 
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index b21483e097..4fba39e80e 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -136,7 +136,7 @@ contains
         integer, intent(in)                :: xp, yp            !< integers containing the periodicity projection information
         real(wp)                           :: f, ca_in, pa, ma, ta
         real(wp)                           :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer                            :: i, j, k, il, ir, jl, jr
+        integer                            :: i, j, k, kk, il, ir, jl, jr
         integer                            :: Np1, Np2
         integer                            :: encoded_patch_id
         real(wp), dimension(1:3)           :: xy_local, offset  !< x and y coordinates in local IB frame
@@ -227,8 +227,8 @@ contains
         call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir)
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, ma, &
-                            & ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
+        $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, &
+                            & ma, ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
         do j = jl, jr
             do i = il, ir
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
@@ -246,8 +246,8 @@ contains
                     end if
                     if (xy_local(2) >= 0._wp) then
                         k = 1
-                        do while (airfoil_grid_u(k)%x < xy_local(1) .and. k <= Np)
-                            k = k + 1
+                        do kk = 1, Np - 1
+                            if (airfoil_grid_u(kk)%x < xy_local(1)) k = kk + 1
                         end do
                         if (f_approx_equal(airfoil_grid_u(k)%x, xy_local(1))) then
                             if (xy_local(2) <= airfoil_grid_u(k)%y) then
@@ -261,8 +261,8 @@ contains
                         end if
                     else
                         k = 1
-                        do while (airfoil_grid_l(k)%x < xy_local(1))
-                            k = k + 1
+                        do kk = 1, Np - 1
+                            if (airfoil_grid_l(kk)%x < xy_local(1)) k = kk + 1
                         end do
                         if (f_approx_equal(airfoil_grid_l(k)%x, xy_local(1))) then
                             if (xy_local(2) >= airfoil_grid_l(k)%y) then
@@ -290,7 +290,7 @@ contains
         type(integer_field), intent(inout) :: ib_markers
         integer, intent(in) :: xp, yp, zp                      !< integers containing the periodicity projection information
         real(wp) :: lz, z_max, z_min, f, ca_in, pa, ma, ta, xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer :: i, j, k, l, il, ir, jl, jr, ll, lr
+        integer :: i, j, k, kk, l, il, ir, jl, jr, ll, lr
         integer :: Np1, Np2
         integer :: encoded_patch_id
         real(wp), dimension(1:3) :: xyz_local, center, offset  !< x, y, z coordinates in local IB frame
@@ -385,8 +385,8 @@ contains
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
         call get_bounding_indices(center(3) - ca_in, center(3) + ca_in, z_cc, ll, lr)
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, &
-                            & ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, &
+                            & offset, ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3)
         do l = ll, lr
             do j = jl, jr
                 do i = il, ir
@@ -399,8 +399,8 @@ contains
                         if (xyz_local(1) >= 0._wp .and. xyz_local(1) <= ca_in) then
                             if (xyz_local(2) >= 0._wp) then
                                 k = 1
-                                do while (airfoil_grid_u(k)%x < xyz_local(1))
-                                    k = k + 1
+                                do kk = 1, Np - 1
+                                    if (airfoil_grid_u(kk)%x < xyz_local(1)) k = kk + 1
                                 end do
                                 if (f_approx_equal(airfoil_grid_u(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) <= airfoil_grid_u(k)%y) then
@@ -415,8 +415,8 @@ contains
                                 end if
                             else
                                 k = 1
-                                do while (airfoil_grid_l(k)%x < xyz_local(1))
-                                    k = k + 1
+                                do kk = 1, Np - 1
+                                    if (airfoil_grid_l(kk)%x < xyz_local(1)) k = kk + 1
                                 end do
                                 if (f_approx_equal(airfoil_grid_l(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) >= airfoil_grid_l(k)%y) then
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 39ec6b1470..0ac42bfff5 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -721,7 +721,7 @@ contains
 
         & nmom_IP, pb_in, mv_in, presb_IP, massv_IP)
         $:GPU_ROUTINE(parallelism='[seq]')
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf  !< Primitive Variables
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf  !< Primitive Variables
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(in) :: pb_in, mv_in
         type(ghost_point), intent(in) :: gp
         real(wp), intent(inout) :: pres_IP
@@ -730,7 +730,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3), intent(inout) :: alpha_IP, alpha_rho_IP
         #:else
-            real(wp), dimension(num_fluids), intent(inout) :: alpha_IP, alpha_rho_IP
+            real(wp), dimension(:), intent(inout) :: alpha_IP, alpha_rho_IP
         #:endif
         real(wp), optional, dimension(:), intent(inout) :: r_IP, v_IP, pb_IP, mv_IP
         real(wp), optional, dimension(:), intent(inout) :: nmom_IP
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 5f1fa9d73b..f326d308bd 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -2640,11 +2640,11 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), intent(in)                      :: E_L, gamma_L, pi_inf_L, rho_L
-        real(wp), intent(in)                      :: E_R, gamma_R, pi_inf_R, rho_R
-        real(wp), dimension(num_dims), intent(in) :: vel_L, vel_R
-        real(wp), intent(out)                     :: pres_L, pres_R, cfl
-        real(wp)                                  :: a_L, a_R
+        real(wp), intent(in)               :: E_L, gamma_L, pi_inf_L, rho_L
+        real(wp), intent(in)               :: E_R, gamma_R, pi_inf_R, rho_R
+        real(wp), dimension(:), intent(in) :: vel_L, vel_R
+        real(wp), intent(out)              :: pres_L, pres_R, cfl
+        real(wp)                           :: a_L, a_R
 
         if (num_dims == 2) then
             pres_L = (E_L - pi_inf_L - 0.5_wp*rho_L*(vel_L(1)**2._wp + vel_L(2)**2._wp))/gamma_L
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index 8ca939de7d..4cb3e847d6 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -51,8 +51,8 @@ contains
     !> The main pressure relaxation procedure
     subroutine s_pressure_relaxation_procedure(q_cons_vf)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer                                                :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer                                         :: j, k, l
 
         $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
         do l = 0, p
@@ -71,8 +71,8 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
 
         ! Volume fraction correction
         if (mpp_lim) call s_correct_volume_fractions(q_cons_vf, j, k, l)
@@ -92,9 +92,9 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        integer, intent(in)                                 :: j, k, l
-        integer                                             :: i
+        type(scalar_field), dimension(:), intent(in) :: q_cons_vf
+        integer, intent(in)                          :: j, k, l
+        integer                                      :: i
 
         s_needs_pressure_relaxation = .true.
         $:GPU_LOOP(parallelism='[seq]')
@@ -111,10 +111,10 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: sum_alpha
-        integer                                                :: i
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: sum_alpha
+        integer                                         :: i
 
         sum_alpha = 0._wp
         $:GPU_LOOP(parallelism='[seq]')
@@ -141,13 +141,13 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
-        real(wp)                                               :: pres_relax, f_pres, df_pres
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
+        real(wp)                                        :: pres_relax, f_pres, df_pres
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: pres_K_init, rho_K_s
         #:else
-            real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+            real(wp), dimension(num_fluids_max) :: pres_K_init, rho_K_s
         #:endif
         integer, parameter :: MAX_ITER = 50
         ! Pressure relaxation convergence tolerance
@@ -214,12 +214,12 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in)                                    :: j, k, l
+        type(scalar_field), dimension(:), intent(inout) :: q_cons_vf
+        integer, intent(in)                             :: j, k, l
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: alpha_rho, alpha
         #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, alpha
+            real(wp), dimension(num_fluids_max) :: alpha_rho, alpha
         #:endif
         real(wp)               :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
         real(wp), dimension(2) :: Re
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 8465d4d349..127c47ccfb 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -587,7 +587,7 @@ contains
         $:GPU_ROUTINE(function_name='s_coeff_nonpoly',parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(32,0:2,0:2), intent(out) :: coeffs
         #:else
             real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeffs
@@ -666,7 +666,7 @@ contains
         $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(32,0:2,0:2), intent(out) :: coeffs
         #:else
             real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeffs
@@ -741,14 +741,14 @@ contains
         real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout)  :: rhs_mv
         type(int_bounds_info), intent(in)                                                          :: ix, iy, iz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(6)    :: moms, msum
             real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
         #:else
             real(wp), dimension(nmom)      :: moms, msum
             real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
         #:endif
-        #:if USING_AMD
+        #:if (USING_AMD or USING_INTEL)
             real(wp), dimension(32,0:2,0:2) :: coeff
         #:else
             real(wp), dimension(nterms,0:2,0:2) :: coeff
@@ -930,7 +930,7 @@ contains
 
             $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', cray_inline=True)
             real(wp), intent(in) :: pres, rho, c
-            #:if USING_AMD
+            #:if (USING_AMD or USING_INTEL)
                 real(wp), dimension(32,0:2,0:2), intent(out) :: coeff
             #:else
                 real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeff
@@ -949,8 +949,8 @@ contains
 
             $:GPU_ROUTINE(function_name='s_chyqmom',parallelism='[seq]', cray_inline=True)
 
-            real(wp), dimension(nmom), intent(in)     :: momin
-            real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
+            real(wp), dimension(nmom), intent(in) :: momin
+            real(wp), dimension(:), intent(inout) :: wght, abscX, abscY
 
             ! Local variables
             real(wp), dimension(0:2,0:2) :: moms
@@ -1027,10 +1027,10 @@ contains
         function f_quad(abscX, abscY, wght_in, q, r, s)
 
             $:GPU_ROUTINE(parallelism='[seq]')
-            #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
                 real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in
             #:else
-                real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
+                real(wp), dimension(:,:), intent(in) :: abscX, abscY, wght_in
             #:endif
             real(wp), intent(in) :: q, r, s
             real(wp)             :: f_quad_RV, f_quad
@@ -1053,10 +1053,10 @@ contains
         function f_quad2D(abscX, abscY, wght_in, pow)
 
             $:GPU_ROUTINE(parallelism='[seq]')
-            #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
                 real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in
             #:else
-                real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
+                real(wp), dimension(:), intent(in) :: abscX, abscY, wght_in
             #:endif
             real(wp), dimension(3), intent(in) :: pow
             real(wp)                           :: f_quad2D
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 0a8bc32212..8ece8fa0ff 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -4389,8 +4389,8 @@ contains
             real(wp), dimension(3, 3), intent(in)  :: vel_grad_avg
             real(wp), dimension(3, 3), intent(out) :: tau_shear_out
         #:else
-            real(wp), dimension(num_dims, num_dims), intent(in)  :: vel_grad_avg
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
+            real(wp), dimension(:,:), intent(in)  :: vel_grad_avg
+            real(wp), dimension(:,:), intent(out) :: tau_shear_out
         #:endif
         real(wp), intent(in) :: Re_shear
         real(wp), intent(in) :: divergence_v
@@ -4422,7 +4422,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
         #:else
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
+            real(wp), dimension(:,:), intent(out) :: tau_bulk_out
         #:endif
 
         ! Local variables
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 4a0978919e..8392fdc6bc 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -45,11 +45,11 @@ contains
     function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in) :: vel
-        real(wp), intent(in)                      :: c
-        integer, intent(in)                       :: j, k, l
-        real(wp)                                  :: cfl_terms
-        real(wp)                                  :: fltr_dtheta
+        real(wp), dimension(:), intent(in) :: vel
+        real(wp), intent(in)               :: c
+        integer, intent(in)                :: j, k, l
+        real(wp)                           :: cfl_terms
+        real(wp)                           :: fltr_dtheta
 
         fltr_dtheta = f_compute_filtered_dtheta(k, l)
 
@@ -74,13 +74,13 @@ contains
 
         $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', cray_inline=True)
 
-        type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
+        type(scalar_field), intent(in), dimension(:) :: q_prim_vf
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), intent(inout), dimension(3) :: alpha
             real(wp), intent(inout), dimension(3) :: vel
         #:else
-            real(wp), intent(inout), dimension(num_fluids) :: alpha
-            real(wp), intent(inout), dimension(num_vels)   :: vel
+            real(wp), intent(inout), dimension(:) :: alpha
+            real(wp), intent(inout), dimension(:) :: vel
         #:endif
         real(wp), intent(inout)               :: rho, gamma, pi_inf, vel_sum, H, pres
         real(wp), intent(out)                 :: qv
@@ -89,7 +89,7 @@ contains
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: alpha_rho, Gs
         #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, Gs
+            real(wp), dimension(num_fluids_max) :: alpha_rho, Gs
         #:endif
         real(wp) :: E, G_local
         integer  :: i
@@ -141,7 +141,7 @@ contains
     subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in), dimension(num_vels)                 :: vel
+        real(wp), intent(in), dimension(:)                        :: vel
         real(wp), intent(in)                                      :: c, rho
         real(wp), dimension(0:m,0:n,0:p), intent(inout)           :: icfl_sf
         real(wp), dimension(0:m,0:n,0:p), intent(inout), optional :: vcfl_sf, Rc_sf
@@ -191,7 +191,7 @@ contains
     subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in)       :: vel
+        real(wp), dimension(:), intent(in)              :: vel
         real(wp), intent(in)                            :: c, rho
         real(wp), dimension(0:m,0:n,0:p), intent(inout) :: max_dt
         real(wp), dimension(2), intent(in)              :: Re_l
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 0b7caf7a47..f83650375d 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -65,7 +65,7 @@ contains
         integer, intent(in)                                    :: id
         type(int_bounds_info), intent(in)                      :: isx, isy, isz
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3, 3) :: Omega
         #:else
             real(wp), dimension(num_dims, num_dims) :: Omega
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 4d142eca55..762d52acaa 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -620,12 +620,12 @@ contains
 
         real(wp) :: rho  !< Cell-avg. density
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3) :: vel    !< Cell-avg. velocity
             real(wp), dimension(3) :: alpha  !< Cell-avg. volume fraction
         #:else
-            real(wp), dimension(num_vels)   :: vel    !< Cell-avg. velocity
-            real(wp), dimension(num_fluids) :: alpha  !< Cell-avg. volume fraction
+            real(wp), dimension(3)              :: vel    !< Cell-avg. velocity
+            real(wp), dimension(num_fluids_max) :: alpha  !< Cell-avg. volume fraction
         #:endif
         real(wp)               :: vel_sum  !< Cell-avg. velocity sum
         real(wp)               :: pres     !< Cell-avg. pressure
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 55cdcc343e..af99f44eb0 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -53,7 +53,7 @@ contains
         real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum  !< Mixture variables
         real(wp), dimension(2) :: Re_visc
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(3)    :: alpha_visc, alpha_rho_visc
             real(wp), dimension(3, 3) :: tau_Re
         #:else
@@ -1266,14 +1266,14 @@ contains
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), dimension(1:3,1:3), intent(inout)           :: viscous_stress_tensor
-        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(in)                                  :: dynamic_viscosity
-        integer, intent(in)                                   :: i, j, k
-        real(wp), dimension(1:3,1:3)                          :: velocity_gradient_tensor
-        real(wp), dimension(1:3)                              :: dx
-        real(wp)                                              :: divergence
-        integer                                               :: l, q  !< iterators
+        real(wp), dimension(1:3,1:3), intent(inout)  :: viscous_stress_tensor
+        type(scalar_field), dimension(:), intent(in) :: q_prim_vf
+        real(wp), intent(in)                         :: dynamic_viscosity
+        integer, intent(in)                          :: i, j, k
+        real(wp), dimension(1:3,1:3)                 :: velocity_gradient_tensor
+        real(wp), dimension(1:3)                     :: dx
+        real(wp)                                     :: divergence
+        integer                                      :: l, q  !< iterators
 
         ! zero the viscous stress, collection of velocity derivatives, and spatial finite differences
         viscous_stress_tensor = 0._wp
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 78e7678b1a..e19c245aec 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -895,7 +895,7 @@ contains
         integer, intent(in)                                                                    :: weno_dir
         type(int_bounds_info), intent(in)                                                      :: is1_weno_d, is2_weno_d, is3_weno_d
 
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL)
             real(wp), dimension(-3:2) :: dvd
             real(wp), dimension(0:4)  :: poly
             real(wp), dimension(0:4)  :: alpha
diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh
index 1beb016539..6dc9c0e6e9 100644
--- a/toolchain/bootstrap/modules.sh
+++ b/toolchain/bootstrap/modules.sh
@@ -48,6 +48,8 @@ if [ -v $u_c ]; then
     log   "$B""DoD$W:     Carpenter Cray (cc) | Carpenter GNU (c) |  Nautilus (n)"
     log   "$OR""Florida$W: HiPerGator (h)"
     log   "$C""WPI $W:   Turing   (t)"
+    log   "$Y""Gatech$W:  CRNCH RoboGator (crnch)"
+    log   "$C""TAMU$W:    ACES (aces)"
     log_n "($G""a$W/$G""f$W/$G""s$W/$G""w$W/$B""tuo$W/$C""b$W/$C""e$CR/$C""d/$C""dai$CR/$Y""p$CR/$R""r$CR/$B""cc$CR/$B""c$CR/$B""n$CR/$BR""o$CR/$BR""pa"$CR"/$OR""h"$CR/$C""t""$CR"): "
     read u_c
     log
@@ -106,11 +108,13 @@ fi
 ELEMENTS="$(__extract "$u_c-all") $(__extract "$u_c-$cg")"
 MODULES=`echo "$ELEMENTS" | tr ' ' '\n' | grep -v = | xargs`
 
-log " $ module load $MODULES"
-if ! module load $MODULES; then
-    error "Failed to load modules."
+if [ -n "$MODULES" ]; then
+    log " $ module load $MODULES"
+    if ! module load $MODULES; then
+        error "Failed to load modules."
 
-    return
+        return
+    fi
 fi
 
 # Export variables one line at a time so each can reference previously exported vars
diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt
index d1c9c8b037..f92939e144 100644
--- a/toolchain/dependencies/CMakeLists.txt
+++ b/toolchain/dependencies/CMakeLists.txt
@@ -32,7 +32,10 @@ if (MFC_FFTW)
         message(STATUS "FFTW found.")
         add_custom_target(fftw)
     else()
-        if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+        if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+            message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
+            add_custom_target(fftw)
+        else()
             ExternalProject_Add(fftw
                 URL        "http://www.fftw.org/fftw-3.3.10.tar.gz"
                 CMAKE_ARGS -DBUILD_TESTS=OFF
@@ -40,9 +43,6 @@ if (MFC_FFTW)
                            "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
                            -DCMAKE_POLICY_VERSION_MINIMUM=3.5
             )
-        else()
-            message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.")
-            add_custom_target(fftw)
         endif()
     endif()
 endif()
@@ -134,6 +134,9 @@ endif()
                             -DCBLAS=OFF
                             -DLAPACKE=OFF
                             -DBUILD_DEPRECATED=OFF
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            "-DCMAKE_EXE_LINKER_FLAGS=-no-pie"
+                            "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}"
                             "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
                             "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
          )
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index d5a13fe182..aa4d4ac2af 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -427,6 +427,7 @@ def configure(self, case: Case):
             flags.append(f"-DMFC_GCov={'ON' if ARG('gcov') else 'OFF'}")
             flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}")
             flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}")
+            flags.append(f"-DMFC_Intel_AOT={'ON' if ARG('intel_aot') else 'OFF'}")
 
         command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath]
 
diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py
index dda710f602..180044a4f3 100644
--- a/toolchain/mfc/run/input.py
+++ b/toolchain/mfc/run/input.py
@@ -99,6 +99,16 @@ def generate_fpp(self, target) -> None:
 
         thermochem_code = pyro.FortranCodeGenerator().generate("m_thermochem", sol, pyro.CodeGenerationOptions(scalar_type=real_type, directive_offload=directive_str))
 
+        if directive_str == "mp":
+            # ifx -fpp strips !$omp directives produced by C-macro expansion because the
+            # Intel Fortran preprocessor treats '!' as a Fortran comment after expansion.
+            # Rewrite the GPU_ROUTINE macro calls as literal !$omp declare target lines so
+            # the directive is visible to the Fortran front-end, not the C preprocessor.
+            import re
+
+            thermochem_code = thermochem_code.replace("#define GPU_ROUTINE(name) !$omp declare target\n", "")
+            thermochem_code = re.sub(r"[ \t]+GPU_ROUTINE\(\w+\)", "!$omp declare target", thermochem_code)
+
         common.file_write(os.path.join(modules_dir, "m_thermochem.f90"), thermochem_code, True)
 
         cons.unindent()
diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py
index 82e886c064..261d8e65dc 100644
--- a/toolchain/mfc/run/run.py
+++ b/toolchain/mfc/run/run.py
@@ -85,6 +85,19 @@ def __get_template() -> Template:
     raise MFCException(f"Failed to find a template for --computer '{computer}'. Baked-in templates are: {format_list_to_string(list(baked.keys()), 'magenta')}.")
 
 
+def __is_intel_gpu_build(case: input.MFCInputFile) -> bool:
+    cmake_cache = os.path.join(SIMULATION.get_staging_dirpath(case), "CMakeCache.txt")
+    if not os.path.isfile(cmake_cache):
+        return False
+    with open(cmake_cache) as f:
+        content = f.read()
+    # Match compiler ID entry (may or may not be present) or fall back to compiler path
+    if re.search(r"CMAKE_Fortran_COMPILER_ID[^=\n]*=[^\n]*IntelLLVM", content):
+        return True
+    m = re.search(r"CMAKE_Fortran_COMPILER:FILEPATH=([^\n]+)", content)
+    return m is not None and os.path.basename(m.group(1).strip()) in ("ifx", "ifx.exe")
+
+
 def __generate_job_script(targets, case: input.MFCInputFile):
     env = {}
     if ARG("gpus") is not None:
@@ -103,6 +116,23 @@ def __generate_job_script(targets, case: input.MFCInputFile):
     gpu_acc = gpu_mode == gpuConfigOptions.ACC.value
     gpu_mp = gpu_mode == gpuConfigOptions.MP.value
 
+    if gpu_mp and __is_intel_gpu_build(case):
+        # Level Zero tuning for Intel GPU Max (Ponte Vecchio).
+        # COMMAND_BATCH=256: batch up to 256 Level Zero commands before flushing,
+        # allowing the GPU to stay busy while the host prepares the next batch.
+        # Measured ~9% throughput improvement on Intel GPU Max 1100.
+        env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH", "256")
+        # Disable per-kernel indirect access memory tracking (zeMemGetAllocProperties).
+        # MFC's scalar_field pointer arrays trigger indirect-access flags; this tracking
+        # adds ~2100 API calls/step. Disabling it is safe since all GPU allocations are
+        # managed via @:ALLOCATE/@:DEALLOCATE and never aliased with host memory.
+        env.setdefault("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY", "0")
+        if ARG("fastmath"):
+            # -cl-fast-relaxed-math: enables unsafe GPU JIT optimizations (fast transcendentals,
+            # fused MAD, no signed-zero semantics, finite-math-only). Matches nvfortran -gpu=fastmath.
+            # Only applied when the user explicitly requests --fastmath.
+            env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "-cl-fast-relaxed-math")
+
     content = __get_template().render(
         **{**ARGS(), "targets": targets},
         ARG=ARG,
diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py
index 94a37be947..63dacd0884 100644
--- a/toolchain/mfc/state.py
+++ b/toolchain/mfc/state.py
@@ -21,6 +21,7 @@ class MFCConfig:
     single: bool = False
     mixed: bool = False
     fastmath: bool = False
+    intel_aot: bool = False
 
     def __hash__(self):
         return hash(tuple(getattr(self, f.name) for f in dataclasses.fields(self)))
@@ -32,7 +33,8 @@ def from_dict(d: dict):
         r = MFCConfig()
 
         for field in dataclasses.fields(MFCConfig):
-            setattr(r, field.name, d[field.name])
+            if field.name in d:
+                setattr(r, field.name, d[field.name])
 
         return r
 
diff --git a/toolchain/modules b/toolchain/modules
index ea7cb36393..1240772c94 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -116,3 +116,19 @@ h-gpu NVCOMPILER_COMM_LIBS_HOME=/apps/compilers/nvhpc/25.9/Linux_x86_64/25.9/com
 t     WPI Turing
 t-all slurm
 t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy
+
+aces     TAMU HPRC ACES (Intel GPU Max 1100, Ponte Vecchio)
+aces-all iimpi/2025a imkl/2025.1.0 CMake/3.31.3 Python/3.13.1
+aces-gpu I_MPI_F90=ifx
+aces-gpu FC=mpif90
+
+crnch     GT CRNCH RoboGator (Intel GPU Max 1100, Ponte Vecchio)
+crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin/mpiifx
+crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin:${PATH}
+crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0
+crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14
+crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/libfabric/lib:${LD_LIBRARY_PATH}
+crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH}
+crnch-gpu I_MPI_FABRICS=shm:ofi
+crnch-gpu FI_PROVIDER=tcp
+crnch-gpu FI_PROVIDER_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/libfabric/lib/prov
diff --git a/toolchain/templates/aces.mako b/toolchain/templates/aces.mako
new file mode 100644
index 0000000000..40c38290c0
--- /dev/null
+++ b/toolchain/templates/aces.mako
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --nodes=${nodes}
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --cpus-per-task=1
+#SBATCH --job-name="${name}"
+#SBATCH --time=${walltime}
+% if partition:
+#SBATCH --partition=${partition}
+% else:
+#SBATCH --partition=pvc
+% endif
+% if account:
+#SBATCH --account="${account}"
+% endif
+% if gpu_enabled:
+#SBATCH --gres=gpu:pvc:${tasks_per_node}
+#SBATCH --mem=32G
+% endif
+#SBATCH --output="${name}.out"
+#SBATCH --error="${name}.err"
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+. ./mfc.sh load -c aces -m ${'g' if gpu_enabled else 'c'}
+cd - > /dev/null
+echo
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; ${profiler}                          \
+            mpirun -np ${nodes*tasks_per_node}        \
+                "${target.get_install_binpath(case)}")
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}
diff --git a/toolchain/templates/include/helpers.mako b/toolchain/templates/include/helpers.mako
index a9493f71d0..5c729827c5 100644
--- a/toolchain/templates/include/helpers.mako
+++ b/toolchain/templates/include/helpers.mako
@@ -32,6 +32,15 @@ END
         export ${key}='${value}'
     % endfor
 
+    % if gpu_mp:
+        # ifx + Level Zero: the OpenMP SPIR-V JIT runs on the main thread stack at
+        # startup, consuming 2-4 MB before user code runs. ifx also stack-allocates
+        # compiler-generated temporaries with no size threshold (unlike gfortran's
+        # 32 KB default). Together they overflow the default 12.5 MB stack for any
+        # non-trivial 3D case. Removing the limit avoids SIGSEGV at launch.
+        ulimit -s unlimited
+    % endif
+
     t_start=$(date +%s)
 % else:
     echo MFC case # ${name} @ ${input}: