diff --git a/CMakeLists.txt b/CMakeLists.txt index 72258149f3..57262c5901 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,8 @@ option(MFC_DOCUMENTATION "Build documentation" OFF option(MFC_ALL "Build everything" OFF) option(MFC_SINGLE_PRECISION "Build single precision" OFF) option(MFC_MIXED_PRECISION "Build mixed precision" OFF) +option(MFC_Intel_AOT "Build Intel GPU with AOT (spir64_gen) instead of JIT (spir64)" OFF) +set(MFC_Intel_AOT_DEVICE "pvc" CACHE STRING "ocloc device target for Intel AOT compilation (e.g. pvc, 0xbda)") if (MFC_ALL) set(MFC_PRE_PROCESS ON FORCE) @@ -244,6 +246,15 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang") elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") add_compile_options($<$:-free>) + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + add_compile_options(-g -Og -traceback -debug -check all) + elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug") + add_compile_options(-g -Og -traceback -check bounds) + endif() +elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM") + add_compile_options($<$:-free> + $<$:-fpp>) + if (CMAKE_BUILD_TYPE STREQUAL "Debug") add_compile_options(-g -Og -traceback -debug -check all) elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug") @@ -595,6 +606,73 @@ exit 0 HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED) target_link_libraries(${a_target} PRIVATE ${HIPFFT_LIB}) endif() + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM") + # Intel GPU: oneMKL DFTI with !$omp dispatch for GPU FFT. + # Requires MKLROOT to be set (via oneAPI module or env). + if (NOT DEFINED ENV{MKLROOT}) + message(FATAL_ERROR "MKLROOT is not set. Load oneAPI MKL module before building.") + endif() + # Compile mkl_dfti_omp_offload.f90 in isolation with minimal flags. + # The file uses !$omp declare variant with need_device_addr (OpenMP 5.2) + # which requires the global -free -fpp flags to be absent so the + # compiler parses it in standard fixed/free detection mode only. + set(_mkl_omp_src "$ENV{MKLROOT}/include/mkl_dfti_omp_offload.f90") + if (NOT EXISTS "${_mkl_omp_src}") + message(FATAL_ERROR "mkl_dfti_omp_offload.f90 not found in $ENV{MKLROOT}/include") + endif() + set(_mkl_omp_mod_dir "${CMAKE_CURRENT_BINARY_DIR}/mkl_omp_mods") + set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o") + set(_mkl_omp_obj_host "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload_host.o") + file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}") + # Find clang-offload-bundler (in ifx's bin/compiler/ subdir). + # CMAKE_Fortran_COMPILER may be an MPI wrapper (mpiifx); resolve the + # underlying ifx from PATH so the HINTS point to the right directory. + find_program(_IFX_REAL ifx REQUIRED) + cmake_path(GET _IFX_REAL PARENT_PATH _ifx_bin) + find_program(CLANG_OFFLOAD_BUNDLER + NAMES clang-offload-bundler + HINTS "${_ifx_bin}/compiler" "${_ifx_bin}" "${_ifx_bin}-llvm" + REQUIRED) + add_custom_command( + OUTPUT "${_mkl_omp_obj}" + "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.mod" + COMMAND "${CMAKE_Fortran_COMPILER}" + -fiopenmp -fopenmp-targets=spir64 + -c -I"$ENV{MKLROOT}/include" + "${_mkl_omp_src}" + -o "${_mkl_omp_obj}" + WORKING_DIRECTORY "${_mkl_omp_mod_dir}" + DEPENDS "${_mkl_omp_src}" + COMMENT "Compiling MKL OMP offload Fortran module (mkl_dfti_omp_offload)" + ) + # Strip the SPIR-V device bundle so the linked object has only host code. + # The SPIR-V contains Import declarations for MKL SYCL DFT functions that + # the OpenMP Level Zero plugin cannot resolve, causing zeModuleDynamicLink + # failure. With host-only code, !$omp dispatch falls back to CPU for DFT. + add_custom_command( + OUTPUT "${_mkl_omp_obj_host}" + COMMAND "${CLANG_OFFLOAD_BUNDLER}" + --unbundle --type=o + --targets=host-x86_64-unknown-linux-gnu + --input="${_mkl_omp_obj}" + --output="${_mkl_omp_obj_host}" + DEPENDS "${_mkl_omp_obj}" + COMMENT "Stripping SPIR-V from MKL DFT object (host-only, fixes Level Zero link)" + ) + add_custom_target(mkl_omp_offload_mod_${a_target} + DEPENDS "${_mkl_omp_obj_host}") + add_dependencies(${a_target} mkl_omp_offload_mod_${a_target}) + target_include_directories(${a_target} PRIVATE + "$ENV{MKLROOT}/include" "${_mkl_omp_mod_dir}") + target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj_host}") + # Link MKL threading + core + SYCL DFT backend + target_link_options(${a_target} PRIVATE -qmkl=parallel) + # mkl_sycl_dft is the name in MKL >= 2023.2; older versions use monolithic mkl_sycl + find_library(MKL_SYCL_DFT NAMES mkl_sycl_dft mkl_sycl + HINTS "$ENV{MKLROOT}/lib" "$ENV{MKLROOT}/lib/intel64" REQUIRED) + find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED) + find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED) + target_link_libraries(${a_target} PRIVATE ${MKL_SYCL_DFT} ${SYCL_LIB} ${OPENCL_LIB}) else() find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED) target_link_libraries(${a_target} PRIVATE hipfort::hipfft) @@ -636,9 +714,22 @@ exit 0 target_compile_options(${a_target} PRIVATE "-mp=gpu" "-Minfo=mp") target_link_options(${a_target} PRIVATE "-mp=gpu") set_target_properties(${a_target} PROPERTIES Fortran_FLAGS "-mp=gpu -gpu=ccall") - elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") - target_compile_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64) - target_link_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64) + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM") + # Intel GPU: OpenMP target offload to SPIR64 (Xe-HPC / Ponte Vecchio). + # GPU FFT uses oneMKL DFTI via the OpenMP dispatch construct. + if (MFC_Intel_AOT) + # AOT: compile to native ISA via ocloc for Intel GPU Max (pvc). + # Avoids JIT zeModuleCreate failures at runtime on Level Zero. + # SHELL: prevents CMake deduplication and preserves the quoted + # "-device pvc" as a single argument to -Xopenmp-target-backend. + target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64_gen + "SHELL:-Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"") + target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64_gen + "SHELL:-Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"") + else() + target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64) + target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64) + endif() elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") target_compile_options(${a_target} PRIVATE -fopenmp) target_link_options(${a_target} PRIVATE -fopenmp) diff --git a/build_intel_gpu.sh b/build_intel_gpu.sh new file mode 100644 index 0000000000..7344856d12 --- /dev/null +++ b/build_intel_gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 1:30:00 +#SBATCH --mem=32G +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/build_intel_gpu.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/build_intel_gpu.log +#SBATCH -J mfc-intel-gpu-build + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 CMake/3.31.3 Python/3.13.1 +export I_MPI_F90=ifx FC=mpif90 +export UV_CACHE_DIR=/scratch/user/u.sb27915/.cache/uv +export RUSTUP_HOME=/scratch/user/u.sb27915/.rustup +export CARGO_HOME=/scratch/user/u.sb27915/.cargo +export PATH=$CARGO_HOME/bin:$PATH + +cd /scratch/user/u.sb27915/MFC-intel +./mfc.sh build -t pre_process simulation --gpu mp --intel-aot -j 8 diff --git a/docs/documentation/intel-gpu-max.md b/docs/documentation/intel-gpu-max.md new file mode 100644 index 0000000000..c18fbe7eb2 --- /dev/null +++ b/docs/documentation/intel-gpu-max.md @@ -0,0 +1,387 @@ +# Building MFC for Intel Data Center GPU Max (Ponte Vecchio) + +This documents how to build and run MFC with Intel GPU Max (Xe-HPC / Ponte Vecchio) +using ifx OpenMP target offload to SPIR64, as tested on GT CRNCH RoboGator (`dash3`). + +## System configuration + +| Component | Version / Path | +|---|---| +| Hardware | Intel Data Center GPU Max 1100 (Ponte Vecchio, PCI 8086:0bda) | +| OS | Linux (RHEL 8 compatible, kernel 5.15) | +| Fortran compiler | ifx 2025.3.3 (`/opt/intel/oneapi/compiler/2025.3/`) | +| MKL | oneMKL 2026.0 (`/opt/intel/oneapi/mkl/2026.0/`) | +| SYCL runtime | `libsycl.so` in `/opt/intel/oneapi/compiler/2026.0/lib/` | +| GPU device | `/dev/dri/renderD128` (requires `render` group membership) | + +## Environment setup + +Load the required oneAPI environment before building or running: + +```bash +export PATH=/opt/intel/oneapi/compiler/2025.3/bin:$PATH +export MKLROOT=/opt/intel/oneapi/mkl/2026.0 +export LIBRARY_PATH=/opt/intel/oneapi/compiler/2026.0/lib:\ +/opt/intel/oneapi/compiler/2025.3/lib:\ +${MKLROOT}/lib:\ +/opt/intel/oneapi/tbb/2022.1/lib:\ +$LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\ +/opt/intel/oneapi/compiler/2026.0/lib:\ +/opt/intel/oneapi/compiler/2025.3/lib:\ +${MKLROOT}/lib:\ +/opt/intel/oneapi/tbb/2022.1/lib:\ +$LD_LIBRARY_PATH +export FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx +``` + +> **Important**: `FC` must be set explicitly. Without it, CMake may cache an older +> ifx (2025.0) from a system module, which does not support `need_device_addr` in +> the MKL 2026.0 OpenMP offload Fortran module. + +> **Important**: `LIBRARY_PATH` (not just `LD_LIBRARY_PATH`) must include the +> compiler 2026.0 lib directory so the linker finds `libsycl.so` at build time. + +> **Important**: `/opt/intel/oneapi/umf/1.1/lib` must be in `LD_LIBRARY_PATH` at +> runtime. The Level Zero and OpenCL UR adapters in the 2026.0 compiler depend on +> `libumf.so.1`, which lives in the separate `umf/1.1` package, not in the compiler +> lib directories themselves. + +## Building + +```bash +./mfc.sh build -t simulation --gpu mp --no-mpi -j 8 +``` + +- `--gpu mp`: OpenMP target offload backend (SPIR64) +- `--no-mpi`: omit for MPI-enabled runs; include for single-node testing +- `-j 8`: parallel build jobs + +MFC will automatically: +1. Compile `$MKLROOT/include/mkl_dfti_omp_offload.f90` with minimal flags + (no `-free -fpp`) via a CMake `add_custom_command` to avoid OpenMP 5.2 + clause compatibility issues with global compile flags +2. Link `-qmkl=parallel` for MKL threading + core +3. Link `libmkl_sycl_dft`, `libsycl`, `libOpenCL` for GPU FFT dispatch + +## GPU FFT implementation + +MFC uses oneMKL DFTI with the OpenMP 5.1 `!$omp dispatch` construct for FFT +in cylindrical geometry (the azimuthal Fourier filter in `m_fftw.fpp`). +This is activated when `__INTEL_LLVM_COMPILER` is defined (i.e., compiled with ifx). + +Key verified properties (oneMKL 2026.0, ifx 2025.3.3): +- Batch R2C transform with `INPUT_DISTANCE != OUTPUT_DISTANCE` works correctly. + MFC uses `real_size = p+1` and `cmplx_size = (p+1)/2+1` which always differ. +- `!$omp dispatch` correctly dispatches DFTI calls to device-mapped allocatables. + +## Running MFC cases + +Build all three targets (pre_process, simulation, post_process) before running: + +```bash +./mfc.sh build --gpu mp --no-mpi -j 8 +``` + +Then run a case normally: + +```bash +./mfc.sh run examples/1D_convergence/case.py --no-build --no-mpi +``` + +To run individual stages directly (useful when `syscheck` blocks due to GPU access): + +```bash +export MKLROOT=/opt/intel/oneapi/mkl/2026.0 +export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:\ +/opt/intel/oneapi/compiler/2026.0/lib:\ +/opt/intel/oneapi/compiler/2025.3/lib:\ +${MKLROOT}/lib:\ +/opt/intel/oneapi/tbb/2022.1/lib:\ +$LD_LIBRARY_PATH + +cd examples/my_case +/path/to/build/install//bin/pre_process +/path/to/build/install//bin/simulation +``` + +The install hashes are printed by `./mfc.sh build`; look for lines like +`✓ Installed simulation`. + +## Chemistry/pyrometheus cases + +MFC's pyrometheus-generated thermochemistry (`m_thermochem.f90`) works with the +Intel GPU build. Pyrometheus emits `!$omp declare target` for all thermochem +routines when `directive_offload="mp"` is requested (automatically set when +`--gpu mp` is active). + +Verified on ifx 2025.3.3: `m_thermochem.f90` and `m_chemistry.fpp` both compile +at O3 + SPIR64 without ICE. The `1D_reactive_shocktube` example (H2/O2/Ar, 29 +reactions, 10 species) runs to completion. No Intel-specific source workarounds +are needed for chemistry beyond the general levelset fix. + +To build and run a reactive chemistry case: +```bash +# Build (chemistry module generated automatically from cantera_file in case.py) +./mfc.sh run examples/1D_reactive_shocktube/case.py \ + --gpu mp --no-mpi -t pre_process simulation + +# Or bypass syscheck if no GPU render-group access: +cd examples/1D_reactive_shocktube +/path/to/build/install//bin/pre_process +OMP_TARGET_OFFLOAD=DISABLED \ + /path/to/build/install//bin/simulation +``` + +ifx warning during chemistry compilation: +``` +warning #8694: A procedure called by a procedure with the DECLARE TARGET +attribute must have the DECLARE TARGET attribute. [GET_MIXTURE_VISCOSITY_MIXAVG] +``` +This is a false positive from ifx's module-interface tracking; `m_thermochem.f90` +does declare all routines as target. The warning is harmless and the code runs +correctly. + +## GPU device access + +The Intel GPU requires membership in the `render` group (GID 109) to access +`/dev/dri/renderD128` via Level Zero. + +Without render group access, `ZE_RESULT_ERROR_UNINITIALIZED` is returned by +Level Zero. OpenMP target offload falls back to the CPU host plugin +(correct results, but no GPU acceleration). + +To diagnose GPU visibility: + +```bash +ls -la /dev/dri/renderD128 # should show rw permissions for your user/group=render + +# With full LD_LIBRARY_PATH set: +LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:... \ + /opt/intel/oneapi/compiler/2026.0/bin/sycl-ls --verbose +# Look for: "[opencl:gpu]" or "[ext_oneapi_level_zero:gpu]" platforms + +LIBOMPTARGET_DEBUG=1 ./simulation # look for "Level0 NG plugin initialization" + # and absence of "ZE_RESULT_ERROR_UNINITIALIZED" +``` + +To get GPU access: +- **Interactive shell**: request from system admin to add user to `render` group + (`sudo usermod -a -G render $USER`, then re-login) +- **SLURM**: submit with `--gres=gpu:max_1100=1`; if Level Zero still fails, + the SLURM epilog may not have configured device cgroup ACLs for the job user — + contact the system admin + +> **Note on `sycl-ls`**: the 2026.0 `sycl-ls` requires `libumf.so.1` from +> `/opt/intel/oneapi/umf/1.1/lib` in `LD_LIBRARY_PATH`, otherwise all adapters +> fail to load and it reports "No platforms found". + +## Link flags (what MFC's CMake generates) + +The full set of flags the compiler uses for the simulation target: + +**Compile flags:** +``` +-fiopenmp -fopenmp-targets=spir64 -free -fpp -march=native +``` + +**Link flags:** +``` +-fiopenmp -fopenmp-targets=spir64 +-qmkl=parallel +-L$MKLROOT/lib -lmkl_sycl_dft +-L/opt/intel/oneapi/compiler/2026.0/lib -lsycl -lOpenCL +``` + +**MKL OMP module (compiled separately, no global flags):** +```bash +ifx -fiopenmp -fopenmp-targets=spir64 \ + -c -I$MKLROOT/include \ + $MKLROOT/include/mkl_dfti_omp_offload.f90 \ + -o mkl_dfti_omp_offload.o +``` + +## Known issues + +### `need_device_addr` compilation error +`mkl_dfti_omp_offload.f90` from MKL 2026.0 uses `need_device_addr` in +`!$omp declare variant` (OpenMP 5.2). This requires ifx **2025.3** or newer. +If CMake finds an older ifx (e.g., 2025.0 from a system module path), the +compile fails with: +``` +error #5082: Syntax error, found IDENTIFIER 'NEED_DEVICE_ADDR' +``` +Fix: set `FC=/opt/intel/oneapi/compiler/2025.3/bin/ifx` before building +and run `./mfc.sh clean` first so CMake re-detects the compiler. + +### Two routines with ifx SPIR64 codegen bugs + +**`s_apply_levelset` (`m_compute_levelset.fpp`)** — ifx SPIR64 inliner ICE: + +The LLVM inliner (at O1+) pulls `!$omp declare target (seq)` geometry +routines into the `target teams loop` kernel and generates LLVM IR that +crashes the SPIR-V converter with a segfault in `llvm-spirv`. At O0 the +crash does not occur (no inlining). Two fixes combined: + +1. **Split loops**: replaced the single if-else dispatch loop with one + `GPU_PARALLEL_LOOP` per geometry type so each kernel calls exactly one + declare-target routine. The original multi-callee dispatch also triggers + `"Instruction does not dominate all uses!"` in llvm-link. + +2. **Per-file `-fno-inline`**: in `CMakeLists.txt`, `set_source_files_properties` + adds `-fno-inline` to `m_compute_levelset.fpp.f90` for IntelLLVM+OpenMP + builds. This prevents the inliner from pulling declare-target routines into + the kernel body where they crash the SPIR-V backend. The routines remain + callable as proper device-side function calls via `!$omp declare target`. + +**`s_pressure_relaxation_procedure` (`m_pressure_relaxation.fpp`)** — SPIR-V +InvalidArraySize in declare-target helpers: + +`!$omp declare target (seq)` routines with `dimension(sys_size)` explicit-shape +dummy arguments trigger `"InvalidArraySize: Array size must be at least 1"` in +llvm-spirv. SPIR-V requires compile-time constant array bounds; `sys_size` is +a runtime module integer. Fixed by changing `dimension(sys_size)` → +`dimension(:)` (assumed-shape) on all helper routine interfaces. The loop now +runs on GPU for Intel. + +### syscheck GPU assertion +`syscheck` runs `assert(omp_get_num_devices() > 0)` and aborts if the GPU +is not accessible. This is a runtime check, not a build issue. See GPU device +access section above. + +To run a case anyway (testing code correctness on CPU fallback), invoke +`pre_process` and `simulation` directly from their install paths, bypassing +the `./mfc.sh run` wrapper that calls `syscheck` first. + +### Multi-node MPI with Intel MPI 2021.x + +Intel MPI 2021.x expects a bundled `ssh` binary at `$I_MPI_ROOT/bin/ssh` that +understands an `--external-launcher` flag used by hydra bootstrap. This binary +is missing from some oneAPI installations, causing SSH bootstrap to fail with +`unknown option -- -`. + +Workaround: create a wrapper that strips the Intel-specific flag: + +```bash +mkdir -p ~/bin +cat > ~/bin/ssh << 'EOF' +#!/bin/bash +args=(-q -o StrictHostKeyChecking=yes -o BatchMode=yes) +for arg in "$@"; do + [[ "$arg" == "--external-launcher" ]] && continue + [[ "$arg" == "--" ]] && break + args+=("$arg") +done +exec /usr/bin/ssh "${args[@]}" +EOF +chmod +x ~/bin/ssh +``` + +Then run with: +```bash +PATH=$HOME/bin:$PATH \ +I_MPI_HYDRA_BOOTSTRAP=rsh \ +I_MPI_HYDRA_BOOTSTRAP_EXEC=$HOME/bin/ssh \ +mpirun -n -hosts , ./simulation +``` + +Nodes must have passwordless SSH from the launch node and no `pam_slurm_adopt` +blocking. Suppress the SSH login banner on remote nodes with `touch ~/.hushlogin`. + +**OFI provider path**: Intel MPI 2021.x ships its own libfabric providers in +`$I_MPI_ROOT/libfabric/lib/prov/`. The system libfabric may not include the tcp +or shm providers. Always set: + +```bash +export FI_PROVIDER_PATH=$I_MPI_ROOT/libfabric/lib/prov +``` + +Without this, `PMPI_Init` aborts with `OFI fi_getinfo() failed: No data available`. +This is handled automatically by `source ./mfc.sh load -c crnch -m gpu`. + +**SLURM GPU access**: on SLURM-managed Intel GPU nodes, processes outside a SLURM +allocation cannot open `/dev/dri/renderD128`. Always request the GPU resource: + +```bash +#SBATCH --gres=gpu:max_1100:1 # Intel GPU Max 1100 +``` + +Without `--gres`, `omp_get_num_devices()` returns 0 and the process aborts with +integer divide-by-zero in `s_initialize_mpi_domain` (rank % num_devices with 0 devices). + +**Per-node renderD128 permissions on CRNCH**: `dash4` has `renderD128` as +`crwxrwxrwx` (world-accessible), but `dash3` has `crw-rw----` (render group only). +`--gres=gpu:max_1100:1` does NOT grant cgroup access on dash3 with the current +SLURM configuration; `omp_get_num_devices()` returns 0 on dash3 even within a +SLURM GPU allocation. Contact the CRNCH admin to either fix the device permissions +on dash3 or configure SLURM device cgroups to grant renderD128 access for GPU jobs. +Until fixed, 2-node GPU simulation is not possible using dash3+dash4. + +**Inter-node MPI: FI_TCP_IFACE must be set dynamically**: The CRNCH dash nodes +have multiple network interfaces (high-speed 10GbE at `10.10.10.x`, public 1GbE +at `143.215.138.x/25`). Intel MPI's OFI tcp provider selects the highest-speed +interface by default. On dash3, this picks `enp200s0f1np1` (10.10.10.32), which +has no corresponding active interface on dash4. This causes the inter-node MPI +broadcast to hang silently after `MPI_Init` succeeds. + +Fix: set `FI_TCP_IFACE` to the name of the interface with the public IP (which +is accessible from all nodes). The interface name differs per node, so set it +dynamically in each rank's startup script: + +```bash +IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}') +export FI_TCP_IFACE="${IFACE}" +``` + +This selects `enp3s0f0` on dash3 and `enp3s0f0np0` on dash4. Combined with +`srun --mpi=pmi2` for SLURM-native MPI bootstrap (avoiding Intel MPI hydra/SSH), +this enables successful inter-node MPI communication. + +**Recommended 2-node run script pattern** (for when dash3's GPU access is fixed): + +```bash +#!/bin/bash +#SBATCH -N 2 +#SBATCH --ntasks-per-node=1 +#SBATCH -p rg-nextgen-hpc +#SBATCH -w dash3,dash4 +#SBATCH --gres=gpu:max_1100:1 +#SBATCH --time=01:00:00 + +INTEL=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1 +export PATH=${INTEL}/compiler/2025.0/bin:${INTEL}/mpi/2021.14/bin:${PATH} +export LD_LIBRARY_PATH=${INTEL}/mkl/2025.0/lib:${INTEL}/compiler/2025.0/lib:${INTEL}/2025.0/lib:${INTEL}/mpi/2021.14/lib:${INTEL}/mpi/2021.14/libfabric/lib:${LD_LIBRARY_PATH} +export FI_PROVIDER_PATH=${INTEL}/mpi/2021.14/libfabric/lib/prov +export I_MPI_FABRICS="shm:ofi" +export FI_PROVIDER=tcp + +cd /path/to/case + +# Step 1: pre-process +WRAP_SCRIPT=$(mktemp) +cat > "$WRAP_SCRIPT" << 'EOF' +IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}') +export FI_TCP_IFACE="$IFACE" +exec /path/to/build/install//bin/pre_process +EOF +chmod +x "$WRAP_SCRIPT" +srun --mpi=pmi2 -n 2 --ntasks-per-node=1 "$WRAP_SCRIPT" + +# Step 2: simulation +cat > "$WRAP_SCRIPT" << 'EOF' +IFACE=$(ip -o addr show | awk '/143\.215\.138\.[0-9]+\// {print $2; exit}') +export FI_TCP_IFACE="$IFACE" +exec /path/to/build/install//bin/simulation +EOF +srun --mpi=pmi2 -n 2 --ntasks-per-node=1 "$WRAP_SCRIPT" +rm "$WRAP_SCRIPT" +``` + +### `libumf.so.1` not found at runtime +The 2026.0 Level Zero and OpenCL UR adapters link against `libumf.so.1`. +If not in `LD_LIBRARY_PATH`, all adapters fail silently and sycl-ls reports +"No platforms found". Fix: + +```bash +export LD_LIBRARY_PATH=/opt/intel/oneapi/umf/1.1/lib:$LD_LIBRARY_PATH +``` diff --git a/examples/3D_performance_test/case.py b/examples/3D_performance_test/case.py index 4e08a8ea1b..8b26fa07dd 100644 --- a/examples/3D_performance_test/case.py +++ b/examples/3D_performance_test/case.py @@ -14,26 +14,13 @@ "y_domain%end": 4.0e-03 / 1.0e-03, "z_domain%beg": 0.0e00, "z_domain%end": 4.0e-03 / 1.0e-03, - "stretch_x": "T", - "a_x": 4.0e00, - "x_a": -1.5e-03 / 1.0e-03, - "x_b": 1.5e-03 / 1.0e-03, - "stretch_y": "T", - "a_y": 4.0e00, - "y_a": -1.5e-03 / 1.0e-03, - "y_b": 1.5e-03 / 1.0e-03, - "stretch_z": "T", - "a_z": 4.0e00, - "z_a": -1.5e-03 / 1.0e-03, - "z_b": 1.5e-03 / 1.0e-03, - "cyl_coord": "F", "m": 200, "n": 200, "p": 200, - "dt": 0.2e-09 / 1.0e-03, + "dt": 0.2e-09, "t_step_start": 0, - "t_step_stop": 30, - "t_step_save": 30, + "t_step_stop": 10, + "t_step_save": 10, # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, diff --git a/mini/m_cl4_mini.f90 b/mini/m_cl4_mini.f90 new file mode 100644 index 0000000000..5d3dd75cea --- /dev/null +++ b/mini/m_cl4_mini.f90 @@ -0,0 +1,405 @@ +! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633 +! +! Pattern: single module with 10+ !$omp target teams loop regions, each calling +! a different !$omp declare target subroutine defined in the SAME module. +! The subroutines access a derived type that contains a character field +! (ib_patch_parameters%model_filepath) and 3x3 real arrays (rotation_matrix). +! +! Compile (ICE expected at -O3, pass expected at -O3 -fno-inline): +! mpif90 -free -fiopenmp -fopenmp-targets=spir64 -fp-model=precise \ +! -march=native -mno-avx512fp16 -O3 -c m_cl4_mini.f90 +! +! Fix: +! mpif90 ... -O3 -fno-inline -c m_cl4_mini.f90 + +module m_cl4_mini + + implicit none + private + + integer, parameter :: wp = kind(1.0d0) + integer, parameter :: PATHLEN = 200 + integer, parameter :: MAX_PATCHES = 10 + + ! Replicated from m_derived_types.fpp :: ib_patch_parameters + type :: ib_patch_parameters + integer :: geometry + real(wp) :: x_centroid, y_centroid, z_centroid + real(wp) :: step_x_centroid, step_y_centroid, step_z_centroid + real(wp), dimension(1:3) :: centroid_offset + real(wp), dimension(1:3) :: angles, step_angles + real(wp), dimension(1:3,1:3) :: rotation_matrix + real(wp), dimension(1:3,1:3) :: rotation_matrix_inverse + real(wp) :: c, p, t, m + real(wp) :: length_x, length_y, length_z + real(wp) :: radius, theta + logical :: slip + character(LEN=PATHLEN) :: model_filepath ! <-- char field; triggers SPIR-V bug + real(wp), dimension(1:3) :: model_translate + real(wp), dimension(1:3) :: model_scale + real(wp), dimension(1:3) :: model_rotate + integer :: model_spc + real(wp) :: model_threshold + integer :: moving_ibm + real(wp) :: mass, moment + real(wp), dimension(1:3) :: force, torque + real(wp), dimension(1:3) :: vel, step_vel + real(wp), dimension(1:3) :: angular_vel, step_angular_vel + end type ib_patch_parameters + + ! Replicated from m_derived_types.fpp :: ghost_point + type :: ghost_point + integer, dimension(3) :: loc + real(wp), dimension(3) :: ip_loc + integer, dimension(3) :: ip_grid + real(wp), dimension(2, 2, 2) :: interp_coeffs ! 3-D array in struct + integer :: ib_patch_id + real(wp) :: levelset + real(wp), dimension(1:3) :: levelset_norm + logical :: slip + integer, dimension(3) :: DB + integer :: x_periodicity, y_periodicity, z_periodicity + end type ghost_point + + type :: bounds_info + real(wp) :: beg, end + end type bounds_info + + ! Module-level variables accessed by the declare-target subroutines + type(ib_patch_parameters), dimension(MAX_PATCHES) :: patch_ib + real(wp), allocatable :: x_cc(:), y_cc(:), z_cc(:) + type(bounds_info) :: x_domain, y_domain, z_domain + integer :: num_ibs + + !$omp declare target(patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs) + + public :: ghost_point, s_cl4_dispatch, s_cl4_init, s_cl4_finalize + +contains + + ! ------------------------------------------------------------------ init -- + + subroutine s_cl4_init(nx, nz) + integer, intent(in) :: nx, nz + integer :: i + allocate(x_cc(nx), y_cc(nx), z_cc(nz)) + do i = 1, nx + x_cc(i) = real(i, wp) * 0.1_wp + y_cc(i) = real(i, wp) * 0.1_wp + end do + do i = 1, nz + z_cc(i) = real(i, wp) * 0.1_wp + end do + x_domain%beg = 0._wp; x_domain%end = real(nx, wp)*0.1_wp + y_domain%beg = 0._wp; y_domain%end = real(nx, wp)*0.1_wp + z_domain%beg = 0._wp; z_domain%end = real(nz, wp)*0.1_wp + num_ibs = 1 + do i = 1, MAX_PATCHES + patch_ib(i)%geometry = i + patch_ib(i)%x_centroid = 0.5_wp + patch_ib(i)%y_centroid = 0.5_wp + patch_ib(i)%z_centroid = 0.5_wp + patch_ib(i)%radius = 0.2_wp + patch_ib(i)%length_x = 0.4_wp + patch_ib(i)%length_y = 0.4_wp + patch_ib(i)%length_z = 0.4_wp + patch_ib(i)%rotation_matrix = 0._wp + patch_ib(i)%rotation_matrix(1,1) = 1._wp + patch_ib(i)%rotation_matrix(2,2) = 1._wp + patch_ib(i)%rotation_matrix(3,3) = 1._wp + patch_ib(i)%rotation_matrix_inverse = patch_ib(i)%rotation_matrix + patch_ib(i)%centroid_offset = 0._wp + patch_ib(i)%model_filepath = 'none' + end do + !$omp target enter data map(to: patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs) + end subroutine s_cl4_init + + subroutine s_cl4_finalize() + !$omp target exit data map(delete: patch_ib, x_cc, y_cc, z_cc, x_domain, y_domain, z_domain, num_ibs) + deallocate(x_cc, y_cc, z_cc) + end subroutine s_cl4_finalize + + ! ------------------------------------------------------- dispatch (ICE site) -- + + !> 10 separate target teams loop regions, each calling a different + !! !$omp declare target subroutine defined in THIS module. + subroutine s_cl4_dispatch(gps, num_gps) + type(ghost_point), dimension(:), intent(inout) :: gps + integer, intent(in) :: num_gps + integer :: i, patch_id + + ! 3D geometry loops (5 loops, geometries 8-12) + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 8) call s_geo_sphere(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 9) call s_geo_cuboid(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 10) call s_geo_cylinder(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 11) call s_geo_3d_airfoil(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 12) call s_geo_model(gps(i)) + end do + + ! 2D geometry loops (5 loops, geometries 2-6) + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 2) call s_geo_circle(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 3) call s_geo_rectangle(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 4) call s_geo_airfoil(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 5) call s_geo_ellipse(gps(i)) + end do + + !$omp target teams loop private(i,patch_id) map(tofrom:gps) map(to:patch_ib,num_ibs) + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 6) call s_geo_triangle(gps(i)) + end do + + end subroutine s_cl4_dispatch + + ! ----------------------------------------- declare-target geometry routines -- + ! All defined in the SAME module; each has !$omp declare target. + ! The ICE fires when the inliner pulls these into the target teams loop bodies. + + subroutine s_geo_sphere(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: dist + real(wp), dimension(3) :: dv + integer :: id, i, j, k + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid - real(gp%x_periodicity,wp)*(x_domain%end-x_domain%beg) + dv(2) = y_cc(j) - patch_ib(id)%y_centroid - real(gp%y_periodicity,wp)*(y_domain%end-y_domain%beg) + dv(3) = z_cc(k) - patch_ib(id)%z_centroid - real(gp%z_periodicity,wp)*(z_domain%end-z_domain%beg) + dist = sqrt(sum(dv**2)) + gp%levelset = dist - patch_ib(id)%radius + if (dist > 0._wp) then + gp%levelset_norm = dv/dist + else + gp%levelset_norm = 0._wp + end if + end subroutine s_geo_sphere + + subroutine s_geo_cuboid(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: dx, dy, dz, lx, ly, lz + integer :: id, i, j, k + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + lx = patch_ib(id)%length_x; ly = patch_ib(id)%length_y; lz = patch_ib(id)%length_z + dx = abs(x_cc(i) - patch_ib(id)%x_centroid) - lx/2._wp + dy = abs(y_cc(j) - patch_ib(id)%y_centroid) - ly/2._wp + dz = abs(z_cc(k) - patch_ib(id)%z_centroid) - lz/2._wp + gp%levelset = max(dx, max(dy, dz)) + gp%levelset_norm(1) = sign(1._wp, x_cc(i) - patch_ib(id)%x_centroid) + gp%levelset_norm(2) = sign(1._wp, y_cc(j) - patch_ib(id)%y_centroid) + gp%levelset_norm(3) = sign(1._wp, z_cc(k) - patch_ib(id)%z_centroid) + end subroutine s_geo_cuboid + + subroutine s_geo_cylinder(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: r2d, dist + real(wp), dimension(3) :: dv, rot_dv + integer :: id, i, j, k + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid + dv(2) = y_cc(j) - patch_ib(id)%y_centroid + dv(3) = z_cc(k) - patch_ib(id)%z_centroid + rot_dv = matmul(patch_ib(id)%rotation_matrix_inverse, dv) + r2d = sqrt(rot_dv(1)**2 + rot_dv(2)**2) + dist = r2d - patch_ib(id)%radius + gp%levelset = dist + if (r2d > 0._wp) then + gp%levelset_norm(1) = rot_dv(1)/r2d + gp%levelset_norm(2) = rot_dv(2)/r2d + else + gp%levelset_norm(1) = 0._wp + gp%levelset_norm(2) = 0._wp + end if + gp%levelset_norm(3) = 0._wp + end subroutine s_geo_cylinder + + subroutine s_geo_3d_airfoil(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp), dimension(3) :: dv, local + integer :: id, i, j, k + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid + dv(2) = y_cc(j) - patch_ib(id)%y_centroid + dv(3) = z_cc(k) - patch_ib(id)%z_centroid + local = matmul(patch_ib(id)%rotation_matrix_inverse, dv) - patch_ib(id)%centroid_offset + gp%levelset = sqrt(sum(local**2)) - patch_ib(id)%radius + gp%levelset_norm = local / max(sqrt(sum(local**2)), 1e-12_wp) + end subroutine s_geo_3d_airfoil + + subroutine s_geo_model(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp), dimension(3) :: dv + integer :: id, i, j, k + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid + patch_ib(id)%model_translate(1)*patch_ib(id)%model_scale(1) + dv(2) = y_cc(j) - patch_ib(id)%y_centroid + patch_ib(id)%model_translate(2)*patch_ib(id)%model_scale(2) + dv(3) = z_cc(k) - patch_ib(id)%z_centroid + patch_ib(id)%model_translate(3)*patch_ib(id)%model_scale(3) + gp%levelset = sqrt(sum(dv**2)) - patch_ib(id)%radius + gp%levelset_norm = dv / max(sqrt(sum(dv**2)), 1e-12_wp) + end subroutine s_geo_model + + subroutine s_geo_circle(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: dist + real(wp), dimension(3) :: dv + integer :: id, i, j + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid - real(gp%x_periodicity,wp)*(x_domain%end-x_domain%beg) + dv(2) = y_cc(j) - patch_ib(id)%y_centroid - real(gp%y_periodicity,wp)*(y_domain%end-y_domain%beg) + dv(3) = 0._wp + dist = sqrt(sum(dv**2)) + gp%levelset = dist - patch_ib(id)%radius + if (dist > 0._wp) then + gp%levelset_norm = dv/dist + else + gp%levelset_norm = 0._wp + end if + end subroutine s_geo_circle + + subroutine s_geo_rectangle(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: dx, dy + integer :: id, i, j + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2) + dx = abs(x_cc(i) - patch_ib(id)%x_centroid) - patch_ib(id)%length_x/2._wp + dy = abs(y_cc(j) - patch_ib(id)%y_centroid) - patch_ib(id)%length_y/2._wp + gp%levelset = max(dx, dy) + gp%levelset_norm(1) = sign(1._wp, x_cc(i) - patch_ib(id)%x_centroid) + gp%levelset_norm(2) = sign(1._wp, y_cc(j) - patch_ib(id)%y_centroid) + gp%levelset_norm(3) = 0._wp + end subroutine s_geo_rectangle + + subroutine s_geo_airfoil(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp), dimension(3) :: dv, local + integer :: id, i, j + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid + dv(2) = y_cc(j) - patch_ib(id)%y_centroid + dv(3) = 0._wp + local = matmul(patch_ib(id)%rotation_matrix_inverse, dv) - patch_ib(id)%centroid_offset + gp%levelset = sqrt(sum(local**2)) - patch_ib(id)%radius + gp%levelset_norm = local / max(sqrt(sum(local**2)), 1e-12_wp) + end subroutine s_geo_airfoil + + subroutine s_geo_ellipse(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: ax, ay, dist + real(wp), dimension(3) :: dv + integer :: id, i, j + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2) + ax = patch_ib(id)%length_x / 2._wp + ay = patch_ib(id)%length_y / 2._wp + dv(1) = (x_cc(i) - patch_ib(id)%x_centroid) / ax + dv(2) = (y_cc(j) - patch_ib(id)%y_centroid) / ay + dv(3) = 0._wp + dist = sqrt(dv(1)**2 + dv(2)**2) + gp%levelset = dist - 1._wp + if (dist > 0._wp) then + gp%levelset_norm = dv/dist + else + gp%levelset_norm = 0._wp + end if + end subroutine s_geo_ellipse + + subroutine s_geo_triangle(gp) + !$omp declare target + type(ghost_point), intent(inout) :: gp + real(wp) :: d1, d2, d3 + real(wp), dimension(3) :: dv + integer :: id, i, j + id = gp%ib_patch_id; i = gp%loc(1); j = gp%loc(2) + dv(1) = x_cc(i) - patch_ib(id)%x_centroid + dv(2) = y_cc(j) - patch_ib(id)%y_centroid + dv(3) = 0._wp + d1 = dv(2) + patch_ib(id)%length_y/2._wp + d2 = -dv(2) + dv(1)*sqrt(3._wp)/3._wp + patch_ib(id)%length_y/3._wp + d3 = -dv(2) - dv(1)*sqrt(3._wp)/3._wp + patch_ib(id)%length_y/3._wp + gp%levelset = min(max(-d1,-d2), max(-d1,-d3)) + gp%levelset_norm(1) = dv(1) / max(sqrt(sum(dv(1:2)**2)), 1e-12_wp) + gp%levelset_norm(2) = dv(2) / max(sqrt(sum(dv(1:2)**2)), 1e-12_wp) + gp%levelset_norm(3) = 0._wp + end subroutine s_geo_triangle + +end module m_cl4_mini + +program test_cl4 + use m_cl4_mini + implicit none + + integer, parameter :: wp = kind(1.0d0) + integer, parameter :: N = 64, NGPS = 32 + type(ghost_point), allocatable :: gps(:) + integer :: i + + call s_cl4_init(N, N) + + allocate(gps(NGPS)) + do i = 1, NGPS + gps(i)%loc = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1] + gps(i)%ib_patch_id = 1 + gps(i)%x_periodicity = 0 + gps(i)%y_periodicity = 0 + gps(i)%z_periodicity = 0 + gps(i)%levelset = 0._wp + gps(i)%levelset_norm = 0._wp + gps(i)%interp_coeffs = 0._wp + gps(i)%slip = .false. + end do + + call s_cl4_dispatch(gps, NGPS) + + write(*,'(a,f12.6)') 'levelset(1) = ', gps(1)%levelset + deallocate(gps) + call s_cl4_finalize() + write(*,'(a)') 'OK' + +end program test_cl4 diff --git a/mini/m_cl5_mini.f90 b/mini/m_cl5_mini.f90 new file mode 100644 index 0000000000..2b23ef549d --- /dev/null +++ b/mini/m_cl5_mini.f90 @@ -0,0 +1,304 @@ +! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633 +! +! ICE trigger: multiple !$omp target teams loop regions in one module, each +! calling a different !$omp declare target subroutine in the SAME module. +! Subroutines access allocatable module arrays and a derived type with +! array-of-arrays fields (interp_coeffs(2,2,2)) plus character field. +! +! Compile (expect ICE): +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -c m_cl5_mini.f90 +! Workaround (passes): +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline -c m_cl5_mini.f90 + +module m_cl5 + + implicit none + private + + integer, parameter :: wp = kind(1.0d0) + + type :: patch_t + integer :: geometry + real(wp) :: radius, cx, cy, cz + real(wp) :: lx, ly, lz + real(wp), dimension(1:3,1:3) :: rot, rot_inv + real(wp), dimension(1:3) :: offset + character(LEN=200) :: label ! char field in device-mapped type + end type patch_t + + type :: gp_t + integer, dimension(3) :: loc + real(wp), dimension(2, 2, 2) :: interp_coeffs ! 3D array in struct + integer :: pid + real(wp) :: levelset + real(wp), dimension(3) :: levelset_norm + integer :: xp, yp, zp + end type gp_t + + type(patch_t), dimension(8), target :: patches + real(wp), allocatable :: xc(:), yc(:), zc(:) + !$omp declare target(patches, xc, yc, zc) + + public :: gp_t, s_init, s_finalize, s_dispatch + +contains + + subroutine s_init(n) + integer, intent(in) :: n + integer :: i + allocate(xc(n), yc(n), zc(n)) + do i = 1, n + xc(i) = real(i,wp)*0.1_wp + yc(i) = real(i,wp)*0.1_wp + zc(i) = real(i,wp)*0.1_wp + end do + do i = 1, 8 + patches(i)%geometry = i + patches(i)%radius = 0.5_wp; patches(i)%cx = 0.5_wp + patches(i)%cy = 0.5_wp; patches(i)%cz = 0.5_wp + patches(i)%lx = 0.4_wp; patches(i)%ly = 0.4_wp; patches(i)%lz = 0.4_wp + patches(i)%rot = 0.0_wp + patches(i)%rot(1,1) = 1.0_wp; patches(i)%rot(2,2) = 1.0_wp; patches(i)%rot(3,3) = 1.0_wp + patches(i)%rot_inv = patches(i)%rot + patches(i)%offset = 0.0_wp + patches(i)%label = 'none' + end do + !$omp target enter data map(to: patches, xc, yc, zc) + end subroutine s_init + + subroutine s_finalize() + !$omp target exit data map(delete: patches, xc, yc, zc) + deallocate(xc, yc, zc) + end subroutine s_finalize + + subroutine s_dispatch(gps, n) + type(gp_t), intent(inout) :: gps(:) + integer, intent(in) :: n + integer :: i + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 1) call s_geo1(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 2) call s_geo2(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 3) call s_geo3(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 4) call s_geo4(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 5) call s_geo5(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 6) call s_geo6(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 7) call s_geo7(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 8) call s_geo8(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 9) call s_geo9(gps(i)) + end do + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + if (patches(gps(i)%pid)%geometry == 10) call s_geo10(gps(i)) + end do + + end subroutine s_dispatch + + subroutine s_geo1(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j, k + real(wp), dimension(3) :: dv + real(wp) :: d + id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = xc(i) - patches(id)%cx - real(gp%xp,wp)*0.1_wp + dv(2) = yc(j) - patches(id)%cy - real(gp%yp,wp)*0.1_wp + dv(3) = zc(k) - patches(id)%cz - real(gp%zp,wp)*0.1_wp + d = sqrt(sum(dv**2)) + gp%levelset = d - patches(id)%radius + if (d > 0._wp) then; gp%levelset_norm = dv/d; else; gp%levelset_norm = 0._wp; end if + end subroutine s_geo1 + + subroutine s_geo2(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j, k + real(wp) :: dx, dy, dz + id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dx = abs(xc(i) - patches(id)%cx) - patches(id)%lx/2._wp + dy = abs(yc(j) - patches(id)%cy) - patches(id)%ly/2._wp + dz = abs(zc(k) - patches(id)%cz) - patches(id)%lz/2._wp + gp%levelset = max(dx, max(dy, dz)) + gp%levelset_norm(1) = sign(1._wp, xc(i) - patches(id)%cx) + gp%levelset_norm(2) = sign(1._wp, yc(j) - patches(id)%cy) + gp%levelset_norm(3) = sign(1._wp, zc(k) - patches(id)%cz) + end subroutine s_geo2 + + subroutine s_geo3(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j, k + real(wp), dimension(3) :: dv, ldv + real(wp) :: r2d + id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz + ldv = matmul(patches(id)%rot_inv, dv) + r2d = sqrt(ldv(1)**2 + ldv(2)**2) + gp%levelset = r2d - patches(id)%radius + if (r2d > 0._wp) then + gp%levelset_norm(1) = ldv(1)/r2d; gp%levelset_norm(2) = ldv(2)/r2d + else + gp%levelset_norm(1) = 0._wp; gp%levelset_norm(2) = 0._wp + end if + gp%levelset_norm(3) = 0._wp + end subroutine s_geo3 + + subroutine s_geo4(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j, k + real(wp), dimension(3) :: dv, ldv + real(wp) :: d + id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz + ldv = matmul(patches(id)%rot_inv, dv) - patches(id)%offset + d = sqrt(sum(ldv**2)) + gp%levelset = d - patches(id)%radius + gp%levelset_norm = ldv / max(d, 1.0e-12_wp) + end subroutine s_geo4 + + subroutine s_geo5(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j + real(wp) :: ax, ay, d + real(wp), dimension(3) :: dv + id = gp%pid; i = gp%loc(1); j = gp%loc(2) + ax = patches(id)%lx/2._wp; ay = patches(id)%ly/2._wp + dv(1) = (xc(i) - patches(id)%cx)/ax + dv(2) = (yc(j) - patches(id)%cy)/ay + dv(3) = 0._wp + d = sqrt(dv(1)**2 + dv(2)**2) + gp%levelset = d - 1._wp + if (d > 0._wp) then; gp%levelset_norm = dv/d; else; gp%levelset_norm = 0._wp; end if + end subroutine s_geo5 + + subroutine s_geo6(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j + real(wp) :: dx, dy + id = gp%pid; i = gp%loc(1); j = gp%loc(2) + dx = abs(xc(i) - patches(id)%cx) - patches(id)%lx/2._wp + dy = abs(yc(j) - patches(id)%cy) - patches(id)%ly/2._wp + gp%levelset = max(dx, dy) + gp%levelset_norm(1) = sign(1._wp, xc(i) - patches(id)%cx) + gp%levelset_norm(2) = sign(1._wp, yc(j) - patches(id)%cy) + gp%levelset_norm(3) = 0._wp + end subroutine s_geo6 + + subroutine s_geo7(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j + real(wp), dimension(3) :: dv, ldv + real(wp) :: d + id = gp%pid; i = gp%loc(1); j = gp%loc(2) + dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = 0._wp + ldv = matmul(patches(id)%rot_inv, dv) - patches(id)%offset + d = sqrt(sum(ldv(1:2)**2)) + gp%levelset = d - patches(id)%radius + gp%levelset_norm = ldv / max(d, 1.0e-12_wp) + end subroutine s_geo7 + + subroutine s_geo8(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j + real(wp) :: d1, d2, d3 + real(wp), dimension(3) :: dv + id = gp%pid; i = gp%loc(1); j = gp%loc(2) + dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = 0._wp + d1 = dv(2) + patches(id)%ly/2._wp + d2 = -dv(2) + dv(1)*sqrt(3._wp)/3._wp + patches(id)%ly/3._wp + d3 = -dv(2) - dv(1)*sqrt(3._wp)/3._wp + patches(id)%ly/3._wp + gp%levelset = min(max(-d1,-d2), max(-d1,-d3)) + gp%levelset_norm(1) = dv(1)/max(sqrt(sum(dv(1:2)**2)),1e-12_wp) + gp%levelset_norm(2) = dv(2)/max(sqrt(sum(dv(1:2)**2)),1e-12_wp) + gp%levelset_norm(3) = 0._wp + end subroutine s_geo8 + + subroutine s_geo9(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j + real(wp), dimension(3) :: dv + real(wp) :: d + id = gp%pid; i = gp%loc(1); j = gp%loc(2) + dv(1) = xc(i) - patches(id)%cx + patches(id)%lx*patches(id)%offset(1) + dv(2) = yc(j) - patches(id)%cy + patches(id)%ly*patches(id)%offset(2) + dv(3) = 0._wp + d = sqrt(sum(dv**2)) + gp%levelset = d - patches(id)%radius + gp%levelset_norm = dv / max(d, 1.0e-12_wp) + end subroutine s_geo9 + + subroutine s_geo10(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j, k + real(wp), dimension(3) :: dv, ldv + real(wp) :: d + id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = xc(i) - patches(id)%cx; dv(2) = yc(j) - patches(id)%cy; dv(3) = zc(k) - patches(id)%cz + ldv = matmul(patches(id)%rot, dv) + d = sqrt(sum(ldv**2)) + gp%levelset = d - patches(id)%radius + gp%levelset_norm = matmul(patches(id)%rot_inv, ldv) / max(d, 1.0e-12_wp) + end subroutine s_geo10 + +end module m_cl5 + +program test_cl5 + use m_cl5 + implicit none + integer, parameter :: N = 16 + type(gp_t) :: gps(N) + integer :: i + call s_init(N) + do i = 1, N + gps(i)%levelset = 0.0d0 + gps(i)%levelset_norm = 0.0d0 + gps(i)%interp_coeffs = 0.0d0 + gps(i)%loc = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1] + gps(i)%pid = mod(i-1,8)+1 + gps(i)%xp = 0; gps(i)%yp = 0; gps(i)%zp = 0 + end do + call s_dispatch(gps, N) + write(*,*) 'levelset(1) =', gps(1)%levelset + call s_finalize() +end program test_cl5 diff --git a/mini/m_ice_min.f90 b/mini/m_ice_min.f90 new file mode 100644 index 0000000000..8b342ead7e --- /dev/null +++ b/mini/m_ice_min.f90 @@ -0,0 +1,61 @@ +! ABSOLUTE MINIMUM reproducer for ifx 2025.1.1 SPIR-V ICE #5633 +! +! Trigger: matmul() inside a !$omp declare target subroutine that is +! inlined into a !$omp target teams loop kernel. +! +! ifx version: 2025.1.1 20250418 Hardware: Intel GPU Max 1100 (Ponte Vecchio) +! +! ICE (O1/O2/O3): +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 m_ice_min.f90 +! OK (inlining disabled): +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline m_ice_min.f90 + +module m_min + + implicit none + private + + integer, parameter :: wp = kind(1.0d0) + + real(wp), dimension(3, 3) :: A ! module-level 3x3 matrix + real(wp), dimension(3) :: x ! module-level vector + real(wp), dimension(3) :: b ! result vector + !$omp declare target(A, x, b) + + public :: s_run + +contains + + subroutine s_run(n) + integer, intent(in) :: n + integer :: i + + A(1,:) = [1._wp, 0._wp, 0._wp] + A(2,:) = [0._wp, 1._wp, 0._wp] + A(3,:) = [0._wp, 0._wp, 1._wp] + x = [1._wp, 2._wp, 3._wp] + b = 0._wp + !$omp target enter data map(to: A, x, b) + + !$omp target teams loop private(i) map(to:A,x) map(tofrom:b) + do i = 1, n + call s_apply(i) + end do + + !$omp target exit data map(from: b) + end subroutine s_run + + subroutine s_apply(k) + !$omp declare target + integer, intent(in) :: k + ! matmul inside declare-target sub -- ICE trigger when inlined + b = real(k, wp) * matmul(A, x) + end subroutine s_apply + +end module m_min + +program test_min + use m_min + implicit none + call s_run(4) +end program test_min diff --git a/mini/m_ice_repro.f90 b/mini/m_ice_repro.f90 new file mode 100644 index 0000000000..d2dbe141de --- /dev/null +++ b/mini/m_ice_repro.f90 @@ -0,0 +1,125 @@ +! Minimal reproducer for ifx 2025.1.1 SPIR-V ICE #5633 +! +! Trigger: matmul() called with a derived-type member (dimension(3,3) array) +! inside an !$omp declare target subroutine, called from an +! !$omp target teams loop. Module-level allocatable arrays are also +! accessed from the declare-target routine. +! +! ifx version: 2025.1.1 20250418 (iimpi/2025a) +! Hardware: Intel GPU Max 1100 (Ponte Vecchio XT) +! +! Compile (ICE -- error #5633: segmentation violation in SPIR-V backend): +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -c m_ice_repro.f90 +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O2 -c m_ice_repro.f90 +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O1 -c m_ice_repro.f90 +! +! Workaround (OK -- inlining suppressed so matmul is called via link, not inline): +! ifx -free -fiopenmp -fopenmp-targets=spir64 -O3 -fno-inline -c m_ice_repro.f90 + +module m_ice + + implicit none + private + + integer, parameter :: wp = kind(1.0d0) + + type :: patch_t + real(wp), dimension(3, 3) :: mat ! 3x3 member -- key field + real(wp) :: cx, cy, cz, radius + end type patch_t + + type :: gp_t + integer :: pid + integer, dimension(3) :: loc + real(wp) :: levelset + real(wp), dimension(3) :: norm + end type gp_t + + type(patch_t), dimension(4) :: patches + real(wp), allocatable :: xc(:), yc(:), zc(:) + !$omp declare target(patches, xc, yc, zc) + + public :: gp_t, s_init, s_finalize, s_dispatch + +contains + + subroutine s_init(n) + integer, intent(in) :: n + integer :: i + allocate(xc(n), yc(n), zc(n)) + do i = 1, n + xc(i) = real(i, wp)*0.1_wp + yc(i) = real(i, wp)*0.1_wp + zc(i) = real(i, wp)*0.1_wp + end do + do i = 1, 4 + patches(i)%cx = 0.5_wp; patches(i)%cy = 0.5_wp; patches(i)%cz = 0.5_wp + patches(i)%radius = 0.25_wp + patches(i)%mat = 0.0_wp + patches(i)%mat(1,1) = 1.0_wp + patches(i)%mat(2,2) = 1.0_wp + patches(i)%mat(3,3) = 1.0_wp + end do + !$omp target enter data map(to: patches, xc, yc, zc) + end subroutine s_init + + subroutine s_finalize() + !$omp target exit data map(delete: patches, xc, yc, zc) + deallocate(xc, yc, zc) + end subroutine s_finalize + + ! Single !$omp target teams loop calling a single declare-target sub. + subroutine s_dispatch(gps, n) + type(gp_t), intent(inout) :: gps(:) + integer, intent(in) :: n + integer :: i + + !$omp target teams loop private(i) map(tofrom:gps) + do i = 1, n + call s_apply(gps(i)) + end do + + end subroutine s_dispatch + + ! !$omp declare target subroutine in the SAME module. + ! ICE fires when the inliner pulls this into the target loop at -O1/-O2/-O3. + subroutine s_apply(gp) + !$omp declare target + type(gp_t), intent(inout) :: gp + integer :: id, i, j, k + real(wp), dimension(3) :: dv, ldv + real(wp) :: d + id = gp%pid; i = gp%loc(1); j = gp%loc(2); k = gp%loc(3) + dv(1) = xc(i) - patches(id)%cx + dv(2) = yc(j) - patches(id)%cy + dv(3) = zc(k) - patches(id)%cz + ! matmul with a derived-type member (patches(id)%mat) is the key trigger + ldv = matmul(patches(id)%mat, dv) + d = sqrt(sum(ldv**2)) + gp%levelset = d - patches(id)%radius + if (d > 0.0_wp) then + gp%norm = ldv/d + else + gp%norm = 0.0_wp + end if + end subroutine s_apply + +end module m_ice + +program test_ice + use m_ice + implicit none + integer, parameter :: N = 16 + type(gp_t) :: gps(N) + integer :: i + call s_init(N) + do i = 1, N + gps(i)%pid = mod(i-1,4)+1 + gps(i)%loc = [mod(i,N)+1, mod(i*2,N)+1, mod(i*3,N)+1] + gps(i)%levelset = 0.0d0 + gps(i)%norm = 0.0d0 + end do + call s_dispatch(gps, N) + write(*,*) 'levelset(1) =', gps(1)%levelset + call s_finalize() +end program test_ice diff --git a/mini/run_cl4.sh b/mini/run_cl4.sh new file mode 100755 index 0000000000..742c63cdca --- /dev/null +++ b/mini/run_cl4.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 0:20:00 +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl4.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl4.log +#SBATCH -J cl4-ice-repro + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 +export I_MPI_F90=ifx + +F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90 +MINI=/scratch/user/u.sb27915/MFC-intel/mini +BASE="-free -fiopenmp -fopenmp-targets=spir64 -fp-model=precise -march=native -mno-avx512fp16" + +echo "=== m_cl4_mini (single-module same-module declare-target + char struct) ===" +$F90 --version 2>&1 | head -1 +echo "" + +try() { + local desc="$1"; shift + local out + rm -f /tmp/cl4_$$.o /tmp/cl4_$$.x m_cl4_mini.mod + out=$($F90 $BASE "$@" "$MINI/m_cl4_mini.f90" -o /tmp/cl4_$$.x 2>&1) + local rc=$? + rm -f /tmp/cl4_$$.o /tmp/cl4_$$.x m_cl4_mini.mod + if [ $rc -eq 0 ]; then + echo "OK : $desc" + elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then + echo "ICE : $desc" + echo " $(echo "$out" | grep -iE "5623|5633|internal abort|error #" | head -1)" + else + echo "ERR : $desc" + echo " $(echo "$out" | grep -v "^$\|warning\|remark\|#5117\|preprocessor" | head -3)" + fi +} + +try "-O3 (baseline)" -O3 +try "-O2" -O2 +try "-O1" -O1 +try "-O3 -fno-inline" -O3 -fno-inline +try "-O3 -fno-vectorize" -O3 -fno-vectorize +try "-O3 -fno-unroll-loops" -O3 -fno-unroll-loops + +echo "" +echo "=== Done ===" diff --git a/mini/run_cl5.sh b/mini/run_cl5.sh new file mode 100755 index 0000000000..4a1f0e02d7 --- /dev/null +++ b/mini/run_cl5.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 0:15:00 +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5.log +#SBATCH -J cl5-ice-reduce + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 +export I_MPI_F90=ifx + +F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90 +MINI=/scratch/user/u.sb27915/MFC-intel/mini +BASE="-free -fiopenmp -fopenmp-targets=spir64" + +echo "=== m_cl5_mini (reduced: char+array struct + 10 target loops + 10 same-module decl-target subs) ===" +$F90 --version 2>&1 | head -1 +echo "" + +try() { + local desc="$1"; shift + local out + rm -f /tmp/cl5_$$.x m_cl5.mod + out=$($F90 $BASE "$@" "$MINI/m_cl5_mini.f90" -o /tmp/cl5_$$.x 2>&1) + local rc=$? + rm -f /tmp/cl5_$$.x m_cl5.mod + if [ $rc -eq 0 ]; then + echo "OK : $desc" + elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then + echo "ICE : $desc" + echo " $(echo "$out" | grep -iE "5623|5633|internal abort|error #" | head -1)" + else + echo "ERR : $desc" + echo " $(echo "$out" | grep -v "^$\|warning\|remark\|#5117\|preprocessor" | head -3)" + fi +} + +try "-O3 (baseline)" -O3 +try "-O3 -fno-inline" -O3 -fno-inline +try "-O2" -O2 +try "-O1" -O1 + +echo "" +echo "=== Done ===" diff --git a/mini/run_cl5b.sh b/mini/run_cl5b.sh new file mode 100755 index 0000000000..f21189914d --- /dev/null +++ b/mini/run_cl5b.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 0:20:00 +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5b.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5b.log +#SBATCH -J cl5b-ice-bisect + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 +export I_MPI_F90=ifx + +F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90 +MINI=/scratch/user/u.sb27915/MFC-intel/mini +BASE="-free -fiopenmp -fopenmp-targets=spir64" +SRC="$MINI/m_cl5_mini.f90" + +echo "=== m_cl5 bisection: what is the minimum ICE trigger? ===" +$F90 --version 2>&1 | head -1 +echo "" + +try_src() { + local desc="$1"; local src="$2"; shift 2 + local out + rm -f /tmp/cl5b_$$.x m_cl5.mod + out=$($F90 $BASE "$@" "$src" -o /tmp/cl5b_$$.x 2>&1) + local rc=$? + rm -f /tmp/cl5b_$$.x m_cl5.mod + if [ $rc -eq 0 ]; then + echo "OK : $desc" + elif echo "$out" | grep -qiE "5623|5633|internal abort|segmentation"; then + echo "ICE : $desc" + else + echo "ERR : $desc" + echo " $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)" + fi +} + +# Baseline confirms ICE +try_src "baseline (full cl5)" "$SRC" -O3 + +# --- Remove char field --- +sed '/character(LEN=200)/d; s/patches(i)%label.*=.*//g' "$SRC" > /tmp/nochar_$$.f90 +try_src "no char(LEN=200) field" /tmp/nochar_$$.f90 -O3 +rm -f /tmp/nochar_$$.f90 + +# --- Remove interp_coeffs(2,2,2) --- +sed '/interp_coeffs/d' "$SRC" > /tmp/no_ic_$$.f90 +try_src "no interp_coeffs(2,2,2)" /tmp/no_ic_$$.f90 -O3 +rm -f /tmp/no_ic_$$.f90 + +# --- Keep only 5 loops (geo1-5) --- +python3 - "$SRC" > /tmp/cl5_5loops_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +# Remove geo6-geo10 loops from s_dispatch +for g in range(6, 11): + src = re.sub( + r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g), + '\n', src) +# Remove geo6-geo10 subroutines +for g in range(6, 11): + src = re.sub( + r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g), + '\n', src, flags=re.DOTALL) +open('/dev/stdout', 'w').write(src) +PYEOF +try_src "5 loops only (geo1-5)" /tmp/cl5_5loops_$$.f90 -O3 +rm -f /tmp/cl5_5loops_$$.f90 + +# --- Remove rot/rot_inv (3x3 arrays) from patch_t --- +sed '/real(wp), dimension(1:3,1:3)/d; /patches(i)%rot/d; /patches(id)%rot/d; /matmul/d' "$SRC" > /tmp/norot_$$.f90 +try_src "no rotation matrices" /tmp/norot_$$.f90 -O3 +rm -f /tmp/norot_$$.f90 + +# --- Remove allocatable xc/yc/zc (use constants instead) --- +python3 - "$SRC" > /tmp/noxc_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +# Replace allocatable declarations with fixed arrays +src = src.replace('real(wp), allocatable :: xc(:), yc(:), zc(:)', 'real(wp) :: xc(64), yc(64), zc(64)') +# Remove allocate/deallocate +src = re.sub(r'\s*allocate\(xc.*?\)\n', '\n', src) +src = re.sub(r'\s*deallocate\(xc.*?\)\n', '\n', src) +src = src.replace('integer, intent(in) :: n\n integer :: i\n allocate', 'integer, intent(in) :: n\n integer :: i\n ! allocate') +open('/dev/stdout', 'w').write(src) +PYEOF +try_src "no allocatable xc/yc/zc" /tmp/noxc_$$.f90 -O3 +rm -f /tmp/noxc_$$.f90 + +echo "" +echo "=== Done ===" diff --git a/mini/run_cl5c.sh b/mini/run_cl5c.sh new file mode 100755 index 0000000000..6734f50d91 --- /dev/null +++ b/mini/run_cl5c.sh @@ -0,0 +1,116 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 0:20:00 +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/cl5c.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/cl5c.log +#SBATCH -J cl5c-ice-matmul + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 +export I_MPI_F90=ifx + +F90=/sw/eb/sw/impi/2021.15.0-intel-compilers-2025.1.1/mpi/2021.15/bin/mpif90 +MINI=/scratch/user/u.sb27915/MFC-intel/mini +BASE="-free -fiopenmp -fopenmp-targets=spir64" +SRC="$MINI/m_cl5_mini.f90" + +echo "=== matmul vs rot(3,3) field isolation test ===" +$F90 --version 2>&1 | head -1 +echo "" + +try_f90() { + local desc="$1"; local src="$2"; shift 2 + local out + rm -f /tmp/cl5c_$$.x m_cl5.mod + out=$($F90 $BASE "$@" "$src" -o /tmp/cl5c_$$.x 2>&1) + local rc=$? + rm -f /tmp/cl5c_$$.x m_cl5.mod + if [ $rc -eq 0 ]; then echo "OK : $desc" + elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then + echo "ICE : $desc" + else + echo "ERR : $desc" + echo " $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)" + fi +} + +# A: Keep rot(3,3) field but replace matmul with element-wise multiply +python3 - "$SRC" > /tmp/no_matmul_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +# Replace matmul(X, Y) with sum of manually computed dot products (fake matmul) +# Just replace matmul calls with equivalent array expressions +src = re.sub(r'matmul\(patches\(id\)%rot_inv, dv\)', 'dv', src) +src = re.sub(r'matmul\(patches\(id\)%rot_inv, xyz_local\)', 'dv', src) +src = re.sub(r'matmul\(patches\(id\)%rot, dv\)', 'dv', src) +src = re.sub(r'matmul\(patches\(id\)%rot_inv, ldv\)', 'ldv', src) +open('/dev/stdout', 'w').write(src) +PYEOF +try_f90 "rot(3,3) field kept, matmul REMOVED (identity)" /tmp/no_matmul_$$.f90 -O3 +rm -f /tmp/no_matmul_$$.f90 + +# B: Remove rot(3,3) fields but keep matmul via a local array +python3 - "$SRC" > /tmp/no_rotfield_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +# Remove rot/rot_inv struct fields +src = re.sub(r'\s*real\(wp\), dimension\(1:3,1:3\) :: rot, rot_inv\n', '\n', src) +# Replace struct accesses with local identity +src = re.sub(r'patches\(id\)%rot_inv', 'reshape([1._wp,0._wp,0._wp,0._wp,1._wp,0._wp,0._wp,0._wp,1._wp],[3,3])', src) +src = re.sub(r'patches\(id\)%rot\b', 'reshape([1._wp,0._wp,0._wp,0._wp,1._wp,0._wp,0._wp,0._wp,1._wp],[3,3])', src) +# Remove initialization of rot/rot_inv in s_init +src = re.sub(r'\s*patches\(i\)%rot.*\n', '\n', src) +open('/dev/stdout', 'w').write(src) +PYEOF +try_f90 "rot(3,3) field REMOVED, matmul kept (local identity)" /tmp/no_rotfield_$$.f90 -O3 +rm -f /tmp/no_rotfield_$$.f90 + +# C: Both removed — equivalent to previous "no rotation" test +python3 - "$SRC" > /tmp/neither_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +src = re.sub(r'\s*real\(wp\), dimension\(1:3,1:3\) :: rot, rot_inv\n', '\n', src) +src = re.sub(r'matmul\(.*?\)', 'dv', src) +src = re.sub(r'\s*patches\(i\)%rot.*\n', '\n', src) +open('/dev/stdout', 'w').write(src) +PYEOF +try_f90 "both rot(3,3) AND matmul removed" /tmp/neither_$$.f90 -O3 +rm -f /tmp/neither_$$.f90 + +# D: Original with just geo3 (which has matmul) — minimum loop count? +python3 - "$SRC" > /tmp/geo3only_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +# Keep only 1 loop with matmul (geo3) +for g in [1,2,4,5,6,7,8,9,10]: + src = re.sub( + r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g), + '\n', src) + src = re.sub( + r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g), + '\n', src, flags=re.DOTALL) +open('/dev/stdout', 'w').write(src) +PYEOF +try_f90 "1 loop (geo3 only, has matmul)" /tmp/geo3only_$$.f90 -O3 +rm -f /tmp/geo3only_$$.f90 + +# E: 2 loops with matmul (geo3 + geo4) +python3 - "$SRC" > /tmp/geo34_$$.f90 << 'PYEOF' +import sys, re +src = open(sys.argv[1]).read() +for g in [1,2,5,6,7,8,9,10]: + src = re.sub( + r'\s*!\$omp target teams loop[^\n]*\n\s*do i = 1, n\n\s*if \(patches.*geometry == %d\) call s_geo%d.*\n\s*end do\n' % (g,g), + '\n', src) + src = re.sub( + r'\n\s*subroutine s_geo%d\(gp\).*?end subroutine s_geo%d\n' % (g, g), + '\n', src, flags=re.DOTALL) +open('/dev/stdout', 'w').write(src) +PYEOF +try_f90 "2 loops with matmul (geo3 + geo4)" /tmp/geo34_$$.f90 -O3 +rm -f /tmp/geo34_$$.f90 + +echo "" +echo "=== Done ===" diff --git a/mini/run_ice_min.sh b/mini/run_ice_min.sh new file mode 100755 index 0000000000..41a698074b --- /dev/null +++ b/mini/run_ice_min.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 0:15:00 +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/ice_min.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/ice_min.log +#SBATCH -J ice-min + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 + +IFX=/sw/eb/sw/intel-compilers/2025.1.1/compiler/2025.1/bin/ifx +MINI=/scratch/user/u.sb27915/MFC-intel/mini +BASE="-free -fiopenmp -fopenmp-targets=spir64" + +echo "=== ifx #5633 absolute minimum: matmul in declare-target sub called from target teams loop ===" +$IFX --version 2>&1 | head -1 +echo "" + +try() { + local desc="$1"; shift + local out + rm -f /tmp/icemin_$$.x m_min.mod + out=$($IFX $BASE "$@" "$MINI/m_ice_min.f90" -o /tmp/icemin_$$.x 2>&1) + local rc=$? + rm -f /tmp/icemin_$$.x m_min.mod + if [ $rc -eq 0 ]; then + echo "OK : $desc" + elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then + echo "ICE : $desc" + echo " $(echo "$out" | grep -iE "5633|error #" | head -1)" + else + echo "ERR : $desc" + echo " $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)" + fi +} + +try "-O3" -O3 +try "-O2" -O2 +try "-O1" -O1 +try "-O0" -O0 +try "-O3 -fno-inline" -O3 -fno-inline + +echo "" +echo "--- Variant: replace matmul with element-wise loop (no matmul intrinsic) ---" +python3 - "$MINI/m_ice_min.f90" > /tmp/no_matmul_$$.f90 << 'PYEOF' +import sys +src = open(sys.argv[1]).read() +src = src.replace( + " b = real(k, wp) * matmul(A, x)", + " integer :: ii, jj\n do ii = 1, 3\n b(ii) = 0._wp\n do jj = 1, 3\n b(ii) = b(ii) + A(ii,jj)*x(jj)\n end do\n b(ii) = b(ii) * real(k, wp)\n end do") +open('/dev/stdout', 'w').write(src) +PYEOF +rm -f /tmp/icemin_nm_$$.x m_min.mod +out=$($IFX $BASE -O3 /tmp/no_matmul_$$.f90 -o /tmp/icemin_nm_$$.x 2>&1) +rc=$? +rm -f /tmp/icemin_nm_$$.x m_min.mod /tmp/no_matmul_$$.f90 +if [ $rc -eq 0 ]; then + echo "OK : -O3 no matmul (manual loops) -- matmul intrinsic IS the trigger" +elif echo "$out" | grep -qiE "5633|segmentation"; then + echo "ICE : -O3 no matmul -- ICE even without matmul intrinsic" +else + echo "ERR : $(echo "$out" | grep -v '^$' | head -1)" +fi + +echo "" +echo "=== Done ===" diff --git a/mini/run_ice_repro.sh b/mini/run_ice_repro.sh new file mode 100755 index 0000000000..32e582f6cc --- /dev/null +++ b/mini/run_ice_repro.sh @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH -p pvc +#SBATCH -N 1 +#SBATCH --gres=gpu:pvc:1 +#SBATCH -t 0:15:00 +#SBATCH -o /scratch/user/u.sb27915/MFC-intel/mini/ice_repro.log +#SBATCH -e /scratch/user/u.sb27915/MFC-intel/mini/ice_repro.log +#SBATCH -J ice-repro-final + +source /etc/profile +module load iimpi/2025a imkl/2025.1.0 +export I_MPI_F90=ifx + +# Use ifx directly (not mpif90) since this is a single-file reproducer +IFX=/sw/eb/sw/intel-compilers/2025.1.1/compiler/2025.1/bin/ifx +MINI=/scratch/user/u.sb27915/MFC-intel/mini +BASE="-free -fiopenmp -fopenmp-targets=spir64" + +echo "=== ifx #5633 minimal reproducer: matmul(derived_type%mat, vec) in declare-target sub ===" +$IFX --version 2>&1 | head -1 +echo "" + +try() { + local desc="$1"; shift + local out + rm -f /tmp/ice_$$.x m_ice.mod + out=$($IFX $BASE "$@" "$MINI/m_ice_repro.f90" -o /tmp/ice_$$.x 2>&1) + local rc=$? + rm -f /tmp/ice_$$.x m_ice.mod + if [ $rc -eq 0 ]; then + echo "OK : $desc" + elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then + echo "ICE : $desc" + echo " $(echo "$out" | grep -iE "5633|error #" | head -1)" + else + echo "ERR : $desc" + echo " $(echo "$out" | grep -v "^$\|warning\|remark\|#5117" | head -2)" + fi +} + +try "-O3" -O3 +try "-O2" -O2 +try "-O1" -O1 +try "-O3 -fno-inline (fix)" -O3 -fno-inline +try "-O0" -O0 + +echo "" +echo "--- Variant: matmul on local array (no struct member) ---" +python3 - "$MINI/m_ice_repro.f90" > /tmp/local_mat_$$.f90 << 'PYEOF' +import sys +src = open(sys.argv[1]).read() +# Add local_mat to the declaration block (before first executable) +src = src.replace( + ' real(wp), dimension(3) :: dv, ldv\n real(wp) :: d', + ' real(wp), dimension(3) :: dv, ldv\n real(wp), dimension(3,3) :: local_mat\n real(wp) :: d') +# Copy struct member to local before matmul +src = src.replace( + ' ! matmul with a derived-type member (patches(id)%mat) is the key trigger\n ldv = matmul(patches(id)%mat, dv)', + ' local_mat = patches(id)%mat\n ! matmul with LOCAL copy (not struct member directly)\n ldv = matmul(local_mat, dv)') +open('/dev/stdout', 'w').write(src) +PYEOF +rm -f /tmp/ice_lm_$$.x m_ice.mod +out=$($IFX $BASE -O3 /tmp/local_mat_$$.f90 -o /tmp/ice_lm_$$.x 2>&1) +rc=$? +rm -f /tmp/ice_lm_$$.x m_ice.mod /tmp/local_mat_$$.f90 +if [ $rc -eq 0 ]; then + echo "OK : -O3 with local_mat copy -- direct struct access IS the trigger" +elif echo "$out" | grep -qiE "5633|internal abort|segmentation"; then + echo "ICE : -O3 with local_mat copy -- copying to local doesn't help" +else + echo "ERR : $(echo "$out" | grep -v '^$' | head -1)" +fi + +echo "" +echo "=== Done ===" diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp index 7620e7607f..00f26a99a2 100644 --- a/src/common/include/omp_macros.fpp +++ b/src/common/include/omp_macros.fpp @@ -2,7 +2,7 @@ #:set NVIDIA_COMPILER_ID="NVHPC" #:set PGI_COMPILER_ID="PGI" -#:set INTEL_COMPILER_ID="Intel" +#:set INTEL_COMPILER_ID="IntelLLVM" #:set CCE_COMPILER_ID="Cray" #:set AMD_COMPILER_ID="LLVMFlang" @@ -182,6 +182,20 @@ #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) ' #:elif MFC_COMPILER == AMD_COMPILER_ID #:set omp_start_directive = '!$omp target teams distribute parallel do ' + #:elif MFC_COMPILER == INTEL_COMPILER_ID + #! Intel OMP 5.2: bind(a,b) is invalid. Drop explicit firstprivate list: + #! ifx rejects firstprivate for declare-target module variables (#7655). + #! OMP 5.0 rule: unclaused scalars in target constructs are firstprivate + #! by default, so scalar locals (e.g. gp_layers_z) are covered implicitly. + #! declare-target vars (e.g. gp_layers) are device-resident -- no mapping + #! needed. defaultmap(firstprivate:scalar) also unsupported by ifx (#9061). + #:set omp_start_directive = '!$omp target teams loop ' + #:set clause_val = collapse_val.strip('\n') + & + & default_val.strip('\n') + GEN_PRIVATE_STR(private, False).strip('\n') + & + & reduction_val.strip('\n') + copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') #:else #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' #:endif @@ -197,6 +211,8 @@ #:set omp_end_directive = '!$omp end target teams distribute parallel do simd' #:elif MFC_COMPILER == AMD_COMPILER_ID #:set omp_end_directive = '!$omp end target teams distribute parallel do' + #:elif MFC_COMPILER == INTEL_COMPILER_ID + #:set omp_end_directive = '!$omp end target teams loop' #:else #:set omp_end_directive = '!$omp end target teams loop' #:endif @@ -218,7 +234,9 @@ #:set function_name_val = '' #:endif - #:if MFC_COMPILER == AMD_COMPILER_ID + #:if MFC_COMPILER == AMD_COMPILER_ID or MFC_COMPILER == INTEL_COMPILER_ID + #! AMD: device_type unsupported. Intel: OpenMP 5.2 requires an enter/to/link/local + #! clause alongside device_type; omit device_type entirely for both. #:set clause_val = '' #:else #:set clause_val = nohost_val.strip('\n') @@ -374,4 +392,10 @@ $:code #:endif #:enddef + +#:def OMP_MKL_DISPATCH() + #:if MFC_COMPILER == INTEL_COMPILER_ID + !$omp dispatch + #:endif +#:enddef ! New line at end of file is required for FYPP diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index b1382ec49a..5f3ec2ccb0 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -303,4 +303,12 @@ #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID) #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID) + +! Dispatch to oneMKL GPU FFT via OpenMP dispatch construct (Intel GPU only) +#:def GPU_MKL_DISPATCH() + #:set omp_code = OMP_MKL_DISPATCH() +#if defined(MFC_OpenMP) + $:omp_code +#endif +#:enddef ! New line at end of file is required for FYPP diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp index 36bee0a23a..fc3f783e98 100644 --- a/src/common/include/shared_parallel_macros.fpp +++ b/src/common/include/shared_parallel_macros.fpp @@ -1,12 +1,13 @@ #:set NVIDIA_COMPILER_ID="NVHPC" #:set PGI_COMPILER_ID="PGI" -#:set INTEL_COMPILER_ID="Intel" +#:set INTEL_COMPILER_ID="IntelLLVM" #:set CCE_COMPILER_ID="Cray" #:set AMD_COMPILER_ID="LLVMFlang" #:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID) #:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID) #:set USING_AMD = (MFC_COMPILER == AMD_COMPILER_ID) +#:set USING_INTEL = (MFC_COMPILER == INTEL_COMPILER_ID) #:def ASSERT_LIST(data, datatype) #:assert data is not None diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp index 6eb5383f75..33b874c9c5 100644 --- a/src/common/m_boundary_common.fpp +++ b/src/common/m_boundary_common.fpp @@ -284,11 +284,11 @@ contains subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf) $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i - type(scalar_field), optional, intent(inout) :: q_T_sf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i + type(scalar_field), optional, intent(inout) :: q_T_sf if (bc_dir == 1) then !< x-direction if (bc_loc == -1) then ! bc_x%beg @@ -371,7 +371,7 @@ contains subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in, q_T_sf) $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in, mv_in integer, intent(in) :: bc_dir, bc_loc integer, intent(in) :: k, l @@ -619,7 +619,7 @@ contains subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in, q_T_sf) $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in, mv_in integer, intent(in) :: bc_dir, bc_loc integer, intent(in) :: k, l @@ -778,7 +778,7 @@ contains subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l) $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), optional, intent(inout) :: pb_in, mv_in integer, intent(in) :: k, l integer :: j, q, i @@ -833,11 +833,11 @@ contains subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf) $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i - type(scalar_field), optional, intent(inout) :: q_T_sf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i + type(scalar_field), optional, intent(inout) :: q_T_sf if (bc_dir == 1) then !< x-direction if (bc_loc == -1) then !< bc_x%beg @@ -986,11 +986,11 @@ contains $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i - type(scalar_field), optional, intent(inout) :: q_T_sf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i + type(scalar_field), optional, intent(inout) :: q_T_sf if (bc_dir == 1) then !< x-direction if (bc_loc == -1) then !< bc_x%beg @@ -1158,11 +1158,11 @@ contains subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l, q_T_sf) $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i - type(scalar_field), optional, intent(inout) :: q_T_sf + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i + type(scalar_field), optional, intent(inout) :: q_T_sf #ifdef MFC_SIMULATION if (bc_dir == 1) then !< x-direction @@ -1461,10 +1461,10 @@ contains subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_color_function_periodic', parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i + type(scalar_field), dimension(:), intent(inout) :: c_divs + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i if (bc_dir == 1) then !< x-direction if (bc_loc == -1) then ! bc_x%beg @@ -1516,10 +1516,10 @@ contains subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_color_function_reflective', parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i + type(scalar_field), dimension(:), intent(inout) :: c_divs + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i if (bc_dir == 1) then !< x-direction if (bc_loc == -1) then ! bc_x%beg @@ -1595,10 +1595,10 @@ contains subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs - integer, intent(in) :: bc_dir, bc_loc - integer, intent(in) :: k, l - integer :: j, i + type(scalar_field), dimension(:), intent(inout) :: c_divs + integer, intent(in) :: bc_dir, bc_loc + integer, intent(in) :: k, l + integer :: j, i if (bc_dir == 1) then !< x-direction if (bc_loc == -1) then ! bc_x%beg diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index ec10c6a3dc..5b684ea51c 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -30,8 +30,8 @@ contains $:GPU_ROUTINE(function_name='compute_viscosity_and_inversion',parallelism='[seq]', cray_inline=True) - real(wp), intent(inout) :: T_L, T_R, Re_L, Re_R - real(wp), dimension(num_species), intent(inout) :: Ys_R, Ys_L + real(wp), intent(inout) :: T_L, T_R, Re_L, Re_R + real(wp), dimension(:), intent(inout) :: Ys_R, Ys_L call get_mixture_viscosity_mixavg(T_L, Ys_L, Re_L) call get_mixture_viscosity_mixavg(T_R, Ys_R, Re_R) @@ -114,7 +114,7 @@ contains real(wp) :: T real(wp) :: rho, omega_m - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(10) :: Ys real(wp), dimension(10) :: omega #:else @@ -157,7 +157,7 @@ contains integer, intent(in) :: idir type(scalar_field), intent(in) :: q_T_sf - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp index 6bd9718dbb..e2a36f418c 100644 --- a/src/common/m_helper.fpp +++ b/src/common/m_helper.fpp @@ -26,11 +26,11 @@ contains subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights) $:GPU_ROUTINE(parallelism='[seq]') - real(wp), intent(in) :: vftmp - real(wp), dimension(nb), intent(in) :: Rtmp - real(wp), intent(out) :: ntmp - real(wp), dimension(nb), intent(in) :: weights - real(wp) :: R3 + real(wp), intent(in) :: vftmp + real(wp), dimension(:), intent(in) :: Rtmp + real(wp), intent(out) :: ntmp + real(wp), dimension(:), intent(in) :: weights + real(wp) :: R3 R3 = dot_product(weights, Rtmp**3._wp) ntmp = (3._wp/(4._wp*pi))*vftmp/R3 @@ -41,11 +41,11 @@ contains subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights) $:GPU_ROUTINE(parallelism='[seq]') - real(wp), intent(in) :: vftmp - real(wp), dimension(nb), intent(in) :: nRtmp - real(wp), intent(out) :: ntmp - real(wp), dimension(nb), intent(in) :: weights - real(wp) :: nR3 + real(wp), intent(in) :: vftmp + real(wp), dimension(:), intent(in) :: nRtmp + real(wp), intent(out) :: ntmp + real(wp), dimension(:), intent(in) :: weights + real(wp) :: nR3 nR3 = dot_product(weights, nRtmp**3._wp) ntmp = sqrt((4._wp*pi/3._wp)*nR3/vftmp) diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index 7d4b92705c..cc42771028 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -799,716 +799,712 @@ contains do i = 1, nVar r = (i - 1) + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) q_comm(i)%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp) -#if defined(__INTEL_COMPILER) +#if defined(__INTEL_COMPILER) && !defined(MFC_GPU) if (ieee_is_nan(q_comm(i)%sf(j + unpack_offset, k, l))) then - print *, "Error", j, k, l, i - call s_mpi_abort("NaN(s) in recv") - end if + print *, "Error", j, k, l, i + call s_mpi_abort("NaN(s) in recv") + end if #endif - end do end do end do end do - $:END_GPU_PARALLEL_LOOP() + end do + $:END_GPU_PARALLEL_LOOP() - if (chem_diff_comm) then - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - r = nVar + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) - q_T_sf%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp) -#if defined(__INTEL_COMPILER) + if (chem_diff_comm) then + $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + r = nVar + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) + q_T_sf%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp) +#if defined(__INTEL_COMPILER) && !defined(MFC_GPU) if (ieee_is_nan(q_T_sf%sf(j + unpack_offset, k, l))) then - print *, "Error", j, k, l - call s_mpi_abort("NaN(s) in recv") - end if + print *, "Error", j, k, l + call s_mpi_abort("NaN(s) in recv") + end if #endif - end do - end do end do - $:END_GPU_PARALLEL_LOOP() - end if + end do + end do + $:END_GPU_PARALLEL_LOOP() + end if - if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = nVar + 1, nVar + nnode - do q = 1, nb - r = (i - 1) + (q - 1)*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) - pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do - end do + if (qbmm_comm) then + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = nVar + 1, nVar + nnode + do q = 1, nb + r = (i - 1) + (q - 1)*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) + pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do - $:END_GPU_PARALLEL_LOOP() - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = nVar + 1, nVar + nnode - do q = 1, nb - r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) - mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do - end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = nVar + 1, nVar + nnode + do q = 1, nb + r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*(j + buff_size*((k + 1) + (n + 1)*l)) + mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do - $:END_GPU_PARALLEL_LOOP() - end if - #:elif mpi_dir == 2 - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do i = 1, nVar - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l)) - q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp) -#if defined(__INTEL_COMPILER) + end do + end do + $:END_GPU_PARALLEL_LOOP() + end if + #:elif mpi_dir == 2 + $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l)) + q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp) +#if defined(__INTEL_COMPILER) && !defined(MFC_GPU) if (ieee_is_nan(q_comm(i)%sf(j, k + unpack_offset, l))) then - print *, "Error", j, k, l, i - call s_mpi_abort("NaN(s) in recv") - end if + print *, "Error", j, k, l, i + call s_mpi_abort("NaN(s) in recv") + end if #endif - end do - end do - end do end do - $:END_GPU_PARALLEL_LOOP() - - if (chem_diff_comm) then - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l)) - q_T_sf%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp) -#if defined(__INTEL_COMPILER) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + if (chem_diff_comm) then + $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l)) + q_T_sf%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp) +#if defined(__INTEL_COMPILER) && !defined(MFC_GPU) if (ieee_is_nan(q_T_sf%sf(j, k + unpack_offset, l))) then - print *, "Error", j, k, l - call s_mpi_abort("NaN(s) in recv") - end if -#endif - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() + print *, "Error", j, k, l + call s_mpi_abort("NaN(s) in recv") end if - - if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + nnode - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k & - & + buff_size) + buff_size*l)) - pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do - end do - end do - end do +#endif + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + end if + + if (qbmm_comm) then + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + nnode + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) & + & + buff_size*l)) + pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do - $:END_GPU_PARALLEL_LOOP() - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + nnode - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size & - & + 1)*((k + buff_size) + buff_size*l)) - mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do - end do - end do - end do + end do + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + nnode + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k & + & + buff_size) + buff_size*l)) + mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do - $:END_GPU_PARALLEL_LOOP() - end if - #:else - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do i = 1, nVar - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n & - & + 2*buff_size + 1)*(l + buff_size))) - q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp) -#if defined(__INTEL_COMPILER) + end do + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + end if +#:else + $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = (i - 1) + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l & + & + buff_size))) + q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp) +#if defined(__INTEL_COMPILER) && !defined(MFC_GPU) if (ieee_is_nan(q_comm(i)%sf(j, k, l + unpack_offset))) then - print *, "Error", j, k, l, i - call s_mpi_abort("NaN(s) in recv") - end if + print *, "Error", j, k, l, i + call s_mpi_abort("NaN(s) in recv") + end if #endif - end do - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - - if (chem_diff_comm) then - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n & - & + 2*buff_size + 1)*(l + buff_size))) - q_T_sf%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp) -#if defined(__INTEL_COMPILER) + end do + end do + end do +end do +$:END_GPU_PARALLEL_LOOP() + +if (chem_diff_comm) then + $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = nVar + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l & + & + buff_size))) + q_T_sf%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp) +#if defined(__INTEL_COMPILER) && !defined(MFC_GPU) if (ieee_is_nan(q_T_sf%sf(j, k, l + unpack_offset))) then - print *, "Error", j, k, l - call s_mpi_abort("NaN(s) in recv") - end if -#endif - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - end if - - if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + nnode - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k & - & + buff_size) + (n + 2*buff_size + 1)*(l + buff_size))) - pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) - end do - end do - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + nnode - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size & - & + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l + buff_size))) - mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) - end do - end do - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - end if - #:endif + print *, "Error", j, k, l + call s_mpi_abort("NaN(s) in recv") end if - #:endfor - call nvtxEndRange +#endif + end do + end do +end do +$:END_GPU_PARALLEL_LOOP() +end if + +if (qbmm_comm) then + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + nnode + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n & + & + 2*buff_size + 1)*(l + buff_size))) + pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) + end do + end do + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + nnode + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*nnode + nb*nnode + v_size*((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) & + & + (n + 2*buff_size + 1)*(l + buff_size))) + mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) + end do + end do + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() +end if +#:endif +end if +#:endfor +call nvtxEndRange #endif - end subroutine s_mpi_sendrecv_variables_buffers +end subroutine s_mpi_sendrecv_variables_buffers - !> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction. - subroutine s_mpi_decompose_computational_domain +!> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction. +subroutine s_mpi_decompose_computational_domain #ifdef MFC_MPI - integer :: num_procs_x, num_procs_y, num_procs_z !< Optimal number of processors in the x-, y- and z-directions - !> Non-optimal number of processors in the x-, y- and z-directions - real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z - real(wp) :: fct_min !< Processor factorization (fct) minimization parameter - integer :: MPI_COMM_CART !< Cartesian processor topology communicator - integer :: rem_cells !< Remaining cells after distribution among processors - integer :: recon_order !< WENO or MUSCL reconstruction order - integer :: i, j !< Generic loop iterators - integer :: ierr !< Generic flag used to identify and report MPI errors - - if (recon_type == WENO_TYPE) then - recon_order = weno_order - else - recon_order = muscl_order - end if + integer :: num_procs_x, num_procs_y, num_procs_z !< Optimal number of processors in the x-, y- and z-directions + !> Non-optimal number of processors in the x-, y- and z-directions + real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z + real(wp) :: fct_min !< Processor factorization (fct) minimization parameter + integer :: MPI_COMM_CART !< Cartesian processor topology communicator + integer :: rem_cells !< Remaining cells after distribution among processors + integer :: recon_order !< WENO or MUSCL reconstruction order + integer :: i, j !< Generic loop iterators + integer :: ierr !< Generic flag used to identify and report MPI errors + + if (recon_type == WENO_TYPE) then + recon_order = weno_order + else + recon_order = muscl_order + end if + + if (num_procs == 1 .and. parallel_io) then + do i = 1, num_dims + start_idx(i) = 0 + end do + return + end if - if (num_procs == 1 .and. parallel_io) then - do i = 1, num_dims - start_idx(i) = 0 - end do - return - end if + if (igr) then + recon_order = igr_order + end if - if (igr) then - recon_order = igr_order - end if + ! 3D Cartesian Processor Topology + if (n > 0) then + if (p > 0) then + if (fft_wrt) then + ! Initial estimate of optimal processor topology + num_procs_x = 1 + num_procs_y = 1 + num_procs_z = num_procs + ierr = -1 - ! 3D Cartesian Processor Topology - if (n > 0) then - if (p > 0) then - if (fft_wrt) then - ! Initial estimate of optimal processor topology + ! Benchmarking the quality of this initial guess + tmp_num_procs_y = num_procs_y + tmp_num_procs_z = num_procs_z + fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) + + ! Optimization of the initial processor topology + do i = 1, num_procs + if (mod(num_procs, i) == 0 .and. (n + 1)/i >= num_stcls_min*recon_order) then + tmp_num_procs_y = i + tmp_num_procs_z = num_procs/i + + if (fct_min >= abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) & + & /tmp_num_procs_z >= num_stcls_min*recon_order) then + num_procs_y = i + num_procs_z = num_procs/i + fct_min = abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) + ierr = 0 + end if + end if + end do + else + if (cyl_coord .and. p > 0) then + ! Pencil blocking for cylindrical coordinates (Fourier filter near axis) + + ! Initial values of the processor factorization optimization num_procs_x = 1 - num_procs_y = 1 - num_procs_z = num_procs + num_procs_y = num_procs + num_procs_z = 1 ierr = -1 - ! Benchmarking the quality of this initial guess + ! Computing minimization variable for these initial values + tmp_num_procs_x = num_procs_x tmp_num_procs_y = num_procs_y tmp_num_procs_z = num_procs_z - fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) + fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) - ! Optimization of the initial processor topology + ! Searching for optimal computational domain distribution do i = 1, num_procs - if (mod(num_procs, i) == 0 .and. (n + 1)/i >= num_stcls_min*recon_order) then - tmp_num_procs_y = i - tmp_num_procs_z = num_procs/i - - if (fct_min >= abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) & - & /tmp_num_procs_z >= num_stcls_min*recon_order) then - num_procs_y = i - num_procs_z = num_procs/i - fct_min = abs((n + 1)/tmp_num_procs_y - (p + 1)/tmp_num_procs_z) + if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then + tmp_num_procs_x = i + tmp_num_procs_y = num_procs/i + + if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) & + & /tmp_num_procs_y >= num_stcls_min*recon_order) then + num_procs_x = i + num_procs_y = num_procs/i + fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) ierr = 0 end if end if end do else - if (cyl_coord .and. p > 0) then - ! Pencil blocking for cylindrical coordinates (Fourier filter near axis) - - ! Initial values of the processor factorization optimization - num_procs_x = 1 - num_procs_y = num_procs - num_procs_z = 1 - ierr = -1 - - ! Computing minimization variable for these initial values - tmp_num_procs_x = num_procs_x - tmp_num_procs_y = num_procs_y - tmp_num_procs_z = num_procs_z - fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) - - ! Searching for optimal computational domain distribution - do i = 1, num_procs - if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then - tmp_num_procs_x = i - tmp_num_procs_y = num_procs/i - - if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) & - & /tmp_num_procs_y >= num_stcls_min*recon_order) then - num_procs_x = i - num_procs_y = num_procs/i - fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) - ierr = 0 - end if - end if - end do - else - ! Initial estimate of optimal processor topology - num_procs_x = 1 - num_procs_y = 1 - num_procs_z = num_procs - ierr = -1 - - ! Benchmarking the quality of this initial guess - tmp_num_procs_x = num_procs_x - tmp_num_procs_y = num_procs_y - tmp_num_procs_z = num_procs_z - fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + 10._wp*abs((n + 1) & - & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) - - ! Optimization of the initial processor topology - do i = 1, num_procs - if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then - do j = 1, num_procs/i - if (mod(num_procs/i, j) == 0 .and. (n + 1)/j >= num_stcls_min*recon_order) then - tmp_num_procs_x = i - tmp_num_procs_y = j - tmp_num_procs_z = num_procs/(i*j) - - if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) & - & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) & - & /tmp_num_procs_z >= num_stcls_min*recon_order) then - num_procs_x = i - num_procs_y = j - num_procs_z = num_procs/(i*j) - fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) & - & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) - ierr = 0 - end if - end if - end do - end if - end do - end if - end if - - ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation - ! exits. - if (proc_rank == 0 .and. ierr == -1) then - call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n, p and ' & - & // 'weno/muscl/igr_order. Exiting.') - end if - - ! Creating new communicator using the Cartesian topology - call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, num_procs_y, num_procs_z/), (/.true., .true., .true./), & - & .false., MPI_COMM_CART, ierr) - - ! Finding the Cartesian coordinates of the local process - call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr) - - ! Global Parameters for z-direction - - ! Number of remaining cells - rem_cells = mod(p + 1, num_procs_z) - - ! Optimal number of cells per processor - p = (p + 1)/num_procs_z - 1 - - ! Distributing the remaining cells - do i = 1, rem_cells - if (proc_coords(3) == i - 1) then - p = p + 1; exit - end if - end do - - ! Boundary condition at the beginning - if (proc_coords(3) > 0 .or. (bc_z%beg == BC_PERIODIC .and. num_procs_z > 1)) then - proc_coords(3) = proc_coords(3) - 1 - call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr) - proc_coords(3) = proc_coords(3) + 1 - end if - - ! Boundary condition at the end - if (proc_coords(3) < num_procs_z - 1 .or. (bc_z%end == BC_PERIODIC .and. num_procs_z > 1)) then - proc_coords(3) = proc_coords(3) + 1 - call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr) - proc_coords(3) = proc_coords(3) - 1 - end if - -#ifdef MFC_POST_PROCESS - ! Ghost zone at the beginning - if (proc_coords(3) > 0 .and. format == 1) then - offset_z%beg = 2 - else - offset_z%beg = 0 - end if + ! Initial estimate of optimal processor topology + num_procs_x = 1 + num_procs_y = 1 + num_procs_z = num_procs + ierr = -1 - ! Ghost zone at the end - if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then - offset_z%end = 2 - else - offset_z%end = 0 - end if -#endif + ! Benchmarking the quality of this initial guess + tmp_num_procs_x = num_procs_x + tmp_num_procs_y = num_procs_y + tmp_num_procs_z = num_procs_z + fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + 10._wp*abs((n + 1)/tmp_num_procs_y & + & - (p + 1)/tmp_num_procs_z) - ! Beginning and end sub-domain boundary locations - if (parallel_io) then - if (proc_coords(3) < rem_cells) then - start_idx(3) = (p + 1)*proc_coords(3) - else - start_idx(3) = (p + 1)*proc_coords(3) + rem_cells - end if - else -#ifdef MFC_PRE_PROCESS - if (old_grid .neqv. .true.) then - dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp) - - if (proc_coords(3) < rem_cells) then - z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3)) - z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1) - (num_procs_z & - & - rem_cells)) - else - z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3) + rem_cells) - z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1)) + ! Optimization of the initial processor topology + do i = 1, num_procs + if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then + do j = 1, num_procs/i + if (mod(num_procs/i, j) == 0 .and. (n + 1)/j >= num_stcls_min*recon_order) then + tmp_num_procs_x = i + tmp_num_procs_y = j + tmp_num_procs_z = num_procs/(i*j) + + if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) & + & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) .and. (p + 1) & + & /tmp_num_procs_z >= num_stcls_min*recon_order) then + num_procs_x = i + num_procs_y = j + num_procs_z = num_procs/(i*j) + fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + abs((n + 1) & + & /tmp_num_procs_y - (p + 1)/tmp_num_procs_z) + ierr = 0 + end if + end if + end do end if - end if -#endif + end do end if + end if - ! 2D Cartesian Processor Topology - else - ! Initial estimate of optimal processor topology - num_procs_x = 1 - num_procs_y = num_procs - ierr = -1 - - ! Benchmarking the quality of this initial guess - tmp_num_procs_x = num_procs_x - tmp_num_procs_y = num_procs_y - fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) - - ! Optimization of the initial processor topology - do i = 1, num_procs - if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then - tmp_num_procs_x = i - tmp_num_procs_y = num_procs/i - - if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) & - & /tmp_num_procs_y >= num_stcls_min*recon_order) then - num_procs_x = i - num_procs_y = num_procs/i - fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) - ierr = 0 - end if - end if - end do - - ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation - ! exits. - if (proc_rank == 0 .and. ierr == -1) then - call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n and ' & - & // 'weno/muscl/igr_order. Exiting.') - end if + ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation exits. + if (proc_rank == 0 .and. ierr == -1) then + call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n, p and ' & + & // 'weno/muscl/igr_order. Exiting.') + end if - ! Creating new communicator using the Cartesian topology - call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, num_procs_y/), (/.true., .true./), .false., MPI_COMM_CART, & - & ierr) + ! Creating new communicator using the Cartesian topology + call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, num_procs_y, num_procs_z/), (/.true., .true., .true./), & + & .false., MPI_COMM_CART, ierr) - ! Finding the Cartesian coordinates of the local process - call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr) - end if + ! Finding the Cartesian coordinates of the local process + call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr) - ! Global Parameters for y-direction + ! Global Parameters for z-direction ! Number of remaining cells - rem_cells = mod(n + 1, num_procs_y) + rem_cells = mod(p + 1, num_procs_z) ! Optimal number of cells per processor - n = (n + 1)/num_procs_y - 1 + p = (p + 1)/num_procs_z - 1 ! Distributing the remaining cells do i = 1, rem_cells - if (proc_coords(2) == i - 1) then - n = n + 1; exit + if (proc_coords(3) == i - 1) then + p = p + 1; exit end if end do ! Boundary condition at the beginning - if (proc_coords(2) > 0 .or. (bc_y%beg == BC_PERIODIC .and. num_procs_y > 1)) then - proc_coords(2) = proc_coords(2) - 1 - call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr) - proc_coords(2) = proc_coords(2) + 1 + if (proc_coords(3) > 0 .or. (bc_z%beg == BC_PERIODIC .and. num_procs_z > 1)) then + proc_coords(3) = proc_coords(3) - 1 + call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr) + proc_coords(3) = proc_coords(3) + 1 end if ! Boundary condition at the end - if (proc_coords(2) < num_procs_y - 1 .or. (bc_y%end == BC_PERIODIC .and. num_procs_y > 1)) then - proc_coords(2) = proc_coords(2) + 1 - call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr) - proc_coords(2) = proc_coords(2) - 1 + if (proc_coords(3) < num_procs_z - 1 .or. (bc_z%end == BC_PERIODIC .and. num_procs_z > 1)) then + proc_coords(3) = proc_coords(3) + 1 + call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr) + proc_coords(3) = proc_coords(3) - 1 end if #ifdef MFC_POST_PROCESS ! Ghost zone at the beginning - if (proc_coords(2) > 0 .and. format == 1) then - offset_y%beg = 2 + if (proc_coords(3) > 0 .and. format == 1) then + offset_z%beg = 2 else - offset_y%beg = 0 + offset_z%beg = 0 end if ! Ghost zone at the end - if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then - offset_y%end = 2 + if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then + offset_z%end = 2 else - offset_y%end = 0 + offset_z%end = 0 end if #endif ! Beginning and end sub-domain boundary locations if (parallel_io) then - if (proc_coords(2) < rem_cells) then - start_idx(2) = (n + 1)*proc_coords(2) + if (proc_coords(3) < rem_cells) then + start_idx(3) = (p + 1)*proc_coords(3) else - start_idx(2) = (n + 1)*proc_coords(2) + rem_cells + start_idx(3) = (p + 1)*proc_coords(3) + rem_cells end if else #ifdef MFC_PRE_PROCESS if (old_grid .neqv. .true.) then - dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp) + dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp) - if (proc_coords(2) < rem_cells) then - y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2)) - y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1) - (num_procs_y & + if (proc_coords(3) < rem_cells) then + z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3)) + z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1) - (num_procs_z & & - rem_cells)) else - y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2) + rem_cells) - y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1)) + z_domain%beg = z_domain%beg + dz*real((p + 1)*proc_coords(3) + rem_cells) + z_domain%end = z_domain%end - dz*real((p + 1)*(num_procs_z - proc_coords(3) - 1)) end if end if #endif end if - ! 1D Cartesian Processor Topology + ! 2D Cartesian Processor Topology else - ! Optimal processor topology - num_procs_x = num_procs + ! Initial estimate of optimal processor topology + num_procs_x = 1 + num_procs_y = num_procs + ierr = -1 + + ! Benchmarking the quality of this initial guess + tmp_num_procs_x = num_procs_x + tmp_num_procs_y = num_procs_y + fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + + ! Optimization of the initial processor topology + do i = 1, num_procs + if (mod(num_procs, i) == 0 .and. (m + 1)/i >= num_stcls_min*recon_order) then + tmp_num_procs_x = i + tmp_num_procs_y = num_procs/i + + if (fct_min >= abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) .and. (n + 1) & + & /tmp_num_procs_y >= num_stcls_min*recon_order) then + num_procs_x = i + num_procs_y = num_procs/i + fct_min = abs((m + 1)/tmp_num_procs_x - (n + 1)/tmp_num_procs_y) + ierr = 0 + end if + end if + end do + + ! Verifying that a valid decomposition of the computational domain has been established. If not, the simulation exits. + if (proc_rank == 0 .and. ierr == -1) then + call s_mpi_abort('Unsupported combination of values ' // 'of num_procs, m, n and ' & + & // 'weno/muscl/igr_order. Exiting.') + end if ! Creating new communicator using the Cartesian topology - call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), (/.true./), .false., MPI_COMM_CART, ierr) + call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, num_procs_y/), (/.true., .true./), .false., MPI_COMM_CART, ierr) ! Finding the Cartesian coordinates of the local process - call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, proc_coords, ierr) + call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr) end if - ! Global Parameters for x-direction + ! Global Parameters for y-direction ! Number of remaining cells - rem_cells = mod(m + 1, num_procs_x) + rem_cells = mod(n + 1, num_procs_y) ! Optimal number of cells per processor - m = (m + 1)/num_procs_x - 1 + n = (n + 1)/num_procs_y - 1 ! Distributing the remaining cells do i = 1, rem_cells - if (proc_coords(1) == i - 1) then - m = m + 1; exit + if (proc_coords(2) == i - 1) then + n = n + 1; exit end if end do - call s_update_cell_bounds(cells_bounds, m, n, p) - ! Boundary condition at the beginning - if (proc_coords(1) > 0 .or. (bc_x%beg == BC_PERIODIC .and. num_procs_x > 1)) then - proc_coords(1) = proc_coords(1) - 1 - call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr) - proc_coords(1) = proc_coords(1) + 1 + if (proc_coords(2) > 0 .or. (bc_y%beg == BC_PERIODIC .and. num_procs_y > 1)) then + proc_coords(2) = proc_coords(2) - 1 + call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr) + proc_coords(2) = proc_coords(2) + 1 end if ! Boundary condition at the end - if (proc_coords(1) < num_procs_x - 1 .or. (bc_x%end == BC_PERIODIC .and. num_procs_x > 1)) then - proc_coords(1) = proc_coords(1) + 1 - call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr) - proc_coords(1) = proc_coords(1) - 1 + if (proc_coords(2) < num_procs_y - 1 .or. (bc_y%end == BC_PERIODIC .and. num_procs_y > 1)) then + proc_coords(2) = proc_coords(2) + 1 + call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr) + proc_coords(2) = proc_coords(2) - 1 end if #ifdef MFC_POST_PROCESS ! Ghost zone at the beginning - if (proc_coords(1) > 0 .and. format == 1) then - offset_x%beg = 2 + if (proc_coords(2) > 0 .and. format == 1) then + offset_y%beg = 2 else - offset_x%beg = 0 + offset_y%beg = 0 end if ! Ghost zone at the end - if (proc_coords(1) < num_procs_x - 1 .and. format == 1) then - offset_x%end = 2 + if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then + offset_y%end = 2 else - offset_x%end = 0 + offset_y%end = 0 end if #endif ! Beginning and end sub-domain boundary locations if (parallel_io) then - if (proc_coords(1) < rem_cells) then - start_idx(1) = (m + 1)*proc_coords(1) + if (proc_coords(2) < rem_cells) then + start_idx(2) = (n + 1)*proc_coords(2) else - start_idx(1) = (m + 1)*proc_coords(1) + rem_cells + start_idx(2) = (n + 1)*proc_coords(2) + rem_cells end if else #ifdef MFC_PRE_PROCESS if (old_grid .neqv. .true.) then - dx = (x_domain%end - x_domain%beg)/real(m_glb + 1, wp) + dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp) - if (proc_coords(1) < rem_cells) then - x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1)) - x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells)) + if (proc_coords(2) < rem_cells) then + y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2)) + y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1) - (num_procs_y - rem_cells)) else - x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells) - x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1)) + y_domain%beg = y_domain%beg + dy*real((n + 1)*proc_coords(2) + rem_cells) + y_domain%end = y_domain%end - dy*real((n + 1)*(num_procs_y - proc_coords(2) - 1)) end if end if #endif end if + + ! 1D Cartesian Processor Topology + else + ! Optimal processor topology + num_procs_x = num_procs + + ! Creating new communicator using the Cartesian topology + call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), (/.true./), .false., MPI_COMM_CART, ierr) + + ! Finding the Cartesian coordinates of the local process + call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, proc_coords, ierr) + end if + + ! Global Parameters for x-direction + + ! Number of remaining cells + rem_cells = mod(m + 1, num_procs_x) + + ! Optimal number of cells per processor + m = (m + 1)/num_procs_x - 1 + + ! Distributing the remaining cells + do i = 1, rem_cells + if (proc_coords(1) == i - 1) then + m = m + 1; exit + end if + end do + + call s_update_cell_bounds(cells_bounds, m, n, p) + + ! Boundary condition at the beginning + if (proc_coords(1) > 0 .or. (bc_x%beg == BC_PERIODIC .and. num_procs_x > 1)) then + proc_coords(1) = proc_coords(1) - 1 + call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr) + proc_coords(1) = proc_coords(1) + 1 + end if + + ! Boundary condition at the end + if (proc_coords(1) < num_procs_x - 1 .or. (bc_x%end == BC_PERIODIC .and. num_procs_x > 1)) then + proc_coords(1) = proc_coords(1) + 1 + call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr) + proc_coords(1) = proc_coords(1) - 1 + end if + +#ifdef MFC_POST_PROCESS + ! Ghost zone at the beginning + if (proc_coords(1) > 0 .and. format == 1) then + offset_x%beg = 2 + else + offset_x%beg = 0 + end if + + ! Ghost zone at the end + if (proc_coords(1) < num_procs_x - 1 .and. format == 1) then + offset_x%end = 2 + else + offset_x%end = 0 + end if +#endif + + ! Beginning and end sub-domain boundary locations + if (parallel_io) then + if (proc_coords(1) < rem_cells) then + start_idx(1) = (m + 1)*proc_coords(1) + else + start_idx(1) = (m + 1)*proc_coords(1) + rem_cells + end if + else +#ifdef MFC_PRE_PROCESS + if (old_grid .neqv. .true.) then + dx = (x_domain%end - x_domain%beg)/real(m_glb + 1, wp) + + if (proc_coords(1) < rem_cells) then + x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1)) + x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells)) + else + x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells) + x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1)) + end if + end if +#endif + end if #endif - end subroutine s_mpi_decompose_computational_domain +end subroutine s_mpi_decompose_computational_domain - !> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors. - !! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of - !! cell-boundary locations may be calculated directly from those of the cell-width distributions. +!> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors. +!! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of +!! cell-boundary locations may be calculated directly from those of the cell-width distributions. #ifndef MFC_PRE_PROCESS - subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc) +subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc) - integer, intent(in) :: mpi_dir - integer, intent(in) :: pbc_loc + integer, intent(in) :: mpi_dir + integer, intent(in) :: pbc_loc #ifdef MFC_MPI - integer :: ierr !< Generic flag used to identify and report MPI errors + integer :: ierr !< Generic flag used to identify and report MPI errors - if (mpi_dir == 1) then - if (pbc_loc == -1) then ! PBC at the beginning + if (mpi_dir == 1) then + if (pbc_loc == -1) then ! PBC at the beginning - if (bc_x%end >= 0) then ! PBC at the beginning and end - call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, & - & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else ! PBC at the beginning only - call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, & - & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - else ! PBC at the end - if (bc_x%beg >= 0) then ! PBC at the end and beginning - call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, & - & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else ! PBC at the end only - call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, & - & bc_x%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if + if (bc_x%end >= 0) then ! PBC at the beginning and end + call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, & + & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + else ! PBC at the beginning only + call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if - else if (mpi_dir == 2) then - if (pbc_loc == -1) then ! PBC at the beginning - - if (bc_y%end >= 0) then ! PBC at the beginning and end - call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, & - & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else ! PBC at the beginning only - call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, & - & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - else ! PBC at the end - if (bc_y%beg >= 0) then ! PBC at the end and beginning - call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, & - & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else ! PBC at the end only - call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, & - & bc_y%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if + else ! PBC at the end + if (bc_x%beg >= 0) then ! PBC at the end and beginning + call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, MPI_COMM_WORLD, & + & MPI_STATUS_IGNORE, ierr) + else ! PBC at the end only + call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if - else - if (pbc_loc == -1) then ! PBC at the beginning - - if (bc_z%end >= 0) then ! PBC at the beginning and end - call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, & - & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else ! PBC at the beginning only - call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, & - & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - else ! PBC at the end - if (bc_z%beg >= 0) then ! PBC at the end and beginning - call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, & - & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else ! PBC at the end only - call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, & - & bc_z%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if + end if + else if (mpi_dir == 2) then + if (pbc_loc == -1) then ! PBC at the beginning + + if (bc_y%end >= 0) then ! PBC at the beginning and end + call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, & + & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + else ! PBC at the beginning only + call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + end if + else ! PBC at the end + if (bc_y%beg >= 0) then ! PBC at the end and beginning + call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, MPI_COMM_WORLD, & + & MPI_STATUS_IGNORE, ierr) + else ! PBC at the end only + call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + end if + end if + else + if (pbc_loc == -1) then ! PBC at the beginning + + if (bc_z%end >= 0) then ! PBC at the beginning and end + call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, & + & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + else ! PBC at the beginning only + call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + end if + else ! PBC at the end + if (bc_z%beg >= 0) then ! PBC at the end and beginning + call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, MPI_COMM_WORLD, & + & MPI_STATUS_IGNORE, ierr) + else ! PBC at the end only + call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if end if + end if #endif - end subroutine s_mpi_sendrecv_grid_variables_buffers +end subroutine s_mpi_sendrecv_grid_variables_buffers #endif - !> Module deallocation and/or disassociation procedures - impure subroutine s_finalize_mpi_common_module +!> Module deallocation and/or disassociation procedures +impure subroutine s_finalize_mpi_common_module #ifdef MFC_MPI - deallocate (buff_send, buff_recv) + deallocate (buff_send, buff_recv) #endif - end subroutine s_finalize_mpi_common_module +end subroutine s_finalize_mpi_common_module end module m_mpi_common diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp index 7f9131550d..f2897c2c97 100644 --- a/src/common/m_phase_change.fpp +++ b/src/common/m_phase_change.fpp @@ -257,15 +257,15 @@ contains $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', parallelism='[seq]', cray_noinline=True) ! initializing variables - integer, intent(in) :: j, k, l, MFL - real(wp), intent(out) :: pS - real(wp), dimension(1:), intent(out) :: p_infpT - type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf - real(wp), intent(in) :: rhoe - real(wp), intent(out) :: TS - real(wp) :: gp, gpp, hp, pO, mCP, mQ !< variables for the Newton Solver - real(wp) :: p_infpT_sum - integer :: i, ns !< generic loop iterators + integer, intent(in) :: j, k, l, MFL + real(wp), intent(out) :: pS + real(wp), dimension(:), intent(out) :: p_infpT + type(scalar_field), dimension(:), intent(in) :: q_cons_vf + real(wp), intent(in) :: rhoe + real(wp), intent(out) :: TS + real(wp) :: gp, gpp, hp, pO, mCP, mQ !< variables for the Newton Solver + real(wp) :: p_infpT_sum + integer :: i, ns !< generic loop iterators ! auxiliary variables for the pT-equilibrium solver mCP = 0.0_wp; mQ = 0.0_wp; p_infpT_sum = 0._wp $:GPU_LOOP(parallelism='[seq]') @@ -351,16 +351,16 @@ contains $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', parallelism='[seq]', cray_noinline=True) - integer, intent(in) :: j, k, l - real(wp), intent(inout) :: pS - real(wp), dimension(1:), intent(in) :: p_infpT - real(wp), intent(in) :: rhoe - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - real(wp), intent(inout) :: TS + integer, intent(in) :: j, k, l + real(wp), intent(inout) :: pS + real(wp), dimension(1:), intent(in) :: p_infpT + real(wp), intent(in) :: rhoe + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + real(wp), intent(inout) :: TS #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium #:else - real(wp), dimension(num_fluids) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium + real(wp), dimension(num_fluids_max) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium #:endif real(wp), dimension(2, 2) :: Jac, InvJac, TJac !< matrices for the Newton Solver real(wp), dimension(2) :: R2D, DeltamP !< residual and correction array @@ -525,10 +525,10 @@ contains !> @name variables for the correction of the reacting partial densities !> @{ - real(wp), intent(out) :: MCT - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - real(wp), intent(inout) :: rM - integer, intent(in) :: j, k, l + real(wp), intent(out) :: MCT + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + real(wp), intent(inout) :: rM + integer, intent(in) :: j, k, l !> @} if (rM < 0.0_wp) then if ((q_cons_vf(lp + eqn_idx%cont%beg - 1)%sf(j, k, & diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp index 2417da1adf..8d546daf9f 100644 --- a/src/common/m_variables_conversion.fpp +++ b/src/common/m_variables_conversion.fpp @@ -91,12 +91,12 @@ contains real(wp), intent(in), optional :: G, pres_mag ! Chemistry - real(wp), dimension(1:num_species), intent(in) :: rhoYks - real(wp), dimension(1:num_species) :: Y_rs - real(wp) :: E_e - real(wp) :: e_Per_Kg, Pdyn_Per_Kg - real(wp) :: T_guess - integer :: s !< Generic loop iterator + real(wp), dimension(:), intent(in) :: rhoYks + real(wp), dimension(1:num_species) :: Y_rs + real(wp) :: E_e + real(wp) :: e_Per_Kg, Pdyn_Per_Kg + real(wp) :: T_guess + integer :: s !< Generic loop iterator #:if not chemistry ! Depending on model_eqns and bubbles_euler, the appropriate procedure for computing pressure is targeted by the ! procedure pointer @@ -253,8 +253,8 @@ contains real(wp), dimension(3), intent(inout) :: alpha_rho_K, alpha_K real(wp), optional, dimension(3), intent(in) :: G #:else - real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K - real(wp), optional, dimension(num_fluids), intent(in) :: G + real(wp), dimension(:), intent(inout) :: alpha_rho_K, alpha_K + real(wp), optional, dimension(:), intent(in) :: G #:endif real(wp), dimension(2), intent(out) :: Re_K real(wp), optional, intent(out) :: G_K @@ -1175,12 +1175,12 @@ contains subroutine s_compute_species_fraction(q_vf, k, l, r, alpha_rho_K, alpha_K) $:GPU_ROUTINE(function_name='s_compute_species_fraction', parallelism='[seq]', cray_noinline=True) - type(scalar_field), dimension(sys_size), intent(in) :: q_vf - integer, intent(in) :: k, l, r + type(scalar_field), dimension(:), intent(in) :: q_vf + integer, intent(in) :: k, l, r #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K #:else - real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K + real(wp), dimension(:), intent(out) :: alpha_rho_K, alpha_K #:endif integer :: i real(wp) :: alpha_K_sum @@ -1256,7 +1256,7 @@ contains #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3), intent(in) :: adv #:else - real(wp), dimension(num_fluids), intent(in) :: adv + real(wp), dimension(:), intent(in) :: adv #:endif real(wp), intent(in) :: vel_sum real(wp), intent(in) :: c_c diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp index 4de261864d..488a84c644 100644 --- a/src/simulation/m_acoustic_src.fpp +++ b/src/simulation/m_acoustic_src.fpp @@ -129,7 +129,7 @@ contains type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf !< Primitive variables type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: myalpha, myalpha_rho #:else real(wp), dimension(num_fluids) :: myalpha, myalpha_rho diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp index a15156405b..200a007754 100644 --- a/src/simulation/m_bubbles_EE.fpp +++ b/src/simulation/m_bubbles_EE.fpp @@ -148,7 +148,7 @@ contains real(wp) :: pb_local, mv_local, vflux, pbdot real(wp) :: n_tait, B_tait - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: Rtmp, Vtmp real(wp), dimension(3) :: myalpha, myalpha_rho #:else diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp index b44eb617b1..c9773ee9f6 100644 --- a/src/simulation/m_bubbles_EL.fpp +++ b/src/simulation/m_bubbles_EL.fpp @@ -699,15 +699,15 @@ contains $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', parallelism='[seq]', cray_inline=True) - type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf - real(wp), intent(in) :: pinf, rhol, gamma, pi_inf - integer, dimension(3), intent(in) :: cell - real(wp), intent(out) :: cson - real(wp) :: E, H + type(scalar_field), dimension(:), intent(in) :: q_prim_vf + real(wp), intent(in) :: pinf, rhol, gamma, pi_inf + integer, dimension(3), intent(in) :: cell + real(wp), intent(out) :: cson + real(wp) :: E, H #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3) :: vel #:else - real(wp), dimension(num_dims) :: vel + real(wp), dimension(3) :: vel #:endif integer :: i @@ -765,19 +765,19 @@ contains $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', cray_inline=True) - integer, intent(in) :: bub_id, ptype - type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf - real(wp), intent(out) :: f_pinfl - integer, dimension(3), intent(out) :: cell - real(wp), intent(out), optional :: preterm1, term2, Romega - real(wp), dimension(3) :: scoord, psi - real(wp) :: dc, vol, aux - real(wp) :: volgas, term1, Rbeq, denom - real(wp) :: charvol, charpres, charvol2, charpres2 - integer, dimension(3) :: cellaux - integer :: i, j, k - integer :: smearGrid, smearGridz - logical :: celloutside + integer, intent(in) :: bub_id, ptype + type(scalar_field), dimension(:), intent(in) :: q_prim_vf + real(wp), intent(out) :: f_pinfl + integer, dimension(3), intent(out) :: cell + real(wp), intent(out), optional :: preterm1, term2, Romega + real(wp), dimension(3) :: scoord, psi + real(wp) :: dc, vol, aux + real(wp) :: volgas, term1, Rbeq, denom + real(wp) :: charvol, charpres, charvol2, charpres2 + integer, dimension(3) :: cellaux + integer :: i, j, k + integer :: smearGrid, smearGridz + logical :: celloutside scoord = mtn_s(bub_id,1:3,2) f_pinfl = 0._wp diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp index 6f1b041d49..bc8d7afe2d 100644 --- a/src/simulation/m_cbc.fpp +++ b/src/simulation/m_cbc.fpp @@ -473,12 +473,12 @@ contains real(wp) :: dqv_dt real(wp) :: dpres_ds - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20) :: L #:else real(wp), dimension(sys_size) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: alpha_rho, dalpha_rho_ds, mf real(wp), dimension(3) :: vel, dvel_ds real(wp), dimension(3) :: adv_local, dadv_ds diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp index c2c415b1d3..750cbf74b6 100644 --- a/src/simulation/m_compute_cbc.fpp +++ b/src/simulation/m_compute_cbc.fpp @@ -23,10 +23,10 @@ contains $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(3), intent(in) :: lambda real(wp), intent(in) :: rho, c, dpres_ds - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: dvel_ds #:else - real(wp), dimension(num_dims), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dvel_ds #:endif real(wp) :: L1 L1 = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1))) @@ -37,15 +37,15 @@ contains subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds) $:GPU_ROUTINE(parallelism='[seq]') - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds #:else - real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds + real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds #:endif real(wp), intent(in) :: lambda_factor, lambda2, c real(wp), intent(in) :: dpres_ds @@ -62,15 +62,15 @@ contains subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds) $:GPU_ROUTINE(parallelism='[seq]') - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: dvel_ds #:else - real(wp), dimension(num_dims), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dvel_ds #:endif real(wp), intent(in) :: lambda_factor, lambda2 integer :: i @@ -86,15 +86,15 @@ contains subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds) $:GPU_ROUTINE(parallelism='[seq]') - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: dadv_ds #:else - real(wp), dimension(num_fluids), intent(in) :: dadv_ds + real(wp), dimension(:), intent(in) :: dadv_ds #:endif real(wp), intent(in) :: lambda_factor, lambda2 integer :: i @@ -110,15 +110,15 @@ contains subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds) $:GPU_ROUTINE(parallelism='[seq]') - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(10), intent(in) :: dYs_ds #:else - real(wp), dimension(num_species), intent(in) :: dYs_ds + real(wp), dimension(:), intent(in) :: dYs_ds #:endif real(wp), intent(in) :: lambda_factor, lambda2 integer :: i @@ -138,15 +138,15 @@ contains $:GPU_ROUTINE(function_name='s_compute_slip_wall_L',parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: dvel_ds #:else - real(wp), dimension(num_dims), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dvel_ds #:endif real(wp), intent(in) :: rho, c, dpres_ds @@ -162,21 +162,21 @@ contains $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_buffer_L', parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds real(wp), dimension(3), intent(in) :: dvel_ds real(wp), dimension(3), intent(in) :: dadv_ds real(wp), dimension(10), intent(in) :: dYs_ds #:else - real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds - real(wp), dimension(num_dims), intent(in) :: dvel_ds - real(wp), dimension(num_fluids), intent(in) :: dadv_ds - real(wp), dimension(num_species), intent(in) :: dYs_ds + real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds + real(wp), dimension(:), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dadv_ds + real(wp), dimension(:), intent(in) :: dYs_ds #:endif real(wp), intent(in) :: rho, c real(wp), intent(in) :: dpres_ds @@ -202,15 +202,15 @@ contains $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_inflow_L', parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: dvel_ds #:else - real(wp), dimension(num_dims), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dvel_ds #:endif real(wp), intent(in) :: rho, c, dpres_ds @@ -226,21 +226,21 @@ contains $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_outflow_L', parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds real(wp), dimension(3), intent(in) :: dvel_ds real(wp), dimension(3), intent(in) :: dadv_ds real(wp), dimension(10), intent(in) :: dYs_ds #:else - real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds - real(wp), dimension(num_dims), intent(in) :: dvel_ds - real(wp), dimension(num_fluids), intent(in) :: dadv_ds - real(wp), dimension(num_species), intent(in) :: dYs_ds + real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds + real(wp), dimension(:), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dadv_ds + real(wp), dimension(:), intent(in) :: dYs_ds #:endif real(wp), intent(in) :: rho, c real(wp), intent(in) :: dpres_ds @@ -260,19 +260,19 @@ contains $:GPU_ROUTINE(function_name='s_compute_force_free_subsonic_outflow_L', parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds real(wp), dimension(3), intent(in) :: dvel_ds real(wp), dimension(3), intent(in) :: dadv_ds #:else - real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds - real(wp), dimension(num_dims), intent(in) :: dvel_ds - real(wp), dimension(num_fluids), intent(in) :: dadv_ds + real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds + real(wp), dimension(:), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dadv_ds #:endif real(wp), intent(in) :: rho, c real(wp), intent(in) :: dpres_ds @@ -291,19 +291,19 @@ contains $:GPU_ROUTINE(function_name='s_compute_constant_pressure_subsonic_outflow_L', parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds real(wp), dimension(3), intent(in) :: dvel_ds real(wp), dimension(3), intent(in) :: dadv_ds #:else - real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds - real(wp), dimension(num_dims), intent(in) :: dvel_ds - real(wp), dimension(num_fluids), intent(in) :: dadv_ds + real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds + real(wp), dimension(:), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dadv_ds #:endif real(wp), intent(in) :: rho, c real(wp), intent(in) :: dpres_ds @@ -320,10 +320,10 @@ contains subroutine s_compute_supersonic_inflow_L(L) $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', parallelism='[seq]', cray_inline=True) - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif L(1:eqn_idx%adv%end) = 0._wp if (chemistry) L(eqn_idx%species%beg:eqn_idx%species%end) = 0._wp @@ -336,21 +336,21 @@ contains $:GPU_ROUTINE(function_name='s_compute_supersonic_outflow_L', parallelism='[seq]', cray_inline=True) real(wp), dimension(3), intent(in) :: lambda - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(20), intent(inout) :: L #:else - real(wp), dimension(sys_size), intent(inout) :: L + real(wp), dimension(:), intent(inout) :: L #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds real(wp), dimension(3), intent(in) :: dvel_ds real(wp), dimension(3), intent(in) :: dadv_ds real(wp), dimension(10), intent(in) :: dYs_ds #:else - real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds - real(wp), dimension(num_dims), intent(in) :: dvel_ds - real(wp), dimension(num_fluids), intent(in) :: dadv_ds - real(wp), dimension(num_species), intent(in) :: dYs_ds + real(wp), dimension(:), intent(in) :: mf, dalpha_rho_ds + real(wp), dimension(:), intent(in) :: dvel_ds + real(wp), dimension(:), intent(in) :: dadv_ds + real(wp), dimension(:), intent(in) :: dYs_ds #:endif real(wp), intent(in) :: rho, c real(wp), intent(in) :: dpres_ds diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp index 9e7519790c..cef5e95e91 100644 --- a/src/simulation/m_compute_levelset.fpp +++ b/src/simulation/m_compute_levelset.fpp @@ -20,53 +20,100 @@ module m_compute_levelset contains + !> 3x3 matrix-vector multiply; replaces the matmul() intrinsic which triggers an ifx 2025.1.1 SPIR-V ICE (#5633) when inlined + !! into a target teams loop kernel via declare target. + pure function f_mv3(M, v) result(w) + + $:GPU_ROUTINE(parallelism='[seq]') + + real(wp), intent(in) :: M(3, 3), v(3) + real(wp) :: w(3) + w(1) = M(1, 1)*v(1) + M(1, 2)*v(2) + M(1, 3)*v(3) + w(2) = M(2, 1)*v(1) + M(2, 2)*v(2) + M(2, 3)*v(3) + w(3) = M(3, 1)*v(1) + M(3, 2)*v(2) + M(3, 3)*v(3) + + end function f_mv3 + !> Dispatch level-set distance and normal computations for all ghost points based on patch geometry type impure subroutine s_apply_levelset(gps, num_gps) type(ghost_point), dimension(:), intent(inout) :: gps integer, intent(in) :: num_gps - integer :: i, patch_id, patch_geometry + integer :: i, patch_id - ! 3D Patch Geometries + ! One GPU loop per geometry type so each kernel calls exactly one + ! declare-target routine. A single if-else dispatch over multiple + ! declare-target callees triggers an LLVM phi-node dominance error + ! in ifx SPIR64 codegen; splitting into separate loops avoids it. if (p > 0) then - $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]') + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]') do i = 1, num_gps patch_id = gps(i)%ib_patch_id - patch_geometry = patch_ib(patch_id)%geometry - - if (patch_geometry == 8) then - call s_sphere_levelset(gps(i)) - else if (patch_geometry == 9) then - call s_cuboid_levelset(gps(i)) - else if (patch_geometry == 10) then - call s_cylinder_levelset(gps(i)) - else if (patch_geometry == 11) then - call s_3d_airfoil_levelset(gps(i)) - else if (patch_geometry == 12) then - call s_model_levelset(gps(i)) - end if + if (patch_ib(patch_id)%geometry == 8) call s_sphere_levelset(gps(i)) end do $:END_GPU_PARALLEL_LOOP() - ! 2D Patch Geometries + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 9) call s_cuboid_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 10) call s_cylinder_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 11) call s_3d_airfoil_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[patch_ib(1:num_ibs), Np]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 12) call s_model_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() else if (n > 0) then - $:GPU_PARALLEL_LOOP(private='[i, patch_id, patch_geometry]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]') + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]') do i = 1, num_gps patch_id = gps(i)%ib_patch_id - patch_geometry = patch_ib(patch_id)%geometry - - if (patch_geometry == 2) then - call s_circle_levelset(gps(i)) - else if (patch_geometry == 3) then - call s_rectangle_levelset(gps(i)) - else if (patch_geometry == 4) then - call s_airfoil_levelset(gps(i)) - else if (patch_geometry == 5) then - call s_model_levelset(gps(i)) - else if (patch_geometry == 6) then - call s_ellipse_levelset(gps(i)) - end if + if (patch_ib(patch_id)%geometry == 2) call s_circle_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 3) call s_rectangle_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 4) call s_airfoil_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 5) call s_model_levelset(gps(i)) + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(private='[i, patch_id]', copy='[gps]', copyin='[Np, patch_ib(1:num_ibs)]') + do i = 1, num_gps + patch_id = gps(i)%ib_patch_id + if (patch_ib(patch_id)%geometry == 6) call s_ellipse_levelset(gps(i)) end do $:END_GPU_PARALLEL_LOOP() end if @@ -126,7 +173,7 @@ contains offset(:) = patch_ib(ib_patch_id)%centroid_offset(:) xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] ! get coordinate frame centered on IB - xy_local = matmul(inverse_rotation, xy_local) ! rotate the frame into the IB's coordinate + xy_local = f_mv3(inverse_rotation, xy_local) ! rotate the frame into the IB's coordinate xy_local = xy_local - offset ! airfoils are a patch that require a centroid offset if (xy_local(2) >= 0._wp) then @@ -176,7 +223,7 @@ contains if (f_approx_equal(dist, 0._wp)) then gp%levelset_norm = 0._wp else - gp%levelset_norm = matmul(rotation, dist_vec(:))/dist ! convert the normal vector back to global grid coordinates + gp%levelset_norm = f_mv3(rotation, dist_vec(:))/dist ! convert the normal vector back to global grid coordinates end if end subroutine s_airfoil_levelset @@ -211,7 +258,7 @@ contains z_min = -lz/2 xyz_local = [x_cc(i), y_cc(j), z_cc(l)] - center - xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates + xyz_local = f_mv3(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates xyz_local = xyz_local - offset ! airfoils are a patch that require a centroid offset if (xyz_local(2) >= 0._wp) then @@ -266,13 +313,13 @@ contains else normal(3) = 1._wp end if - gp%levelset_norm = matmul(rotation, normal) + gp%levelset_norm = f_mv3(rotation, normal) else gp%levelset = dist_surf if (f_approx_equal(dist_surf, 0._wp)) then gp%levelset_norm = 0._wp else - gp%levelset_norm = matmul(rotation, dist_vec(:)/dist_surf) + gp%levelset_norm = f_mv3(rotation, dist_vec(:)/dist_surf) end if end if @@ -312,7 +359,7 @@ contains ! convert grid to local coordinates xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] - xy_local = matmul(inverse_rotation, xy_local) + xy_local = f_mv3(inverse_rotation, xy_local) side_dists(1) = bottom_left(1) - xy_local(1) side_dists(2) = top_right(1) - xy_local(1) @@ -339,7 +386,7 @@ contains dist_vec(2) = side_dists(idx)/abs(side_dists(idx)) end if ! convert the normal vector back into the global coordinate system - gp%levelset_norm = matmul(rotation, dist_vec) + gp%levelset_norm = f_mv3(rotation, dist_vec) else gp%levelset_norm = 0._wp end if @@ -375,13 +422,13 @@ contains ellipse_coeffs(2) = 0.5_wp*length_y xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] - xy_local = matmul(inverse_rotation, xy_local) + xy_local = f_mv3(inverse_rotation, xy_local) normal_vector = xy_local ! get the normal direction via the coordinate transformation method normal_vector(2) = normal_vector(2)*(ellipse_coeffs(1)/ellipse_coeffs(2))**2._wp normal_vector = normal_vector/sqrt(dot_product(normal_vector, normal_vector)) ! normalize the vector - gp%levelset_norm = matmul(rotation, normal_vector) ! save after rotating the vector to the global frame + gp%levelset_norm = f_mv3(rotation, normal_vector) ! save after rotating the vector to the global frame ! use the normal vector to set up the quadratic equation for the levelset, using A, B, and C in indices 1, 2, and 3 quadratic_coeffs(1) = (normal_vector(1)/ellipse_coeffs(1))**2 + (normal_vector(2)/ellipse_coeffs(2))**2 @@ -434,7 +481,7 @@ contains Back = -length_z/2 xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center ! get coordinate frame centered on IB - xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinate + xyz_local = f_mv3(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinate dist_left = Left - xyz_local(1) dist_right = xyz_local(1) - Right @@ -478,7 +525,7 @@ contains end if end if - gp%levelset_norm = matmul(rotation, dist_vec) + gp%levelset_norm = f_mv3(rotation, dist_vec) end subroutine s_cuboid_levelset @@ -567,7 +614,7 @@ contains end if xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center ! get coordinate frame centered on IB - xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates + xyz_local = f_mv3(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates ! get distance to flat edge of cylinder side_pos = dot_product(xyz_local, dist_sides_vec) @@ -579,15 +626,15 @@ contains ! if the closest edge is flat gp%levelset = -dist_side if (f_approx_equal(dist_side, abs(side_pos - boundary(1)))) then - gp%levelset_norm = matmul(rotation, -dist_sides_vec) + gp%levelset_norm = f_mv3(rotation, -dist_sides_vec) else - gp%levelset_norm = matmul(rotation, dist_sides_vec) + gp%levelset_norm = f_mv3(rotation, dist_sides_vec) end if else gp%levelset = dist_surface xyz_local = xyz_local*dist_surface_vec xyz_local = xyz_local/max(norm2(xyz_local), sgm_eps) - gp%levelset_norm = matmul(rotation, xyz_local) + gp%levelset_norm = f_mv3(rotation, xyz_local) end if end subroutine s_cylinder_levelset @@ -630,7 +677,7 @@ contains if (p > 0) then xyz_local(3) = z_cc(k) - center(3) end if - xyz_local = matmul(inverse_rotation, xyz_local) + xyz_local = f_mv3(inverse_rotation, xyz_local) ! 3D models if (p > 0) then @@ -642,12 +689,12 @@ contains gp%levelset = -abs(gp%levelset) ! Assign the levelset_norm - gp%levelset_norm = matmul(rotation, normals(1:3)) + gp%levelset_norm = f_mv3(rotation, normals(1:3)) else ! 2D models call s_distance_normals_2D(patch_id, boundary_edge_count, xyz_local, normals, distance) gp%levelset = -abs(distance) - gp%levelset_norm = matmul(rotation, normals(1:3)) + gp%levelset_norm = f_mv3(rotation, normals(1:3)) end if end subroutine s_model_levelset diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp index 8ffcba6101..169af242b4 100644 --- a/src/simulation/m_data_output.fpp +++ b/src/simulation/m_data_output.fpp @@ -166,12 +166,12 @@ contains integer, intent(in) :: t_step real(wp) :: rho !< Cell-avg. density - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: alpha !< Cell-avg. volume fraction real(wp), dimension(3) :: vel !< Cell-avg. velocity #:else - real(wp), dimension(num_fluids) :: alpha !< Cell-avg. volume fraction - real(wp), dimension(num_vels) :: vel !< Cell-avg. velocity + real(wp), dimension(num_fluids_max) :: alpha !< Cell-avg. volume fraction + real(wp), dimension(3) :: vel !< Cell-avg. velocity #:endif real(wp) :: vel_sum !< Cell-avg. velocity sum real(wp) :: pres !< Cell-avg. pressure diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp index 2ee806d928..59fb741aef 100644 --- a/src/simulation/m_fftw.fpp +++ b/src/simulation/m_fftw.fpp @@ -4,7 +4,7 @@ #:include 'macros.fpp' -!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT) for azimuthal Fourier filtering in cylindrical geometries +!> @brief Forward and inverse FFT wrappers (FFTW/cuFFT/hipFFT/oneMKL) for azimuthal Fourier filtering in cylindrical geometries module m_fftw use, intrinsic :: iso_c_binding @@ -12,7 +12,14 @@ module m_fftw use m_derived_types use m_global_parameters use m_mpi_proxy -#if defined(MFC_GPU) && defined(__PGI) + ! GPU FFT backend selection: + ! cuFFT - NVHPC/PGI (OpenACC or OpenMP target) + ! hipFFT - Cray/AMD (OpenMP target) + ! oneMKL - Intel ifx (OpenMP target + dispatch construct) + ! FFTW - CPU-only builds +#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER) + use mkl_dfti_omp_offload +#elif defined(MFC_GPU) && defined(__PGI) use cufft #elif defined(MFC_GPU) use hipfort @@ -34,7 +41,18 @@ module m_fftw real(c_double), pointer :: data_real(:) !< Real data complex(c_double_complex), pointer :: data_cmplx(:) !< Complex data in Fourier space complex(c_double_complex), pointer :: data_fltr_cmplx(:) !< Filtered complex data in Fourier space -#if defined(MFC_GPU) + +#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER) + $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]') + + real(dp), allocatable, target :: data_real_gpu(:) + complex(dp), allocatable, target :: data_cmplx_gpu(:) + complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:) + $:GPU_DECLARE(create='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') + + type(DFTI_DESCRIPTOR), pointer :: fwd_plan_mkl => null() + type(DFTI_DESCRIPTOR), pointer :: bwd_plan_mkl => null() +#elif defined(MFC_GPU) $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]') real(dp), allocatable, target :: data_real_gpu(:) @@ -62,16 +80,39 @@ contains impure subroutine s_initialize_fftw_module integer :: ierr !< Generic flag used to identify and report GPU errors - ! Size of input array going into DFT real_size = p + 1 - ! Size of output array coming out of DFT cmplx_size = (p + 1)/2 + 1 x_size = m + 1 batch_size = x_size*sys_size -#if defined(MFC_GPU) +#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER) + $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]') + $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]') + + @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size)) + @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) + @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) + + ! Forward R2C descriptor: batch of real_size transforms + ierr = DftiCreateDescriptor(fwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size) + ierr = DftiSetValue(fwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size) + ierr = DftiSetValue(fwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE) + ierr = DftiSetValue(fwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX) + ierr = DftiSetValue(fwd_plan_mkl, DFTI_INPUT_DISTANCE, real_size) + ierr = DftiSetValue(fwd_plan_mkl, DFTI_OUTPUT_DISTANCE, cmplx_size) + ierr = DftiCommitDescriptor(fwd_plan_mkl) + + ! Backward C2R descriptor + ierr = DftiCreateDescriptor(bwd_plan_mkl, DFTI_DOUBLE, DFTI_REAL, 1, real_size) + ierr = DftiSetValue(bwd_plan_mkl, DFTI_NUMBER_OF_TRANSFORMS, batch_size) + ierr = DftiSetValue(bwd_plan_mkl, DFTI_PLACEMENT, DFTI_NOT_INPLACE) + ierr = DftiSetValue(bwd_plan_mkl, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX) + ierr = DftiSetValue(bwd_plan_mkl, DFTI_INPUT_DISTANCE, cmplx_size) + ierr = DftiSetValue(bwd_plan_mkl, DFTI_OUTPUT_DISTANCE, real_size) + ierr = DftiCommitDescriptor(bwd_plan_mkl) +#elif defined(MFC_GPU) rank = 1; istride = 1; ostride = 1 allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank)) @@ -80,22 +121,7 @@ contains oembed(1) = cmplx_size $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]') $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]') -#else - ! Allocate input and output DFT data sizes - fftw_real_data = fftw_alloc_real(int(real_size, c_size_t)) - fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) - fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) - ! Associate input and output data pointers with allocated memory - call c_f_pointer(fftw_real_data, data_real, [real_size]) - call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size]) - call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size]) - ! Generate plans for forward and backward DFTs - fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE) - bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE) -#endif - -#if defined(MFC_GPU) @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size)) @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) @@ -111,6 +137,19 @@ contains ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, & & HIPFFT_Z2D, batch_size) #endif +#else + ! Allocate input and output DFT data sizes + fftw_real_data = fftw_alloc_real(int(real_size, c_size_t)) + fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) + fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) + ! Associate input and output data pointers with allocated memory + call c_f_pointer(fftw_real_data, data_real, [real_size]) + call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size]) + call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size]) + + ! Generate plans for forward and backward DFTs + fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE) + bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE) #endif end subroutine s_initialize_fftw_module @@ -124,7 +163,116 @@ contains ! Restrict filter to processors that have cells adjacent to axis if (bc_y%beg >= 0) return -#if defined(MFC_GPU) +#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER) + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, cmplx_size + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_MKL_DISPATCH() + ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu) + + Nfq = 3 + $:GPU_UPDATE(device='[Nfq]') + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, Nfq + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1) & + & *cmplx_size*x_size) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_MKL_DISPATCH() + ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu) + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1) & + & *real_size*x_size)/real(real_size, dp) + q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + do i = 1, fourier_rings + i2 = i + $:GPU_UPDATE(device='[i2]') + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, cmplx_size + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i2, l) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_MKL_DISPATCH() + ierr = DftiComputeForward(fwd_plan_mkl, data_real_gpu, data_cmplx_gpu) + + Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + $:GPU_UPDATE(device='[Nfq]') + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, Nfq + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k & + & - 1)*cmplx_size*x_size) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_MKL_DISPATCH() + ierr = DftiComputeBackward(bwd_plan_mkl, data_fltr_cmplx_gpu, data_real_gpu) + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k & + & - 1)*real_size*x_size)/real(real_size, dp) + q_cons_vf(k)%sf(j, i2, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + end do +#elif defined(MFC_GPU) $:GPU_PARALLEL_LOOP(collapse=3) do k = 1, sys_size do j = 0, m @@ -292,7 +440,13 @@ contains !> Finalize the FFTW module impure subroutine s_finalize_fftw_module -#if defined(MFC_GPU) +#if defined(MFC_GPU) && defined(__INTEL_LLVM_COMPILER) + integer :: ierr !< Generic flag used to identify and report GPU errors + + @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu) + ierr = DftiFreeDescriptor(fwd_plan_mkl) + ierr = DftiFreeDescriptor(bwd_plan_mkl) +#elif defined(MFC_GPU) integer :: ierr !< Generic flag used to identify and report GPU errors @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu) diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp index 90e78d04df..85eb77cbb4 100644 --- a/src/simulation/m_hyperelastic.fpp +++ b/src/simulation/m_hyperelastic.fpp @@ -77,13 +77,13 @@ contains type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(10) :: tensora, tensorb #:else real(wp), dimension(tensor_size) :: tensora, tensorb #:endif - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: alpha_k, alpha_rho_k #:else real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k @@ -194,13 +194,13 @@ contains subroutine s_neoHookean_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - type(scalar_field), dimension(b_size), intent(inout) :: btensor_in - real(wp), intent(in) :: G_param - integer, intent(in) :: j, k, l - real(wp) :: trace - real(wp), parameter :: f13 = 1._wp/3._wp - integer :: i !< Generic loop iterators + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf + type(scalar_field), dimension(:), intent(inout) :: btensor_in + real(wp), intent(in) :: G_param + integer, intent(in) :: j, k, l + real(wp) :: trace + real(wp), parameter :: f13 = 1._wp/3._wp + integer :: i !< Generic loop iterators ! tensor is the symmetric tensor & calculate the trace of the tensor trace = btensor_in(1)%sf(j, k, l) + btensor_in(3)%sf(j, k, l) + btensor_in(6)%sf(j, k, l) @@ -222,13 +222,13 @@ contains subroutine s_Mooney_Rivlin_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf - type(scalar_field), dimension(b_size), intent(inout) :: btensor_in - real(wp), intent(in) :: G_param - integer, intent(in) :: j, k, l - real(wp) :: trace - real(wp), parameter :: f13 = 1._wp/3._wp - integer :: i !< Generic loop iterators + type(scalar_field), dimension(:), intent(inout) :: q_prim_vf + type(scalar_field), dimension(:), intent(inout) :: btensor_in + real(wp), intent(in) :: G_param + integer, intent(in) :: j, k, l + real(wp) :: trace + real(wp), parameter :: f13 = 1._wp/3._wp + integer :: i !< Generic loop iterators ! TODO: Make 1D and 2D capable trace = btensor_in(1)%sf(j, k, l) + btensor_in(3)%sf(j, k, l) + btensor_in(6)%sf(j, k, l) diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp index b21483e097..4fba39e80e 100644 --- a/src/simulation/m_ib_patches.fpp +++ b/src/simulation/m_ib_patches.fpp @@ -136,7 +136,7 @@ contains integer, intent(in) :: xp, yp !< integers containing the periodicity projection information real(wp) :: f, ca_in, pa, ma, ta real(wp) :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c - integer :: i, j, k, il, ir, jl, jr + integer :: i, j, k, kk, il, ir, jl, jr integer :: Np1, Np2 integer :: encoded_patch_id real(wp), dimension(1:3) :: xy_local, offset !< x and y coordinates in local IB frame @@ -227,8 +227,8 @@ contains call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir) call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr) - $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, ma, & - & ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2) + $:GPU_PARALLEL_LOOP(private='[i, j, xy_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, & + & ma, ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2) do j = jl, jr do i = il, ir xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] ! get coordinate frame centered on IB @@ -246,8 +246,8 @@ contains end if if (xy_local(2) >= 0._wp) then k = 1 - do while (airfoil_grid_u(k)%x < xy_local(1) .and. k <= Np) - k = k + 1 + do kk = 1, Np - 1 + if (airfoil_grid_u(kk)%x < xy_local(1)) k = kk + 1 end do if (f_approx_equal(airfoil_grid_u(k)%x, xy_local(1))) then if (xy_local(2) <= airfoil_grid_u(k)%y) then @@ -261,8 +261,8 @@ contains end if else k = 1 - do while (airfoil_grid_l(k)%x < xy_local(1)) - k = k + 1 + do kk = 1, Np - 1 + if (airfoil_grid_l(kk)%x < xy_local(1)) k = kk + 1 end do if (f_approx_equal(airfoil_grid_l(k)%x, xy_local(1))) then if (xy_local(2) >= airfoil_grid_l(k)%y) then @@ -290,7 +290,7 @@ contains type(integer_field), intent(inout) :: ib_markers integer, intent(in) :: xp, yp, zp !< integers containing the periodicity projection information real(wp) :: lz, z_max, z_min, f, ca_in, pa, ma, ta, xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c - integer :: i, j, k, l, il, ir, jl, jr, ll, lr + integer :: i, j, k, kk, l, il, ir, jl, jr, ll, lr integer :: Np1, Np2 integer :: encoded_patch_id real(wp), dimension(1:3) :: xyz_local, center, offset !< x, y, z coordinates in local IB frame @@ -385,8 +385,8 @@ contains call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr) call get_bounding_indices(center(3) - ca_in, center(3) + ca_in, z_cc, ll, lr) - $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, f]', copyin='[encoded_patch_id, center, inverse_rotation, offset, & - & ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3) + $:GPU_PARALLEL_LOOP(private='[i, j, l, xyz_local, k, kk, f]', copyin='[encoded_patch_id, center, inverse_rotation, & + & offset, ma, ca_in, airfoil_grid_u, airfoil_grid_l, z_min, z_max]', collapse=3) do l = ll, lr do j = jl, jr do i = il, ir @@ -399,8 +399,8 @@ contains if (xyz_local(1) >= 0._wp .and. xyz_local(1) <= ca_in) then if (xyz_local(2) >= 0._wp) then k = 1 - do while (airfoil_grid_u(k)%x < xyz_local(1)) - k = k + 1 + do kk = 1, Np - 1 + if (airfoil_grid_u(kk)%x < xyz_local(1)) k = kk + 1 end do if (f_approx_equal(airfoil_grid_u(k)%x, xyz_local(1))) then if (xyz_local(2) <= airfoil_grid_u(k)%y) then @@ -415,8 +415,8 @@ contains end if else k = 1 - do while (airfoil_grid_l(k)%x < xyz_local(1)) - k = k + 1 + do kk = 1, Np - 1 + if (airfoil_grid_l(kk)%x < xyz_local(1)) k = kk + 1 end do if (f_approx_equal(airfoil_grid_l(k)%x, xyz_local(1))) then if (xyz_local(2) >= airfoil_grid_l(k)%y) then diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp index 39ec6b1470..0ac42bfff5 100644 --- a/src/simulation/m_ibm.fpp +++ b/src/simulation/m_ibm.fpp @@ -721,7 +721,7 @@ contains & nmom_IP, pb_in, mv_in, presb_IP, massv_IP) $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf !< Primitive Variables + type(scalar_field), dimension(:), intent(in) :: q_prim_vf !< Primitive Variables real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(in) :: pb_in, mv_in type(ghost_point), intent(in) :: gp real(wp), intent(inout) :: pres_IP @@ -730,7 +730,7 @@ contains #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3), intent(inout) :: alpha_IP, alpha_rho_IP #:else - real(wp), dimension(num_fluids), intent(inout) :: alpha_IP, alpha_rho_IP + real(wp), dimension(:), intent(inout) :: alpha_IP, alpha_rho_IP #:endif real(wp), optional, dimension(:), intent(inout) :: r_IP, v_IP, pb_IP, mv_IP real(wp), optional, dimension(:), intent(inout) :: nmom_IP diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 5f1fa9d73b..f326d308bd 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -2640,11 +2640,11 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - real(wp), intent(in) :: E_L, gamma_L, pi_inf_L, rho_L - real(wp), intent(in) :: E_R, gamma_R, pi_inf_R, rho_R - real(wp), dimension(num_dims), intent(in) :: vel_L, vel_R - real(wp), intent(out) :: pres_L, pres_R, cfl - real(wp) :: a_L, a_R + real(wp), intent(in) :: E_L, gamma_L, pi_inf_L, rho_L + real(wp), intent(in) :: E_R, gamma_R, pi_inf_R, rho_R + real(wp), dimension(:), intent(in) :: vel_L, vel_R + real(wp), intent(out) :: pres_L, pres_R, cfl + real(wp) :: a_L, a_R if (num_dims == 2) then pres_L = (E_L - pi_inf_L - 0.5_wp*rho_L*(vel_L(1)**2._wp + vel_L(2)**2._wp))/gamma_L diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp index 8ca939de7d..4cb3e847d6 100644 --- a/src/simulation/m_pressure_relaxation.fpp +++ b/src/simulation/m_pressure_relaxation.fpp @@ -51,8 +51,8 @@ contains !> The main pressure relaxation procedure subroutine s_pressure_relaxation_procedure(q_cons_vf) - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - integer :: j, k, l + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + integer :: j, k, l $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3) do l = 0, p @@ -71,8 +71,8 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - integer, intent(in) :: j, k, l + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + integer, intent(in) :: j, k, l ! Volume fraction correction if (mpp_lim) call s_correct_volume_fractions(q_cons_vf, j, k, l) @@ -92,9 +92,9 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf - integer, intent(in) :: j, k, l - integer :: i + type(scalar_field), dimension(:), intent(in) :: q_cons_vf + integer, intent(in) :: j, k, l + integer :: i s_needs_pressure_relaxation = .true. $:GPU_LOOP(parallelism='[seq]') @@ -111,10 +111,10 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - integer, intent(in) :: j, k, l - real(wp) :: sum_alpha - integer :: i + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + integer, intent(in) :: j, k, l + real(wp) :: sum_alpha + integer :: i sum_alpha = 0._wp $:GPU_LOOP(parallelism='[seq]') @@ -141,13 +141,13 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - integer, intent(in) :: j, k, l - real(wp) :: pres_relax, f_pres, df_pres + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + integer, intent(in) :: j, k, l + real(wp) :: pres_relax, f_pres, df_pres #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3) :: pres_K_init, rho_K_s #:else - real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s + real(wp), dimension(num_fluids_max) :: pres_K_init, rho_K_s #:endif integer, parameter :: MAX_ITER = 50 ! Pressure relaxation convergence tolerance @@ -214,12 +214,12 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - integer, intent(in) :: j, k, l + type(scalar_field), dimension(:), intent(inout) :: q_cons_vf + integer, intent(in) :: j, k, l #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3) :: alpha_rho, alpha #:else - real(wp), dimension(num_fluids) :: alpha_rho, alpha + real(wp), dimension(num_fluids_max) :: alpha_rho, alpha #:endif real(wp) :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha real(wp), dimension(2) :: Re diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp index 8465d4d349..127c47ccfb 100644 --- a/src/simulation/m_qbmm.fpp +++ b/src/simulation/m_qbmm.fpp @@ -587,7 +587,7 @@ contains $:GPU_ROUTINE(function_name='s_coeff_nonpoly',parallelism='[seq]', cray_inline=True) real(wp), intent(in) :: pres, rho, c - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(32,0:2,0:2), intent(out) :: coeffs #:else real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeffs @@ -666,7 +666,7 @@ contains $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', cray_inline=True) real(wp), intent(in) :: pres, rho, c - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(32,0:2,0:2), intent(out) :: coeffs #:else real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeffs @@ -741,14 +741,14 @@ contains real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: rhs_mv type(int_bounds_info), intent(in) :: ix, iy, iz - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(6) :: moms, msum real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht #:else real(wp), dimension(nmom) :: moms, msum real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht #:endif - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(32,0:2,0:2) :: coeff #:else real(wp), dimension(nterms,0:2,0:2) :: coeff @@ -930,7 +930,7 @@ contains $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', cray_inline=True) real(wp), intent(in) :: pres, rho, c - #:if USING_AMD + #:if (USING_AMD or USING_INTEL) real(wp), dimension(32,0:2,0:2), intent(out) :: coeff #:else real(wp), dimension(nterms,0:2,0:2), intent(out) :: coeff @@ -949,8 +949,8 @@ contains $:GPU_ROUTINE(function_name='s_chyqmom',parallelism='[seq]', cray_inline=True) - real(wp), dimension(nmom), intent(in) :: momin - real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY + real(wp), dimension(nmom), intent(in) :: momin + real(wp), dimension(:), intent(inout) :: wght, abscX, abscY ! Local variables real(wp), dimension(0:2,0:2) :: moms @@ -1027,10 +1027,10 @@ contains function f_quad(abscX, abscY, wght_in, q, r, s) $:GPU_ROUTINE(parallelism='[seq]') - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in #:else - real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in + real(wp), dimension(:,:), intent(in) :: abscX, abscY, wght_in #:endif real(wp), intent(in) :: q, r, s real(wp) :: f_quad_RV, f_quad @@ -1053,10 +1053,10 @@ contains function f_quad2D(abscX, abscY, wght_in, pow) $:GPU_ROUTINE(parallelism='[seq]') - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in #:else - real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in + real(wp), dimension(:), intent(in) :: abscX, abscY, wght_in #:endif real(wp), dimension(3), intent(in) :: pow real(wp) :: f_quad2D diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp index 0a8bc32212..8ece8fa0ff 100644 --- a/src/simulation/m_riemann_solvers.fpp +++ b/src/simulation/m_riemann_solvers.fpp @@ -4389,8 +4389,8 @@ contains real(wp), dimension(3, 3), intent(in) :: vel_grad_avg real(wp), dimension(3, 3), intent(out) :: tau_shear_out #:else - real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg - real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out + real(wp), dimension(:,:), intent(in) :: vel_grad_avg + real(wp), dimension(:,:), intent(out) :: tau_shear_out #:endif real(wp), intent(in) :: Re_shear real(wp), intent(in) :: divergence_v @@ -4422,7 +4422,7 @@ contains #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3, 3), intent(out) :: tau_bulk_out #:else - real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out + real(wp), dimension(:,:), intent(out) :: tau_bulk_out #:endif ! Local variables diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp index 4a0978919e..8392fdc6bc 100644 --- a/src/simulation/m_sim_helpers.fpp +++ b/src/simulation/m_sim_helpers.fpp @@ -45,11 +45,11 @@ contains function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms) $:GPU_ROUTINE(parallelism='[seq]') - real(wp), dimension(num_vels), intent(in) :: vel - real(wp), intent(in) :: c - integer, intent(in) :: j, k, l - real(wp) :: cfl_terms - real(wp) :: fltr_dtheta + real(wp), dimension(:), intent(in) :: vel + real(wp), intent(in) :: c + integer, intent(in) :: j, k, l + real(wp) :: cfl_terms + real(wp) :: fltr_dtheta fltr_dtheta = f_compute_filtered_dtheta(k, l) @@ -74,13 +74,13 @@ contains $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', cray_inline=True) - type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf + type(scalar_field), intent(in), dimension(:) :: q_prim_vf #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), intent(inout), dimension(3) :: alpha real(wp), intent(inout), dimension(3) :: vel #:else - real(wp), intent(inout), dimension(num_fluids) :: alpha - real(wp), intent(inout), dimension(num_vels) :: vel + real(wp), intent(inout), dimension(:) :: alpha + real(wp), intent(inout), dimension(:) :: vel #:endif real(wp), intent(inout) :: rho, gamma, pi_inf, vel_sum, H, pres real(wp), intent(out) :: qv @@ -89,7 +89,7 @@ contains #:if not MFC_CASE_OPTIMIZATION and USING_AMD real(wp), dimension(3) :: alpha_rho, Gs #:else - real(wp), dimension(num_fluids) :: alpha_rho, Gs + real(wp), dimension(num_fluids_max) :: alpha_rho, Gs #:endif real(wp) :: E, G_local integer :: i @@ -141,7 +141,7 @@ contains subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf) $:GPU_ROUTINE(parallelism='[seq]') - real(wp), intent(in), dimension(num_vels) :: vel + real(wp), intent(in), dimension(:) :: vel real(wp), intent(in) :: c, rho real(wp), dimension(0:m,0:n,0:p), intent(inout) :: icfl_sf real(wp), dimension(0:m,0:n,0:p), intent(inout), optional :: vcfl_sf, Rc_sf @@ -191,7 +191,7 @@ contains subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') - real(wp), dimension(num_vels), intent(in) :: vel + real(wp), dimension(:), intent(in) :: vel real(wp), intent(in) :: c, rho real(wp), dimension(0:m,0:n,0:p), intent(inout) :: max_dt real(wp), dimension(2), intent(in) :: Re_l diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp index 0b7caf7a47..f83650375d 100644 --- a/src/simulation/m_surface_tension.fpp +++ b/src/simulation/m_surface_tension.fpp @@ -65,7 +65,7 @@ contains integer, intent(in) :: id type(int_bounds_info), intent(in) :: isx, isy, isz - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3, 3) :: Omega #:else real(wp), dimension(num_dims, num_dims) :: Omega diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 4d142eca55..762d52acaa 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -620,12 +620,12 @@ contains real(wp) :: rho !< Cell-avg. density - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: vel !< Cell-avg. velocity real(wp), dimension(3) :: alpha !< Cell-avg. volume fraction #:else - real(wp), dimension(num_vels) :: vel !< Cell-avg. velocity - real(wp), dimension(num_fluids) :: alpha !< Cell-avg. volume fraction + real(wp), dimension(3) :: vel !< Cell-avg. velocity + real(wp), dimension(num_fluids_max) :: alpha !< Cell-avg. volume fraction #:endif real(wp) :: vel_sum !< Cell-avg. velocity sum real(wp) :: pres !< Cell-avg. pressure diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp index 55cdcc343e..af99f44eb0 100644 --- a/src/simulation/m_viscous.fpp +++ b/src/simulation/m_viscous.fpp @@ -53,7 +53,7 @@ contains real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum !< Mixture variables real(wp), dimension(2) :: Re_visc - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(3) :: alpha_visc, alpha_rho_visc real(wp), dimension(3, 3) :: tau_Re #:else @@ -1266,14 +1266,14 @@ contains $:GPU_ROUTINE(parallelism='[seq]') - real(wp), dimension(1:3,1:3), intent(inout) :: viscous_stress_tensor - type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf - real(wp), intent(in) :: dynamic_viscosity - integer, intent(in) :: i, j, k - real(wp), dimension(1:3,1:3) :: velocity_gradient_tensor - real(wp), dimension(1:3) :: dx - real(wp) :: divergence - integer :: l, q !< iterators + real(wp), dimension(1:3,1:3), intent(inout) :: viscous_stress_tensor + type(scalar_field), dimension(:), intent(in) :: q_prim_vf + real(wp), intent(in) :: dynamic_viscosity + integer, intent(in) :: i, j, k + real(wp), dimension(1:3,1:3) :: velocity_gradient_tensor + real(wp), dimension(1:3) :: dx + real(wp) :: divergence + integer :: l, q !< iterators ! zero the viscous stress, collection of velocity derivatives, and spatial finite differences viscous_stress_tensor = 0._wp diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index 78e7678b1a..e19c245aec 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -895,7 +895,7 @@ contains integer, intent(in) :: weno_dir type(int_bounds_info), intent(in) :: is1_weno_d, is2_weno_d, is3_weno_d - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if not MFC_CASE_OPTIMIZATION and (USING_AMD or USING_INTEL) real(wp), dimension(-3:2) :: dvd real(wp), dimension(0:4) :: poly real(wp), dimension(0:4) :: alpha diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh index 1beb016539..6dc9c0e6e9 100644 --- a/toolchain/bootstrap/modules.sh +++ b/toolchain/bootstrap/modules.sh @@ -48,6 +48,8 @@ if [ -v $u_c ]; then log "$B""DoD$W: Carpenter Cray (cc) | Carpenter GNU (c) | Nautilus (n)" log "$OR""Florida$W: HiPerGator (h)" log "$C""WPI $W: Turing (t)" + log "$Y""Gatech$W: CRNCH RoboGator (crnch)" + log "$C""TAMU$W: ACES (aces)" log_n "($G""a$W/$G""f$W/$G""s$W/$G""w$W/$B""tuo$W/$C""b$W/$C""e$CR/$C""d/$C""dai$CR/$Y""p$CR/$R""r$CR/$B""cc$CR/$B""c$CR/$B""n$CR/$BR""o$CR/$BR""pa"$CR"/$OR""h"$CR/$C""t""$CR"): " read u_c log @@ -106,11 +108,13 @@ fi ELEMENTS="$(__extract "$u_c-all") $(__extract "$u_c-$cg")" MODULES=`echo "$ELEMENTS" | tr ' ' '\n' | grep -v = | xargs` -log " $ module load $MODULES" -if ! module load $MODULES; then - error "Failed to load modules." +if [ -n "$MODULES" ]; then + log " $ module load $MODULES" + if ! module load $MODULES; then + error "Failed to load modules." - return + return + fi fi # Export variables one line at a time so each can reference previously exported vars diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt index d1c9c8b037..f92939e144 100644 --- a/toolchain/dependencies/CMakeLists.txt +++ b/toolchain/dependencies/CMakeLists.txt @@ -32,7 +32,10 @@ if (MFC_FFTW) message(STATUS "FFTW found.") add_custom_target(fftw) else() - if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.") + add_custom_target(fftw) + else() ExternalProject_Add(fftw URL "http://www.fftw.org/fftw-3.3.10.tar.gz" CMAKE_ARGS -DBUILD_TESTS=OFF @@ -40,9 +43,6 @@ if (MFC_FFTW) "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 ) - else() - message(WARNING "The Fortran compiler vendor is Cray so FFTW3 will not be built. We will use cray-fftw instead.") - add_custom_target(fftw) endif() endif() endif() @@ -134,6 +134,9 @@ endif() -DCBLAS=OFF -DLAPACKE=OFF -DBUILD_DEPRECATED=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + "-DCMAKE_EXE_LINKER_FLAGS=-no-pie" + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" ) diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index d5a13fe182..aa4d4ac2af 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -427,6 +427,7 @@ def configure(self, case: Case): flags.append(f"-DMFC_GCov={'ON' if ARG('gcov') else 'OFF'}") flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}") flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}") + flags.append(f"-DMFC_Intel_AOT={'ON' if ARG('intel_aot') else 'OFF'}") command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath] diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index dda710f602..180044a4f3 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -99,6 +99,16 @@ def generate_fpp(self, target) -> None: thermochem_code = pyro.FortranCodeGenerator().generate("m_thermochem", sol, pyro.CodeGenerationOptions(scalar_type=real_type, directive_offload=directive_str)) + if directive_str == "mp": + # ifx -fpp strips !$omp directives produced by C-macro expansion because the + # Intel Fortran preprocessor treats '!' as a Fortran comment after expansion. + # Rewrite the GPU_ROUTINE macro calls as literal !$omp declare target lines so + # the directive is visible to the Fortran front-end, not the C preprocessor. + import re + + thermochem_code = thermochem_code.replace("#define GPU_ROUTINE(name) !$omp declare target\n", "") + thermochem_code = re.sub(r"[ \t]+GPU_ROUTINE\(\w+\)", "!$omp declare target", thermochem_code) + common.file_write(os.path.join(modules_dir, "m_thermochem.f90"), thermochem_code, True) cons.unindent() diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 82e886c064..261d8e65dc 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -85,6 +85,19 @@ def __get_template() -> Template: raise MFCException(f"Failed to find a template for --computer '{computer}'. Baked-in templates are: {format_list_to_string(list(baked.keys()), 'magenta')}.") +def __is_intel_gpu_build(case: input.MFCInputFile) -> bool: + cmake_cache = os.path.join(SIMULATION.get_staging_dirpath(case), "CMakeCache.txt") + if not os.path.isfile(cmake_cache): + return False + with open(cmake_cache) as f: + content = f.read() + # Match compiler ID entry (may or may not be present) or fall back to compiler path + if re.search(r"CMAKE_Fortran_COMPILER_ID[^=\n]*=[^\n]*IntelLLVM", content): + return True + m = re.search(r"CMAKE_Fortran_COMPILER:FILEPATH=([^\n]+)", content) + return m is not None and os.path.basename(m.group(1).strip()) in ("ifx", "ifx.exe") + + def __generate_job_script(targets, case: input.MFCInputFile): env = {} if ARG("gpus") is not None: @@ -103,6 +116,23 @@ def __generate_job_script(targets, case: input.MFCInputFile): gpu_acc = gpu_mode == gpuConfigOptions.ACC.value gpu_mp = gpu_mode == gpuConfigOptions.MP.value + if gpu_mp and __is_intel_gpu_build(case): + # Level Zero tuning for Intel GPU Max (Ponte Vecchio). + # COMMAND_BATCH=256: batch up to 256 Level Zero commands before flushing, + # allowing the GPU to stay busy while the host prepares the next batch. + # Measured ~9% throughput improvement on Intel GPU Max 1100. + env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH", "256") + # Disable per-kernel indirect access memory tracking (zeMemGetAllocProperties). + # MFC's scalar_field pointer arrays trigger indirect-access flags; this tracking + # adds ~2100 API calls/step. Disabling it is safe since all GPU allocations are + # managed via @:ALLOCATE/@:DEALLOCATE and never aliased with host memory. + env.setdefault("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY", "0") + if ARG("fastmath"): + # -cl-fast-relaxed-math: enables unsafe GPU JIT optimizations (fast transcendentals, + # fused MAD, no signed-zero semantics, finite-math-only). Matches nvfortran -gpu=fastmath. + # Only applied when the user explicitly requests --fastmath. + env.setdefault("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "-cl-fast-relaxed-math") + content = __get_template().render( **{**ARGS(), "targets": targets}, ARG=ARG, diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index 94a37be947..63dacd0884 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -21,6 +21,7 @@ class MFCConfig: single: bool = False mixed: bool = False fastmath: bool = False + intel_aot: bool = False def __hash__(self): return hash(tuple(getattr(self, f.name) for f in dataclasses.fields(self))) @@ -32,7 +33,8 @@ def from_dict(d: dict): r = MFCConfig() for field in dataclasses.fields(MFCConfig): - setattr(r, field.name, d[field.name]) + if field.name in d: + setattr(r, field.name, d[field.name]) return r diff --git a/toolchain/modules b/toolchain/modules index ea7cb36393..1240772c94 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -116,3 +116,19 @@ h-gpu NVCOMPILER_COMM_LIBS_HOME=/apps/compilers/nvhpc/25.9/Linux_x86_64/25.9/com t WPI Turing t-all slurm t-cpu gcc/12.1.0/i6yk33f openmpi/4.1.3/ebae7zc python/3.13.5/6anz4qy + +aces TAMU HPRC ACES (Intel GPU Max 1100, Ponte Vecchio) +aces-all iimpi/2025a imkl/2025.1.0 CMake/3.31.3 Python/3.13.1 +aces-gpu I_MPI_F90=ifx +aces-gpu FC=mpif90 + +crnch GT CRNCH RoboGator (Intel GPU Max 1100, Ponte Vecchio) +crnch-gpu FC=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin/mpiifx +crnch-gpu PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/bin:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/bin:${PATH} +crnch-gpu MKLROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0 +crnch-gpu I_MPI_ROOT=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14 +crnch-gpu LD_LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/libfabric/lib:${LD_LIBRARY_PATH} +crnch-gpu LIBRARY_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mkl/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/compiler/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/2025.0/lib:/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/lib:${LIBRARY_PATH} +crnch-gpu I_MPI_FABRICS=shm:ofi +crnch-gpu FI_PROVIDER=tcp +crnch-gpu FI_PROVIDER_PATH=/net/projects/tools/x86_64/rhel-8/intel-oneapi/2025.1/mpi/2021.14/libfabric/lib/prov diff --git a/toolchain/templates/aces.mako b/toolchain/templates/aces.mako new file mode 100644 index 0000000000..40c38290c0 --- /dev/null +++ b/toolchain/templates/aces.mako @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +<%namespace name="helpers" file="helpers.mako"/> + +% if engine == 'batch': +#SBATCH --nodes=${nodes} +#SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --cpus-per-task=1 +#SBATCH --job-name="${name}" +#SBATCH --time=${walltime} +% if partition: +#SBATCH --partition=${partition} +% else: +#SBATCH --partition=pvc +% endif +% if account: +#SBATCH --account="${account}" +% endif +% if gpu_enabled: +#SBATCH --gres=gpu:pvc:${tasks_per_node} +#SBATCH --mem=32G +% endif +#SBATCH --output="${name}.out" +#SBATCH --error="${name}.err" +% if email: +#SBATCH --mail-user=${email} +#SBATCH --mail-type="BEGIN, END, FAIL" +% endif +% endif + +${helpers.template_prologue()} + +ok ":) Loading modules:\n" +cd "${MFC_ROOT_DIR}" +. ./mfc.sh load -c aces -m ${'g' if gpu_enabled else 'c'} +cd - > /dev/null +echo + +% for target in targets: + ${helpers.run_prologue(target)} + + % if not mpi: + (set -x; ${profiler} "${target.get_install_binpath(case)}") + % else: + (set -x; ${profiler} \ + mpirun -np ${nodes*tasks_per_node} \ + "${target.get_install_binpath(case)}") + % endif + + ${helpers.run_epilogue(target)} + + echo +% endfor + +${helpers.template_epilogue()} diff --git a/toolchain/templates/include/helpers.mako b/toolchain/templates/include/helpers.mako index a9493f71d0..5c729827c5 100644 --- a/toolchain/templates/include/helpers.mako +++ b/toolchain/templates/include/helpers.mako @@ -32,6 +32,15 @@ END export ${key}='${value}' % endfor + % if gpu_mp: + # ifx + Level Zero: the OpenMP SPIR-V JIT runs on the main thread stack at + # startup, consuming 2-4 MB before user code runs. ifx also stack-allocates + # compiler-generated temporaries with no size threshold (unlike gfortran's + # 32 KB default). Together they overflow the default 12.5 MB stack for any + # non-trivial 3D case. Removing the limit avoids SIGSEGV at launch. + ulimit -s unlimited + % endif + t_start=$(date +%s) % else: echo MFC case # ${name} @ ${input}: