From e1bfa9fe6f94c85933c96935f5382bbfe1242f2b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 6 May 2026 18:56:39 -0400 Subject: [PATCH 1/3] Fix Cray compiler GPU routine macros: INLINENEVER directive and cray_inline flag --- src/common/include/parallel_macros.fpp | 5 ++++- src/common/m_variables_conversion.fpp | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index b1382ec49a..05ab575283 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -67,10 +67,11 @@ #:if not isinstance(function_name, str) #:stop "When using cray_noinline, function name must be given and given as a string" #:endif - #:set cray_noinline_directive = ('!DIR$ NOINLINE ' + function_name).strip('\n') + #:set cray_noinline_directive = ('!DIR$ INLINENEVER ' + function_name).strip('\n') #ifdef _CRAYFTN #if MFC_OpenACC $:acc_directive + $:cray_noinline_directive #elif MFC_OpenMP $:omp_directive #else @@ -91,6 +92,7 @@ #ifdef _CRAYFTN #if MFC_OpenACC $:acc_directive + $:cray_directive #elif MFC_OpenMP $:omp_directive #else @@ -98,6 +100,7 @@ #endif #elif MFC_OpenACC $:acc_directive + $:cray_directive #elif MFC_OpenMP $:omp_directive #endif diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp index 2417da1adf..3b0126179d 100644 --- a/src/common/m_variables_conversion.fpp +++ b/src/common/m_variables_conversion.fpp @@ -80,7 +80,7 @@ contains !> Compute the pressure from the appropriate equation of state subroutine s_compute_pressure(energy, alf, dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T, stress, mom, G, pres_mag) - $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', cray_noinline=True) + $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', cray_inline=True) real(stp), intent(in) :: energy, alf real(wp), intent(in) :: dyn_p @@ -499,7 +499,6 @@ contains real(wp) :: E, D !< Prim/Cons variables within Newton-Raphson iteration real(wp) :: f, dGa_dW, dp_dW, df_dW !< Functions within Newton-Raphson iteration integer :: iter !< Newton-Raphson iteration counter - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, Re_K, nRtmp, rho_K, gamma_K, pi_inf_K, qv_K, dyn_pres_K, & & rhoYks, B, pres, vftmp, nbub_sc, G_K, T, pres_mag, Ga, B2, m2, S, W, dW, E, D, f, dGa_dW, dp_dW, & & df_dW, iter]') From ee97ba7fbc88d443f9c2345ec617751faf7390e7 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 6 May 2026 20:43:08 -0400 Subject: [PATCH 2/3] Fix cray_inline: don't emit INLINEALWAYS on non-Cray OpenACC builds --- src/common/include/parallel_macros.fpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index 05ab575283..d8e230dcab 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -100,7 +100,6 @@ #endif #elif MFC_OpenACC $:acc_directive - $:cray_directive #elif MFC_OpenMP $:omp_directive #endif From 1529f98c4cd6da8c2810a8c98f970b4f3981fa78 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 12 May 2026 16:01:36 -0500 Subject: [PATCH 3/3] style: apply ffmt v0.4.0 formatting --- src/common/include/2dHardcodedIC.fpp | 8 ++++---- src/common/include/case.fpp | 2 -- src/simulation/m_ib_patches.fpp | 4 ++-- src/simulation/m_rhs.fpp | 6 +++--- src/simulation/m_riemann_solvers.fpp | 25 ++++++++++++------------- src/simulation/m_weno.fpp | 12 ++++++------ 6 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/common/include/2dHardcodedIC.fpp b/src/common/include/2dHardcodedIC.fpp index 3f4dc4bcbb..bb84830264 100644 --- a/src/common/include/2dHardcodedIC.fpp +++ b/src/common/include/2dHardcodedIC.fpp @@ -285,11 +285,11 @@ & 0) = 1.0*(1.0 - (1.0/1.0)*(5.0/(2.0*pi))*(5.0/(8.0*1.0*(1.4 + 1.0)*pi))*exp(2.0*1.0*(1.0 - (x_cc(i) & & - patch_icpp(1)%x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0)))**1.4 q_prim_vf(eqn_idx%mom%beg + 0)%sf(i, j, & - & 0) = 0.0 + (y_cc(j) - patch_icpp(1)%y_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) - patch_icpp(1) & - & %x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0)) + & 0) = 0.0 + (y_cc(j) - patch_icpp(1)%y_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) & + & - patch_icpp(1)%x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0)) q_prim_vf(eqn_idx%mom%beg + 1)%sf(i, j, & - & 0) = 0.0 - (x_cc(i) - patch_icpp(1)%x_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) - patch_icpp(1) & - & %x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0)) + & 0) = 0.0 - (x_cc(i) - patch_icpp(1)%x_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) & + & - patch_icpp(1)%x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0)) end if case (281) ! Acoustic pulse ! This is patch is hard-coded for test suite optimization used in the 2D_acoustic_pulse case: This analytic patch uses diff --git a/src/common/include/case.fpp b/src/common/include/case.fpp index aa0e0637b9..8f5fc4777b 100644 --- a/src/common/include/case.fpp +++ b/src/common/include/case.fpp @@ -4,10 +4,8 @@ ! For pre-process. #:def analytical() - #:enddef ! For moving immersed boundaries in simulation #:def mib_analytical() - #:enddef diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp index cf3ad9ecfa..bbdab66204 100644 --- a/src/simulation/m_ib_patches.fpp +++ b/src/simulation/m_ib_patches.fpp @@ -395,8 +395,8 @@ contains do l = ll, lr do j = jl, jr do i = il, ir - xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), & - & z_cc(l) - center(3)] ! get coordinate frame centered on IB + ! get coordinate frame centered on IB + xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(l) - center(3)] xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates xyz_local = xyz_local - offset ! airfoils are a patch that require a centroid offset diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp index ce475532b8..3d90a589d3 100644 --- a/src/simulation/m_rhs.fpp +++ b/src/simulation/m_rhs.fpp @@ -517,9 +517,8 @@ contains type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in - real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), & - & intent(inout) & - & :: rhs_pb ! TODO :: I think these other two variables need to be stp as well, but it doesn't compile like that right now + ! TODO :: I think these other two variables need to be stp as well, but it doesn't compile like that right now + real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: rhs_pb real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: mv_in real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: rhs_mv integer, intent(in) :: t_step @@ -530,6 +529,7 @@ contains integer(kind=8) :: i, j, k, l, q !< Generic loop iterators ! RHS: halo exchange -> reconstruct -> Riemann solve -> flux difference -> source terms + call nvtxStartRange("COMPUTE-RHS") call cpu_time(t_start) diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp index 33661b54d7..88a2210c3f 100644 --- a/src/simulation/m_riemann_solvers.fpp +++ b/src/simulation/m_riemann_solvers.fpp @@ -423,11 +423,11 @@ contains pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp) #:endif E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L - E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R & - & + pres_mag%R ! includes magnetic energy + ! includes magnetic energy + E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R H_L = (E_L + pres_L - pres_mag%L)/rho_L - H_R = (E_R + pres_R - pres_mag%R) & - & /rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + H_R = (E_R + pres_R - pres_mag%R)/rho_R else E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R @@ -754,9 +754,8 @@ contains & eqn_idx%psi) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, & & eqn_idx%psi)))/(s_M - s_P) else - flux_rs${XYZ}$_vf(j, k, l, & - & eqn_idx%B%beg + norm_dir - 1) & - & = 0._wp ! Without hyperbolic cleaning, make sure flux of B_normal is identically zero + ! Without hyperbolic cleaning, make sure flux of B_normal is identically zero + flux_rs${XYZ}$_vf(j, k, l, eqn_idx%B%beg + norm_dir - 1) = 0._wp end if end if flux_src_rs${XYZ}$_vf(j, k, l, eqn_idx%adv%beg) = 0._wp @@ -1113,11 +1112,11 @@ contains pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp) pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp) E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L - E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R & - & + pres_mag%R ! includes magnetic energy + ! includes magnetic energy + E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R H_L = (E_L + pres_L - pres_mag%L)/rho_L - H_R = (E_R + pres_R - pres_mag%R) & - & /rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + H_R = (E_R + pres_R - pres_mag%R)/rho_R else E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R @@ -3463,8 +3462,8 @@ contains E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L - H_no_mag%R = (E%R + pres%R - pres_mag%R) & - & /rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! (2) Compute fast wave speeds call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, & diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index c89e35d438..2de3ea1150 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -1178,8 +1178,8 @@ contains delta(:) = 0._wp beta(:) = weno_eps - if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3,k, l, & - & i) ! temporary field value array for clarity + ! temporary field value array for clarity + if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3,k, l, i) if (.not. teno) then dvd(2) = v_rs_ws_${XYZ}$ (j + 3, k, l, i) - v_rs_ws_${XYZ}$ (j + 2, k, l, i) @@ -1279,8 +1279,8 @@ contains tau = abs(beta(3) - beta(0)) ! Equation 50 $:GPU_LOOP(parallelism='[seq]') do q = 0, weno_num_stencils - alpha(q) = d_cbL_${XYZ}$ (q, & - & j)*(1._wp + (tau/beta(q))**wenoz_q) ! wenoz_q = 2,3,4 for stability + ! wenoz_q = 2,3,4 for stability + alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q) end do else if (teno) then #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3 @@ -1353,8 +1353,8 @@ contains else if (wenoz) then $:GPU_LOOP(parallelism='[seq]') do q = 0, weno_num_stencils - alpha(q) = d_cbR_${XYZ}$ (q, & - & j)*(1._wp + (tau/beta(q))**wenoz_q) ! wenoz_q = 2,3,4 for stability + ! wenoz_q = 2,3,4 for stability + alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q) end do else if (teno) then $:GPU_LOOP(parallelism='[seq]')