From e1bfa9fe6f94c85933c96935f5382bbfe1242f2b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 6 May 2026 18:56:39 -0400
Subject: [PATCH 1/3] Fix Cray compiler GPU routine macros: INLINENEVER
 directive and cray_inline flag

---
 src/common/include/parallel_macros.fpp | 5 ++++-
 src/common/m_variables_conversion.fpp  | 3 +--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index b1382ec49a..05ab575283 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -67,10 +67,11 @@
         #:if not isinstance(function_name, str)
             #:stop "When using cray_noinline, function name must be given and given as a string"
         #:endif
-        #:set cray_noinline_directive = ('!DIR$ NOINLINE ' + function_name).strip('\n')
+        #:set cray_noinline_directive = ('!DIR$ INLINENEVER ' + function_name).strip('\n')
 #ifdef _CRAYFTN
 #if MFC_OpenACC
         $:acc_directive
+        $:cray_noinline_directive
 #elif MFC_OpenMP
         $:omp_directive
 #else
@@ -91,6 +92,7 @@
 #ifdef _CRAYFTN
 #if MFC_OpenACC
         $:acc_directive
+        $:cray_directive
 #elif MFC_OpenMP
         $:omp_directive
 #else
@@ -98,6 +100,7 @@
 #endif
 #elif MFC_OpenACC
         $:acc_directive
+        $:cray_directive
 #elif MFC_OpenMP
         $:omp_directive
 #endif
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 2417da1adf..3b0126179d 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -80,7 +80,7 @@ contains
     !> Compute the pressure from the appropriate equation of state
     subroutine s_compute_pressure(energy, alf, dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T, stress, mom, G, pres_mag)
 
-        $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', cray_noinline=True)
+        $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', cray_inline=True)
 
         real(stp), intent(in)           :: energy, alf
         real(wp), intent(in)            :: dyn_p
@@ -499,7 +499,6 @@ contains
         real(wp)               :: E, D                     !< Prim/Cons variables within Newton-Raphson iteration
         real(wp)               :: f, dGa_dW, dp_dW, df_dW  !< Functions within Newton-Raphson iteration
         integer                :: iter                     !< Newton-Raphson iteration counter
-
         $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, Re_K, nRtmp, rho_K, gamma_K, pi_inf_K, qv_K, dyn_pres_K, &
                             & rhoYks, B, pres, vftmp, nbub_sc, G_K, T, pres_mag, Ga, B2, m2, S, W, dW, E, D, f, dGa_dW, dp_dW, &
                             & df_dW, iter]')

From ee97ba7fbc88d443f9c2345ec617751faf7390e7 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 6 May 2026 20:43:08 -0400
Subject: [PATCH 2/3] Fix cray_inline: don't emit INLINEALWAYS on non-Cray
 OpenACC builds

---
 src/common/include/parallel_macros.fpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index 05ab575283..d8e230dcab 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -100,7 +100,6 @@
 #endif
 #elif MFC_OpenACC
         $:acc_directive
-        $:cray_directive
 #elif MFC_OpenMP
         $:omp_directive
 #endif

From 1529f98c4cd6da8c2810a8c98f970b4f3981fa78 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 12 May 2026 16:01:36 -0500
Subject: [PATCH 3/3] style: apply ffmt v0.4.0 formatting

---
 src/common/include/2dHardcodedIC.fpp |  8 ++++----
 src/common/include/case.fpp          |  2 --
 src/simulation/m_ib_patches.fpp      |  4 ++--
 src/simulation/m_rhs.fpp             |  6 +++---
 src/simulation/m_riemann_solvers.fpp | 25 ++++++++++++-------------
 src/simulation/m_weno.fpp            | 12 ++++++------
 6 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/common/include/2dHardcodedIC.fpp b/src/common/include/2dHardcodedIC.fpp
index 3f4dc4bcbb..bb84830264 100644
--- a/src/common/include/2dHardcodedIC.fpp
+++ b/src/common/include/2dHardcodedIC.fpp
@@ -285,11 +285,11 @@
                       & 0) = 1.0*(1.0 - (1.0/1.0)*(5.0/(2.0*pi))*(5.0/(8.0*1.0*(1.4 + 1.0)*pi))*exp(2.0*1.0*(1.0 - (x_cc(i) &
                       & - patch_icpp(1)%x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0)))**1.4
             q_prim_vf(eqn_idx%mom%beg + 0)%sf(i, j, &
-                      & 0) = 0.0 + (y_cc(j) - patch_icpp(1)%y_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) - patch_icpp(1) &
-                      & %x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0))
+                      & 0) = 0.0 + (y_cc(j) - patch_icpp(1)%y_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) &
+                      & - patch_icpp(1)%x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0))
             q_prim_vf(eqn_idx%mom%beg + 1)%sf(i, j, &
-                      & 0) = 0.0 - (x_cc(i) - patch_icpp(1)%x_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) - patch_icpp(1) &
-                      & %x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0))
+                      & 0) = 0.0 - (x_cc(i) - patch_icpp(1)%x_centroid)*(5.0/(2.0*pi))*exp(1.0*(1.0 - (x_cc(i) &
+                      & - patch_icpp(1)%x_centroid)**2.0 - (y_cc(j) - patch_icpp(1)%y_centroid)**2.0))
         end if
     case (281)  ! Acoustic pulse
         ! This is patch is hard-coded for test suite optimization used in the 2D_acoustic_pulse case: This analytic patch uses
diff --git a/src/common/include/case.fpp b/src/common/include/case.fpp
index aa0e0637b9..8f5fc4777b 100644
--- a/src/common/include/case.fpp
+++ b/src/common/include/case.fpp
@@ -4,10 +4,8 @@
 
 ! For pre-process.
 #:def analytical()
-
 #:enddef
 
 ! For moving immersed boundaries in simulation
 #:def mib_analytical()
-
 #:enddef
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index cf3ad9ecfa..bbdab66204 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -395,8 +395,8 @@ contains
         do l = ll, lr
             do j = jl, jr
                 do i = il, ir
-                    xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), &
-                                      & z_cc(l) - center(3)]  ! get coordinate frame centered on IB
+                    ! get coordinate frame centered on IB
+                    xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(l) - center(3)]
                     xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
                     xyz_local = xyz_local - offset  ! airfoils are a patch that require a centroid offset
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index ce475532b8..3d90a589d3 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -517,9 +517,8 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout)                                     :: rhs_vf
         real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: pb_in
 
-        real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), &
-             & intent(inout) &
-             & :: rhs_pb  ! TODO :: I think these other two variables need to be stp as well, but it doesn't compile like that right now
+        ! TODO :: I think these other two variables need to be stp as well, but it doesn't compile like that right now
+        real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: rhs_pb
         real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: mv_in
         real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(inout) :: rhs_mv
         integer, intent(in) :: t_step
@@ -530,6 +529,7 @@ contains
         integer(kind=8) :: i, j, k, l, q  !< Generic loop iterators
 
         ! RHS: halo exchange -> reconstruct -> Riemann solve -> flux difference -> source terms
+
         call nvtxStartRange("COMPUTE-RHS")
 
         call cpu_time(t_start)
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 33661b54d7..88a2210c3f 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -423,11 +423,11 @@ contains
                                     pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
                                 #:endif
                                 E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L
-                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R &
-                                    & + pres_mag%R  ! includes magnetic energy
+                                ! includes magnetic energy
+                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R
                                 H_L = (E_L + pres_L - pres_mag%L)/rho_L
-                                H_R = (E_R + pres_R - pres_mag%R) &
-                                       & /rho_R  ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                                ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                                H_R = (E_R + pres_R - pres_mag%R)/rho_R
                             else
                                 E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
                                 E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
@@ -754,9 +754,8 @@ contains
                                                           & eqn_idx%psi) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, &
                                                           & eqn_idx%psi)))/(s_M - s_P)
                                     else
-                                        flux_rs${XYZ}$_vf(j, k, l, &
-                                                          & eqn_idx%B%beg + norm_dir - 1) &
-                                                          & = 0._wp  ! Without hyperbolic cleaning, make sure flux of B_normal is identically zero
+                                        ! Without hyperbolic cleaning, make sure flux of B_normal is identically zero
+                                        flux_rs${XYZ}$_vf(j, k, l, eqn_idx%B%beg + norm_dir - 1) = 0._wp
                                     end if
                                 end if
                                 flux_src_rs${XYZ}$_vf(j, k, l, eqn_idx%adv%beg) = 0._wp
@@ -1113,11 +1112,11 @@ contains
                                 pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
                                 pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
                                 E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L
-                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R &
-                                    & + pres_mag%R  ! includes magnetic energy
+                                ! includes magnetic energy
+                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R
                                 H_L = (E_L + pres_L - pres_mag%L)/rho_L
-                                H_R = (E_R + pres_R - pres_mag%R) &
-                                       & /rho_R  ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                                ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                                H_R = (E_R + pres_R - pres_mag%R)/rho_R
                             else
                                 E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
                                 E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
@@ -3463,8 +3462,8 @@ contains
                             E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L
                             E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R  ! includes magnetic energy
                             H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L
-                            H_no_mag%R = (E%R + pres%R - pres_mag%R) &
-                                          & /rho%R  ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                            ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                            H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R
 
                             ! (2) Compute fast wave speeds
                             call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, &
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index c89e35d438..2de3ea1150 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -1178,8 +1178,8 @@ contains
                                         delta(:) = 0._wp
                                         beta(:) = weno_eps
 
-                                        if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3,k, l, &
-                                            & i)  ! temporary field value array for clarity
+                                        ! temporary field value array for clarity
+                                        if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3,k, l, i)
 
                                         if (.not. teno) then
                                             dvd(2) = v_rs_ws_${XYZ}$ (j + 3, k, l, i) - v_rs_ws_${XYZ}$ (j + 2, k, l, i)
@@ -1279,8 +1279,8 @@ contains
                                             tau = abs(beta(3) - beta(0))  ! Equation 50
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
-                                                alpha(q) = d_cbL_${XYZ}$ (q, &
-                                                      & j)*(1._wp + (tau/beta(q))**wenoz_q)  ! wenoz_q = 2,3,4 for stability
+                                                ! wenoz_q = 2,3,4 for stability
+                                                alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q)
                                             end do
                                         else if (teno) then
                                             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
@@ -1353,8 +1353,8 @@ contains
                                         else if (wenoz) then
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
-                                                alpha(q) = d_cbR_${XYZ}$ (q, &
-                                                      & j)*(1._wp + (tau/beta(q))**wenoz_q)  ! wenoz_q = 2,3,4 for stability
+                                                ! wenoz_q = 2,3,4 for stability
+                                                alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q)
                                             end do
                                         else if (teno) then
                                             $:GPU_LOOP(parallelism='[seq]')