MFlowCode
diff --git a/‎.github/scripts/prebuild-case-optimization.sh‎
Lines changed: 75 additions & 1 deletion b/‎.github/scripts/prebuild-case-optimization.sh‎
Lines changed: 75 additions & 1 deletion
diff --git a/‎.github/scripts/run_parallel_benchmarks.sh‎
Lines changed: 6 additions & 0 deletions b/‎.github/scripts/run_parallel_benchmarks.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 27 additions & 3 deletions b/‎.github/workflows/test.yml‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎benchmarks/igr/case.py‎
Lines changed: 4 additions & 3 deletions b/‎benchmarks/igr/case.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/common/include/acc_macros.fpp‎
Lines changed: 4 additions & 4 deletions b/‎src/common/include/acc_macros.fpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/common/include/omp_macros.fpp‎
Lines changed: 2 additions & 2 deletions b/‎src/common/include/omp_macros.fpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/common/include/shared_parallel_macros.fpp‎
Lines changed: 25 additions & 0 deletions b/‎src/common/include/shared_parallel_macros.fpp‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/common/m_global_parameters_common.fpp‎
Lines changed: 1 addition & 0 deletions b/‎src/common/m_global_parameters_common.fpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/pre_process/m_mpi_proxy.fpp‎
Lines changed: 2 additions & 2 deletions b/‎src/pre_process/m_mpi_proxy.fpp‎
Lines changed: 2 additions & 2 deletions
@@ -22,12 +22,44 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
+# Optional sharding (format "i/N", e.g. "1/2"), set by submit-slurm-job.sh's
+# [shard] argument via $job_shard: shard i builds every Nth case of the sorted
+# case list. Unset = build all cases in one job (default; other clusters).
+shard="${job_shard:-}"
+if [ -n "$shard" ]; then
+    # Validate full shape: must be exactly "digits/digits" — one slash with
+    # non-empty, purely numeric, non-leading-zero parts on both sides.
+    # Split first, then validate each part independently so that inputs like
+    # "1/" "/2" "//" "1/2/3" "a/b" "12" are all caught before any arithmetic.
+    shard_idx="${shard%%/*}"
+    shard_count="${shard##*/}"
+    # Reject if no slash (idx and count are equal and equal to the whole string)
+    case "$shard_idx" in
+        ''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
+    esac
+    case "$shard_count" in
+        ''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
+    esac
+    # Confirm the string is exactly "idx/count" — catches "12" (no slash) and
+    # "1/2/3" (extra slash, where idx=1 and count=2/3 would have failed above,
+    # but this is an extra safety net).
+    if [ "$shard" != "$shard_idx/$shard_count" ]; then
+        echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1
+    fi
+    if [ "$shard_idx" -lt 1 ] || [ "$shard_idx" -gt "$shard_count" ]; then
+        echo "ERROR: bad shard '$shard' (expected i/N with 1 <= i <= N)"; exit 1
+    fi
+fi
+
 # Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
 # build.sh first, so we must preserve them and only clean MFC target staging.
+# Sharded jobs share one workspace and run concurrently, so the workflow
+# cleans once before submitting them — cleaning here would wipe a sibling
+# shard's in-progress build.
 if [ "$cluster" = "phoenix" ]; then
     source .github/scripts/clean-build.sh
     clean_build
-else
+elif [ -z "$shard" ]; then
     find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
     find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
 fi
@@ -40,7 +72,49 @@ case "$job_interface" in
     *)   echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
 esac
 
+# Case-optimized simulation builds land in per-case hash-named staging dirs,
+# but syscheck/pre_process/post_process hash identically across these cases.
+# Concurrent shards must not build those shared staging dirs simultaneously:
+# shard 1 builds them first and drops a done marker; other shards wait for it,
+# after which their builds no-op in the shared dirs.
+if [ -n "$shard" ] && [ "$shard_count" -gt 1 ]; then
+    shared_marker_done="build/.prebuild-shared-targets-done"
+    shared_marker_failed="build/.prebuild-shared-targets-failed"
+    set -- benchmarks/*/case.py
+    first_case="$1"
+    if [ "$shard_idx" -eq 1 ]; then
+        # Remove both markers at the start so reruns and manual invocations
+        # never observe stale state from a prior run.
+        rm -f "$shared_marker_done" "$shared_marker_failed"
+        echo "=== Shard 1/$shard_count: building shared targets ==="
+        # Write the failure marker if the build exits non-zero so other shards
+        # can detect the failure immediately instead of waiting 90 minutes.
+        trap 'touch "$shared_marker_failed"' ERR
+        ./mfc.sh build -i "$first_case" -t syscheck pre_process post_process --case-optimization $gpu_opts -j 8
+        trap - ERR
+        touch "$shared_marker_done"
+    else
+        echo "=== Shard $shard_idx/$shard_count: waiting for shard 1 to build shared targets ==="
+        waited=0
+        until [ -f "$shared_marker_done" ]; do
+            if [ -f "$shared_marker_failed" ]; then
+                echo "ERROR: shard 1 failed to build shared targets; see shard 1 log"; exit 1
+            fi
+            if [ "$waited" -ge 5400 ]; then
+                echo "ERROR: timed out waiting for $shared_marker_done"; exit 1
+            fi
+            sleep 30
+            waited=$((waited + 30))
+        done
+    fi
+fi
+
+idx=0
 for case in benchmarks/*/case.py; do
+    idx=$((idx + 1))
+    if [ -n "$shard" ] && [ $(((idx - 1) % shard_count)) -ne $((shard_idx - 1)) ]; then
+        continue
+    fi
     echo "=== Pre-building: $case ==="
     ./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
 done
@@ -62,6 +62,12 @@ bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out
 if [ "$pr_exit" -ne 0 ]; then
     echo "PR job exited with code: $pr_exit"
     tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo "  Could not read PR log"
+    # The PR benchmark run genuinely failed (cases crashed/hung/SIGTERM'd, not a
+    # monitor false-positive -- run_monitored_slurm_job.sh re-checks sacct). Fail
+    # the job instead of falling through to the YAML-exists check, which would let
+    # a broken PR pass green as long as a partial YAML was written. Scoped to PR
+    # only: a master/baseline infra flake stays a warning and does not red-cross.
+    exit 1
 else
     echo "PR job completed successfully"
 fi
 
@@ -402,6 +402,13 @@ jobs:
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'cpu'
             interface: 'none'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'cpu'
+            interface: 'none'
+            shard: '2/2'
     runs-on:
       group:  phoenix
       labels: ${{ matrix.runner }}
@@ -420,7 +427,7 @@ jobs:
 
       - name: Fetch Dependencies
         if:   matrix.cluster != 'phoenix'
-        timeout-minutes: 60
+        timeout-minutes: 120
         run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Build
@@ -523,7 +530,22 @@ jobs:
 
       - name: Pre-Build (SLURM)
         if:   matrix.cluster == 'frontier_amd'
-        run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}
+        # AMD flang is slow enough that one serial pre-build job exceeds its
+        # walltime, so split the case list across two concurrent SLURM jobs.
+        # The shards share this workspace and skip their in-job staging clean,
+        # so clean once here on the login node before submitting.
+        run:  |
+          find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
+          find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
+          rm -f build/.prebuild-shared-targets-done
+          bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 1/2 &
+          pid1=$!
+          bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 2/2 &
+          pid2=$!
+          rc=0
+          wait "$pid1" || rc=1
+          wait "$pid2" || rc=1
+          exit $rc
 
       - name: Build & Run Case-Optimization Tests
         if:   matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
@@ -546,6 +568,8 @@ jobs:
         if:   always()
         run:  |
           for f in prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out \
+                   prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-1-of-2.out \
+                   prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-2-of-2.out \
                    run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out; do
             [ -f "$f" ] && echo "=== $f ===" && cat "$f"
           done
@@ -556,5 +580,5 @@ jobs:
         with:
           name: case-opt-${{ strategy.job-index }}-${{ matrix.cluster }}-${{ matrix.interface }}
           path: |
-            prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
+            prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}*.out
             run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
@@ -102,10 +102,11 @@
             "patch_icpp(1)%length_x": 2 * math.pi * L,
             "patch_icpp(1)%length_y": 2 * math.pi * L,
             "patch_icpp(1)%length_z": 2 * math.pi * L,
-            "patch_icpp(1)%vel(1)": f"{V0}*sin(x/{L})*cos(y/{L})*sin(z/{L})",
-            "patch_icpp(1)%vel(2)": f"-{V0}*cos(x/{L})*sin(y/{L})*sin(z/{L})",
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
             "patch_icpp(1)%vel(3)": 0,
-            "patch_icpp(1)%pres": f"{P0} + ({rho0}*{V0}**2/16)*(cos(2*x/{L}) + cos(2*y/{L}))*(cos(2*z/{L}) + 2)",
+            "patch_icpp(1)%pres": 0.0,
+            "patch_icpp(1)%hcid": 380,
             "patch_icpp(1)%alpha_rho(1)": 1,
             "patch_icpp(1)%alpha(1)": 1,
             # Fluids Physical Parameters
 
@@ -121,8 +121,8 @@
         & copyout_val.strip('\n') + create_val.strip('\n') + &
         & no_create_val.strip('\n') + present_val.strip('\n') + &
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
-    #:set acc_directive = '!$acc parallel ' + &
-        & acc_clause_val + extraAccArgs_val.strip('\n')
+    #:set acc_directive = FOLD_DIRECTIVE('!$acc parallel ' + &
+        & acc_clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n')
     #:set end_acc_directive = '!$acc end parallel'
     $:acc_directive
     $:code
@@ -153,8 +153,8 @@
         & copyout_val.strip('\n') + create_val.strip('\n') + &
         & no_create_val.strip('\n') + present_val.strip('\n') + &
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
-    #:set acc_directive = '!$acc parallel loop ' + &
-        & clause_val + extraAccArgs_val.strip('\n')
+    #:set acc_directive = FOLD_DIRECTIVE('!$acc parallel loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n')
     $:acc_directive
 #:enddef
 
 
@@ -141,7 +141,7 @@
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
 
     #:set omp_clause_val = omp_clause_val.strip('\n')
-    #:set omp_directive = '!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n')
+    #:set omp_directive = FOLD_DIRECTIVE('!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n')
 
     #:set omp_end_directive = '!$omp end target teams'
     $:omp_directive
@@ -186,7 +186,7 @@
         #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
     #:endif
 
-    #:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n')
+    #:set omp_directive = FOLD_DIRECTIVE(omp_start_directive + clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n')
     $:omp_directive
 #:enddef
 
 
@@ -117,4 +117,29 @@
     #:endif
     $:extraArgs_val
 #:enddef
+
+#:def FOLD_DIRECTIVE(directive, sentinel, width=200)
+    #! Fold a long GPU directive across free-form continuation lines so it stays
+    #! under nvfortran's ~1000-char source-line limit. Breaks only at whole-clause
+    #! boundaries (clause(args) groups and bare keywords), repeating the sentinel
+    #! (e.g. '!$acc&') on each continuation -- which fypp's --no-folding cannot do
+    #! because its generic folder omits the sentinel. Every emitted line is no
+    #! longer than the prefix plus the single longest clause, i.e. no longer than
+    #! the unfolded line a build with one fewer clause already compiles.
+    #:set _toks = re.findall(r'\w+\([^)]*\)|\S+', directive)
+    #:set _lines = []
+    #:set _cur = ''
+    #:for _t in _toks
+        #:if _cur == ''
+            #:set _cur = _t
+        #:elif len(_cur) + 1 + len(_t) > width
+            #:set _lines = _lines + [_cur + ' &']
+            #:set _cur = sentinel + '& ' + _t
+        #:else
+            #:set _cur = _cur + ' ' + _t
+        #:endif
+    #:endfor
+    #:set _lines = _lines + [_cur]
+    $:'\n'.join(_lines)
+#:enddef
 ! New line at end of file is required for FYPP
@@ -419,6 +419,7 @@ contains
             muscl_order = dflt_int
             num_fluids = dflt_int
             igr = .false.
+            igr_order = dflt_int
             mhd = .false.
             relativity = .false.
         #:endif
 
@@ -80,7 +80,7 @@ contains
 
         ! manual: patch_icpp (complex members: alter_patch, sph_har_coeff, size() arrays)
         do i = 1, num_patches_max
-            #:for VAR in [ 'geometry', 'smooth_patch_id']
+            #:for VAR in [ 'geometry', 'smooth_patch_id', 'hcid']
                 call MPI_BCAST(patch_icpp(i)%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
             #:endfor
 
@@ -91,7 +91,7 @@ contains
             #:for VAR in [ 'x_centroid', 'y_centroid', 'z_centroid',           &
                 & 'length_x', 'length_y', 'length_z', 'radius', 'epsilon',     &
                 & 'beta', 'smooth_coeff', 'rho', 'p0', 'm0', 'r0', 'v0',       &
-                & 'pres', 'gamma', 'pi_inf', 'hcid', 'cv', 'qv', 'qvp',        &
+                & 'pres', 'gamma', 'pi_inf', 'cv', 'qv', 'qvp',        &
                 & 'cf_val', 'Bx', 'By', 'Bz']
                 call MPI_BCAST(patch_icpp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             #:endfor