Skip to content

Commit 6b1d0de

Browse files
committed
feat: Intel GPU Max (Ponte Vecchio) OpenMP target offload support
Add end-to-end support for building and running MFC on Intel Data Center GPU Max (Ponte Vecchio) using ifx 2025.0+ with OpenMP target offload to SPIR-V/SPIR64. Verified on GT CRNCH RoboGator (dash4) with Intel GPU Max 1100. All 161 1D regression tests pass. ## Compiler and build system - Recognize IntelLLVM compiler ID throughout CMakeLists.txt (was Intel) - Add -fiopenmp -fopenmp-targets=spir64 compile/link flags for GPU builds - Add -fp-model=precise to prevent ifx FP reassociation in SPIR-V kernels - Add -fpp to global compile flags for Intel preprocessor compatibility - Link MKL parallel, libmkl_sycl_dft, libsycl, libOpenCL for oneMKL FFT - Strip SPIR-V from mkl_dfti_omp_offload.o via clang-offload-bundler to fix zeModuleDynamicLink Level Zero failures - Add --intel-aot flag: AOT compilation via ocloc to native PVC ISA, eliminates ~30 min Level Zero JIT delay (test runs: 30 min -> 14 sec) - Add IntelLLVM to no-FFTW-from-source list in dependencies/CMakeLists.txt - Fix LAPACK PIE link error with ifx on Ubuntu 22.04 ## GPU kernel fixes - omp_macros.fpp: add Intel-specific OMP_PARALLEL_LOOP, END_OMP_PARALLEL_LOOP, OMP_ROUTINE, OMP_MKL_DISPATCH branches for SPIR-V codegen - parallel_macros.fpp: add GPU_MKL_DISPATCH() macro for oneMKL dispatch - shared_parallel_macros.fpp: add USING_INTEL Fypp variable; extend all #:if not MFC_CASE_OPTIMIZATION and USING_AMD guards to include USING_INTEL and bare #:if USING_AMD guards for dimension(sys_size) in m_cbc/m_compute_cbc - m_fftw.fpp: oneMKL DFTI + ! dispatch GPU FFT path for Intel - m_compute_levelset.fpp: split single if-else dispatch to fix multi-callee phi-node issue and inliner ICE; add -fno-inline workaround - m_riemann_solvers.fpp, m_variables_conversion.fpp, m_bubbles_EE.fpp, m_weno.fpp, m_sim_helpers.fpp, m_pressure_relaxation.fpp, m_boundary_common, m_chemistry.fpp, m_phase_change.fpp, m_bubbles_EL.fpp, m_viscous.fpp, m_ibm.fpp, m_hyperelastic.fpp, m_acoustic_src.fpp, m_surface_tension.fpp, m_data_output.fpp, m_qbmm.fpp, m_compute_cbc.fpp, m_cbc.fpp, m_ib_patches.fpp: explicit array sizes in GPU_ROUTINE arguments (no assumed-shape in SPIR-V) and extend VLA guards to USING_INTEL for non-case-optimized GPU builds - m_helper.fpp: Intel-specific workarounds for SPIR-V codegen ## Toolchain - Add GT CRNCH RoboGator (crnch) module entry with Intel oneAPI 2025.1 - run.py: Intel GPU detection, set LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=256 and SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY=0 for ~16% speedup - run/input.py: post-process pyrometheus m_thermochem.f90 for --gpu mp (replace C-macro GPU_ROUTINE with literal ! declare target) - build.py, state.py: --intel-aot flag and ocloc device selection - test.py: --binary mpirun support to bypass SLURM srun slot limits on CRNCH - bootstrap/modules.sh: crnch module bootstrap - templates/include/helpers.mako: Intel MPI I_MPI_FABRICS=shm hint - modules: crnch entry (Intel oneAPI 2025.1, mpiifx, GPU Max 1100) ## Documentation - docs/documentation/intel-gpu-max.md: full build, run, troubleshoot guide
1 parent 1139cc4 commit 6b1d0de

40 files changed

Lines changed: 1479 additions & 790 deletions

CMakeLists.txt

Lines changed: 150 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ option(MFC_DOCUMENTATION "Build documentation" OFF
3131
option(MFC_ALL "Build everything" OFF)
3232
option(MFC_SINGLE_PRECISION "Build single precision" OFF)
3333
option(MFC_MIXED_PRECISION "Build mixed precision" OFF)
34+
option(MFC_Intel_AOT "Build Intel GPU with AOT compilation (no JIT)" OFF)
35+
set(MFC_Intel_AOT_DEVICE "pvc" CACHE STRING "Intel GPU AOT target device (e.g. pvc, dg2)")
3436

3537
if (MFC_ALL)
3638
set(MFC_PRE_PROCESS ON FORCE)
@@ -249,6 +251,42 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
249251
elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
250252
add_compile_options(-g -Og -traceback -check bounds)
251253
endif()
254+
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
255+
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>
256+
$<$<COMPILE_LANGUAGE:Fortran>:-fpp>)
257+
258+
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
259+
add_compile_options(-g -Og -traceback -debug -check all)
260+
elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug")
261+
add_compile_options(-g -Og -traceback -check bounds)
262+
endif()
263+
264+
# mpiifx calls 'ifx' via eval. If ifx is not in PATH, cmake's compiler
265+
# capability tests (OpenMP detection, IPO check) fail with "ifx: not found".
266+
# Locate ifx relative to mpiifx's oneapi tree so cmake tests work without
267+
# the user having to load modules before every build.
268+
get_filename_component(_fc_name "${CMAKE_Fortran_COMPILER}" NAME)
269+
if (_fc_name MATCHES "^mpi")
270+
get_filename_component(_mpi_bin "${CMAKE_Fortran_COMPILER}" DIRECTORY)
271+
get_filename_component(_mpi_ver "${_mpi_bin}" DIRECTORY)
272+
get_filename_component(_mpi_root "${_mpi_ver}" DIRECTORY)
273+
get_filename_component(_oneapi "${_mpi_root}" DIRECTORY)
274+
file(GLOB _ifx_bins "${_oneapi}/compiler/*/bin/ifx")
275+
if (_ifx_bins)
276+
list(GET _ifx_bins 0 _ifx_bin)
277+
get_filename_component(_ifx_dir "${_ifx_bin}" DIRECTORY)
278+
set(ENV{PATH} "${_ifx_dir}:$ENV{PATH}")
279+
message(STATUS "MFC: mpiifx detected — added ifx to PATH: ${_ifx_dir}")
280+
endif()
281+
unset(_mpi_bin)
282+
unset(_mpi_ver)
283+
unset(_mpi_root)
284+
unset(_oneapi)
285+
unset(_ifx_bins)
286+
unset(_ifx_bin)
287+
unset(_ifx_dir)
288+
endif()
289+
unset(_fc_name)
252290
elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_Fortran_COMPILER_ID STREQUAL "PGI"))
253291
add_compile_options(
254292
$<$<COMPILE_LANGUAGE:Fortran>:-Mfreeform>
@@ -560,8 +598,18 @@ exit 0
560598
)
561599

562600
if (MFC_MPI AND ARGS_MPI)
563-
find_package(MPI COMPONENTS Fortran REQUIRED)
564-
601+
# When the Fortran compiler is an MPI wrapper (e.g. mpiifx), skip MPI
602+
# auto-detection: cmake's FindMPI probes with -showme:compile which hangs
603+
# on Intel MPI. The wrapper already injects all MPI includes and link flags.
604+
get_filename_component(_fc_basename "${CMAKE_Fortran_COMPILER}" NAME)
605+
if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND _fc_basename MATCHES "^mpi")
606+
if (NOT TARGET MPI::MPI_Fortran)
607+
add_library(MPI::MPI_Fortran INTERFACE IMPORTED)
608+
endif()
609+
else()
610+
find_package(MPI COMPONENTS Fortran REQUIRED)
611+
endif()
612+
565613
target_compile_definitions(${a_target} PRIVATE MFC_MPI)
566614
if(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang" AND
567615
DEFINED ENV{CRAY_MPICH_INC} AND NOT "$ENV{CRAY_MPICH_INC}" STREQUAL "")
@@ -595,6 +643,71 @@ exit 0
595643
HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
596644
target_link_libraries(${a_target} PRIVATE ${HIPFFT_LIB})
597645
endif()
646+
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
647+
# Intel GPU: oneMKL DFTI with !$omp dispatch for GPU FFT.
648+
# Requires MKLROOT to be set (via oneAPI module or env).
649+
if (NOT DEFINED ENV{MKLROOT})
650+
message(FATAL_ERROR "MKLROOT is not set. Load oneAPI MKL module before building.")
651+
endif()
652+
# Compile mkl_dfti_omp_offload.f90 in isolation with minimal flags.
653+
# The file uses !$omp declare variant with need_device_addr (OpenMP 5.2)
654+
# which requires the global -free -fpp flags to be absent so the
655+
# compiler parses it in standard fixed/free detection mode only.
656+
set(_mkl_omp_src "$ENV{MKLROOT}/include/mkl_dfti_omp_offload.f90")
657+
if (NOT EXISTS "${_mkl_omp_src}")
658+
message(FATAL_ERROR "mkl_dfti_omp_offload.f90 not found in $ENV{MKLROOT}/include")
659+
endif()
660+
set(_mkl_omp_mod_dir "${CMAKE_CURRENT_BINARY_DIR}/mkl_omp_mods")
661+
set(_mkl_omp_obj "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.o")
662+
set(_mkl_omp_obj_host "${_mkl_omp_mod_dir}/mkl_dfti_omp_offload_host.o")
663+
file(MAKE_DIRECTORY "${_mkl_omp_mod_dir}")
664+
# Find clang-offload-bundler (in ifx's bin/compiler/ subdir).
665+
# CMAKE_Fortran_COMPILER may be an MPI wrapper (mpiifx); resolve the
666+
# underlying ifx from PATH so the HINTS point to the right directory.
667+
find_program(_IFX_REAL ifx REQUIRED)
668+
cmake_path(GET _IFX_REAL PARENT_PATH _ifx_bin)
669+
find_program(CLANG_OFFLOAD_BUNDLER
670+
NAMES clang-offload-bundler
671+
HINTS "${_ifx_bin}/compiler" "${_ifx_bin}"
672+
REQUIRED)
673+
add_custom_command(
674+
OUTPUT "${_mkl_omp_obj}"
675+
"${_mkl_omp_mod_dir}/mkl_dfti_omp_offload.mod"
676+
COMMAND "${CMAKE_Fortran_COMPILER}"
677+
-fiopenmp -fopenmp-targets=spir64
678+
-c -I"$ENV{MKLROOT}/include"
679+
"${_mkl_omp_src}"
680+
-o "${_mkl_omp_obj}"
681+
WORKING_DIRECTORY "${_mkl_omp_mod_dir}"
682+
DEPENDS "${_mkl_omp_src}"
683+
COMMENT "Compiling MKL OMP offload Fortran module (mkl_dfti_omp_offload)"
684+
)
685+
# Strip the SPIR-V device bundle so the linked object has only host code.
686+
# The SPIR-V contains Import declarations for MKL SYCL DFT functions that
687+
# the OpenMP Level Zero plugin cannot resolve, causing zeModuleDynamicLink
688+
# failure. With host-only code, !$omp dispatch falls back to CPU for DFT.
689+
add_custom_command(
690+
OUTPUT "${_mkl_omp_obj_host}"
691+
COMMAND "${CLANG_OFFLOAD_BUNDLER}"
692+
--unbundle --type=o
693+
--targets=host-x86_64-unknown-linux-gnu
694+
--input="${_mkl_omp_obj}"
695+
--output="${_mkl_omp_obj_host}"
696+
DEPENDS "${_mkl_omp_obj}"
697+
COMMENT "Stripping SPIR-V from MKL DFT object (host-only, fixes Level Zero link)"
698+
)
699+
add_custom_target(mkl_omp_offload_mod_${a_target}
700+
DEPENDS "${_mkl_omp_obj_host}")
701+
add_dependencies(${a_target} mkl_omp_offload_mod_${a_target})
702+
target_include_directories(${a_target} PRIVATE
703+
"$ENV{MKLROOT}/include" "${_mkl_omp_mod_dir}")
704+
target_link_libraries(${a_target} PRIVATE "${_mkl_omp_obj_host}")
705+
# Link MKL threading + core + SYCL DFT backend
706+
target_link_options(${a_target} PRIVATE -qmkl=parallel)
707+
find_library(MKL_SYCL_DFT mkl_sycl_dft HINTS "$ENV{MKLROOT}/lib" REQUIRED)
708+
find_library(SYCL_LIB sycl HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
709+
find_library(OPENCL_LIB OpenCL HINTS ENV LIBRARY_PATH "${_ifx_bin}/../lib" REQUIRED)
710+
target_link_libraries(${a_target} PRIVATE ${MKL_SYCL_DFT} ${SYCL_LIB} ${OPENCL_LIB})
598711
else()
599712
find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
600713
target_link_libraries(${a_target} PRIVATE hipfort::hipfft)
@@ -636,9 +749,23 @@ exit 0
636749
target_compile_options(${a_target} PRIVATE "-mp=gpu" "-Minfo=mp")
637750
target_link_options(${a_target} PRIVATE "-mp=gpu")
638751
set_target_properties(${a_target} PROPERTIES Fortran_FLAGS "-mp=gpu -gpu=ccall")
639-
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
640-
target_compile_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
641-
target_link_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64)
752+
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
753+
# Intel GPU: OpenMP target offload to SPIR64 (Xe-HPC / Ponte Vecchio).
754+
# GPU FFT uses oneMKL DFTI via the OpenMP dispatch construct.
755+
# MFC_Intel_AOT=ON: compile native GPU ISA at build time (no runtime JIT).
756+
if(MFC_Intel_AOT)
757+
# AOT: compile native GPU ISA at build time with ocloc via -Xopenmp-target-backend.
758+
# ocloc uses single-dash flags (-device pvc, not --device pvc).
759+
# -Xopenmp-target-backend goes only on link because ocloc runs at link time;
760+
# putting it on compile options too causes ocloc to see -device twice.
761+
target_compile_options(${a_target} PRIVATE
762+
-fiopenmp -fopenmp-targets=spir64_gen -fp-model=precise)
763+
target_link_options(${a_target} PRIVATE
764+
"SHELL:-fiopenmp -fopenmp-targets=spir64_gen -Xopenmp-target-backend \"-device ${MFC_Intel_AOT_DEVICE}\"")
765+
else()
766+
target_compile_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64 -fp-model=precise)
767+
target_link_options(${a_target} PRIVATE -fiopenmp -fopenmp-targets=spir64)
768+
endif()
642769
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
643770
target_compile_options(${a_target} PRIVATE -fopenmp)
644771
target_link_options(${a_target} PRIVATE -fopenmp)
@@ -753,6 +880,13 @@ if (MFC_PRE_PROCESS)
753880
# via cross-file inlining. Safe to disable IPA for the whole target
754881
# (CPU-only, no GPU device-call requirements). See PR #1286.
755882
target_compile_options(pre_process PRIVATE -Oipa0)
883+
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
884+
# ifx stack-allocates compiler-generated temporaries by default; large
885+
# ones (e.g. 8 MB MPI I/O contiguous copies) overflow the stack when the
886+
# Level Zero / OpenMP offload runtime has already consumed stack space.
887+
# Note: -heap-arrays triggers an ICE in ifx's SPIR-V backend for simulation
888+
# (GPU device code), so it is applied only to CPU-only targets here.
889+
target_compile_options(pre_process PRIVATE $<$<COMPILE_LANGUAGE:Fortran>:-heap-arrays>)
756890
endif()
757891
endif()
758892

@@ -782,6 +916,17 @@ if (MFC_SIMULATION)
782916
target_compile_options(simulation PRIVATE -Oipa0)
783917
endif()
784918
endif()
919+
# ifx SPIR64 ICE: the LLVM inliner pulls !$omp declare target (seq) geometry
920+
# routines into target teams loop kernels and generates SPIR-V IR that crashes
921+
# llvm-spirv at O1+. -fno-inline keeps them as proper device-side calls.
922+
# Each GPU loop calls exactly one geometry routine (split-loop pattern in
923+
# m_compute_levelset.fpp), so device-call overhead is small. See PR intel-gpu.
924+
if (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM" AND MFC_OpenMP)
925+
set_source_files_properties(
926+
"${CMAKE_BINARY_DIR}/fypp/simulation/m_compute_levelset.fpp.f90"
927+
PROPERTIES COMPILE_OPTIONS "-fno-inline"
928+
)
929+
endif()
785930
endif()
786931

787932
if (MFC_POST_PROCESS)

0 commit comments

Comments
 (0)