diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b374a1a3..dd14a2eba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,9 @@ include(TableGen) include(AddLLVM) include(AddMLIR) include(HandleLLVMOptions) -string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}") +if(NOT WIN32) + string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}") +endif() include(MLIRDetectPythonEnv) mlir_configure_python_dev_packages() @@ -45,6 +47,22 @@ set(MLIR_PYTHON_PACKAGE_PREFIX "_mlir" CACHE STRING "" FORCE) set(MLIR_BINDINGS_PYTHON_INSTALL_PREFIX "python_packages/flydsl/${MLIR_PYTHON_PACKAGE_PREFIX}" CACHE STRING "" FORCE) +# On Windows, MLIR's default SelfOwningTypeID scheme does not work across DLL +# boundaries: the TypeID's `static SelfOwningTypeID id` member cannot be auto-imported +# from a DLL without explicit __declspec(dllimport). So FlyDSL's Fly-defined types +# (defined in obj.MLIRFlyDialect embedded in FlyPythonCAPI.dll) are unreachable from +# .pyd extensions that reference them, causing link errors or runtime TypeID mismatches. +# +# Switch our compilation to string-based FallbackTypeIDs globally. Our own types all +# route through the process-shared `registerImplicitTypeID` exported from +# FlyPythonCAPI.dll, giving consistent TypeIDs across every DLL. Upstream MLIR's +# prebuilt static libs (MLIRIR.lib, etc.) were compiled with SelfOwning TypeIDs, +# but those are now linked into FlyPythonCAPI.dll as a single copy — internal usage +# within that DLL still agrees with itself, so no mismatch there. +if(WIN32) + add_compile_definitions(MLIR_USE_FALLBACK_TYPE_IDS=1) +endif() + add_subdirectory(include/flydsl) add_subdirectory(lib) add_subdirectory(tools) diff --git a/docs/windows_build_guide.md b/docs/windows_build_guide.md new file mode 100644 index 000000000..ca1866da0 --- /dev/null +++ b/docs/windows_build_guide.md @@ -0,0 +1,144 @@ +# FlyDSL on Windows (experimental) + +This guide covers building and running FlyDSL natively on Windows. The Windows +port uses the **TheRock ROCm SDK** (installed as a Python package into a venv) +rather than a system-wide ROCm install. + +> **Status**: experimental. 301 / 310 unit tests pass (97%). See +> [Known limitations](#known-limitations) for what doesn't yet. + +## Prerequisites + +1. **Windows 10/11, x64** with an AMD GPU supported by ROCm. Tested on RDNA4 (`gfx1200`, Radeon RX 9060 XT). +2. **Visual Studio 2022 Build Tools** (or full VS 2022) with the *Desktop + development with C++* workload — provides the MSVC toolchain. +3. **LLVM/Clang** in PATH — `clang-cl`, `llvm-ar`, `llvm-ml`, `lld-link`. The + official LLVM Windows installer puts these on PATH. +4. **Ninja** in PATH (`choco install ninja` or bundled with VS). +5. **Git** in PATH. +6. **Python 3.12** (the build has only been exercised against 3.12). +7. **PowerShell 5.1+** (ships with Windows). + +All commands below assume you're running in a **Developer PowerShell / x64 +Native Tools Command Prompt**, or have run `vcvarsall.bat amd64` first so that +`cl.exe`/`link.exe` are on PATH. + +## 1. Create a venv and install TheRock ROCm SDK + +TheRock ships ROCm as a Python wheel that installs into your venv. FlyDSL on +Windows currently expects this layout (it finds `ld.lld.exe`, OCML bitcode, +and `amdhip64_7.dll` relative to the SDK root). + +```powershell +python -m venv C:\path\to\flydsl-venv +C:\path\to\flydsl-venv\Scripts\Activate.ps1 + +# Install TheRock ROCm SDK for your GPU family and initialize. For RDNA4 / gfx1200: +pip install --pre --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all torch torchaudio torchvision rocm[libraries,devel] +rocm-sdk init +``` + +After install, locate the SDK development root — typically: +`...\flydsl-venv\Lib\site-packages\_rocm_sdk_devel` + +## 2. Set environment variables + +```powershell +$env:ROCM_PATH = "C:\path\to\flydsl-venv\Lib\site-packages\_rocm_sdk_devel" +# Optional: force a specific GPU arch when torch-rocm auto-detect isn't right. +# $env:FLYDSL_GPU_ARCH = "gfx1200" +``` + +## 3. Build LLVM/MLIR + +This pins the commit from `thirdparty/llvm-hash.txt`, builds with Ninja + +`clang-cl`, and installs into `..\llvm-project\mlir_install\`. + +```powershell +# From the FlyDSL repo root: +.\scripts\build_llvm.ps1 -Arch gfx1200 # substitute your GPU arch +# or rely on FLYDSL_GPU_ARCH env var; default is gfx942. +``` + +Notes: +- The script passes `/DMLIR_USE_FALLBACK_TYPE_IDS=1` globally — required to + make MLIR's TypeIDs work across multiple DLLs on Windows. +- Build takes 30–60 min on a typical workstation. Use `-Jobs N` to cap + parallelism (default is `NUMBER_OF_PROCESSORS / 2`). +- Output: `..\llvm-project\mlir_install\` — keep this around. + +## 4. Build FlyDSL + +```powershell +$env:MLIR_PATH = "C:\llvm-project\mlir_install" # or wherever step 3 installed +.\scripts\build.ps1 +``` + +This produces `build-fly\python_packages\flydsl\` with the compiled extension +modules, runtime DLLs, and python sources. + +## 5. Install FlyDSL into the venv + +```powershell +pip install -e . +``` + +This runs `setup.py` which creates a directory junction from +`python\flydsl\_mlir` → `build-fly\python_packages\flydsl\_mlir` (junction +instead of symlink so no admin / Developer Mode required). + +## 6. Run tests + +```powershell +$env:PYTHONPATH = "$PWD\build-fly\python_packages;$PWD" +python -m pytest tests\unit\ -q +``` + +Expected: ~301 passed / ~4 failed / 5 skipped. See +[Known limitations](#known-limitations). + +## How the Windows-specific bits work + +| Component | Linux behavior | Windows behavior | +|---|---|---| +| TypeID | `SelfOwningTypeID` (pointer identity across `.so`) | `MLIR_USE_FALLBACK_TYPE_IDS=1` — string-based, works across DLLs | +| Symbol export | `-fvisibility=hidden` + version script | `WINDOWS_EXPORT_ALL_SYMBOLS` on `FlyPythonCAPI.dll`, with `obj.MLIRFlyDialect` / `obj.MLIRFlyROCDLDialect` added as direct sources and upstream `MLIRIR.lib` / `MLIRSupport.lib` extracted via `llvm-ar x` so auto-`.def` generation sees them | +| lld for ROCDL | `/llvm/bin/ld.lld` — matches `/opt/rocm` layout | Staging junction at `%LOCALAPPDATA%\flydsl\rocm_toolkit\` unifies TheRock's `lib/llvm/bin/ld.lld.exe` + `lib/llvm/amdgcn/bitcode/` into the layout MLIR expects | +| Runtime DLL search | `RPATH=$ORIGIN` | `os.add_dll_directory` + ctypes pre-load of `_mlir_libs\*.dll` before JIT engine init (LLVM's `LoadLibraryPermanently` doesn't search DLL-local dirs) | +| GPU arch detect | `rocm_agent_enumerator` | Falls back to `torch.cuda.get_device_properties(0).gcnArchName` since TheRock doesn't ship the enumerator | +| `_mlir` package link | Symlink | Directory junction (no admin needed) | + +## Known limitations + +- **Multi-stream correctness**: 2 `test_multi_stream_launch` tests fail + (`test_two_streams_independent`, `test_diamond_pipeline_with_event_sync`). + Single-stream launches work correctly. +- **Disk cache test fragility**: `test_fp_math_reaches_pipeline` passes solo + but fails in-suite because cached compilation artifacts bypass the monkey- + patched hook. Not Windows-specific. Run with `FLYDSL_RUNTIME_ENABLE_CACHE=0`. +- **Torch profiler test**: `test_cache_disabled_run_perftest_does_not_crash` + hits a `DataFrame.host_time_sum` attribute error inside torch.profiler — + version compat, not a FlyDSL issue. +- **No CI coverage** yet — every build is verified manually. +- **Only gfx1200 exercised** on Windows; other arches should work + if your TheRock SDK + GPU combination is supported. + +## Troubleshooting + +- **`MLIR_FOUND=FALSE` or `LLVMNVPTXCodeGen` missing at configure time**: + the ROCm SDK ships its own `LLVMConfig.cmake`. Ensure `-DLLVM_DIR=...` is + passed explicitly (the scripts do this). Don't add `_rocm_sdk_devel` to + `CMAKE_PREFIX_PATH`. +- **`_ITERATOR_DEBUG_LEVEL` link mismatch**: you're building FlyDSL as Debug + while MLIR was built Release. Keep both at Release (`build.ps1` passes + `-DCMAKE_BUILD_TYPE=Release`). +- **`ModuleNotFoundError: flydsl._mlir`**: the editable install junction + didn't get created. `cd` into the repo and run: + `New-Item -ItemType Junction -Path python\flydsl\_mlir -Target build-fly\python_packages\flydsl\_mlir`. +- **`hipErrorNoBinaryForGpu`**: your kernel was compiled for the wrong arch. + Set `FLYDSL_GPU_ARCH` to the value printed by + `python -c "import torch; print(torch.cuda.get_device_properties(0).gcnArchName)"`. +- **`rocm amdgcn bitcode path ... does not exist`** during compilation: + the toolkit staging junction failed. Check + `%LOCALAPPDATA%\flydsl\rocm_toolkit\amdgcn\bitcode\` and that `ROCM_PATH` + points at TheRock's `_rocm_sdk_devel`. diff --git a/kernels/custom_all_reduce.py b/kernels/custom_all_reduce.py index 5f6b76314..21b1a78ed 100644 --- a/kernels/custom_all_reduce.py +++ b/kernels/custom_all_reduce.py @@ -130,8 +130,10 @@ def _get_gpu_arch(cls) -> str: pass if not arch: try: + import shutil import subprocess - r = subprocess.run(["rocminfo"], capture_output=True, text=True, timeout=10) + rocminfo = shutil.which("rocminfo") or "rocminfo" + r = subprocess.run([rocminfo], capture_output=True, text=True, timeout=10) for line in r.stdout.splitlines(): if "Name:" in line and "gfx" in line.lower(): arch = line.split(":")[-1].strip() @@ -147,7 +149,13 @@ def _load_hip(cls): if cls._hip is not None: return cls._hip import ctypes - for name in ("libamdhip64.so", "libamdhip64.so.6", "libamdhip64.so.5"): + import sys + + if sys.platform == "win32": + hip_candidates = ("amdhip64.dll", "amdhip64_7.dll", "amdhip64_6.dll") + else: + hip_candidates = ("libamdhip64.so", "libamdhip64.so.6", "libamdhip64.so.5") + for name in hip_candidates: try: cls._hip = ctypes.CDLL(name) break diff --git a/lib/Bindings/Python/BindingUtils.h b/lib/Bindings/Python/BindingUtils.h index 305875ace..c1bdf37dd 100644 --- a/lib/Bindings/Python/BindingUtils.h +++ b/lib/Bindings/Python/BindingUtils.h @@ -4,9 +4,11 @@ #ifndef FLYDSL_BINDINGS_PYTHON_BINDINGUTILS_H #define FLYDSL_BINDINGS_PYTHON_BINDINGUTILS_H +// Nanobind.h must come before Interop.h: nanobind includes Python.h which +// defines PyObject — Interop.h uses PyObject but doesn't include Python.h. +#include "mlir/Bindings/Python/Nanobind.h" #include "mlir-c/Bindings/Python/Interop.h" #include "mlir/Bindings/Python/IRCore.h" -#include "mlir/Bindings/Python/Nanobind.h" #include "mlir/Bindings/Python/NanobindAdaptors.h" #include "mlir/CAPI/IR.h" #include "mlir/CAPI/Support.h" diff --git a/lib/CAPI/Dialect/Fly/CMakeLists.txt b/lib/CAPI/Dialect/Fly/CMakeLists.txt index 2ec143a1f..ee4dbc49f 100644 --- a/lib/CAPI/Dialect/Fly/CMakeLists.txt +++ b/lib/CAPI/Dialect/Fly/CMakeLists.txt @@ -1,5 +1,21 @@ -add_mlir_public_c_api_library(MLIRCPIFly - FlyDialect.cpp - LINK_LIBS PUBLIC - MLIRFlyDialect -) +# On Windows, MLIRFlyDialect's OBJECT files are added directly to FlyPythonCAPI +# (see python/mlir_flydsl/CMakeLists.txt) so that WINDOWS_EXPORT_ALL_SYMBOLS exports +# the mlir::fly::* C++ symbols. Linking MLIRFlyDialect here transitively would cause +# duplicate-symbol errors at FlyPythonCAPI.dll link time. +if(WIN32) + add_mlir_public_c_api_library(MLIRCPIFly + FlyDialect.cpp + ) + # Still need to compile against the dialect's headers (TableGen'd types). + add_dependencies(obj.MLIRCPIFly MLIRFlyIncGen) + target_include_directories(obj.MLIRCPIFly PRIVATE + ${CMAKE_BINARY_DIR}/include + ${CMAKE_SOURCE_DIR}/include + ) +else() + add_mlir_public_c_api_library(MLIRCPIFly + FlyDialect.cpp + LINK_LIBS PUBLIC + MLIRFlyDialect + ) +endif() diff --git a/lib/CAPI/Dialect/FlyROCDL/CMakeLists.txt b/lib/CAPI/Dialect/FlyROCDL/CMakeLists.txt index c30d361d6..7b37611a9 100644 --- a/lib/CAPI/Dialect/FlyROCDL/CMakeLists.txt +++ b/lib/CAPI/Dialect/FlyROCDL/CMakeLists.txt @@ -1,6 +1,23 @@ -add_mlir_public_c_api_library(MLIRCPIFlyROCDL - FlyROCDLDialect.cpp - LINK_LIBS PUBLIC - MLIRFlyROCDLDialect - MLIRFlyToROCDL -) +# On Windows, MLIRFlyROCDLDialect's OBJECT files are added directly to FlyPythonCAPI +# so WINDOWS_EXPORT_ALL_SYMBOLS exports the C++ symbols. Avoid duplicate linkage here. +# MLIRFlyToROCDL is still linked as a static lib — its symbols are only used through +# the CAPI registerFlyToROCDLConversionPass wrapper, which IS in an embedded obj. +if(WIN32) + add_mlir_public_c_api_library(MLIRCPIFlyROCDL + FlyROCDLDialect.cpp + LINK_LIBS PUBLIC + MLIRFlyToROCDL + ) + add_dependencies(obj.MLIRCPIFlyROCDL MLIRFlyROCDLIncGen) + target_include_directories(obj.MLIRCPIFlyROCDL PRIVATE + ${CMAKE_BINARY_DIR}/include + ${CMAKE_SOURCE_DIR}/include + ) +else() + add_mlir_public_c_api_library(MLIRCPIFlyROCDL + FlyROCDLDialect.cpp + LINK_LIBS PUBLIC + MLIRFlyROCDLDialect + MLIRFlyToROCDL + ) +endif() diff --git a/lib/Runtime/FlyRocmRuntimeWrappers.cpp b/lib/Runtime/FlyRocmRuntimeWrappers.cpp index 87b35d20d..473a0de83 100644 --- a/lib/Runtime/FlyRocmRuntimeWrappers.cpp +++ b/lib/Runtime/FlyRocmRuntimeWrappers.cpp @@ -19,6 +19,12 @@ #include "mlir/ExecutionEngine/CRunnerUtils.h" #include "hip/hip_runtime.h" +#ifdef _WIN32 +#define FLY_RUNTIME_EXPORT __declspec(dllexport) +#else +#define FLY_RUNTIME_EXPORT __attribute__((visibility("default"))) +#endif + #define HIP_REPORT_IF_ERROR(expr) \ [](hipError_t result) { \ if (!result) \ @@ -31,31 +37,31 @@ thread_local static int32_t defaultDevice = 0; -extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { +extern "C" FLY_RUNTIME_EXPORT hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { hipModule_t module = nullptr; HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data)); return module; } -extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) { +extern "C" FLY_RUNTIME_EXPORT hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) { (void)data; (void)optLevel; assert(false && "This function is not available in HIP."); return nullptr; } -extern "C" void mgpuModuleUnload(hipModule_t module) { +extern "C" FLY_RUNTIME_EXPORT void mgpuModuleUnload(hipModule_t module) { HIP_REPORT_IF_ERROR(hipModuleUnload(module)); } -extern "C" hipFunction_t mgpuModuleGetFunction(hipModule_t module, +extern "C" FLY_RUNTIME_EXPORT hipFunction_t mgpuModuleGetFunction(hipModule_t module, const char *name) { hipFunction_t function = nullptr; HIP_REPORT_IF_ERROR(hipModuleGetFunction(&function, module, name)); return function; } -extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, +extern "C" FLY_RUNTIME_EXPORT void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, @@ -66,7 +72,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, stream, params, extra)); } -extern "C" void mgpuLaunchClusterKernel(hipFunction_t function, +extern "C" FLY_RUNTIME_EXPORT void mgpuLaunchClusterKernel(hipFunction_t function, intptr_t clusterX, intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY, @@ -135,76 +141,76 @@ extern "C" void mgpuLaunchClusterKernel(hipFunction_t function, #endif } -extern "C" hipStream_t mgpuStreamCreate() { +extern "C" FLY_RUNTIME_EXPORT hipStream_t mgpuStreamCreate() { hipStream_t stream = nullptr; HIP_REPORT_IF_ERROR(hipStreamCreate(&stream)); return stream; } -extern "C" void mgpuStreamDestroy(hipStream_t stream) { +extern "C" FLY_RUNTIME_EXPORT void mgpuStreamDestroy(hipStream_t stream) { HIP_REPORT_IF_ERROR(hipStreamDestroy(stream)); } -extern "C" void mgpuStreamSynchronize(hipStream_t stream) { +extern "C" FLY_RUNTIME_EXPORT void mgpuStreamSynchronize(hipStream_t stream) { HIP_REPORT_IF_ERROR(hipStreamSynchronize(stream)); } -extern "C" void mgpuStreamWaitEvent(hipStream_t stream, hipEvent_t event) { +extern "C" FLY_RUNTIME_EXPORT void mgpuStreamWaitEvent(hipStream_t stream, hipEvent_t event) { HIP_REPORT_IF_ERROR(hipStreamWaitEvent(stream, event, /*flags=*/0)); } -extern "C" hipEvent_t mgpuEventCreate() { +extern "C" FLY_RUNTIME_EXPORT hipEvent_t mgpuEventCreate() { hipEvent_t event = nullptr; HIP_REPORT_IF_ERROR(hipEventCreateWithFlags(&event, hipEventDisableTiming)); return event; } -extern "C" void mgpuEventDestroy(hipEvent_t event) { +extern "C" FLY_RUNTIME_EXPORT void mgpuEventDestroy(hipEvent_t event) { HIP_REPORT_IF_ERROR(hipEventDestroy(event)); } -extern "C" void mgpuEventSynchronize(hipEvent_t event) { +extern "C" FLY_RUNTIME_EXPORT void mgpuEventSynchronize(hipEvent_t event) { HIP_REPORT_IF_ERROR(hipEventSynchronize(event)); } -extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { +extern "C" FLY_RUNTIME_EXPORT void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, +extern "C" FLY_RUNTIME_EXPORT void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, bool /*isHostShared*/) { void *ptr = nullptr; HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes)); return ptr; } -extern "C" void mgpuMemFree(void *ptr, hipStream_t /*stream*/) { +extern "C" FLY_RUNTIME_EXPORT void mgpuMemFree(void *ptr, hipStream_t /*stream*/) { HIP_REPORT_IF_ERROR(hipFree(ptr)); } -extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes, +extern "C" FLY_RUNTIME_EXPORT void mgpuMemcpy(void *dst, void *src, size_t sizeBytes, hipStream_t stream) { HIP_REPORT_IF_ERROR( hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream)); } -extern "C" void mgpuMemset32(void *dst, int value, size_t count, +extern "C" FLY_RUNTIME_EXPORT void mgpuMemset32(void *dst, int value, size_t count, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipMemsetD32Async(reinterpret_cast(dst), value, count, stream)); } -extern "C" void mgpuMemset16(void *dst, int shortValue, size_t count, +extern "C" FLY_RUNTIME_EXPORT void mgpuMemset16(void *dst, int shortValue, size_t count, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipMemsetD16Async(reinterpret_cast(dst), shortValue, count, stream)); } -extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { +extern "C" FLY_RUNTIME_EXPORT void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /*flags=*/0)); } -extern "C" void +extern "C" FLY_RUNTIME_EXPORT void mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType *descriptor, int64_t elementSizeBytes) { int64_t *sizes = descriptor->sizes; @@ -232,11 +238,11 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType *descriptor, mgpuMemHostRegister(ptr, sizeBytes); } -extern "C" void mgpuMemHostUnregister(void *ptr) { +extern "C" FLY_RUNTIME_EXPORT void mgpuMemHostUnregister(void *ptr) { HIP_REPORT_IF_ERROR(hipHostUnregister(ptr)); } -extern "C" void +extern "C" FLY_RUNTIME_EXPORT void mgpuMemHostUnregisterMemRef(int64_t /*rank*/, StridedMemRefType *descriptor, int64_t elementSizeBytes) { @@ -251,7 +257,7 @@ static void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) { hipHostGetDevicePointer((void **)devicePtr, hostPtr, /*flags=*/0)); } -extern "C" StridedMemRefType +extern "C" FLY_RUNTIME_EXPORT StridedMemRefType mgpuMemGetDeviceMemRef1dFloat(float * /*allocated*/, float *aligned, int64_t offset, int64_t size, int64_t stride) { float *devicePtr = nullptr; @@ -259,7 +265,7 @@ mgpuMemGetDeviceMemRef1dFloat(float * /*allocated*/, float *aligned, return {devicePtr, devicePtr, offset, {size}, {stride}}; } -extern "C" StridedMemRefType +extern "C" FLY_RUNTIME_EXPORT StridedMemRefType mgpuMemGetDeviceMemRef1dInt32(int32_t * /*allocated*/, int32_t *aligned, int64_t offset, int64_t size, int64_t stride) { int32_t *devicePtr = nullptr; @@ -267,7 +273,7 @@ mgpuMemGetDeviceMemRef1dInt32(int32_t * /*allocated*/, int32_t *aligned, return {devicePtr, devicePtr, offset, {size}, {stride}}; } -extern "C" void mgpuSetDefaultDevice(int32_t device) { +extern "C" FLY_RUNTIME_EXPORT void mgpuSetDefaultDevice(int32_t device) { defaultDevice = device; HIP_REPORT_IF_ERROR(hipSetDevice(device)); } diff --git a/python/flydsl/_compat.py b/python/flydsl/_compat.py index 867ccedc8..a440d452b 100644 --- a/python/flydsl/_compat.py +++ b/python/flydsl/_compat.py @@ -9,6 +9,24 @@ import ctypes import os +import sys + + +def _default_comgr_path() -> str: + """Return the default path to the system ``libamd_comgr`` library.""" + if sys.platform == "win32": + rocm = os.environ.get("ROCM_PATH") or os.environ.get("HIP_PATH", "") + if rocm: + return os.path.join(rocm, "bin", "amd_comgr.dll") + return "amd_comgr.dll" + return "/opt/rocm/lib/libamd_comgr.so.3" + + +def _comgr_sim_name() -> str: + """Return the simulator-side comgr library name.""" + if sys.platform == "win32": + return "amd_comgr.dll" + return "libamd_comgr.so.3" def _maybe_preload_system_comgr() -> None: @@ -32,10 +50,8 @@ def _maybe_preload_system_comgr() -> None: if not in_ffm_session: return - system_comgr = os.environ.get( - "FLYDSL_COMGR_PRELOAD_PATH", "/opt/rocm/lib/libamd_comgr.so.3" - ) - sim_comgr = os.path.join(model_path, "rocm", "libamd_comgr.so.3") + system_comgr = os.environ.get("FLYDSL_COMGR_PRELOAD_PATH", _default_comgr_path()) + sim_comgr = os.path.join(model_path, "rocm", _comgr_sim_name()) if not (os.path.exists(system_comgr) and os.path.exists(sim_comgr)): return diff --git a/python/flydsl/compiler/backends/rocm.py b/python/flydsl/compiler/backends/rocm.py index 5188d1529..e1a70bd6a 100644 --- a/python/flydsl/compiler/backends/rocm.py +++ b/python/flydsl/compiler/backends/rocm.py @@ -5,6 +5,7 @@ from ...runtime.device import get_rocm_arch, is_rdna_arch from ...utils import env +from ...utils.platform import rocm_toolkit_path, shared_lib_glob, shared_lib_name from .base import BaseBackend, GPUTarget @@ -83,7 +84,7 @@ def pipeline_fragments(self, *, compile_hints: dict) -> List[str]: if env.debug.enable_debug_info else [] ), - f'gpu-module-to-binary{{format=fatbin opts="{" ".join(bin_cli_opts)}"}}', + f'gpu-module-to-binary{{format=fatbin opts="{" ".join(bin_cli_opts)}" toolkit={rocm_toolkit_path()}}}', ] def gpu_module_targets(self) -> List[str]: @@ -94,15 +95,16 @@ def gpu_module_targets(self) -> List[str]: def native_lib_patterns(self) -> List[str]: return [ - "_mlirDialectsFly*.so", - "libFly*.so", - "libfly_jit_runtime.so", - "libmlir_rocm_runtime.so", - "_mlirRegisterEverything*.so", + shared_lib_glob("_mlirDialectsFly*.so"), + shared_lib_glob("libFly*.so"), + shared_lib_name("libfly_jit_runtime.so"), + shared_lib_name("libmlir_rocm_runtime.so"), + shared_lib_glob("_mlirRegisterEverything*.so"), ] def jit_runtime_lib_basenames(self) -> List[str]: return [ - "libfly_jit_runtime.so", - "libmlir_c_runner_utils.so", + shared_lib_name("libfly_jit_runtime.so"), + shared_lib_name("libmlir_c_runner_utils.so"), + shared_lib_name("libmlir_rocm_runtime.so"), ] diff --git a/python/flydsl/compiler/jit_executor.py b/python/flydsl/compiler/jit_executor.py index 0905b09ee..7a19852e1 100644 --- a/python/flydsl/compiler/jit_executor.py +++ b/python/flydsl/compiler/jit_executor.py @@ -14,6 +14,8 @@ @lru_cache(maxsize=1) def _resolve_runtime_libs() -> List[str]: + import sys + from .backends import get_backend backend = get_backend() @@ -25,6 +27,20 @@ def _resolve_runtime_libs() -> List[str]: f"Required JIT runtime library not found: {lib}\n" f"Please rebuild the project." ) + + # Windows: LLVM's LoadLibraryPermanently uses default search order which does + # not include the DLL's own directory. Pre-load each runtime lib (and any + # sibling .dll dependencies) via ctypes with LOAD_WITH_ALTERED_SEARCH_PATH so + # transitive deps resolve from _mlir_libs/ before ExecutionEngine touches them. + if sys.platform == "win32": + import os + os.add_dll_directory(str(mlir_libs_dir)) + for dep in mlir_libs_dir.glob("*.dll"): + try: + ctypes.CDLL(str(dep), mode=0x00000008) # LOAD_WITH_ALTERED_SEARCH_PATH + except OSError: + pass + return [str(p) for p in libs] diff --git a/python/flydsl/compiler/jit_function.py b/python/flydsl/compiler/jit_function.py index f8be93aaf..6674a6f77 100644 --- a/python/flydsl/compiler/jit_function.py +++ b/python/flydsl/compiler/jit_function.py @@ -19,6 +19,7 @@ from .._mlir.passmanager import PassManager from ..expr.typing import Stream from ..utils import env, log +from ..utils.platform import rocm_toolkit_path from .ast_rewriter import ASTRewriter from .backends import compile_backend_name, get_backend from .jit_argument import convert_to_jit_arguments @@ -40,8 +41,8 @@ def _flydsl_key() -> str: Covers: 1. All Python source files under flydsl.compiler.*, flydsl.expr.*, flydsl.runtime.*, flydsl.utils.* - 2. Native shared libraries (_mlirDialectsFly*.so, libFly*.so, libfly_jit_runtime.so, - libmlir_rocm_runtime.so) + 2. Native shared libraries (e.g. _mlirDialectsFly*, libFly*, fly_jit_runtime, + mlir_rocm_runtime — .so on Linux, .dll/.pyd on Windows) 3. flydsl.__version__ Any change to compiler code, pass pipeline, runtime wrappers, or C++ @@ -304,7 +305,7 @@ def _dump_isa(*, dump_dir: Path, ctx: ir.Context, asm: str, verify: bool, stage_ "ensure-debug-info-scope-on-llvm-func{emission-kind=LineTablesOnly}," if env.debug.enable_debug_info else "" ) pm = PassManager.parse( - f'builtin.module({di_pass}gpu-module-to-binary{{format=isa opts="{"-g" if env.debug.enable_debug_info else ""}" section= toolkit=}})', + f'builtin.module({di_pass}gpu-module-to-binary{{format=isa opts="{"-g" if env.debug.enable_debug_info else ""}" section= toolkit={rocm_toolkit_path()}}})', context=ctx, ) pm.enable_verifier(bool(verify)) diff --git a/python/flydsl/runtime/device.py b/python/flydsl/runtime/device.py index 50ffc1287..7a933ace6 100644 --- a/python/flydsl/runtime/device.py +++ b/python/flydsl/runtime/device.py @@ -3,18 +3,38 @@ import functools import os +import shutil import subprocess +import sys from typing import Optional _ROCM_AGENT_TIMEOUT_S = int(os.environ.get("FLYDSL_ROCM_AGENT_TIMEOUT", "300")) +def _find_rocm_tool(name: str) -> Optional[str]: + """Locate a ROCm CLI tool, checking ROCM_PATH/HIP_PATH on Windows.""" + found = shutil.which(name) + if found: + return found + if sys.platform == "win32": + for env_var in ("ROCM_PATH", "HIP_PATH"): + prefix = os.environ.get(env_var) + if prefix: + candidate = os.path.join(prefix, "bin", name + ".exe") + if os.path.isfile(candidate): + return candidate + return None + + def _arch_from_rocm_agent_enumerator() -> Optional[str]: """Query rocm_agent_enumerator (standard ROCm tool) for the first GPU arch.""" + tool = _find_rocm_tool("rocm_agent_enumerator") + if tool is None: + return None try: out = subprocess.check_output( - ["rocm_agent_enumerator", "-name"], + [tool, "-name"], text=True, timeout=_ROCM_AGENT_TIMEOUT_S, stderr=subprocess.DEVNULL, @@ -43,6 +63,17 @@ def get_rocm_arch() -> str: if arch: return arch.split(":", 1)[0] + # Fallback: query torch-rocm. TheRock SDK on Windows doesn't ship + # rocm_agent_enumerator, but torch.cuda reports gcnArchName on ROCm. + try: + import torch + if torch.cuda.is_available(): + gcn = getattr(torch.cuda.get_device_properties(0), "gcnArchName", None) + if gcn: + return gcn.split(":", 1)[0] + except Exception: + pass + return "gfx942" @@ -53,9 +84,12 @@ def get_rocm_device_count() -> int: Uses the same invocation as :func:`_arch_from_rocm_agent_enumerator`. Returns 0 when the tool is unavailable or no discrete GPU agents are reported. """ + tool = _find_rocm_tool("rocm_agent_enumerator") + if tool is None: + return 0 try: out = subprocess.check_output( - ["rocm_agent_enumerator", "-name"], + [tool, "-name"], text=True, timeout=5, stderr=subprocess.DEVNULL, diff --git a/python/flydsl/utils/platform.py b/python/flydsl/utils/platform.py new file mode 100644 index 000000000..6df270b93 --- /dev/null +++ b/python/flydsl/utils/platform.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 FlyDSL Project Contributors + +"""Cross-platform helpers for Windows / Linux compatibility.""" + +import os +import sys + +IS_WINDOWS = sys.platform == "win32" + + +def _ensure_junction(link: str, target: str) -> bool: + """Create a Windows directory junction from ``link`` to ``target``. Junctions + don't require admin (unlike symlinks). Returns True if link exists and points + at target.""" + import subprocess + + if os.path.exists(link): + try: + if os.path.samefile(link, target): + return True + except OSError: + pass + # Stale or unrelated — leave it alone and assume usable + return os.path.isdir(link) + os.makedirs(os.path.dirname(link), exist_ok=True) + try: + subprocess.run( + ["cmd", "/c", "mklink", "/J", link, target], + check=True, + capture_output=True, + ) + return True + except (OSError, subprocess.CalledProcessError): + return False + + +def rocm_toolkit_path() -> str: + """Path MLIR's gpu-module-to-binary should use as ``toolkit``. + + MLIR's ROCDL target appends ``llvm/bin/ld.lld`` (for linking) and + ``amdgcn/bitcode`` (for OCML/OCKL bitcode) to this path. + + - Linux ROCm at ``/opt/rocm`` has both at the standard relative locations → + empty/default toolkit works. + - Windows TheRock SDK has ``ld.lld.exe`` at ``/lib/llvm/bin/`` and + bitcode at ``/lib/llvm/amdgcn/bitcode/``. No single toolkit path + satisfies both MLIR-expected sub-paths, so we stage a directory under the + user's LocalAppData with directory junctions that unify the layout: + /llvm → /lib/llvm + /amdgcn → /lib/llvm/amdgcn + + Returns an empty string if no override is needed or staging fails. + """ + if not IS_WINDOWS: + return "" + rocm = os.environ.get("ROCM_PATH") or os.environ.get("HIP_PATH") + if not rocm: + return "" + llvm_dir = os.path.join(rocm, "lib", "llvm") + ld_lld = os.path.join(llvm_dir, "bin", "ld.lld.exe") + amdgcn = os.path.join(llvm_dir, "amdgcn") + if not (os.path.isfile(ld_lld) and os.path.isdir(amdgcn)): + return "" + + cache_root = os.environ.get("LOCALAPPDATA") or os.path.expanduser("~") + staging = os.path.join(cache_root, "flydsl", "rocm_toolkit") + ok_llvm = _ensure_junction(os.path.join(staging, "llvm"), llvm_dir) + ok_amdgcn = _ensure_junction(os.path.join(staging, "amdgcn"), amdgcn) + if ok_llvm and ok_amdgcn: + # MLIR pass-option parser is brace/colon sensitive — use forward slashes. + return staging.replace("\\", "/") + + # Fallback: lld still works from /lib (OCML won't resolve). + candidate = os.path.join(rocm, "lib") + if os.path.isfile(os.path.join(candidate, "llvm", "bin", "ld.lld.exe")): + return candidate.replace("\\", "/") + return "" + + +def shared_lib_ext() -> str: + """Return the native shared-library file extension for the current platform.""" + if IS_WINDOWS: + return ".dll" + return ".so" + + +def shared_lib_name(basename: str) -> str: + """Convert a Linux-style shared library name to the platform equivalent. + + Examples (Linux → Windows): + libfoo.so → foo.dll + libfoo.so.3 → foo.dll + _bar*.so → _bar*.pyd (Python extension) + """ + if not IS_WINDOWS: + return basename + + # Python extension modules: _name*.so → _name*.pyd + if basename.startswith("_") and basename.endswith(".so"): + return basename[:-3] + ".pyd" + + # Versioned sonames: libfoo.so.3 → foo.dll + name = basename + if ".so." in name: + name = name[: name.index(".so.")] + elif name.endswith(".so"): + name = name[:-3] + + # Drop lib prefix (Windows convention) + if name.startswith("lib"): + name = name[3:] + + return name + ".dll" + + +def shared_lib_glob(pattern: str) -> str: + """Convert a Linux glob pattern for shared libraries to the platform equivalent. + + Examples (Linux → Windows): + _mlirDialectsFly*.so → _mlirDialectsFly*.pyd + libFly*.so → Fly*.dll + libfoo.so → foo.dll + """ + if not IS_WINDOWS: + return pattern + + # Python extension globs: _name*.so → _name*.pyd + if pattern.startswith("_") and pattern.endswith(".so"): + return pattern[:-3] + ".pyd" + + # lib*.so globs + name = pattern + if name.endswith(".so"): + name = name[:-3] + if name.startswith("lib"): + name = name[3:] + return name + ".dll" diff --git a/python/mlir_flydsl/CMakeLists.txt b/python/mlir_flydsl/CMakeLists.txt index 07a51ac00..e0efe5af7 100644 --- a/python/mlir_flydsl/CMakeLists.txt +++ b/python/mlir_flydsl/CMakeLists.txt @@ -25,12 +25,25 @@ declare_mlir_dialect_python_bindings( GEN_ENUM_BINDINGS ) -# NOTE: Do NOT link MLIRFlyDialect/MLIRFlyROCDLDialect here via PRIVATE_LINK_LIBS. +# NOTE: Do NOT link MLIRFlyDialect/MLIRFlyROCDLDialect here via PRIVATE_LINK_LIBS on Linux. # These symbols are already provided by FlyPythonCAPI.so (via MLIRCPIFly's transitive # dependencies in EMBED_CAPI_LINK_LIBS). Statically linking them here creates DUPLICATE # TypeID static variables, causing "storage uniquer isn't initialized" errors at runtime. # The COMMON_CAPI_LINK_LIBS FlyPythonCAPI in add_mlir_python_modules ensures _mlirDialectsFly.so # links to FlyPythonCAPI.so, which provides all needed symbols. +# +# Windows: FlyPythonCAPI.dll is built with WINDOWS_EXPORT_ALL_SYMBOLS (set below), so it +# exports ALL C++ symbols from its statically-linked deps (MLIRFlyDialect, MLIRArithDialect, +# etc.) via an auto-generated .def file. The .pyd extensions then resolve all mlir::fly:: +# symbols from FlyPythonCAPI.lib at link time — no second static copy, no duplicate +# storage uniquers. This is the correct Windows equivalent of ELF's global symbol table. +if(WIN32) + set(_FLY_EXTRA_LINK_LIBS "") + set(_FLY_ROCDL_EXTRA_LINK_LIBS "") +else() + set(_FLY_EXTRA_LINK_LIBS "") + set(_FLY_ROCDL_EXTRA_LINK_LIBS "") +endif() declare_mlir_python_extension(FlyPythonSources.Core.fly MODULE_NAME _mlirDialectsFly @@ -42,6 +55,7 @@ declare_mlir_python_extension(FlyPythonSources.Core.fly FlyExtension.cpp PRIVATE_LINK_LIBS LLVMSupport + ${_FLY_EXTRA_LINK_LIBS} ) declare_mlir_python_extension(FlyPythonSources.Core.fly_rocdl @@ -53,6 +67,7 @@ declare_mlir_python_extension(FlyPythonSources.Core.fly_rocdl FlyROCDLExtension.cpp PRIVATE_LINK_LIBS LLVMSupport + ${_FLY_ROCDL_EXTRA_LINK_LIBS} ) # NOTE: Do NOT link MLIRFlyToROCDL or other C++ libs via PRIVATE_LINK_LIBS. @@ -162,6 +177,86 @@ add_mlir_python_modules(FlyPythonModules FlyPythonCAPI ) +# --------------------------------------------------------------------------- +# Windows: cross-DLL TypeID resolution via string fallback +# +# MLIR's MLIR_DECLARE_EXPLICIT_SELF_OWNING_TYPE_ID declares a static DATA +# member (SelfOwningTypeID id) with no __declspec(dllimport/dllexport). +# On Windows, static DATA members defined in a DLL cannot be accessed by +# consumers without dllimport — the linker sees an unresolved symbol because +# the .obj references the symbol directly, not via __imp_ indirection. +# +# The correct fix is MLIR_USE_FALLBACK_TYPE_IDS: when true, TypeID uses +# string-based comparison (FallbackTypeIDResolver) instead of pointer +# identity of a cross-DLL static. This is exactly what the comment in +# TypeID.h recommends for "complex shared library setups". No DATA symbols +# cross DLL boundaries, so no dllimport/dllexport is required. +# +# We define this globally so FlyPythonCAPI and all .pyd extensions use the +# same TypeID resolution strategy. The fallback is slightly slower but +# correct and is the standard approach for Windows MLIR ports. +# --------------------------------------------------------------------------- +if(WIN32) + # MLIR_USE_FALLBACK_TYPE_IDS is set at the top-level CMakeLists.txt so it applies + # to every compilation unit (including obj.MLIRFlyDialect). See comment there. + + # Export ALL C++ symbols from FlyPythonCAPI.dll via an auto-generated .def file. + # WINDOWS_EXPORT_ALL_SYMBOLS scans .obj files directly linked into the DLL; it does + # NOT scan linked static libraries. To make the mlir::fly::* C++ symbols visible to + # .pyd extensions (which directly #include & call these APIs in FlyExtension.cpp / + # TiledOpTraits.cpp), we add the OBJECT library outputs of MLIRFlyDialect and + # MLIRFlyROCDLDialect as direct sources of FlyPythonCAPI. The scan then includes + # all Fly dialect symbols in the auto-generated exports.def. + # + # NOTE: We disable the transitive MLIRFlyDialect/MLIRFlyROCDLDialect link chain + # on Windows (see lib/CAPI/Dialect/{Fly,FlyROCDL}/CMakeLists.txt) to avoid double + # linkage of the same symbols. + set_target_properties(FlyPythonCAPI PROPERTIES + WINDOWS_EXPORT_ALL_SYMBOLS ON) + + target_sources(FlyPythonCAPI PRIVATE + $ + $ + ) + # TableGen'd headers must exist before FlyPythonCAPI compiles, since the embedded + # objects reference them via their .cpp.obj metadata. + add_dependencies(FlyPythonCAPI MLIRFlyDialect MLIRFlyROCDLDialect) + + # -------------------------------------------------------------------------- + # Extract .obj files from upstream MLIR static archives so WINDOWS_EXPORT_ALL_SYMBOLS + # can scan them and re-export their C++ symbols from FlyPythonCAPI.dll. + # + # The .pyd extensions (FlyExtension.cpp, TiledOpTraits.cpp) directly use C++ APIs + # from MLIRIR (Float8E5M2Type, MLIRContext::getTypeUniquer, etc.) and MLIRSupport + # (StorageUniquer::*). These live in .lib archives which WINDOWS_EXPORT_ALL_SYMBOLS + # cannot scan — so we extract their .obj files at configure time and inject them + # as direct sources. The MSVC/LLD linker resolves symbols from .obj inputs before + # searching .lib archives, so no duplicate-definition errors arise. + # -------------------------------------------------------------------------- + set(_FLY_EXPORT_STATIC_LIBS MLIRIR MLIRSupport) + set(_FLY_EXTRA_OBJS "") + foreach(_lib IN LISTS _FLY_EXPORT_STATIC_LIBS) + set(_lib_path "${MLIR_DIR}/../../../lib/${_lib}.lib") + get_filename_component(_lib_path "${_lib_path}" ABSOLUTE) + if(NOT EXISTS "${_lib_path}") + message(FATAL_ERROR "Cannot locate ${_lib}.lib for Windows exports augmentation at ${_lib_path}") + endif() + set(_extract_dir "${CMAKE_BINARY_DIR}/extracted_objs/${_lib}") + file(MAKE_DIRECTORY "${_extract_dir}") + execute_process( + COMMAND "${LLVM_TOOLS_BINARY_DIR}/llvm-ar.exe" x "${_lib_path}" + WORKING_DIRECTORY "${_extract_dir}" + RESULT_VARIABLE _extract_res + ) + if(NOT _extract_res EQUAL 0) + message(FATAL_ERROR "llvm-ar x ${_lib}.lib failed (result=${_extract_res})") + endif() + file(GLOB _lib_objs "${_extract_dir}/*.obj") + list(APPEND _FLY_EXTRA_OBJS ${_lib_objs}) + endforeach() + target_sources(FlyPythonCAPI PRIVATE ${_FLY_EXTRA_OBJS}) +endif() + ################################################################################ # Type Stubs Generation ################################################################################ @@ -170,24 +265,50 @@ set(_FLYDSL_PYTHON_PACKAGES_DIR "${MLIR_BINARY_DIR}/python_packages") set(_MLIR_LIBS_DIR "${FlyPythonModules_ROOT_PREFIX}/_mlir_libs") set(_STUB_MARKER_FILE "${_MLIR_LIBS_DIR}/.stubs_generated") -add_custom_command( - OUTPUT "${_STUB_MARKER_FILE}" - COMMAND /bin/bash -c "\ - PYTHONPATH='${_FLYDSL_PYTHON_PACKAGES_DIR}' \ - '${Python3_EXECUTABLE}' -m nanobind.stubgen \ - -q -r \ - -m flydsl._mlir._mlir_libs._mlir \ - -m flydsl._mlir._mlir_libs._mlirDialectsFly \ - -m flydsl._mlir._mlir_libs._mlirDialectsFlyROCDL \ - -m flydsl._mlir._mlir_libs._mlirDialectsGPU \ - -m flydsl._mlir._mlir_libs._mlirDialectsLLVM \ - -O '${_MLIR_LIBS_DIR}' \ - || echo 'Warning: nanobind.stubgen not available -- skipping stub generation'" - COMMAND ${CMAKE_COMMAND} -E touch "${_STUB_MARKER_FILE}" - DEPENDS CopyFlyPythonSources - COMMENT "Generating Python stub files for all extension modules" - VERBATIM -) +if(WIN32) + # NOTE: omit `-m flydsl._mlir._mlir_libs._mlir` on Windows. That extension has + # submodules (ir, passmanager, rewrite) and nanobind.stubgen -r generates a + # sibling directory `_mlir/` containing `__init__.pyi` + submodule stubs. + # On Windows, Python's import system treats that directory as a PEP 420 + # namespace package and shadows the `_mlir.cp312-win_amd64.pyd`, so + # `from ._mlir import ir` fails. The stubs are ergonomics-only; skipping + # upstream MLIR's stubs is acceptable. The Fly-specific extensions don't + # have Python submodules and generate flat `.pyi` files that don't conflict. + add_custom_command( + OUTPUT "${_STUB_MARKER_FILE}" + COMMAND ${CMAKE_COMMAND} -E env "PYTHONPATH=${_FLYDSL_PYTHON_PACKAGES_DIR}" + "${Python3_EXECUTABLE}" -m nanobind.stubgen + -q -r + -m flydsl._mlir._mlir_libs._mlirDialectsFly + -m flydsl._mlir._mlir_libs._mlirDialectsFlyROCDL + -m flydsl._mlir._mlir_libs._mlirDialectsGPU + -m flydsl._mlir._mlir_libs._mlirDialectsLLVM + -O "${_MLIR_LIBS_DIR}" + COMMAND ${CMAKE_COMMAND} -E touch "${_STUB_MARKER_FILE}" + DEPENDS CopyFlyPythonSources + COMMENT "Generating Python stub files for all extension modules" + VERBATIM + ) +else() + add_custom_command( + OUTPUT "${_STUB_MARKER_FILE}" + COMMAND /bin/bash -c "\ + PYTHONPATH='${_FLYDSL_PYTHON_PACKAGES_DIR}' \ + '${Python3_EXECUTABLE}' -m nanobind.stubgen \ + -q -r \ + -m flydsl._mlir._mlir_libs._mlir \ + -m flydsl._mlir._mlir_libs._mlirDialectsFly \ + -m flydsl._mlir._mlir_libs._mlirDialectsFlyROCDL \ + -m flydsl._mlir._mlir_libs._mlirDialectsGPU \ + -m flydsl._mlir._mlir_libs._mlirDialectsLLVM \ + -O '${_MLIR_LIBS_DIR}' \ + || echo 'Warning: nanobind.stubgen not available -- skipping stub generation'" + COMMAND ${CMAKE_COMMAND} -E touch "${_STUB_MARKER_FILE}" + DEPENDS CopyFlyPythonSources + COMMENT "Generating Python stub files for all extension modules" + VERBATIM + ) +endif() add_custom_target(FlyPythonStubs ALL DEPENDS "${_STUB_MARKER_FILE}" @@ -212,25 +333,71 @@ add_custom_target(CopyFlyPythonSources ALL "${MLIR_BINARY_DIR}/python_packages/flydsl/_mlir/dialects/_fly_rocdl_enum_gen.py" COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" - "${_MLIR_LIBS_DIR}/libmlir_c_runner_utils.so" + "${_MLIR_LIBS_DIR}/$" COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "${_MLIR_LIBS_DIR}/$" - COMMAND bash -c "patchelf --set-rpath '\\$$ORIGIN' '${_MLIR_LIBS_DIR}/libmlir_c_runner_utils.so'" - COMMAND bash -c "patchelf --set-rpath '\\$$ORIGIN' '${_MLIR_LIBS_DIR}/$'" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "${_MLIR_LIBS_DIR}/$" COMMENT "Copying python/flydsl sources to build/python_packages/flydsl" DEPENDS FlyPythonModules ) add_dependencies(CopyFlyPythonSources FlyPythonModules) +# Windows: upstream MLIRPythonSources.Core copies `_mlir_libs/_mlir/py.typed` and +# `_mlir_libs/_mlirRegisterEverything/py.typed` from the MLIR source tree. On Linux +# this is harmless — the `.so` extension takes precedence over a same-named +# namespace-package directory. On Windows, Python's import system treats those +# directories as PEP 420 namespace packages and SHADOWS the `_mlir.cp312-win_amd64.pyd` +# / `_mlirRegisterEverything.cp312-win_amd64.pyd` extensions, breaking +# `from ._mlir import ir`. The stubs are ergonomics-only; safe to drop on Windows. +if(WIN32) + add_custom_command(TARGET CopyFlyPythonSources POST_BUILD + COMMAND ${CMAKE_COMMAND} -E rm -rf + "${_MLIR_LIBS_DIR}/_mlir" + "${_MLIR_LIBS_DIR}/_mlirRegisterEverything" + COMMENT "Removing py.typed shadow directories that conflict with .pyd extensions" + VERBATIM + ) +endif() + +# patchelf RPATH fixup — Linux only (Windows uses PATH / DLL search order) +if(UNIX AND NOT APPLE) + add_custom_command(TARGET CopyFlyPythonSources POST_BUILD + COMMAND bash -c "patchelf --set-rpath '\\$$ORIGIN' '${_MLIR_LIBS_DIR}/$'" || true + COMMAND bash -c "patchelf --set-rpath '\\$$ORIGIN' '${_MLIR_LIBS_DIR}/$'" || true + COMMENT "Setting RPATH on copied runtime libraries" + VERBATIM + ) +endif() + ################################################################################ # FlyJitRuntime — thin ROCm runtime with GPU module caching ################################################################################ -file(GLOB _ROCM_SEARCH_PATHS LIST_DIRECTORIES true "/opt/rocm*") -list(SORT _ROCM_SEARCH_PATHS ORDER DESCENDING) +if(WIN32) + # On Windows, ROCm / HIP is typically found via ROCM_PATH or HIP_PATH env vars, + # or installed by TheRock into the Python venv. + set(_ROCM_SEARCH_PATHS) + if(DEFINED ENV{ROCM_PATH}) + list(APPEND _ROCM_SEARCH_PATHS "$ENV{ROCM_PATH}") + endif() + if(DEFINED ENV{HIP_PATH}) + list(APPEND _ROCM_SEARCH_PATHS "$ENV{HIP_PATH}") + endif() + # Also check the active Python prefix (TheRock installs HIP into the venv) + if(DEFINED Python3_EXECUTABLE) + get_filename_component(_PY_PREFIX "${Python3_EXECUTABLE}" DIRECTORY) + get_filename_component(_PY_PREFIX "${_PY_PREFIX}" DIRECTORY) + list(APPEND _ROCM_SEARCH_PATHS "${_PY_PREFIX}") + endif() +else() + file(GLOB _ROCM_SEARCH_PATHS LIST_DIRECTORIES true "/opt/rocm*") + list(SORT _ROCM_SEARCH_PATHS ORDER DESCENDING) +endif() find_package(hip REQUIRED CONFIG PATHS ${_ROCM_SEARCH_PATHS}) add_library(FlyJitRuntime SHARED @@ -241,10 +408,15 @@ target_include_directories(FlyJitRuntime PRIVATE ${MLIR_INCLUDE_DIRS} ) target_compile_features(FlyJitRuntime PRIVATE cxx_std_17) +if(WIN32) + target_compile_definitions(FlyJitRuntime PRIVATE _CRT_SECURE_NO_WARNINGS) +endif() target_link_libraries(FlyJitRuntime PRIVATE hip::host hip::amdhip64) set_target_properties(FlyJitRuntime PROPERTIES OUTPUT_NAME "fly_jit_runtime" LIBRARY_OUTPUT_DIRECTORY "${_MLIR_LIBS_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${_MLIR_LIBS_DIR}" # Windows puts DLLs here + ARCHIVE_OUTPUT_DIRECTORY "${_MLIR_LIBS_DIR}" # Windows import libs ) add_dependencies(FlyPythonCAPI FlyJitRuntime) diff --git a/scripts/build.ps1 b/scripts/build.ps1 new file mode 100644 index 000000000..130a5dab7 --- /dev/null +++ b/scripts/build.ps1 @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 FlyDSL Project Contributors +# +# Windows build script for FlyDSL (PowerShell equivalent of build.sh) + +$ErrorActionPreference = "Stop" + +$SCRIPT_DIR = Split-Path -Parent $MyInvocation.MyCommand.Definition +$REPO_ROOT = (Resolve-Path "$SCRIPT_DIR\..").Path + +# --------------------------------------------------------------------------- +# Build directory (default: build-fly\, overridable via FLY_BUILD_DIR) +# --------------------------------------------------------------------------- +$BUILD_DIR = if ($env:FLY_BUILD_DIR) { $env:FLY_BUILD_DIR } else { "$REPO_ROOT\build-fly" } +if (-not [System.IO.Path]::IsPathRooted($BUILD_DIR)) { + $BUILD_DIR = "$REPO_ROOT\$BUILD_DIR" +} + +# --------------------------------------------------------------------------- +# Parallelism: default to processor count, overridable via -jN argument +# --------------------------------------------------------------------------- +$PARALLEL_JOBS = $env:NUMBER_OF_PROCESSORS +foreach ($arg in $args) { + if ($arg -match '^-j(\d+)$') { + $PARALLEL_JOBS = $Matches[1] + } +} + +# --------------------------------------------------------------------------- +# Discover MLIR_PATH +# --------------------------------------------------------------------------- +if (-not $env:MLIR_PATH) { + $BASE_DIR = (Resolve-Path "$REPO_ROOT\..").Path + $candidates = @( + "$BASE_DIR\llvm-project-flydsl\build-flydsl\mlir_install", + "$BASE_DIR\llvm-project\build-flydsl\mlir_install", + "$BASE_DIR\llvm-project\mlir_install" + ) + foreach ($p in $candidates) { + if (Test-Path "$p\lib\cmake\mlir") { + Write-Host "Auto-detected MLIR_PATH: $p" + $env:MLIR_PATH = $p + break + } + } +} + +if (-not $env:MLIR_PATH) { + Write-Error "MLIR_PATH not set and could not be auto-detected. Build LLVM/MLIR first or set `$env:MLIR_PATH to your mlir_install path." + exit 1 +} + +# --------------------------------------------------------------------------- +# CMake generator: prefer Ninja, fall back to Visual Studio +# --------------------------------------------------------------------------- +$GENERATOR = $null +$CMAKE_CACHE = "$BUILD_DIR\CMakeCache.txt" +if (Test-Path $CMAKE_CACHE) { + $cached = Select-String -Path $CMAKE_CACHE -Pattern '^CMAKE_GENERATOR:INTERNAL=(.+)$' + if ($cached) { + $GENERATOR = $cached.Matches[0].Groups[1].Value + } +} +if (-not $GENERATOR) { + if (Get-Command ninja -ErrorAction SilentlyContinue) { + $GENERATOR = "Ninja" + } else { + $GENERATOR = "NMake Makefiles" + } +} + +Write-Host "==============================================" +Write-Host "FlyDSL Build (Windows)" +Write-Host " REPO_ROOT: $REPO_ROOT" +Write-Host " BUILD_DIR: $BUILD_DIR" +Write-Host " MLIR_PATH: $($env:MLIR_PATH)" +Write-Host " PARALLEL: -j$PARALLEL_JOBS" +Write-Host " GENERATOR: $GENERATOR" +Write-Host "==============================================" + +# --------------------------------------------------------------------------- +# Initialize git submodules if needed +# --------------------------------------------------------------------------- +if (-not (Test-Path "$REPO_ROOT\thirdparty\dlpack\include\dlpack\dlpack.h")) { + Write-Host "Initializing git submodules..." + git -C $REPO_ROOT submodule update --init --recursive +} + +# --------------------------------------------------------------------------- +# Remove editable-install symlink/junction that would cause conflicts +# --------------------------------------------------------------------------- +$EDITABLE_MLIR_LINK = "$REPO_ROOT\python\flydsl\_mlir" +if (Test-Path $EDITABLE_MLIR_LINK) { + $item = Get-Item $EDITABLE_MLIR_LINK -Force + if ($item.Attributes -band [System.IO.FileAttributes]::ReparsePoint) { + Write-Host "Removing editable-install symlink/junction: $EDITABLE_MLIR_LINK" + cmd /c rmdir $EDITABLE_MLIR_LINK + } +} + +# --------------------------------------------------------------------------- +# Discover Python and nanobind +# --------------------------------------------------------------------------- +$PYTHON = (Get-Command python -ErrorAction SilentlyContinue).Source +if (-not $PYTHON) { + $PYTHON = (Get-Command python3 -ErrorAction SilentlyContinue).Source +} +if (-not $PYTHON) { + Write-Error "Python not found in PATH" + exit 1 +} + +$NANOBIND_DIR = $null +try { + $NANOBIND_DIR = & $PYTHON -c "import nanobind, os; print(os.path.dirname(nanobind.__file__) + '/cmake')" 2>$null +} catch {} + +# --------------------------------------------------------------------------- +# CMake configure +# --------------------------------------------------------------------------- +if (-not (Test-Path $BUILD_DIR)) { + New-Item -ItemType Directory -Path $BUILD_DIR -Force | Out-Null +} + +# Detect compiler +$CC = $null +if (Get-Command clang-cl -ErrorAction SilentlyContinue) { + $CC = "clang-cl" +} elseif (-not (Get-Command cl -ErrorAction SilentlyContinue)) { + Write-Warning "No C/C++ compiler found. Install Visual Studio Build Tools or LLVM/Clang." +} + +$cmake_args = @( + "-G", $GENERATOR, + $REPO_ROOT, + "-DMLIR_DIR=$($env:MLIR_PATH)\lib\cmake\mlir", + "-DLLVM_DIR=$($env:MLIR_PATH)\lib\cmake\llvm", + "-DCMAKE_BUILD_TYPE=Release", + "-DPython3_EXECUTABLE=$PYTHON" +) + +if ($CC) { + $cmake_args += "-DCMAKE_C_COMPILER=$CC" + $cmake_args += "-DCMAKE_CXX_COMPILER=$CC" +} +if ($NANOBIND_DIR) { + $cmake_args += "-Dnanobind_DIR=$NANOBIND_DIR" +} + +# Add HIP/ROCm paths for Windows (TheRock) +if ($env:ROCM_PATH) { + $cmake_args += "-DCMAKE_PREFIX_PATH=$($env:ROCM_PATH)" +} +if ($env:HIP_PATH) { + $cmake_args += "-Dhip_DIR=$($env:HIP_PATH)\lib\cmake\hip" +} + +Write-Host "Configuring CMake..." +Push-Location $BUILD_DIR +try { + cmake @cmake_args + if ($LASTEXITCODE -ne 0) { throw "CMake configure failed" } + + # --------------------------------------------------------------------------- + # Build + # --------------------------------------------------------------------------- + Write-Host "Building with -j$PARALLEL_JOBS..." + cmake --build . -j $PARALLEL_JOBS + if ($LASTEXITCODE -ne 0) { throw "CMake build failed" } +} finally { + Pop-Location +} + +# --------------------------------------------------------------------------- +# Done +# --------------------------------------------------------------------------- +$PYTHON_PKG_DIR = "$BUILD_DIR\python_packages" + +Write-Host "" +Write-Host "==============================================" +Write-Host "Build complete!" +Write-Host "" +Write-Host "Usage (no install):" +Write-Host " `$env:PYTHONPATH = '$PYTHON_PKG_DIR;' + `$env:PYTHONPATH" +Write-Host "" +Write-Host "Or install as editable package:" +Write-Host " cd $REPO_ROOT; pip install -e ." +Write-Host "==============================================" diff --git a/scripts/build_llvm.ps1 b/scripts/build_llvm.ps1 new file mode 100644 index 000000000..f317067f4 --- /dev/null +++ b/scripts/build_llvm.ps1 @@ -0,0 +1,179 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 FlyDSL Project Contributors +# +# Windows build script for LLVM/MLIR (PowerShell equivalent of build_llvm.sh) + +param( + [int]$Jobs = 0, + [switch]$NoInstall, + # GPU arch for ROCM_TEST_CHIPSET. TheRock SDK doesn't ship + # rocm_agent_enumerator, so MLIR's ROCM runner config needs an explicit arch. + # Override via -Arch or the FLYDSL_GPU_ARCH env var (defaults to gfx942). + [string]$Arch = $(if ($env:FLYDSL_GPU_ARCH) { $env:FLYDSL_GPU_ARCH } else { "gfx942" }) +) + +$ErrorActionPreference = "Stop" + +$SCRIPT_DIR = Split-Path -Parent $MyInvocation.MyCommand.Definition +$REPO_ROOT = (Resolve-Path "$SCRIPT_DIR\..").Path +$BASE_DIR = (Resolve-Path "$REPO_ROOT\..").Path + +$LLVM_SRC_DIR = "$BASE_DIR\llvm-project" +$LLVM_BUILD_DIR = "$LLVM_SRC_DIR\build-flydsl" +$LLVM_INSTALL_DIR = if ($env:LLVM_INSTALL_DIR) { $env:LLVM_INSTALL_DIR } else { "$LLVM_SRC_DIR\mlir_install" } +$LLVM_PACKAGE_INSTALL = if ($NoInstall) { "0" } else { "1" } + +# Read LLVM commit hash +$LLVM_HASH_FILE = "$REPO_ROOT\thirdparty\llvm-hash.txt" +$LLVM_COMMIT = if ($env:LLVM_COMMIT) { $env:LLVM_COMMIT } else { (Get-Content $LLVM_HASH_FILE -Raw).Trim() } + +if ($LLVM_COMMIT.Length -lt 40) { + Write-Error "LLVM_COMMIT must be a full 40-char SHA (got '$LLVM_COMMIT')" + exit 1 +} + +Write-Host "Base directory: $BASE_DIR" +Write-Host "LLVM Source: $LLVM_SRC_DIR" +Write-Host "LLVM Build: $LLVM_BUILD_DIR" +Write-Host "LLVM Install: $LLVM_INSTALL_DIR" +Write-Host "LLVM Commit: $LLVM_COMMIT" + +# 1. Clone LLVM +$LLVM_REMOTE = if ($env:LLVM_REMOTE) { $env:LLVM_REMOTE } else { "https://github.com/llvm/llvm-project.git" } + +if (-not (Test-Path $LLVM_SRC_DIR)) { + Write-Host "Fetching llvm-project commit $LLVM_COMMIT (shallow, single commit)..." + git init $LLVM_SRC_DIR + Push-Location $LLVM_SRC_DIR + git remote add origin $LLVM_REMOTE +} else { + Push-Location $LLVM_SRC_DIR +} + +$commitExists = git cat-file -e "${LLVM_COMMIT}^{commit}" 2>$null; $commitFound = $LASTEXITCODE -eq 0 +if (-not $commitFound) { + Write-Host "Fetching commit $LLVM_COMMIT ..." + git fetch --depth 1 origin $LLVM_COMMIT +} +git checkout $LLVM_COMMIT +Pop-Location + +# 2. Create Build Directory +if (-not (Test-Path $LLVM_BUILD_DIR)) { + New-Item -ItemType Directory -Path $LLVM_BUILD_DIR -Force | Out-Null +} + +# 3. Install Python deps +Write-Host "Installing Python dependencies..." +pip install nanobind numpy pybind11 + +# 4. Detect tools +$PYTHON = (Get-Command python -ErrorAction SilentlyContinue).Source +if (-not $PYTHON) { $PYTHON = (Get-Command python3 -ErrorAction SilentlyContinue).Source } + +$GENERATOR = "NMake Makefiles" +if (Get-Command ninja -ErrorAction SilentlyContinue) { + $GENERATOR = "Ninja" + Write-Host "Using Ninja generator." +} + +$NANOBIND_DIR = & $PYTHON -c "import nanobind, os; print(os.path.dirname(nanobind.__file__) + '/cmake')" + +# 5. Determine compiler - prefer clang-cl on Windows for LLVM builds +$CC = $null +$CXX = $null +if (Get-Command clang-cl -ErrorAction SilentlyContinue) { + $CC = "clang-cl" + $CXX = "clang-cl" + Write-Host "Using clang-cl compiler." +} elseif (Get-Command cl -ErrorAction SilentlyContinue) { + Write-Host "Using MSVC cl compiler." +} else { + Write-Error "No C/C++ compiler found. Install Visual Studio Build Tools or LLVM/Clang." + exit 1 +} + +# 6. Configure CMake +Write-Host "Configuring LLVM..." +$cmake_args = @( + "-G", $GENERATOR, + "-S", "$LLVM_SRC_DIR\llvm", + "-B", $LLVM_BUILD_DIR, + "-DLLVM_ENABLE_PROJECTS=mlir;clang", + "-DLLVM_TARGETS_TO_BUILD=X86;NVPTX;AMDGPU", + "-DCMAKE_BUILD_TYPE=Release", + "-DCMAKE_CXX_STANDARD=17", + "-DCMAKE_CXX_FLAGS=/DMLIR_USE_FALLBACK_TYPE_IDS=1", + "-DCMAKE_C_FLAGS=/DMLIR_USE_FALLBACK_TYPE_IDS=1", + "-DLLVM_ENABLE_ASSERTIONS=ON", + "-DLLVM_INSTALL_UTILS=ON", + "-DMLIR_ENABLE_BINDINGS_PYTHON=ON", + "-DMLIR_ENABLE_ROCM_RUNNER=ON", + "-DROCM_TEST_CHIPSET=$Arch", + "-DMLIR_BINDINGS_PYTHON_NB_DOMAIN=mlir", + "-DPython3_EXECUTABLE=$PYTHON", + "-Dnanobind_DIR=$NANOBIND_DIR", + "-DBUILD_SHARED_LIBS=OFF", + "-DLLVM_BUILD_LLVM_DYLIB=OFF", + "-DLLVM_LINK_LLVM_DYLIB=OFF", + "-DMLIR_INCLUDE_TESTS=OFF" +) + +# Skip compiler-rt on Windows (not needed and may cause issues) +# Don't set RPATH on Windows (not applicable) + +if ($CC) { + $cmake_args += "-DCMAKE_C_COMPILER=$CC" + $cmake_args += "-DCMAKE_CXX_COMPILER=$CXX" +} + +# When using clang-cl, ml64 (MSVC assembler) is typically not available. +# Use llvm-ml with -m64 for 64-bit MASM assembly (BLAKE3 SSE/AVX). +$LLVM_ML = (Get-Command llvm-ml -ErrorAction SilentlyContinue).Source +if (-not (Get-Command ml64 -ErrorAction SilentlyContinue) -and $LLVM_ML) { + $cmake_args += "-DCMAKE_ASM_MASM_COMPILER=$LLVM_ML" + $cmake_args += "-DCMAKE_ASM_MASM_FLAGS=-m64" + Write-Host "Using llvm-ml as MASM assembler (ml64 not found)." +} + +cmake @cmake_args +if ($LASTEXITCODE -ne 0) { throw "CMake configure failed" } + +# 7. Build +if ($Jobs -eq 0) { + $Jobs = [Math]::Max(1, [int]($env:NUMBER_OF_PROCESSORS) / 2) +} +Write-Host "Starting build with $Jobs parallel jobs..." +$buildStart = Get-Date +Write-Host "Build started at: $buildStart" +# Ninja prints [N/total] per step; -v adds full compile commands so progress is always visible. +cmake --build $LLVM_BUILD_DIR -j $Jobs -v +if ($LASTEXITCODE -ne 0) { throw "CMake build failed" } + +# 8. Install +if ($LLVM_PACKAGE_INSTALL -eq "1") { + Write-Host "==============================================" + Write-Host "Installing MLIR/LLVM to a clean prefix..." + + if (Test-Path $LLVM_INSTALL_DIR) { + Remove-Item -Recurse -Force $LLVM_INSTALL_DIR + } + New-Item -ItemType Directory -Path $LLVM_INSTALL_DIR -Force | Out-Null + + cmake --install $LLVM_BUILD_DIR --prefix $LLVM_INSTALL_DIR + if ($LASTEXITCODE -ne 0) { throw "CMake install failed" } + + if (-not (Test-Path "$LLVM_INSTALL_DIR\lib\cmake\mlir")) { + Write-Error "Install prefix missing lib\cmake\mlir: $LLVM_INSTALL_DIR" + exit 1 + } +} + +Write-Host "==============================================" +Write-Host "LLVM/MLIR build completed successfully!" +Write-Host "" +Write-Host "To build FlyDSL, set:" +Write-Host " `$env:MLIR_PATH = '$LLVM_INSTALL_DIR'" +Write-Host "Then run:" +Write-Host " .\scripts\build.ps1" +Write-Host "==============================================" diff --git a/scripts/generate_summary.py b/scripts/generate_summary.py index 30c3895d9..5e745b944 100644 --- a/scripts/generate_summary.py +++ b/scripts/generate_summary.py @@ -79,8 +79,10 @@ def test_summary(summary: Path) -> None: install_outcome = os.environ.get("SUMMARY_INSTALL_OUTCOME", "unknown") tests_outcome = os.environ.get("SUMMARY_TESTS_OUTCOME", "unknown") bench_outcome = os.environ.get("SUMMARY_BENCHMARKS_OUTCOME", "unknown") - test_log = os.environ.get("SUMMARY_TEST_LOG", "/tmp/test_output.log") - bench_log = os.environ.get("SUMMARY_BENCH_LOG", "/tmp/bench_output.log") + import tempfile + _tmp = tempfile.gettempdir() + test_log = os.environ.get("SUMMARY_TEST_LOG", os.path.join(_tmp, "test_output.log")) + bench_log = os.environ.get("SUMMARY_BENCH_LOG", os.path.join(_tmp, "bench_output.log")) _out(summary, f"## Test Summary (`{runner}`)") _out(summary) diff --git a/setup.py b/setup.py index 7ff2a0ea3..878c9e4fc 100644 --- a/setup.py +++ b/setup.py @@ -191,18 +191,35 @@ def _assert_embedded_mlir_exists() -> None: try: env = dict(os.environ) env.setdefault("FLY_BUILD_DIR", str(BUILD_DIR_REL)) - subprocess.run(["bash", "scripts/build.sh"], cwd=str(REPO_ROOT), check=True, env=env) + if sys.platform == "win32": + build_script = REPO_ROOT / "scripts" / "build.ps1" + if build_script.exists(): + subprocess.run( + ["powershell", "-ExecutionPolicy", "Bypass", "-File", str(build_script)], + cwd=str(REPO_ROOT), check=True, env=env, + ) + else: + raise FileNotFoundError( + "Windows build script not found. " + "Run: python scripts/build.py or use CMake directly." + ) + else: + subprocess.run(["bash", "scripts/build.sh"], cwd=str(REPO_ROOT), check=True, env=env) except Exception as e: raise RuntimeError( - "Failed to build via `scripts/build.sh`.\n" + "Failed to build.\n" f"Original error: {e}\n" ) from e if not EMBEDDED__MLIR.exists(): + build_hint = ( + "Build first: .\\scripts\\build.ps1" if sys.platform == "win32" + else "Build first: bash scripts/build.sh" + ) raise RuntimeError( "Embedded MLIR python runtime not found at " f"{EMBEDDED__MLIR}.\n\n" - "Build first: bash scripts/build.sh\n\n" + f"{build_hint}\n\n" "Controls:\n" " - FLY_REBUILD=auto (default): build iff missing\n" " - FLY_REBUILD=1: always rebuild\n" @@ -216,6 +233,10 @@ def _assert_embedded_mlir_exists() -> None: def _strip_embedded_shared_libs() -> None: """Strip debug symbols from embedded shared libraries to reduce wheel size.""" + if sys.platform == "win32": + # Windows: no strip equivalent needed; MSVC Release builds are already optimized. + return + strip_bin = shutil.which("strip") if not strip_bin: print("Warning: strip not found; skipping binary stripping.") @@ -366,15 +387,37 @@ def _ensure_python_embedded_mlir_package() -> None: # Path exists but is neither a working directory nor a symlink we can manage. raise RuntimeError(f"{dst} exists but is not a usable symlink/directory; please remove it and retry.") # Prefer a relative symlink so the repo remains relocatable. + # On Windows, symlinks require Developer Mode or admin privileges. + # Fall back to a directory junction (no special privileges) or copy. try: dst.symlink_to(target, target_is_directory=True) - except Exception as e: - raise RuntimeError( - f"Failed to create symlink {dst} -> {target}.\n" - "Either create it manually, or install with PYTHONPATH pointing at " - "`build/python_packages/flydsl`.\n" - f"Original error: {e}" - ) from e + except OSError: + if sys.platform == "win32": + # Try a directory junction (works without special privileges on Windows) + abs_target = (dst.parent / target).resolve() + try: + subprocess.run( + ["cmd", "/c", "mklink", "/J", str(dst), str(abs_target)], + check=True, capture_output=True, + ) + except Exception: + # Last resort: copy the directory + try: + shutil.copytree(str(abs_target), str(dst)) + except Exception as e2: + raise RuntimeError( + f"Failed to link or copy {dst} -> {abs_target}.\n" + "Enable Developer Mode in Windows Settings for symlink support,\n" + "or install with PYTHONPATH pointing at " + "`build/python_packages/flydsl`.\n" + f"Original error: {e2}" + ) from e2 + else: + raise RuntimeError( + f"Failed to create symlink {dst} -> {target}.\n" + "Either create it manually, or install with PYTHONPATH pointing at " + "`build/python_packages/flydsl`.\n" + ) if not IS_WHEEL_BUILD: @@ -417,12 +460,18 @@ def _ensure_python_embedded_mlir_package() -> None: # otherwise the wheel will miss required runtime deps and be unusable. package_data={ "flydsl._mlir": [ + # Linux shared libraries "_mlir_libs/_*.so", "_mlir_libs/libFlyPythonCAPI.so.*", "_mlir_libs/libnanobind-*.so", "_mlir_libs/libMLIRPythonSupport-*.so", "_mlir_libs/lib*.so", "_mlir_libs/lib*.so.*", + # Windows shared libraries + "_mlir_libs/*.dll", + "_mlir_libs/*.pyd", + "_mlir_libs/*.lib", + # Type stubs "*.pyi", ], }, diff --git a/tests/unit/test_compile_hints.py b/tests/unit/test_compile_hints.py index bc11bb690..e66292261 100644 --- a/tests/unit/test_compile_hints.py +++ b/tests/unit/test_compile_hints.py @@ -75,11 +75,14 @@ def test_int_round_trip(self): assert restored == 2147483647 def test_str_round_trip(self): + import os + import tempfile _fly = self._get_fly() - old = _fly.set_llvm_option_str("module-summary-dot-file", "/tmp/test.dot") + test_path = os.path.join(tempfile.gettempdir(), "test.dot") + old = _fly.set_llvm_option_str("module-summary-dot-file", test_path) assert old == "" restored = _fly.set_llvm_option_str("module-summary-dot-file", old) - assert restored == "/tmp/test.dot" + assert restored == test_path def test_unknown_option_raises(self): _fly = self._get_fly()