Skip to content

Commit 0b10794

Browse files
committed
Add initial Windows support
1 parent b1688aa commit 0b10794

19 files changed

Lines changed: 1107 additions & 96 deletions

File tree

CMakeLists.txt

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ include(TableGen)
2424
include(AddLLVM)
2525
include(AddMLIR)
2626
include(HandleLLVMOptions)
27-
string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
27+
if(NOT WIN32)
28+
string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
29+
endif()
2830
include(MLIRDetectPythonEnv)
2931
mlir_configure_python_dev_packages()
3032

@@ -45,6 +47,22 @@ set(MLIR_PYTHON_PACKAGE_PREFIX "_mlir" CACHE STRING "" FORCE)
4547
set(MLIR_BINDINGS_PYTHON_INSTALL_PREFIX "python_packages/flydsl/${MLIR_PYTHON_PACKAGE_PREFIX}" CACHE STRING "" FORCE)
4648

4749

50+
# On Windows, MLIR's default SelfOwningTypeID scheme does not work across DLL
51+
# boundaries: the TypeID's `static SelfOwningTypeID id` member cannot be auto-imported
52+
# from a DLL without explicit __declspec(dllimport). So FlyDSL's Fly-defined types
53+
# (defined in obj.MLIRFlyDialect embedded in FlyPythonCAPI.dll) are unreachable from
54+
# .pyd extensions that reference them, causing link errors or runtime TypeID mismatches.
55+
#
56+
# Switch our compilation to string-based FallbackTypeIDs globally. Our own types all
57+
# route through the process-shared `registerImplicitTypeID` exported from
58+
# FlyPythonCAPI.dll, giving consistent TypeIDs across every DLL. Upstream MLIR's
59+
# prebuilt static libs (MLIRIR.lib, etc.) were compiled with SelfOwning TypeIDs,
60+
# but those are now linked into FlyPythonCAPI.dll as a single copy — internal usage
61+
# within that DLL still agrees with itself, so no mismatch there.
62+
if(WIN32)
63+
add_compile_definitions(MLIR_USE_FALLBACK_TYPE_IDS=1)
64+
endif()
65+
4866
add_subdirectory(include/flydsl)
4967
add_subdirectory(lib)
5068
add_subdirectory(tools)

docs/windows_build_guide.md

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# FlyDSL on Windows (experimental)
2+
3+
This guide covers building and running FlyDSL natively on Windows. The Windows
4+
port uses the **TheRock ROCm SDK** (installed as a Python package into a venv)
5+
rather than a system-wide ROCm install.
6+
7+
> **Status**: experimental. 301 / 310 unit tests pass (97%). See
8+
> [Known limitations](#known-limitations) for what doesn't yet.
9+
10+
## Prerequisites
11+
12+
1. **Windows 10/11, x64** with an AMD GPU supported by ROCm. Tested on RDNA4 (`gfx1200`, Radeon RX 9060 XT).
13+
2. **Visual Studio 2022 Build Tools** (or full VS 2022) with the *Desktop
14+
development with C++* workload — provides the MSVC toolchain.
15+
3. **LLVM/Clang** in PATH — `clang-cl`, `llvm-ar`, `llvm-ml`, `lld-link`. The
16+
official LLVM Windows installer puts these on PATH.
17+
4. **Ninja** in PATH (`choco install ninja` or bundled with VS).
18+
5. **Git** in PATH.
19+
6. **Python 3.12** (the build has only been exercised against 3.12).
20+
7. **PowerShell 5.1+** (ships with Windows).
21+
22+
All commands below assume you're running in a **Developer PowerShell / x64
23+
Native Tools Command Prompt**, or have run `vcvarsall.bat amd64` first so that
24+
`cl.exe`/`link.exe` are on PATH.
25+
26+
## 1. Create a venv and install TheRock ROCm SDK
27+
28+
TheRock ships ROCm as a Python wheel that installs into your venv. FlyDSL on
29+
Windows currently expects this layout (it finds `ld.lld.exe`, OCML bitcode,
30+
and `amdhip64_7.dll` relative to the SDK root).
31+
32+
```powershell
33+
python -m venv C:\path\to\flydsl-venv
34+
C:\path\to\flydsl-venv\Scripts\Activate.ps1
35+
36+
# Install TheRock ROCm SDK for your GPU family and initialize. For RDNA4 / gfx1200:
37+
pip install --pre --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all torch torchaudio torchvision rocm[libraries,devel]
38+
rocm-sdk init
39+
```
40+
41+
After install, locate the SDK development root — typically:
42+
`...\flydsl-venv\Lib\site-packages\_rocm_sdk_devel`
43+
44+
## 2. Set environment variables
45+
46+
```powershell
47+
$env:ROCM_PATH = "C:\path\to\flydsl-venv\Lib\site-packages\_rocm_sdk_devel"
48+
# Optional: force a specific GPU arch when torch-rocm auto-detect isn't right.
49+
# $env:FLYDSL_GPU_ARCH = "gfx1200"
50+
```
51+
52+
## 3. Build LLVM/MLIR
53+
54+
This pins the commit from `thirdparty/llvm-hash.txt`, builds with Ninja +
55+
`clang-cl`, and installs into `..\llvm-project\mlir_install\`.
56+
57+
```powershell
58+
# From the FlyDSL repo root:
59+
.\scripts\build_llvm.ps1 -Arch gfx1200 # substitute your GPU arch
60+
# or rely on FLYDSL_GPU_ARCH env var; default is gfx942.
61+
```
62+
63+
Notes:
64+
- The script passes `/DMLIR_USE_FALLBACK_TYPE_IDS=1` globally — required to
65+
make MLIR's TypeIDs work across multiple DLLs on Windows.
66+
- Build takes 30–60 min on a typical workstation. Use `-Jobs N` to cap
67+
parallelism (default is `NUMBER_OF_PROCESSORS / 2`).
68+
- Output: `..\llvm-project\mlir_install\` — keep this around.
69+
70+
## 4. Build FlyDSL
71+
72+
```powershell
73+
$env:MLIR_PATH = "C:\llvm-project\mlir_install" # or wherever step 3 installed
74+
.\scripts\build.ps1
75+
```
76+
77+
This produces `build-fly\python_packages\flydsl\` with the compiled extension
78+
modules, runtime DLLs, and python sources.
79+
80+
## 5. Install FlyDSL into the venv
81+
82+
```powershell
83+
pip install -e .
84+
```
85+
86+
This runs `setup.py` which creates a directory junction from
87+
`python\flydsl\_mlir``build-fly\python_packages\flydsl\_mlir` (junction
88+
instead of symlink so no admin / Developer Mode required).
89+
90+
## 6. Run tests
91+
92+
```powershell
93+
$env:PYTHONPATH = "$PWD\build-fly\python_packages;$PWD"
94+
python -m pytest tests\unit\ -q
95+
```
96+
97+
Expected: ~301 passed / ~4 failed / 5 skipped. See
98+
[Known limitations](#known-limitations).
99+
100+
## How the Windows-specific bits work
101+
102+
| Component | Linux behavior | Windows behavior |
103+
|---|---|---|
104+
| TypeID | `SelfOwningTypeID` (pointer identity across `.so`) | `MLIR_USE_FALLBACK_TYPE_IDS=1` — string-based, works across DLLs |
105+
| Symbol export | `-fvisibility=hidden` + version script | `WINDOWS_EXPORT_ALL_SYMBOLS` on `FlyPythonCAPI.dll`, with `obj.MLIRFlyDialect` / `obj.MLIRFlyROCDLDialect` added as direct sources and upstream `MLIRIR.lib` / `MLIRSupport.lib` extracted via `llvm-ar x` so auto-`.def` generation sees them |
106+
| lld for ROCDL | `<toolkit>/llvm/bin/ld.lld` — matches `/opt/rocm` layout | Staging junction at `%LOCALAPPDATA%\flydsl\rocm_toolkit\` unifies TheRock's `lib/llvm/bin/ld.lld.exe` + `lib/llvm/amdgcn/bitcode/` into the layout MLIR expects |
107+
| Runtime DLL search | `RPATH=$ORIGIN` | `os.add_dll_directory` + ctypes pre-load of `_mlir_libs\*.dll` before JIT engine init (LLVM's `LoadLibraryPermanently` doesn't search DLL-local dirs) |
108+
| GPU arch detect | `rocm_agent_enumerator` | Falls back to `torch.cuda.get_device_properties(0).gcnArchName` since TheRock doesn't ship the enumerator |
109+
| `_mlir` package link | Symlink | Directory junction (no admin needed) |
110+
111+
## Known limitations
112+
113+
- **Multi-stream correctness**: 2 `test_multi_stream_launch` tests fail
114+
(`test_two_streams_independent`, `test_diamond_pipeline_with_event_sync`).
115+
Single-stream launches work correctly.
116+
- **Disk cache test fragility**: `test_fp_math_reaches_pipeline` passes solo
117+
but fails in-suite because cached compilation artifacts bypass the monkey-
118+
patched hook. Not Windows-specific. Run with `FLYDSL_RUNTIME_ENABLE_CACHE=0`.
119+
- **Torch profiler test**: `test_cache_disabled_run_perftest_does_not_crash`
120+
hits a `DataFrame.host_time_sum` attribute error inside torch.profiler —
121+
version compat, not a FlyDSL issue.
122+
- **No CI coverage** yet — every build is verified manually.
123+
- **Only gfx1200 exercised** on Windows; other arches should work
124+
if your TheRock SDK + GPU combination is supported.
125+
126+
## Troubleshooting
127+
128+
- **`MLIR_FOUND=FALSE` or `LLVMNVPTXCodeGen` missing at configure time**:
129+
the ROCm SDK ships its own `LLVMConfig.cmake`. Ensure `-DLLVM_DIR=...` is
130+
passed explicitly (the scripts do this). Don't add `_rocm_sdk_devel` to
131+
`CMAKE_PREFIX_PATH`.
132+
- **`_ITERATOR_DEBUG_LEVEL` link mismatch**: you're building FlyDSL as Debug
133+
while MLIR was built Release. Keep both at Release (`build.ps1` passes
134+
`-DCMAKE_BUILD_TYPE=Release`).
135+
- **`ModuleNotFoundError: flydsl._mlir`**: the editable install junction
136+
didn't get created. `cd` into the repo and run:
137+
`New-Item -ItemType Junction -Path python\flydsl\_mlir -Target build-fly\python_packages\flydsl\_mlir`.
138+
- **`hipErrorNoBinaryForGpu`**: your kernel was compiled for the wrong arch.
139+
Set `FLYDSL_GPU_ARCH` to the value printed by
140+
`python -c "import torch; print(torch.cuda.get_device_properties(0).gcnArchName)"`.
141+
- **`rocm amdgcn bitcode path ... does not exist`** during compilation:
142+
the toolkit staging junction failed. Check
143+
`%LOCALAPPDATA%\flydsl\rocm_toolkit\amdgcn\bitcode\` and that `ROCM_PATH`
144+
points at TheRock's `_rocm_sdk_devel`.

kernels/custom_all_reduce.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,10 @@ def _get_gpu_arch(cls) -> str:
130130
pass
131131
if not arch:
132132
try:
133+
import shutil
133134
import subprocess
134-
r = subprocess.run(["rocminfo"], capture_output=True, text=True, timeout=10)
135+
rocminfo = shutil.which("rocminfo") or "rocminfo"
136+
r = subprocess.run([rocminfo], capture_output=True, text=True, timeout=10)
135137
for line in r.stdout.splitlines():
136138
if "Name:" in line and "gfx" in line.lower():
137139
arch = line.split(":")[-1].strip()
@@ -147,7 +149,13 @@ def _load_hip(cls):
147149
if cls._hip is not None:
148150
return cls._hip
149151
import ctypes
150-
for name in ("libamdhip64.so", "libamdhip64.so.6", "libamdhip64.so.5"):
152+
import sys
153+
154+
if sys.platform == "win32":
155+
hip_candidates = ("amdhip64.dll", "amdhip64_7.dll", "amdhip64_6.dll")
156+
else:
157+
hip_candidates = ("libamdhip64.so", "libamdhip64.so.6", "libamdhip64.so.5")
158+
for name in hip_candidates:
151159
try:
152160
cls._hip = ctypes.CDLL(name)
153161
break

lib/Bindings/Python/BindingUtils.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
#ifndef FLYDSL_BINDINGS_PYTHON_BINDINGUTILS_H
55
#define FLYDSL_BINDINGS_PYTHON_BINDINGUTILS_H
66

7+
// Nanobind.h must come before Interop.h: nanobind includes Python.h which
8+
// defines PyObject — Interop.h uses PyObject but doesn't include Python.h.
9+
#include "mlir/Bindings/Python/Nanobind.h"
710
#include "mlir-c/Bindings/Python/Interop.h"
811
#include "mlir/Bindings/Python/IRCore.h"
9-
#include "mlir/Bindings/Python/Nanobind.h"
1012
#include "mlir/Bindings/Python/NanobindAdaptors.h"
1113
#include "mlir/CAPI/IR.h"
1214
#include "mlir/CAPI/Support.h"
Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
1-
add_mlir_public_c_api_library(MLIRCPIFly
2-
FlyDialect.cpp
3-
LINK_LIBS PUBLIC
4-
MLIRFlyDialect
5-
)
1+
# On Windows, MLIRFlyDialect's OBJECT files are added directly to FlyPythonCAPI
2+
# (see python/mlir_flydsl/CMakeLists.txt) so that WINDOWS_EXPORT_ALL_SYMBOLS exports
3+
# the mlir::fly::* C++ symbols. Linking MLIRFlyDialect here transitively would cause
4+
# duplicate-symbol errors at FlyPythonCAPI.dll link time.
5+
if(WIN32)
6+
add_mlir_public_c_api_library(MLIRCPIFly
7+
FlyDialect.cpp
8+
)
9+
# Still need to compile against the dialect's headers (TableGen'd types).
10+
add_dependencies(obj.MLIRCPIFly MLIRFlyIncGen)
11+
target_include_directories(obj.MLIRCPIFly PRIVATE
12+
${CMAKE_BINARY_DIR}/include
13+
${CMAKE_SOURCE_DIR}/include
14+
)
15+
else()
16+
add_mlir_public_c_api_library(MLIRCPIFly
17+
FlyDialect.cpp
18+
LINK_LIBS PUBLIC
19+
MLIRFlyDialect
20+
)
21+
endif()
Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
1-
add_mlir_public_c_api_library(MLIRCPIFlyROCDL
2-
FlyROCDLDialect.cpp
3-
LINK_LIBS PUBLIC
4-
MLIRFlyROCDLDialect
5-
MLIRFlyToROCDL
6-
)
1+
# On Windows, MLIRFlyROCDLDialect's OBJECT files are added directly to FlyPythonCAPI
2+
# so WINDOWS_EXPORT_ALL_SYMBOLS exports the C++ symbols. Avoid duplicate linkage here.
3+
# MLIRFlyToROCDL is still linked as a static lib — its symbols are only used through
4+
# the CAPI registerFlyToROCDLConversionPass wrapper, which IS in an embedded obj.
5+
if(WIN32)
6+
add_mlir_public_c_api_library(MLIRCPIFlyROCDL
7+
FlyROCDLDialect.cpp
8+
LINK_LIBS PUBLIC
9+
MLIRFlyToROCDL
10+
)
11+
add_dependencies(obj.MLIRCPIFlyROCDL MLIRFlyROCDLIncGen)
12+
target_include_directories(obj.MLIRCPIFlyROCDL PRIVATE
13+
${CMAKE_BINARY_DIR}/include
14+
${CMAKE_SOURCE_DIR}/include
15+
)
16+
else()
17+
add_mlir_public_c_api_library(MLIRCPIFlyROCDL
18+
FlyROCDLDialect.cpp
19+
LINK_LIBS PUBLIC
20+
MLIRFlyROCDLDialect
21+
MLIRFlyToROCDL
22+
)
23+
endif()

0 commit comments

Comments
 (0)