Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
833100d
feat(ascend): add Ascend framework layer — runtime, type mapping, bui…
Apr 8, 2026
e4b7e49
style(ascend): apply `clang-format` to framework headers
Apr 8, 2026
6d72245
fix(ascend): adapt `Memcpy`/`Memset` arity, assert workspace alloc, r…
Apr 8, 2026
a0ab3d0
feat(ascend): add GEMM kernel, NPU test infra, and example integration
Apr 8, 2026
9bd3db8
fix(ascend): move `aclrtMalloc` out of `assert()` in `WorkspacePool`
Apr 8, 2026
6b782a2
fix(nvidia): restore `CUDA::cublasLt` link dependency
Apr 8, 2026
0fc990f
feat(test): add `--devices` option to pytest for platform-name filtering
Apr 8, 2026
9cfac6d
fix(nvidia): add missing include and work around NVCC `std::forward` bug
Apr 10, 2026
0f08022
fix(ci): upgrade NVIDIA CI image to 25.12 and restore `std::forward`
zhangyue207 Apr 10, 2026
4c6adba
fix: add explicit narrowing casts in `RotaryEmbedding` initializer list
Apr 10, 2026
91689d5
style: fix lint issues from PR review
Apr 10, 2026
7628b2f
style: fix lint issues in `feat/ascend-framework`
Apr 10, 2026
537fc6d
feat(ascend): add 9 Ascend operator kernels
Apr 8, 2026
6341457
test(ascend): add NPU stream injection and new operator tests
Apr 8, 2026
e1fa963
docs: add Ascend FlashAttention design spec
Apr 8, 2026
aa4703d
Revert "docs: add Ascend FlashAttention design spec"
Apr 8, 2026
ffe99fe
feat(ascend): optimize all operator dispatch (P0-P4) and add Cast/Cat…
Apr 9, 2026
c85dcc6
fix(ascend): stabilize `WorkspacePool` pointers and remove dead code
Apr 9, 2026
8b458ed
fix(cat): support negative dim and document TensorList caching assump…
Apr 9, 2026
c1ee4b6
feat(dsl): add cross-platform DSL framework with `@manual_op` codegen
Apr 10, 2026
fa5bb45
feat(dsl): add C++ template bricks for binary elementwise and reduce-…
Apr 10, 2026
1da2e1c
feat(dsl): add `@infini_op` compiler with DAG parser, pattern matcher…
Apr 10, 2026
1e9f167
feat(dsl): add implementation_index system, DSL-generate Mul/Swiglu, …
zhangyue207 Apr 11, 2026
067c85a
feat(dsl): add CUDA unary elementwise brick template
zhangyue207 Apr 11, 2026
293bc4e
feat(dsl): add CPU unary elementwise brick template
zhangyue207 Apr 11, 2026
aaaa641
feat(dsl): add unary elementwise codegen for `@infini_op`
zhangyue207 Apr 11, 2026
7cb62bd
test(dsl): add performance benchmark comparing DSL vs hand-written ke…
zhangyue207 Apr 11, 2026
e437333
feat(dsl): add unary elementwise brick and migrate Cast to @infini_op
zhangyue207 Apr 11, 2026
57fde3a
refactor(dsl): extract binding generation into dsl/compiler/bindings.py
zhangyue207 Apr 11, 2026
c6fcc17
feat(dsl): integrate binding generation into `python -m dsl`
zhangyue207 Apr 11, 2026
02904b6
build: replace generate_wrappers.py with python -m dsl in CMake
zhangyue207 Apr 11, 2026
4d71ce2
feat(nvidia): add Matmul operator with cuBLASLt + cuBLAS fallback
zhangyue207 Apr 11, 2026
f446b8f
feat(nvidia): add Linear operator with CUDA implementation
zhangyue207 Apr 11, 2026
c73e03a
feat(nvidia): add Cat (concatenation) operator with CUDA kernel
zhangyue207 Apr 11, 2026
70012e5
Merge branch 'worktree-agent-aa86cbd7' into feat/cross-platform-dsl
zhangyue207 Apr 11, 2026
bd911d2
Merge branch 'worktree-agent-a077bda1' into feat/cross-platform-dsl
zhangyue207 Apr 11, 2026
e3705c1
Merge branch 'worktree-agent-a2f03dad' into feat/cross-platform-dsl
zhangyue207 Apr 11, 2026
9c4e010
feat(nvidia): add Cat, Linear, and Matmul CUDA kernels
zhangyue207 Apr 11, 2026
5412680
feat(nvidia): add fused AddRmsNorm CUDA kernel
zhangyue207 Apr 11, 2026
cf18c08
feat(nvidia): add ReshapeAndCache CUDA kernel for KV cache
zhangyue207 Apr 11, 2026
e7e49d9
Merge branch 'worktree-agent-ab5f4b23' into feat/cross-platform-dsl
zhangyue207 Apr 11, 2026
79df991
Merge branch 'worktree-agent-a09ccf36' into feat/cross-platform-dsl
zhangyue207 Apr 11, 2026
ea2346f
feat(nvidia): add AddRmsNorm, ReshapeAndCache, and RotaryEmbedding CU…
zhangyue207 Apr 11, 2026
893f82e
fix: address code review findings for Batch 1+2 operators
zhangyue207 Apr 11, 2026
87fbf77
feat(nvidia): add FlashAttention via FlashInfer header-only integration
zhangyue207 Apr 11, 2026
2c70d33
fix(build): auto-detect CUDA architecture from GPU hardware
zhangyue207 Apr 11, 2026
0526f71
test: add comprehensive CUDA operator benchmark and baseline report
zhangyue207 Apr 11, 2026
9986795
fix(docs): correct CausalSoftmax optimization suggestion in benchmark…
zhangyue207 Apr 11, 2026
c91a0a9
perf(nvidia): upgrade Linear to cuBLASLt for 13% speedup
zhangyue207 Apr 11, 2026
5d3edf4
docs(test): annotate Gemm cuBLASLt performance advantage and precisio…
zhangyue207 Apr 11, 2026
be129fc
refactor(dsl): scan generated/ for operator specializations in bindin…
zhangyue207 Apr 12, 2026
7f882f8
refactor: separate hand-written and generated code
zhangyue207 Apr 12, 2026
b37dc07
refactor(dsl): auto-generate BLAS wrapper for Gemm cuBLAS
zhangyue207 Apr 12, 2026
3a2e6e2
feat(nvidia): add FlashAttention single decode path via FlashInfer
zhangyue207 Apr 12, 2026
aabf242
feat(nvidia): add batch prefill and paged decode for FlashAttention
zhangyue207 Apr 12, 2026
32e83b8
perf(nvidia): replace per-sequence loops with FlashInfer native batch…
zhangyue207 Apr 12, 2026
0e092dc
perf(nvidia): replace per-call cudaMalloc with pre-allocated workspac…
zhangyue207 Apr 12, 2026
8d0efdf
perf(cuda): add vectorized binary elementwise kernel for contiguous t…
zhangyue207 Apr 12, 2026
6f116c2
perf(nvidia): refactor CudaAdd and CudaSwiglu to use vectorized brick
zhangyue207 Apr 12, 2026
e871da1
perf(cuda): add grid-stride loop to unary elementwise kernel for cont…
zhangyue207 Apr 12, 2026
873a2a3
docs: add 5-round optimization log with performance data
zhangyue207 Apr 12, 2026
cd8ae08
perf(cuda): vectorized load for unary elementwise kernel
zhangyue207 Apr 12, 2026
666c436
docs: update optimization log with 5 rounds of profiling and analysis
zhangyue207 Apr 12, 2026
743eb3d
perf(cuda): single-pass RmsNorm with shared memory caching
zhangyue207 Apr 12, 2026
2e0fccb
perf(cuda): single-pass AddRmsNorm with shared memory caching
zhangyue207 Apr 12, 2026
ffab633
revert: restore smem-cache RmsNorm without vectorized global load
zhangyue207 Apr 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/flashinfer"]
path = third_party/flashinfer
url = https://github.com/flashinfer-ai/flashinfer.git
176 changes: 176 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## Build

InfiniOps uses CMake + scikit-build-core. The library is compiled into a shared `libinfiniops` and an optional Python extension `ops`.

### C++ only

```bash
mkdir build && cd build
cmake .. -DWITH_CPU=ON # or -DWITH_NVIDIA=ON, -DWITH_METAX=ON, etc.
make -j$(nproc)
```

### Python package (pip / editable install)

```bash
pip install .[dev] # installs infiniops + dev tools
# or for an editable build:
pip install -e .[dev]
```

`pyproject.toml` sets `AUTO_DETECT_DEVICES=ON` and `GENERATE_PYTHON_BINDINGS=ON` automatically during `pip install`.

### Backend CMake flags

| Flag | Backend |
|------|---------|
| `-DWITH_CPU=ON` | CPU (OpenMP) |
| `-DWITH_NVIDIA=ON` | NVIDIA CUDA (requires CUDAToolkit) |
| `-DWITH_ILUVATAR=ON` | Iluvatar (clang++ with `-x ivcore`) |
| `-DWITH_METAX=ON` | MetaX (requires `$MACA_PATH`) |
| `-DWITH_CAMBRICON=ON` | Cambricon (requires `$NEUWARE_HOME`) |

`WITH_NVIDIA` and `WITH_ILUVATAR` cannot both be ON at the same time.

## Testing

```bash
pytest tests/ # run all tests
pytest tests/test_add.py # run one test file
pytest tests/test_add.py::test_add # run a single test
pytest tests/ --benchmark # run with performance benchmarks
pytest tests/ -v --tb=short # verbose output
```

Tests auto-parametrize on `dtype` (float32/float16/bfloat16) and `device` (cpu, and cuda/mlu if available). Tests import `infini.ops`, so the package must be installed (or built and on `PYTHONPATH`).

## Linting

```bash
ruff check .
ruff format .
```

## Code Style

Follow PEP 8 as the primary style guide. For areas PEP 8 does not cover in detail, refer to the GDScript style guide for non-syntax conventions. Always run `ruff format && ruff check` before committing.

### Comments

- Comments must be complete English sentences: capitalize the first word, end with punctuation.
- Use Markdown backtick syntax for code references within comments (e.g. `` `variable_name` ``).
- Error messages and framework-conventional strings (e.g. `pytest.skip` reasons) follow their own conventions — typically lowercase, no trailing period.

### Docstrings

- Follow PEP 257. One-line docstrings stay on a single line. Multi-line docstrings have a summary line, a blank line, then the description.

### Blank lines

- No blank line between a function signature and its body when there is no docstring or comment.
- Add a blank line before and after `if`, `for`, `while`, and similar compound statements.
- Add a blank line before a `return` statement unless it is directly inside an `if`/`for`/`while` block body.

## CI

The `.ci/` directory implements a multi-platform, resource-aware CI system with Docker-based execution, GitHub integration, and cross-machine job dispatch.

### Configuration

`config.yaml` uses a **platform-centric** structure that normalizes to flat `{platform}_{job}` names at load time (e.g. `nvidia_gpu`). Each platform defines its Docker image, setup commands, volumes, env vars, and jobs. Jobs inherit platform-level defaults.

Supported platforms: **nvidia**, **iluvatar**, **ascend** (ascend not ready yet).

### Building images

```bash
python .ci/build.py --platform nvidia # build one platform
python .ci/build.py --platform all # build all platforms
python .ci/build.py --platform nvidia --force # skip Dockerfile change detection
python .ci/build.py --push --dry-run # push to registry (preview)
```

Dockerfiles live in `.ci/images/{platform}/Dockerfile`. Proxy variables from the host are forwarded automatically.

### Running the pipeline locally

```bash
python .ci/run.py # auto-detect platform, run all jobs
python .ci/run.py --job gpu --stage test # run specific job/stage
python .ci/run.py --job gpu --gpu-id 0,2 # override GPU allocation
python .ci/run.py --image-tag stable # use a specific image tag
python .ci/run.py --dry-run # preview docker commands
```

Platform is auto-detected by checking for `nvidia-smi` or `ixsmi` on PATH.

### Agent (scheduler + webhook server)

`agent.py` provides a resource-aware scheduler with GitHub webhook support and REST API:

```bash
# Start the agent (webhook server + scheduler)
python .ci/agent.py serve --port 8080 --webhook-secret <secret>

# Dispatch jobs to remote agents via HTTP
python .ci/agent.py run --branch feat/xxx --platform nvidia
python .ci/agent.py run --job nvidia_gpu --dry-run
```

**Key capabilities:**

- **Resource-aware scheduling** — dynamically allocates GPUs based on utilization threshold; queues jobs when resources are busy.
- **GitHub webhooks** — triggers jobs on push/PR events (`/webhook` endpoint, HMAC-SHA256 verified).
- **REST API** — `/api/run` (trigger jobs, Bearer token auth), `/api/job/{id}` (query status), `/status` (queue + resources), `/health`.
- **GitHub commit status** — reports pending/success/failure per job via `github_status.py`.
- **Cross-machine dispatch** — sends jobs to remote platform agents and polls for results.

### Module overview

| File | Purpose |
|------|---------|
| `config.yaml` | Platform-centric CI configuration |
| `build.py` | Docker image builder with change detection |
| `run.py` | Standalone Docker CI runner (clone, setup, stages) |
| `agent.py` | Scheduler, webhook server, remote dispatch CLI |
| `utils.py` | Config normalization (`normalize_config`), git helpers |
| `ci_resource.py` | GPU/memory detection and thread-safe allocation (`ResourcePool`) |
| `github_status.py` | GitHub Commit Status API wrapper (zero external deps) |

### Tests

```bash
pytest .ci/tests/ # run all CI tests
pytest .ci/tests/test_agent.py # test scheduler and webhooks
```

## Architecture

### C++ layer (`src/`)

- **`src/base/<op>.h`** — Abstract base class for each operator (e.g. `Add`, `Gemm`, `RmsNorm`). Declares the constructor (capturing tensor metadata) and a pure-virtual `operator()`.
- **`src/<backend>/<op>.*`** — Backend-specific specializations: `src/cpu/`, `src/cuda/`, `src/nvidia/`, `src/metax/`, `src/cambricon/`, `src/iluvatar/`. Each provides `template<> class Operator<Add, Device::Type::kNvidia>`.
- **`src/operator.h`** — `Operator<Key, Device>` template that dispatches to the correct device specialization at `make()` time via `DispatchFunc`. Also caches constructed operator descriptors keyed on tensor shape/dtype/strides.
- **`src/tensor.h` / `src/device.h` / `src/data_type.h`** — Core data model: `Tensor` (pointer + shape + strides + dtype + device), `Device`, `DataType`.
- **`src/dispatcher.h`** — `DispatchFunc` selects the right device at runtime based on `Device::Type` and the compile-time `ActiveDevices` set.

### Python bindings

Python bindings are **auto-generated** by `scripts/generate_wrappers.py` using libclang to parse `src/base/<op>.h`. The generated output lands in `generated/bindings/ops.cc` and `generated/include/`. Bindings expose each operator both as a callable class (stateful, with constructor) and as a free function (`infini.ops.add(input, other, out)`).

### Test framework (`tests/`)

- `conftest.py` implements the `@pytest.mark.auto_act_and_assert` marker: the test function returns a `Payload(func, ref, args, kwargs, rtol, atol)` and the framework calls both, clones tensors for the reference, and asserts `torch.allclose`.
- `device` and `dtype` fixtures are auto-parametrized in `conftest.py`; individual tests can override with explicit `@pytest.mark.parametrize`.
- `tests/utils.py` provides `randn_strided`, `randint_strided`, `empty_strided`, `clone_strided` to create tensors with arbitrary strides.

### Adding a new operator

1. Create `src/base/<op>.h` with an abstract class inheriting `Operator<OpName>`.
2. Implement backend specializations in `src/<backend>/`.
3. Re-run `scripts/generate_wrappers.py` (or rebuild with `GENERATE_PYTHON_BINDINGS=ON`) to regenerate Python bindings.
4. Add a `tests/test_<op>.py` using the `Payload` / `auto_act_and_assert` pattern.
52 changes: 49 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF)
option(WITH_METAX "Enable MetaX backend" OFF)
option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
option(WITH_MOORE "Enable Moore backend" OFF)
option(WITH_ASCEND "Enable Ascend backend" OFF)

option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
Expand All @@ -28,6 +29,31 @@ if(AUTO_DETECT_DEVICES)
if(NVIDIA_DEV_FILES)
set(WITH_NVIDIA ON)
message(STATUS "Auto-detected NVIDIA environment.")

# Detect the GPU's compute capability so we compile for the right
# architecture. Without this, CMake may pick a lower default (e.g.
# SM75) and kernels that require newer features (bf16 on SM80+) will
# fail at runtime.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
execute_process(
COMMAND nvidia-smi --query-gpu=compute_cap --format=csv,noheader
OUTPUT_VARIABLE _gpu_caps
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)

if(_gpu_caps)
# Take the first GPU's capability (e.g. "8.0" -> "80").
string(REGEX MATCH "([0-9]+)\\.([0-9]+)" _cap_match "${_gpu_caps}")
string(REPLACE "." "" _arch "${_cap_match}")

if(_arch)
set(CMAKE_CUDA_ARCHITECTURES "${_arch}" CACHE STRING
"CUDA architectures (auto-detected from GPU)")
message(STATUS "Auto-detected CUDA architecture: SM${_arch}")
endif()
endif()
endif()
endif()

file(GLOB ILUVATAR_DEV_FILES "/dev/iluvatar*")
Expand Down Expand Up @@ -71,20 +97,25 @@ if(AUTO_DETECT_DEVICES)
set(WITH_MOORE OFF)
set(WITH_MOORE OFF CACHE BOOL "Enable Moore backend" FORCE)
endif()

if(DEFINED ENV{ASCEND_HOME_PATH} OR EXISTS "/dev/davinci0")
set(WITH_ASCEND ON)
message(STATUS "Auto-detected Ascend environment.")
endif()
endif()

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)

# Only one CUDA-like GPU backend can be enabled at a time.
set(_gpu_backend_count 0)
foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE)
foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE WITH_ASCEND)
if(${_gpu_backend})
math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1")
endif()
endforeach()

if(_gpu_backend_count GREATER 1)
message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.")
message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.")
endif()

if(WITH_NVIDIA)
Expand Down Expand Up @@ -178,8 +209,23 @@ if(WITH_CAMBRICON)
find_library(CAMBRICON_PAPI_LIB NAMES cnpapi HINTS "${NEUWARE_HOME}/lib64" REQUIRED)
endif()

if(WITH_ASCEND)
add_compile_definitions(WITH_ASCEND=1)
if(NOT DEFINED ASCEND_HOME)
if(DEFINED ENV{ASCEND_HOME_PATH} AND NOT "$ENV{ASCEND_HOME_PATH}" STREQUAL "")
set(ASCEND_HOME "$ENV{ASCEND_HOME_PATH}" CACHE PATH "Ascend toolkit root")
else()
set(ASCEND_HOME "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "Ascend toolkit root")
endif()
endif()
if(NOT EXISTS "${ASCEND_HOME}")
message(FATAL_ERROR "`WITH_ASCEND` is ON but `${ASCEND_HOME}` was not found. Set ASCEND_HOME_PATH.")
endif()
message(STATUS "Using Ascend from `${ASCEND_HOME}`.")
endif()

# If all other platforms are not enabled, CPU is enabled by default.
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON)
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND)
add_compile_definitions(WITH_CPU=1)
endif()

Expand Down
Loading