Merge branch 'main' into feat/dflash-vlm-sglang

FrankLeeeee · web-flow · commit 10f35f34ee77 · 2026-03-24T11:50:23.000+07:00
diff --git a/.github/workflows/publish_pypi.yaml b/.github/workflows/publish_pypi.yaml
@@ -21,7 +21,7 @@ jobs:
       with:
         python-version: '3.11'
 
-    - run: python setup.py sdist build
+    - run: pip install build && python -m build --sdist
 
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI
diff --git a/.github/workflows/scripts/delete_gpu_process.sh b/.github/workflows/scripts/delete_gpu_process.sh
@@ -0,0 +1,30 @@
+echo "=== Checking GPU Utilization ==="
+
+# Get GPU indices and their utilization
+nvidia-smi --query-gpu=index,utilization.gpu --format=csv,noheader,nounits | while IFS=',' read -r gpu_index utilization; do
+    gpu_index=$(echo "$gpu_index" | tr -d ' ')
+    utilization=$(echo "$utilization" | tr -d ' ')
+
+    if [ "$utilization" -eq 0 ]; then
+        echo "GPU $gpu_index has 0% utilization — checking for processes..."
+
+        # Get PIDs running on this GPU
+        pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader --id="$gpu_index")
+
+        if [ -z "$pids" ]; then
+            echo "  No processes found on GPU $gpu_index."
+        else
+            echo "  Killing processes on GPU $gpu_index: $pids"
+            for pid in $pids; do
+                pid=$(echo "$pid" | tr -d ' ')
+                echo "  Killing PID $pid..."
+                # kill -9 "$pid" && echo "  PID $pid killed." || echo "  Failed to kill PID $pid (may need sudo)."
+                docker run --rm --privileged --pid=host ubuntu bash -c "kill -9 $pid"
+            done
+        fi
+    else
+        echo "GPU $gpu_index is active ($utilization% utilization) — skipping."
+    fi
+done
+
+echo ""
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -19,7 +19,7 @@ jobs:
     runs-on: [self-hosted]
     container:
       image: lmsysorg/sglang:v0.5.5 # we lock to this version to avoid repeated docker pull
-      options: --gpus all --shm-size=2g --rm -v /dev/shm
+      options: --gpus all --shm-size=2g --rm -v /dev/shm --privileged
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -49,6 +49,11 @@ jobs:
           uv pip install setuptools
           MAX_JOBS=8 uv pip install -v ".[fa]" --prerelease=allow --no-build-isolation
 
+      - name: Kill GPU processes
+        shell: bash
+        run: |
+          bash .github/workflows/scripts/delete_gpu_process.sh
+
       - name: Run test
         timeout-minutes: 30
         shell: bash
diff --git a/configs/qwen3-coder-next-eagle3.json b/configs/qwen3-coder-next-eagle3.json
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "use_cache": true,
+  "vocab_size": 151936,
+  "draft_vocab_size": 32000
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,9 +4,12 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "specforge"
-dynamic = ["version", "description"]
+dynamic = ["version"]
 readme = "README.md"
 requires-python = ">=3.11"
+description = "SpecForge: Speculative Decoding Training Framework"
+authors = [{name = "SGLang Team"}]
+urls = {Homepage = "https://github.com/sgl-project/SpecForge"}
 dependencies = [
     "pre-commit",
     "torch==2.9.1",
@@ -29,8 +32,8 @@ dependencies = [
     "yunchang",
 ]
 
-[tool.setuptools]
-packages = ["specforge"]
+[tool.setuptools.packages.find]
+exclude = ["configs*", "scripts*", "tests*"]
 
 [project.optional-dependencies]
 dev = [
@@ -41,4 +44,3 @@ fa = ["flash-attn"]
 
 [tool.setuptools.dynamic]
 version = {file = "version.txt"}
-description = {file = "README.md"}
diff --git a/setup.py b/setup.py
diff --git a/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py b/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py
@@ -34,7 +34,6 @@ def test_dense(rank, world_size, port, tp_size):
         mem_fraction_static=0.4,
         # enable_torch_compile=True,
         enable_nccl_nvls=True,
-        # enable_symm_mem=True,
         enable_symm_mem=False,
         enable_torch_compile=True,
         enable_dp_attention=False,
@@ -73,9 +72,6 @@ def test_moe(rank, world_size, port, tp_size):
         mem_fraction_static=0.4,
         enable_torch_compile=True,
         enable_nccl_nvls=True,
-        # enable_symm_mem=True,
-        # enable_dp_attention=True,
-        # enable_dp_lm_head=True,
         enable_symm_mem=False,
         enable_dp_attention=False,
         enable_dp_lm_head=False,
@@ -213,11 +209,8 @@ def test_vlm(rank, world_size, port, tp_size):
         attention_backend="fa3",
         mem_fraction_static=0.75,
         enable_torch_compile=True,
-        # enable_nccl_nvls=False,
         enable_nccl_nvls=True,
         enable_symm_mem=False,  # Disable to avoid nccl_allocator compilation issues
-        # enable_dp_attention=True,
-        # enable_dp_lm_head=True,
         enable_dp_attention=False,
         enable_dp_lm_head=False,
         enable_piecewise_cuda_graph=True,
@@ -376,10 +369,7 @@ def test_vlm_multi_batch(rank, world_size, port, tp_size):
         mem_fraction_static=0.4,
         enable_nccl_nvls=True,
         enable_torch_compile=True,
-        # enable_nccl_nvls=False,
         enable_symm_mem=False,
-        # enable_dp_attention=True,
-        # enable_dp_lm_head=True,
         enable_dp_attention=False,
         enable_dp_lm_head=False,
         enable_piecewise_cuda_graph=True,