Gen-Verse · ShenAC-SAC · Mar 18, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -269,6 +269,11 @@ cd slime
 bash ../openclaw-combine/run_qwen3_4b_openclaw_combine.sh
 ```
 
+**Qwen3.5-4B** (hybrid text backbone):
+```bash
+bash ../openclaw-combine/run_qwen35_4b_openclaw_combine.sh
+```
+
 This method combines binary RL and OPD to achieve the best optimization.
 
 See [`./openclaw-combine/README.md`](./openclaw-combine/README.md) for algorithm details.
@@ -299,6 +304,11 @@ cd slime
 bash ../openclaw-rl/run_qwen3_4b_openclaw_rl.sh
 ```
 
+**Qwen3.5-4B** (hybrid text backbone):
+```bash
+bash ../openclaw-rl/run_qwen35_4b_openclaw_rl.sh
+```
+
 The PRM will automatically judge response quality from next-state feedback. We recommend providing frequent feedback (e.g., 👍/👎) to help the model optimize effectively.
 
 See [`./openclaw-rl/README.md`](./openclaw-rl/README.md) for algorithm details.
@@ -329,6 +339,11 @@ cd slime
 bash ../openclaw-opd/run_qwen3_4b_openclaw_opd.sh
 ```
 
+**Qwen3.5-4B** (hybrid text backbone):
+```bash
+bash ../openclaw-opd/run_qwen35_4b_openclaw_opd.sh
+```
+
 The system extracts hindsight hints from your feedback and distills them into the policy at the token level. We recommend providing concrete feedback (e.g., "you should have checked the file first" or "don't use that library").
 
 See [`./openclaw-opd/README.md`](./openclaw-opd/README.md) for algorithm details.
@@ -558,4 +573,3 @@ When using OpenClaw-RL, please do not provide sensitive personal information dur
 ---
 
 
-
diff --git a/openclaw-combine/README.md b/openclaw-combine/README.md
@@ -43,6 +43,13 @@ cd slime
 bash ../openclaw-combine/run_qwen3_4b_openclaw_combine.sh
 ```
 
+Qwen3.5-4B alternative:
+
+```bash
+cd slime
+bash ../openclaw-combine/run_qwen35_4b_openclaw_combine.sh
+```
+
 ### Key Environment Variables
 
 | Variable | Default | Description |

diff --git a/openclaw-combine/run_qwen35_4b_openclaw_combine.sh b/openclaw-combine/run_qwen35_4b_openclaw_combine.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+
+SKIP_CLUSTER_CLEANUP=${SKIP_CLUSTER_CLEANUP:-0}
+if [ "${SKIP_CLUSTER_CLEANUP}" != "1" ]; then
+  pkill -9 sglang
+  sleep 3
+  ray stop --force
+  pkill -9 ray
+  pkill -9 python
+  sleep 3
+  pkill -9 ray
+  pkill -9 python
+fi
+
+set -ex
+
+export PYTHONUNBUFFERED=1
+export PYTHONFAULTHANDLER=1
+export FLASHINFER_WORKSPACE_BASE="${FLASHINFER_WORKSPACE_BASE:-/tmp}"
+
+NUM_GPUS=${NUM_GPUS:-8}
+ACTOR_GPUS=${ACTOR_GPUS:-4}
+ROLLOUT_GPUS=${ROLLOUT_GPUS:-2}
+PRM_GPUS=${PRM_GPUS:-2}
+
+if (( ACTOR_GPUS + ROLLOUT_GPUS + PRM_GPUS > NUM_GPUS )); then
+    echo "ACTOR_GPUS + ROLLOUT_GPUS + PRM_GPUS must be <= NUM_GPUS"
+    echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, PRM_GPUS=${PRM_GPUS}, NUM_GPUS=${NUM_GPUS}"
+    exit 1
+fi
+
+export RAY_health_check_failure_threshold=20
+export RAY_health_check_period_ms=5000
+export RAY_health_check_timeout_ms=30000
+export RAY_num_heartbeats_timeout=60
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)"
+SLIME_ROOT="$(cd -- "${SCRIPT_DIR}/../slime" &>/dev/null && pwd)"
+MEGATRON_ROOT="$(cd -- "${REPO_ROOT}/Megatron-LM" &>/dev/null && pwd)"
+source "${SLIME_ROOT}/scripts/models/qwen3.5-4B.sh"
+
+HF_CKPT=${HF_CKPT:-/absolute/path/to/Qwen3.5-4B}
+REF_LOAD=${REF_LOAD:-${HF_CKPT}}
+SAVE_CKPT=${SAVE_CKPT:-/absolute/path/to/OpenClaw-RL/ckpt/qwen35-4b-openclaw-combine}
+PRM_MODEL_PATH=${PRM_MODEL_PATH:-/absolute/path/to/Qwen3.5-4B}
+
+export SGLANG_API_KEY="${SGLANG_API_KEY}"
+export SERVED_MODEL_NAME="qwen3.5-4b"
+export HOST="0.0.0.0"
+export PORT="30000"
+export OPENCLAW_RECORD_ENABLED="${OPENCLAW_RECORD_ENABLED:-1}"
+export OPENCLAW_RECORD_FILE="${SCRIPT_DIR}/results/qwen35_4b_record.jsonl"
+export TP="2"
+export CONTEXT_LENGTH="32768"
+export MEM_FRACTION_STATIC="0.8"
+export REASONING_PARSER="${REASONING_PARSER:-qwen3}"
+export TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-qwen25}"
+export SGLANG_LANGUAGE_ONLY="${SGLANG_LANGUAGE_ONLY:-1}"
+export PRM_M="${PRM_M:-1}"
+TRAIN_TP=${TRAIN_TP:-4}
+ROLLOUT_NUM_GPUS_PER_ENGINE=${ROLLOUT_NUM_GPUS_PER_ENGINE:-2}
+PRM_NUM_GPUS_PER_ENGINE=${PRM_NUM_GPUS_PER_ENGINE:-2}
+PRM_ENABLE=${PRM_ENABLE:-1}
+export OPENCLAW_OPD_TEACHER_LP_MAX_CONCURRENCY="${OPENCLAW_OPD_TEACHER_LP_MAX_CONCURRENCY:-1}"
+export OPENCLAW_COMBINE_W_RL="${OPENCLAW_COMBINE_W_RL:-1.0}"
+export OPENCLAW_COMBINE_W_OPD="${OPENCLAW_COMBINE_W_OPD:-1.0}"
+
+CKPT_ARGS=(
+   --megatron-to-hf-mode bridge
+   --hf-checkpoint "${HF_CKPT}"
+   --ref-load "${REF_LOAD}"
+   --save "${SAVE_CKPT}"
+   --save-interval 100
+   --rotary-base 10000000
+)
+
+ROLLOUT_ARGS=(
+   --disable-rollout-global-dataset
+   --rollout-function-path openclaw_combine_rollout.generate_rollout_openclaw_combine
+
+   --num-rollout 100000000
+   --rollout-batch-size 16
+   --n-samples-per-prompt 1
+   --rollout-max-response-len 8192
+   --rollout-max-context-len 32768
+   --rollout-temperature 0.6
+   --reward-key score
+
+   --num-steps-per-rollout 1
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size "${TRAIN_TP}"
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 32768
+   --log-probs-chunk-size 1024
+)
+
+COMBINE_ARGS=(
+   --advantage-estimator grpo
+   --disable-rewards-normalization
+   --loss-type custom_loss
+   --custom-loss-function-path combine_loss.combine_loss_function
+   --use-kl-loss
+   --kl-loss-coef 0.0
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-5
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+EVAL_ARGS=()
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine "${ROLLOUT_NUM_GPUS_PER_ENGINE}"
+   --sglang-tool-call-parser "${TOOL_CALL_PARSER}"
+   --sglang-mem-fraction-static 0.8
+   --sglang-context-length 32768
+   --sglang-reasoning-parser "${REASONING_PARSER}"
+)
+
+if [ "${SGLANG_LANGUAGE_ONLY}" = "1" ]; then
+  SGLANG_ARGS+=(--sglang-language-only)
+fi
+
+if [ "${PRM_ENABLE}" = "1" ]; then
+  PRM_ARGS=(
+     --prm-enable
+     --prm-num-gpus "${PRM_GPUS}"
+     --prm-num-gpus-per-engine "${PRM_NUM_GPUS_PER_ENGINE}"
+     --prm-model-path "${PRM_MODEL_PATH}"
+     --prm-m "${PRM_M}"
+     --prm-temperature "${PRM_TEMPERATURE:-0.6}"
+     --prm-max-new-tokens "${PRM_MAX_NEW_TOKENS:-8192}"
+  )
+else
+  PRM_ARGS=()
+fi
+
+CUSTOM_ARGS=(
+   --custom-generate-function-path openclaw_combine_api_server.generate
+   --custom-rm-path openclaw_combine_api_server.reward_func
+)
+
+MISC_ARGS=(
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+)
+
+USE_WANDB=${USE_WANDB:-1}
+WANDB_PROJECT=${WANDB_PROJECT:-openclaw_rl}
+WANDB_KEY_VALUE=${WANDB_KEY:-${WANDB_API_KEY:-}}
+if [ "${USE_WANDB}" = "1" ] && [ -n "${WANDB_KEY_VALUE}" ]; then
+  WANDB_ARGS=(
+    --use-wandb
+    --wandb-project ${WANDB_PROJECT}
+    --wandb-group qwen35-4b-openclaw-combine
+    --wandb-key ${WANDB_KEY_VALUE}
+  )
+else
+  WANDB_ARGS=()
+fi
+
+export OPENCLAW_EVAL_MODE="${OPENCLAW_EVAL_MODE:-1}"
+
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+export no_proxy="127.0.0.1,${MASTER_ADDR}"
+ray start --head --node-ip-address "${MASTER_ADDR}" --num-gpus "${NUM_GPUS}" --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"${MEGATRON_ROOT}:${SCRIPT_DIR}:${SCRIPT_DIR}/../openclaw-opd:${SLIME_ROOT}\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"FLASHINFER_WORKSPACE_BASE\": \"${FLASHINFER_WORKSPACE_BASE}\",
+    \"OPENCLAW_EVAL_MODE\": \"${OPENCLAW_EVAL_MODE}\",
+    \"OPENCLAW_COMBINE_W_RL\": \"${OPENCLAW_COMBINE_W_RL}\",
+    \"OPENCLAW_COMBINE_W_OPD\": \"${OPENCLAW_COMBINE_W_OPD}\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train_async.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node "${ACTOR_GPUS}" \
+   --rollout-num-gpus "${ROLLOUT_GPUS}" \
+   --num-gpus-per-node "${NUM_GPUS}" \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${COMBINE_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${CUSTOM_ARGS[@]} \
+   ${PRM_ARGS[@]}
diff --git a/openclaw-opd/README.md b/openclaw-opd/README.md
@@ -32,6 +32,13 @@ cd slime
 bash ../openclaw-opd/run_qwen3_4b_openclaw_opd.sh
 ```
 
+Qwen3.5-4B alternative:
+
+```bash
+cd slime
+bash ../openclaw-opd/run_qwen35_4b_openclaw_opd.sh
+```
+
 ## Option B: Top-K Logits Distillation (SDFT/SDPO-style)
 
 Following [SDFT](https://arxiv.org/abs/2601.19897) and [SDPO](https://arxiv.org/abs/2601.20802), instead of single-token teacher targets, distill teacher top-K distribution per position. But note that we use teacher top k instead of student top k (setting in their original paper), see issue #7. e will compare teacher top-K and student top-K later.