Optimization-AI
diff --git a/‎.github/workflows/e2e_disco.yml‎
Lines changed: 116 additions & 0 deletions b/‎.github/workflows/e2e_disco.yml‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎recipe/disco/README.md‎
Lines changed: 91 additions & 0 deletions b/‎recipe/disco/README.md‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎recipe/disco/config/disco_trainer.yaml‎
Lines changed: 42 additions & 0 deletions b/‎recipe/disco/config/disco_trainer.yaml‎
Lines changed: 42 additions & 0 deletions
@@ -0,0 +1,116 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
+name: e2e_disco
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  # For push, for now only anti-patterns are specified so it is more conservative
+  # and achieves higher coverage.
+  push:
+    branches:
+      - main
+      - v0.*
+    paths:
+      - "verl/*.py"
+      # Other entrypoints
+      - "!examples/*trainer*"
+      - "!tests/**"
+      - "!verl/trainer/main_*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      # Megatron
+      - "!verl/workers/**/megatron_*.py"
+      - "!recipe/**"
+      - "recipe/disco"
+      # Entrypoints
+      - ".github/workflows/e2e_disco.yml"
+      - "examples/data_preprocess/gsm8k.py"
+      - "tests/special_e2e/run_disco.sh"
+  pull_request:
+    branches:
+      - main
+      - v0.*
+    paths:
+      - "**/*.py"
+      # Other entrypoints
+      - "!examples/**"
+      - "!tests/**"
+      - "!verl/trainer/main_*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      # Other recipes
+      - "!recipe/**"
+      # Megatron
+      - "!verl/workers/**/megatron_*.py"
+      # Home
+      - "recipe/disco"
+      # Entrypoints
+      - ".github/workflows/e2e_disco.yml"
+      - "examples/data_preprocess/gsm8k.py"
+      - "tests/special_e2e/run_disco.sh"
+
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare permissions just read content.
+permissions:
+  contents: read
+
+jobs:
+  e2e_disco:
+    runs-on: [L20x8]
+    timeout-minutes: 40 # Increase this timeout value as needed
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+    container:
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
+      options: --gpus all --shm-size=10g
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install --no-deps -e .[test,gpu]
+      - name: Prepare GSM8K dataset
+        run: |
+          python3 examples/data_preprocess/gsm8k.py
+      - name: Running the E2E test with the DisCO algorithm
+        run: |
+          ray stop --force
+          bash tests/special_e2e/run_disco.sh
@@ -0,0 +1,91 @@
+<h1 align="center">🚀 DisCO: Reinforcing Large Reasoning Models with Discriminative Constrained Optimization</h1>
+<p align="center"><img alt="DisCO" src="https://github.com/Optimization-AI/DisCO/blob/main/assets/disco-final.png" width="300"/></p>
+<p align="center" tyle="font-size: 5px;"><em>Credit: the above image was generated by Sora</em></p>
+
+
+ 📝 [Paper@arXiv](https://arxiv.org/abs/2505.12366) | 🏠 [Repo@GitHub](https://github.com/Optimization-AI/DisCO)
+ 
+
+---
+
+### 💡 Introducing **DisCO** — *Discriminative Constrained Optimization*
+
+**DisCO** is a new RL framework grounded in **discriminative learning**. It trains models by **increasing scores for positive answers while decreasing those for negatives**, enabling:
+
+* ❌ **No Early Entropy Collapse**
+* ⚡ **Faster convergence**
+* 📉 **More stable training**
+* ⚖️ **Handles sparse rewards** – robust to imbalanced data with advanced discriminative approaches
+
+---
+
+### 📈 Quick Results
+
+On six math reasoning benchmarks with a 1.5B model, **DisCO outperforms GRPO and its variants**:
+
+* **+7% vs GRPO**
+* **+6% vs DAPO**
+
+**DisCO with 8k response length is on par with or even better than GRPO with 32k response length**
+
+---
+
+- [Model Checkpoints](#model-checkpoints)
+- [Quickstart](#quickstart)
+- [More Results](#more-results)
+- [Citing DisCO](#citing-disco)
+
+
+## Model Checkpoints
+
+- DisCO (Log-L) finetuned DeepSeek-R1-Distill-Qwen-1.5B Model: [DisCO-1.5B-logL](https://huggingface.co/ganglii/DisCO-1.5B-logL)
+- DisCO (L-Ratio) finetuned DeepSeek-R1-Distill-Qwen-1.5B Model: [DisCO-1.5B-Lratio](https://huggingface.co/ganglii/DisCO-1.5B-Lratio)
+- DisCO (Log-L) finetuned DeepSeek-R1-Distill-Qwen-7B Model: [DisCO-7B-logL](https://huggingface.co/ganglii/DisCO-7B-logL)
+- DisCO (L-Ratio) finetuned DeepSeek-R1-Distill-Qwen-7B Model: [DisCO-7B-Lratio](https://huggingface.co/ganglii/DisCO-7B-Lratio)
+
+
+## Quickstart
+
+1. Prepare the datasets:
+
+```bash
+bash prepare_data.sh # This downloads the datasets to current folder
+```
+
+2. Run script:
+
+```bash
+cd verl # Repo root
+bash recipe/disco/run_disco_1.5b.sh # or other scripts
+```
+
+## More Results
+
+Comparison with baseline models and baseline methods for fine-tuning 1.5B models. OpenAI-o1-preview is included as a reference.  MRL denotes Max Response Length utilized in training/testing. The shaded models are trained by other works and the shaded numbers are reported in their original works or in DeepScalaR. All other results are either evaluated on existing models or on the models trained by us using  different approaches. Methods in the bottom area are all for fine-tuning  DeepSeek-R1-Distill-Qwen-1.5B model on the same DeepScaleR dataset. DS is short for DeepSeek-R1, DSR is short for DeepScalaR.
+
+<p align="center"><img alt="Comparison with baselines on 1.5B model" src="https://github.com/Optimization-AI/DisCO/blob/main/assets/1p5model.png" width="800"/></p>
+
+
+Comparison with baseline models and baseline methods for fine-tuning 7B models. Methods in the bottom area are all for fine-tuning  DeepSeek-R1-Distill-Qwen-7B model on the the same DeepScalaR dataset.
+
+<p align="center"><img alt="Comparison with baselines on 7B model" src="https://github.com/Optimization-AI/DisCO/blob/main/assets/7Bmodel.png" width="800"/></p>
+
+Training dynamics of different methods: left two are for fine-tuning 1.5B model and right two are for fine-tuning 7B model. (a), (c) plot the training reward (averaged over generated outputs for questions used in each step) vs the number of training steps; (b), (d) plot the generation entropy vs training steps.
+
+<p align="center"><img alt="Training Dynamics" src="https://github.com/Optimization-AI/DisCO/blob/main/assets/training-dyanmics.png" width="800"/></p>
+
+
+
+
+## Citing DisCO
+
+If you find DisCO useful in your research, please consider citing the following paper:
+```bibtex
+@article{li2025disco,
+  title={DisCO: Reinforcing Large Reasoning Models with Discriminative Constrained Optimization},
+  author={Li, Gang and Lin, Ming and Galanti, Tomer and Tu, Zhengzhong and Yang, Tianbao},
+  journal={arXiv preprint arXiv:2505.12366},
+  year={2025}
+}
+```
+
@@ -0,0 +1,42 @@
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+
+defaults:
+  - ppo_trainer
+  - _self_
+
+data:
+  gen_batch_size: ${data.train_batch_size}
+
+actor_rollout_ref:
+  actor:
+    policy_loss:
+      loss_mode: 'disco'
+      score_func: 'logL' # score function used in disco. Options: 'logL', 'Lratio'
+      delta: 1e-4    
+      beta: 1e3     
+      tau: 10         
+
+reward_model:
+  reward_manager: naive
+
+custom_reward_function:
+
+  # The path to the file containing your customized reward function.
+  # If not specified, pre-implemented reward functions will be used.
+  path: recipe/disco/reward/deepscaler_reward.py
+
+  # The name of the reward function within the specified file. Default is 'compute_score'.
+  name: deepscaler_reward_fn
+
+algorithm:
+  filter_groups:
+    _target_: verl.trainer.config.FilterGroupsConfig
+    enable: False # We try to avoid forgetting to set enable
+    metric: null # acc / score / seq_reward / seq_final_reward / ...
+    max_num_gen_batches: 0 # Non-positive values mean no upper limit
+
+trainer:
+  project_name: verl-disco
+