huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/dflash.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/source/en/api/pipelines/dflash.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/source/en/api/schedulers/dflash_token_diffusion.md‎
Lines changed: 22 additions & 0 deletions b/‎docs/source/en/api/schedulers/dflash_token_diffusion.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎examples/discrete_diffusion/sample_dflash.py‎
Lines changed: 145 additions & 0 deletions b/‎examples/discrete_diffusion/sample_dflash.py‎
Lines changed: 145 additions & 0 deletions
@@ -648,6 +648,8 @@
         title: Z-Image
       title: Image
     - sections:
+      - local: api/pipelines/dflash
+        title: DFlash
       - local: api/pipelines/llada2
         title: LLaDA2
       title: Text
@@ -711,6 +713,8 @@
       title: DDPMScheduler
     - local: api/schedulers/deis
       title: DEISMultistepScheduler
+    - local: api/schedulers/dflash_token_diffusion
+      title: DFlashTokenDiffusionScheduler
     - local: api/schedulers/multistep_dpm_solver_inverse
       title: DPMSolverMultistepInverse
     - local: api/schedulers/multistep_dpm_solver
 
@@ -0,0 +1,24 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DFlash
+
+`DFlashPipeline` performs block-diffusion speculative decoding using a diffusion draft model and a target causal LM.
+The draft model is conditioned on target hidden features extracted during prefill and verification steps.
+
+## DFlashPipeline
+[[autodoc]] DFlashPipeline
+    - all
+    - __call__
+
+## DFlashPipelineOutput
+[[autodoc]] pipelines.DFlashPipelineOutput
@@ -0,0 +1,22 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DFlashTokenDiffusionScheduler
+
+`DFlashTokenDiffusionScheduler` implements the acceptance and posterior sampling logic used in DFlash-style block
+diffusion speculative decoding.
+
+## DFlashTokenDiffusionScheduler
+[[autodoc]] DFlashTokenDiffusionScheduler
+
+## DFlashTokenDiffusionSchedulerOutput
+[[autodoc]] schedulers.scheduling_dflash_token_diffusion.DFlashTokenDiffusionSchedulerOutput
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Sample script for DFlash speculative decoding.
+
+Example:
+    python sample_dflash.py \
+      --draft_model_id z-lab/Qwen3-8B-DFlash-b16 \
+      --target_model_id Qwen/Qwen3-8B \
+      --prompt "How many positive whole-number divisors does 196 have?" \
+      --max_new_tokens 256
+"""
+
+import argparse
+
+import torch
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
+from diffusers import DFlashPipeline
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run DFlash speculative decoding.")
+    parser.add_argument(
+        "--draft_model_id",
+        type=str,
+        default="z-lab/Qwen3-8B-DFlash-b16",
+        help="Draft model ID or local path.",
+    )
+    parser.add_argument(
+        "--target_model_id",
+        type=str,
+        default="Qwen/Qwen3-8B",
+        help="Target model ID or local path.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="How many positive whole-number divisors does 196 have?",
+        help="Prompt text to generate from.",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=2048,
+        help="Maximum number of new tokens to generate.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.0,
+        help="Sampling temperature.",
+    )
+    parser.add_argument(
+        "--use_chat_template",
+        action="store_true",
+        help="Use the tokenizer chat template for the prompt.",
+    )
+    parser.add_argument(
+        "--add_generation_prompt",
+        action="store_true",
+        help="Add the generation prompt when using the chat template.",
+    )
+    parser.add_argument(
+        "--enable_thinking",
+        action="store_true",
+        help="Enable chat-template thinking mode if supported by the tokenizer.",
+    )
+    parser.add_argument(
+        "--mask_token",
+        type=str,
+        default="<|MASK|>",
+        help="Mask token to add if the tokenizer does not define one.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to run inference on.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "float32", "float16", "bfloat16"],
+        help="Model dtype.",
+    )
+
+    args = parser.parse_args()
+
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    torch_dtype = dtype_map.get(args.dtype)
+
+    print(f"Loading draft model: {args.draft_model_id}")
+    print(f"Loading target model: {args.target_model_id}")
+    dtype_arg = torch_dtype if torch_dtype is not None else "auto"
+    # Draft model is a custom DFlashDraftModel; use AutoModel so trust_remote_code routes to the class in `auto_map`.
+    draft_model = AutoModel.from_pretrained(
+        args.draft_model_id,
+        trust_remote_code=True,
+        dtype=dtype_arg,
+        device_map=args.device,
+    )
+    target_model = AutoModelForCausalLM.from_pretrained(
+        args.target_model_id,
+        dtype=dtype_arg,
+        device_map=args.device,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.target_model_id)
+    if tokenizer.mask_token is None:
+        tokenizer.add_special_tokens({"mask_token": args.mask_token})
+    pipe = DFlashPipeline(draft_model=draft_model, target_model=target_model, tokenizer=tokenizer)
+
+    chat_kwargs = {"enable_thinking": args.enable_thinking}
+
+    print(f"\nPrompt: {args.prompt}")
+    output = pipe(
+        prompt=args.prompt,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        use_chat_template=args.use_chat_template,
+        add_generation_prompt=args.add_generation_prompt,
+        chat_template_kwargs=chat_kwargs,
+    )
+
+    print("\nGenerated text:")
+    print(output.texts[0])
+    print(f"\nGenerated {output.sequences.shape[1]} tokens")
+
+
+if __name__ == "__main__":
+    main()