evalstate
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/en/continuous_batching.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/source/en/continuous_batching.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/glm_moe_dsa.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/model_doc/glm_moe_dsa.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/model_doc/hy_v3.md‎
Lines changed: 64 additions & 0 deletions b/‎docs/source/en/model_doc/hy_v3.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/olmo.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/source/en/model_doc/olmo.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/olmo2.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/source/en/model_doc/olmo2.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/olmo3.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/source/en/model_doc/olmo3.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/openai_privacy_filter.md‎
Lines changed: 103 additions & 0 deletions b/‎docs/source/en/model_doc/openai_privacy_filter.md‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/slanet.md‎
Lines changed: 80 additions & 0 deletions b/‎docs/source/en/model_doc/slanet.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎docs/source/en/quantization/torchao.md‎
Lines changed: 5 additions & 7 deletions b/‎docs/source/en/quantization/torchao.md‎
Lines changed: 5 additions & 7 deletions
@@ -661,6 +661,8 @@
         title: HunYuanDenseV1
       - local: model_doc/hunyuan_v1_moe
         title: HunYuanMoEV1
+      - local: model_doc/hy_v3
+        title: HYV3
       - local: model_doc/ibert
         title: I-BERT
       - local: model_doc/jais2
@@ -767,6 +769,8 @@
         title: OLMoE
       - local: model_doc/olmo_hybrid
         title: OlmoHybrid
+      - local: model_doc/openai_privacy_filter
+        title: OpenAI Privacy Filter
       - local: model_doc/opt
         title: OPT
       - local: model_doc/pegasus
@@ -1365,6 +1369,8 @@
         title: SigLIP
       - local: model_doc/siglip2
         title: SigLIP2
+      - local: model_doc/slanet
+        title: SLANet
       - local: model_doc/slanext
         title: SLANeXt
       - local: model_doc/smollm3
 
@@ -124,6 +124,20 @@ Cancel a request with [`~ContinuousBatchingManager.cancel_request`].
 manager.cancel_request(request_id="my_request")
 ```
 
+### Per-request sampling parameters
+
+Enable `per_request_processors` to apply `temperature`, `top_k`, and `top_p` independently per request within the same forward pass to allow different sampling parameters for different requests (creative, high-temperature outputs versus precise, low-temperature ones for example).
+
+```py
+cb_config = ContinuousBatchingConfig(per_request_processors=True)
+
+# each request gets its own sampling parameters
+manager.add_request(input_ids=inputs_a, temperature=0.9, top_p=0.95)
+manager.add_request(input_ids=inputs_b, temperature=0.1, top_k=10)
+```
+
+Each parameter in [`GenerationConfig`] must be a non-default value in order to create the associated logits processor at runtime. For example, set `temperature` to a value other than `None` or `1` to support per-request temperature control. Requests with temperatures of `1` can still be created afterwards.
+
 ### Retrieving results
 
 Iterate over the manager to receive results as they arrive.
@@ -174,6 +188,7 @@ By default, `num_blocks` and `max_batch_tokens` are inferred automatically from
 | Prefix caching | ↓ shared KV blocks | ✓ skips redundant prefill | ✓ TTFT |
 | Paged attention | ↓ no fragmentation | ✓ dynamic batch membership | |
 | Sliding window | ↓ bounded KV per layer | | |
+| Per-request processors | | ✓ mixed sampling params per batch | |
 
 ```py
 from transformers.generation import ContinuousBatchingConfig
 
@@ -16,7 +16,7 @@ limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.
 
 -->
-*This model was released on {release_date} and added to Hugging Face Transformers on 2026-02-08.*
+*This model was released on 2026-02-17 and added to Hugging Face Transformers on 2026-02-09.*
 
 <div style="float: right;">
     <div class="flex flex-wrap space-x-1">
 
@@ -0,0 +1,64 @@
+<!--Copyright 2026 THL A29 Limited, a Tencent company and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2026-04-22.*
+
+# Hy3-preview
+
+## Overview
+
+Hy3-preview is a large-scale Mixture-of-Experts (MoE) language model developed by the Tencent HunYuan team. It features a dense-MoE hybrid architecture with 192 routed experts and 1 always-active shared expert per MoE layer, achieving strong performance with efficient inference via sparse expert activation.
+
+Key architectural features:
+
+- **Dense-MoE hybrid**: The first layer uses a dense FFN; all subsequent layers use MoE with top-k routing (default k=8).
+- **Shared experts**: Each MoE layer includes 1 shared expert that processes all tokens alongside the routed experts.
+- **Sigmoid routing with expert-bias correction**: Tokens are routed via sigmoid scoring (not softmax) with a learned per-expert bias for load balancing.
+- **QK-Norm**: Per-head RMSNorm applied to query and key projections before attention for improved training stability.
+
+## Usage tips
+
+- Load with `AutoModelForCausalLM`. The model requires multiple GPUs due to its size.
+- Set `output_router_logits=True` in the config or forward call to collect per-layer MoE router logits. Note that this model does not compute an auxiliary load-balancing loss; `aux_loss` is always `None`.
+- The model supports `gradient_checkpointing` to reduce memory during fine-tuning.
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "tencent/Hy3-preview"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+)
+
+inputs = tokenizer("The future of artificial intelligence is", return_tensors="pt").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=64)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+## HYV3Config
+
+[[autodoc]] HYV3Config
+
+## HYV3Model
+
+[[autodoc]] HYV3Model
+    - forward
+
+## HYV3ForCausalLM
+
+[[autodoc]] HYV3ForCausalLM
+    - forward
@@ -127,3 +127,8 @@ print(tokenizer.decode(output[0]))
 
 [[autodoc]] OlmoForCausalLM
     - forward
+
+## OlmoForSequenceClassification
+
+[[autodoc]] OlmoForSequenceClassification
+    - forward
@@ -136,3 +136,8 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 
 [[autodoc]] Olmo2ForCausalLM
     - forward
+
+## Olmo2ForSequenceClassification
+
+[[autodoc]] Olmo2ForSequenceClassification
+    - forward
@@ -129,6 +129,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 
 [[autodoc]] Olmo3ForCausalLM
 
+## Olmo3ForSequenceClassification
+
+[[autodoc]] Olmo3ForSequenceClassification
+    - forward
+
 ## Olmo3Model
 
 [[autodoc]] Olmo3Model
 
@@ -0,0 +1,103 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-04-22 and added to Hugging Face Transformers on 2026-04-22.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# OpenAI Privacy Filter
+
+OpenAI Privacy Filter is a bidirectional token-classification model for personally identifiable information (PII) detection and masking in text. It is intended for high-throughput data sanitization workflows where teams need a model that they can run on-premises that is fast, context-aware, and tunable.
+
+OpenAI Privacy Filter is pretrained autoregressively to arrive at a checkpoint with similar architecture to gpt-oss, albeit of a smaller size.  We  then converted that checkpoint into a bidirectional token classifier over a privacy label taxonomy, and post-trained with a supervised classification loss. (For architecture details about gpt-oss, please see the gpt-oss model card.) Instead of generating text token-by-token, this model labels an input sequence in a single forward pass, then decodes coherent spans with a constrained Viterbi procedure. For each input token, the model predicts a probability distribution over the label taxonomy which consists of 8 output categories described below.
+
+Highlights:
+
+- Permissive Apache 2.0 license: ideal for experimentation, customization, and commercial deployment.
+- Small size: Runs in a web browser or on a laptop – 1.5B parameters total and 50M active parameters.
+- Fine-tunable: Adapt the model to specific data distributions through easy and data efficient finetuning.
+- Long-context: 128,000-token context window enables processing long text with high throughput and no chunking.
+- Runtime control: configure precision/recall tradeoffs and detected span lengths through preset operating points.
+
+The example below demonstrates how to detect privacy-sensitive tokens with [`Pipeline`] or the [`AutoModelForTokenClassification`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(
+    task="token-classification",
+    model="openai/privacy-filter",
+)
+classifier("My name is Alice Smith")
+```
+
+</hfoption>
+<hfoption id="AutoModelForTokenClassification">
+
+```py
+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("openai/privacy-filter")
+model = AutoModelForTokenClassification.from_pretrained("openai/privacy-filter", device_map="auto")
+
+inputs = tokenizer("My name is Alice Smith", return_tensors="pt").to(model.device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+predicted_token_class_ids = outputs.logits.argmax(dim=-1)
+predicted_token_classes = [model.config.id2label[token_id.item()] for token_id in predicted_token_class_ids[0]]
+print(predicted_token_classes)
+```
+
+</hfoption>
+</hfoptions>
+
+- Developed by: OpenAI
+- Funded by: OpenAI
+- Shared by: OpenAI
+- Model type: Bidirectional token classification model for privacy span detection
+- Language(s): Primarily English; selected multilingual robustness evaluation reported
+- License: [Apache 2.0](LICENSE)
+
+- Source repository: https://github.com/openai/privacy-filter
+- Model weights: https://huggingface.co/openai/privacy-filter
+- Demo: https://huggingface.co/spaces/openai/privacy-filter
+
+## Resources
+
+- [Token classification task guide](../tasks/token_classification)
+
+## OpenAIPrivacyFilterConfig
+
+[[autodoc]] OpenAIPrivacyFilterConfig
+
+## OpenAIPrivacyFilterModel
+
+[[autodoc]] OpenAIPrivacyFilterModel
+    - forward
+
+## OpenAIPrivacyFilterForTokenClassification
+
+[[autodoc]] OpenAIPrivacyFilterForTokenClassification
+    - forward
@@ -0,0 +1,80 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2025-03-07 and added to Hugging Face Transformers on 2026-04-22.*
+
+# SLANet
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+**SLANet** and **SLANet_plus** are part of a series of dedicated lightweight models for table structure recognition, focusing on accurately recognizing table structures in documents and natural scenes. For more details about the SLANet series model, please refer to the [official documentation](https://www.paddleocr.ai/latest/en/version3.x/module_usage/table_structure_recognition.html).
+
+## Model Architecture
+
+SLANet is a table structure recognition model developed by Baidu PaddlePaddle Vision Team. The model significantly improves the accuracy and inference speed of table structure recognition by adopting a CPU-friendly lightweight backbone network PP-LCNet, a high-low-level feature fusion module CSP-PAN, and a feature decoding module SLA Head that aligns structural and positional information.
+
+## Usage
+
+### Single input inference
+
+The example below demonstrates how to detect text with SLANet using the [`AutoModel`].
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
+from io import BytesIO
+
+import httpx
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForTableRecognition
+
+model_path="PaddlePaddle/SLANet_plus_safetensors"
+model = AutoModelForTableRecognition.from_pretrained(model_path, device_map="auto")
+image_processor = AutoImageProcessor.from_pretrained(model_path)
+
+image = Image.open(BytesIO(httpx.get(image_url).content))
+inputs = image_processor(images=image, return_tensors="pt").to(model.device)
+outputs = model(**inputs)
+
+results = image_processor.post_process_table_recognition(outputs)
+
+print(result['structure'])
+print(result['structure_score'])
+```
+
+</hfoption>
+</hfoptions>
+
+## SLANetConfig
+
+[[autodoc]] SLANetConfig
+
+## SLANetForTableRecognition
+
+[[autodoc]] SLANetForTableRecognition
+
+## SLANetBackbone
+
+[[autodoc]] SLANetBackbone
+
+## SLANetSLAHead
+
+[[autodoc]] SLANetSLAHead
+
@@ -328,11 +328,9 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int4WeightOnlyConfig
-from torchao.dtypes import Int4XPULayout
-from torchao.quantization.quant_primitives import ZeroPointDomain
 
 
-quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT, int4_packing_format="plain_int32")
+quant_config = Int4WeightOnlyConfig(group_size=128, int4_packing_format="plain_int32")
 quantization_config = TorchAoConfig(quant_type=quant_config)
 
 # Load and quantize the model
@@ -345,7 +343,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
 
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device).to(quantized_model.dtype)
 
 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@@ -395,9 +393,9 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.prototype.int4_opaque_tensor import Int4WeightOnlyOpaqueTensorConfig
+from torchao.prototype.quantization.int4 import PrototypeInt4WeightOnlyConfig
 
-quantization_config = TorchAoConfig(Int4WeightOnlyOpaqueTensorConfig(group_size=128))
+quantization_config = TorchAoConfig(PrototypeInt4WeightOnlyConfig(group_size=128, int4_choose_qparams_algorithm="tinygemm"))
 
 # Load and quantize the model
 quantized_model = AutoModelForCausalLM.from_pretrained(
@@ -409,7 +407,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
 
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device).to(quantized_model.dtype)
 
 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")