OpenNMT
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 26 additions & 3 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 14 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎docker/Dockerfile‎
Lines changed: 6 additions & 6 deletions b/‎docker/Dockerfile‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/guides/transformers.md‎
Lines changed: 80 additions & 1 deletion b/‎docs/guides/transformers.md‎
Lines changed: 80 additions & 1 deletion
diff --git a/‎include/ctranslate2/layers/attention.h‎
Lines changed: 3 additions & 0 deletions b/‎include/ctranslate2/layers/attention.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/ctranslate2/__init__.py‎
Lines changed: 11 additions & 3 deletions b/‎python/ctranslate2/__init__.py‎
Lines changed: 11 additions & 3 deletions
@@ -109,7 +109,7 @@ jobs:
             -DBUILD_TESTS=ON \
             .
           make -j $(nproc) install
-            
+
       - name: Build Ruy
         if: matrix.backend == 'ruy'
         run: |
@@ -121,7 +121,7 @@ jobs:
             $CMAKE_EXTRA_OPTIONS \
             .
           make -j $(nproc) install
- 
+
       - name: Download test data
         run: |
           wget https://opennmt-models.s3.amazonaws.com/transliteration-aren-all.tar.gz
@@ -229,7 +229,7 @@ jobs:
           ls -l
           find .
           pip install ${{ matrix.wheel_pattern }}
-                
+
       - name: Test Python wheel
         run: |
           pytest -v python/tests/ --ignore=python/tests/test_opennmt_tf.py
@@ -295,6 +295,29 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Show disk and docker usage (before cleanup)
+        run: |
+          df -h
+          echo " -= Docker System =-"
+          docker system df || true
+
+      - name: Free disk space (cleanup heavy preinstalled directories + docker prune)
+        run: |
+          echo " -= Removing big preinstalled directories (shouldn't remove the needed tools) =-"
+          sudo rm -rf /opt/hostedtoolcache || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /usr/lib/jvm || true
+          sudo rm -rf /usr/local/lib/android || true
+          echo " -= Running docker prune =-"
+          docker system prune -af --volumes || true
+          docker builder prune -af || true
+
+      - name: Show disk and docker usage (after cleanup)
+        run: |
+          df -h
+          echo " -= Docker System =-"
+          docker system df || true
+
       - name: Build Docker images
         run: |
           ./docker/build_all.sh
 
@@ -4,7 +4,20 @@
 
 ### Fixes and improvements
 
-## [v4.6.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.6.1) (2025-10-07)
+## [v4.6.2](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.6.2) (2025-12-05)
+
+### New features
+
+* Qwen 3 support (#1943) by [@jordimas](https://github.com/jordimas)
+* Gemma 3 text support (#1936) by [@jordimas](https://github.com/jordimas)
+
+### Fixes and improvements
+
+* Fixed pkg_resources Deprecated Warning (#1911) by [@thawancomt](https://github.com/thawancomt)
+* Disable INT8 for sm120 - Blackwell GPUs (#1937) by [@Purfview](https://github.com/Purfview)
+* FIX: package libctranslate2.so in wheel to avoid build fail (#1920) by [@yzewei](https://github.com/yzewei)
+
+## [v4.6.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.6.1) (2025-11-07)
 
 ### New features
 
 
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04 as builder
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 as builder
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -77,21 +77,21 @@ RUN cd python && \
     python3 -m pip --no-cache-dir install -r install_requirements.txt && \
     python3 setup.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT
 
-FROM nvidia/cuda:12.2.2-base-ubuntu22.04
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04
 
 # We remove the cuda-compat package because it conflicts with the CUDA Enhanced Compatibility.
 # See e.g. https://github.com/NVIDIA/nvidia-docker/issues/1515
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-        libcublas-12-2 \
-        libcudnn8=8.9.7.29-1+cuda12.2 \
-        libnccl2=2.19.3-1+cuda12.2 \
+        libcublas-12-4 \
+        libcudnn9-cuda-12 \
+        libnccl2 \
         libopenmpi3=4.1.2-2ubuntu1 \
         openmpi-bin \
         libgomp1 \
         python3-pip \
         && \
-    apt-get purge -y cuda-compat-12-2 && \
+    apt-get purge -y cuda-compat-12-4 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 
@@ -8,6 +8,8 @@ CTranslate2 supports selected models from Hugging Face's [Transformers](https://
 * CodeGen
 * DistilBERT
 * Falcon
+* Gemma 2
+* Gemma 3 (text only)
 * Llama
 * M2M100
 * MarianMT
@@ -20,6 +22,8 @@ CTranslate2 supports selected models from Hugging Face's [Transformers](https://
 * GPT-NeoX
 * OPT
 * Pegasus
+* Qwen 2.5
+* Qwen 3
 * T5
 * Whisper
 * XLM-RoBERTa
@@ -80,7 +84,7 @@ print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target), skip_special_tok
 
 ## BERT
 
-[BERT](https://huggingface.co/docs/transformers/model_doc/bert) is pretrained model on English language using a masked language modeling objective.
+[BERT](https://huggingface.co/docs/transformers/model_doc/bert) is a pretrained model on English language using a masked language modeling objective.
 
 CTranslate2 only implements the `BertModel` class from Transformers which includes the Transformer encoder and the pooling layer. Task-specific layers should be run with PyTorch as shown in the example below.
 
@@ -183,6 +187,43 @@ output = tokenizer.decode(results[0].sequences_ids[0])
 print(output)
 ```
 
+## Gemma 3 (text only)
+
+
+[Gemma 3](https://ai.google.dev/gemma/docs/core) is Google's latest family of lightweight, open-weight AI models, built on the same technology as Gemini.
+
+Gemma models come in two flavors: instruction tuned (it) models and base models.
+
+Instruction tuned models expect a specific [prompt template format](https://ai.google.dev/gemma/docs/core/prompt-structure) which you should use.
+
+When converting an instruction-tuned model, CTranslate sets `<end_of_turn>` as the default end-of-sequence token.
+
+
+To convert a model:
+
+```bash
+ct2-transformers-converter --model google/gemma-3-1b-it --output_dir gemma-3-1b-it
+```
+
+Gemma 3 usage sample:
+
+
+```python
+
+from transformers import AutoTokenizer
+import ctranslate2
+
+tok = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
+gen = ctranslate2.Generator("gemma-3-1b-it")
+
+prompt = "<start_of_turn>user\nGenerate a 200 word text talking about George Orwell.<end_of_turn>\n<start_of_turn>model\n"
+tokens = tok.convert_ids_to_tokens(tok.encode(prompt))
+
+res = gen.generate_batch([tokens], max_length=2048, sampling_temperature=0.1, include_prompt_in_result=False)
+print(tok.convert_tokens_to_string(res[0].sequences[0]))
+```
+
+
 ## Llama 2
 
 [Llama 2](https://ai.meta.com/llama/) is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
@@ -446,6 +487,44 @@ output = tokenizer.decode(results[0].sequences_ids[0])
 print(output)
 ```
 
+## Qwen 3
+
+[Qwen 3](https://github.com/QwenLM/Qwen3) are a collection of large language models developed by the Alibaba Group. A key feature is allows switching between "thinking mode" for complex reasoning and a "non-thinking mode" for efficient general chat.
+
+To convert a model:
+
+```bash
+ct2-transformers-converter --model Qwen/Qwen3-4B --quantization float16 --output_dir qwen3-4b-ct2
+```
+
+Usage Sample
+
+You can use the converted model for text generation with ctranslate2.Generator. For Qwen 3 instruction-tuned models, you should use the Hugging Face tokenizer's apply_chat_template method to correctly format your prompts, especially when dealing with the optional "thinking mode". Currently MoE models variants are not supported.
+
+```python
+import ctranslate2
+import transformers
+
+generator = ctranslate2.Generator("qwen3-4b-ct2")
+tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
+
+def generate(prompt):
+    tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt, add_special_tokens=False))
+    results = generator.generate_batch([tokens], max_length=2048, sampling_temperature=0.7, include_prompt_in_result=False)
+    return tokenizer.decode(results[0].sequences_ids[0])
+
+prompt_base = """<|im_start|>user
+A train leaves Station A at 60 mph heading towards Station B, 300 miles away. At the same time, another train leaves Station B at 40 mph heading towards Station A. When will they meet and how far from Station A?
+<|im_end|>
+<|im_start|>assistant"""
+
+print("Non-thinking:\n" + "-"*60)
+print(generate(prompt_base + "\n<think></think>\n"))
+
+print("\nThinking:\n" + "="*60)
+print(generate(prompt_base))
+```
+
 ## T5
 
 [T5](https://huggingface.co/docs/transformers/model_doc/t5) is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which each task is converted into a text-to-text format.
 
@@ -2,6 +2,7 @@
 
 #include "ctranslate2/layers/attention_layer.h"
 #include "ctranslate2/padder.h"
+#include "ctranslate2/layers/transformer.h"
 
 namespace ctranslate2 {
   namespace layers {
@@ -65,6 +66,8 @@ namespace ctranslate2 {
       dim_t _relative_right_max_position;
       const bool _merge_time_and_head_dims;
       const dim_t _cache_time_dim;
+      std::unique_ptr<const LayerNorm> _q_norm;  // Query normalization
+      std::unique_ptr<const LayerNorm> _k_norm;  // Key normalization
     };
   }
 }
@@ -5,10 +5,18 @@
     import glob
     import os
 
-    import pkg_resources
-
     module_name = sys.modules[__name__].__name__
-    package_dir = pkg_resources.resource_filename(module_name, "")
+
+    # Adressing python 3.9 < version
+    try:
+        from importlib.resources import files
+
+        # Fixed the pkg_resources depreciation
+        package_dir = str(files(module_name))
+    except ImportError:
+        import pkg_resources
+
+        package_dir = pkg_resources.resource_filename(module_name, "")
 
     add_dll_directory = getattr(os, "add_dll_directory", None)
     if add_dll_directory is not None:
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`	`#include "ctranslate2/layers/attention_layer.h"`
`4`	`4`	`#include "ctranslate2/padder.h"`
	`5`	`+#include "ctranslate2/layers/transformer.h"`
`5`	`6`
`6`	`7`	`namespace ctranslate2 {`
`7`	`8`	`namespace layers {`
`@@ -65,6 +66,8 @@ namespace ctranslate2 {`
`65`	`66`	`dim_t _relative_right_max_position;`
`66`	`67`	`const bool _merge_time_and_head_dims;`
`67`	`68`	`const dim_t _cache_time_dim;`
	`69`	`+ std::unique_ptr<const LayerNorm> _q_norm; // Query normalization`
	`70`	`+ std::unique_ptr<const LayerNorm> _k_norm; // Key normalization`
`68`	`71`	`};`
`69`	`72`	`}`
`70`	`73`	`}`