diff --git a/examples/grpo/cosyvoice3/pretrained_to_huggingface.py b/examples/grpo/cosyvoice3/pretrained_to_huggingface.py
new file mode 100644
index 000000000..ddec609b7
--- /dev/null
+++ b/examples/grpo/cosyvoice3/pretrained_to_huggingface.py
@@ -0,0 +1,198 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage: Instruct TTS
+ python3 infer.py \
+ --token2wav-path /workspace/CosyVoice2-0.5B \
+ --prompt-text "吃燕窝就选燕之屋,本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝,营养更均衡,本节目由豆本豆豆奶特约播出。" \
+ --prompt-speech-path ./assets/prompt_audio.wav \
+ --model-path ./transformers_cosyvoice2_llm \
+ --input-text "用四川话说<|endofprompt|>扁担长,板凳宽,扁担绑在板凳上。吃葡萄不吐葡萄皮,不吃葡萄倒吐葡萄皮。"
+"""
+from cosyvoice.cli.cosyvoice import CosyVoice3
+import sys
+from argparse import ArgumentParser
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+sys.path.append("/workspace/CosyVoice/third_party/Matcha-TTS")
+
+
+def get_args():
+ parser = ArgumentParser()
+ parser.add_argument(
+ "--pretrained-cosyvoice3-path",
+ type=str,
+ default="/workspace/CosyVoice2-0.5B",
+ help="Token2Wav path, default to %(default)r"
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default='./transformers_cosyvoice3_llm',
+ help="The path to save the model"
+ )
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = get_args()
+ cosy3_model = CosyVoice3(args.pretrained_cosyvoice3_path)
+
+ # text tokenizer
+ tokenizer = AutoTokenizer.from_pretrained(f"{args.pretrained_cosyvoice3_path}/CosyVoice-BlankEN")
+
+ llm = cosy3_model.model.llm.llm.model
+
+ # speech token embedding (with sos/eos, etc), removing llm embedding
+ speech_embedding = cosy3_model.model.llm.speech_embedding
+ llm_decoder = cosy3_model.model.llm.llm_decoder
+
+ special_tokens = {
+ 'eos_token': '<|endoftext|>',
+ 'pad_token': '<|endoftext|>',
+ 'additional_special_tokens': [
+ '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+ '[breath]', '', '', '[noise]', '[laughter]', '[cough]', '[clucking]', '[accent]', '[quick_breath]',
+ "", "", "[hissing]", "[sigh]", "[vocalized-noise]", "[lipsmack]", "[mn]", "<|endofsystem|>",
+ "[AA]", "[AA0]", "[AA1]", "[AA2]", "[AE]", "[AE0]", "[AE1]", "[AE2]", "[AH]", "[AH0]", "[AH1]", "[AH2]",
+ "[AO]", "[AO0]", "[AO1]", "[AO2]", "[AW]", "[AW0]", "[AW1]", "[AW2]", "[AY]", "[AY0]", "[AY1]", "[AY2]",
+ "[B]", "[CH]", "[D]", "[DH]", "[EH]", "[EH0]", "[EH1]", "[EH2]", "[ER]", "[ER0]", "[ER1]", "[ER2]", "[EY]",
+ "[EY0]", "[EY1]", "[EY2]", "[F]", "[G]", "[HH]", "[IH]", "[IH0]", "[IH1]", "[IH2]", "[IY]", "[IY0]", "[IY1]",
+ "[IY2]", "[JH]", "[K]", "[L]", "[M]", "[N]", "[NG]", "[OW]", "[OW0]", "[OW1]", "[OW2]", "[OY]", "[OY0]",
+ "[OY1]", "[OY2]", "[P]", "[R]", "[S]", "[SH]", "[T]", "[TH]", "[UH]", "[UH0]", "[UH1]", "[UH2]", "[UW]",
+ "[UW0]", "[UW1]", "[UW2]", "[V]", "[W]", "[Y]", "[Z]", "[ZH]",
+ "[a]", "[ai]", "[an]", "[ang]", "[ao]", "[b]", "[c]", "[ch]", "[d]", "[e]", "[ei]", "[en]", "[eng]", "[f]",
+ "[g]", "[h]", "[i]", "[ian]", "[in]", "[ing]", "[iu]", "[ià]", "[iàn]", "[iàng]", "[iào]", "[iá]", "[ián]",
+ "[iáng]", "[iáo]", "[iè]", "[ié]", "[iòng]", "[ióng]", "[iù]", "[iú]", "[iā]", "[iān]", "[iāng]", "[iāo]",
+ "[iē]", "[iě]", "[iōng]", "[iū]", "[iǎ]", "[iǎn]", "[iǎng]", "[iǎo]", "[iǒng]", "[iǔ]", "[j]", "[k]", "[l]",
+ "[m]", "[n]", "[o]", "[ong]", "[ou]", "[p]", "[q]", "[r]", "[s]", "[sh]", "[t]", "[u]", "[uang]", "[ue]",
+ "[un]", "[uo]", "[uà]", "[uài]", "[uàn]", "[uàng]", "[uá]", "[uái]", "[uán]", "[uáng]", "[uè]", "[ué]", "[uì]",
+ "[uí]", "[uò]", "[uó]", "[uā]", "[uāi]", "[uān]", "[uāng]", "[uē]", "[uě]", "[uī]", "[uō]", "[uǎ]", "[uǎi]",
+ "[uǎn]", "[uǎng]", "[uǐ]", "[uǒ]", "[vè]", "[w]", "[x]", "[y]", "[z]", "[zh]", "[à]", "[ài]", "[àn]", "[àng]",
+ "[ào]", "[á]", "[ái]", "[án]", "[áng]", "[áo]", "[è]", "[èi]", "[èn]", "[èng]", "[èr]", "[é]", "[éi]", "[én]",
+ "[éng]", "[ér]", "[ì]", "[ìn]", "[ìng]", "[í]", "[ín]", "[íng]", "[ò]", "[òng]", "[òu]", "[ó]", "[óng]", "[óu]",
+ "[ù]", "[ùn]", "[ú]", "[ún]", "[ā]", "[āi]", "[ān]", "[āng]", "[āo]", "[ē]", "[ēi]", "[ēn]", "[ēng]", "[ě]",
+ "[ěi]", "[ěn]", "[ěng]", "[ěr]", "[ī]", "[īn]", "[īng]", "[ō]", "[ōng]", "[ōu]", "[ū]", "[ūn]", "[ǎ]", "[ǎi]",
+ "[ǎn]", "[ǎng]", "[ǎo]", "[ǐ]", "[ǐn]", "[ǐng]", "[ǒ]", "[ǒng]", "[ǒu]", "[ǔ]", "[ǔn]", "[ǘ]", "[ǚ]", "[ǜ]"
+ ]
+ }
+ tokenizer.add_special_tokens(special_tokens)
+
+ original_tokenizer_vocab_size = len(tokenizer)
+ cosyvoice3_token_size = 6561
+ total_speech_tokens = cosyvoice3_token_size + 200
+
+ new_tokens = [
+ f"<|s_{i}|>" for i in range(total_speech_tokens)
+ ] + [
+ "<|sos|>", "<|eos|>", "<|task_id|>"
+ ]
+ num_added_tokens = tokenizer.add_tokens(new_tokens)
+
+ speech_start_idx = tokenizer.convert_tokens_to_ids("<|s_0|>")
+
+ speech_end_idx = tokenizer.convert_tokens_to_ids(
+ f"<|s_{total_speech_tokens - 1}|>"
+ ) + 1
+
+ assert speech_start_idx != tokenizer.unk_token_id, "missing <|s_0|> in tokenizer"
+ assert (speech_end_idx - speech_start_idx) == total_speech_tokens, (
+ f"speech token span mismatch: got {speech_end_idx - speech_start_idx}, "
+ f"expected {total_speech_tokens}"
+ )
+
+ llm.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)
+ vocab_size = llm.get_input_embeddings().weight.shape[0]
+
+ feature_size = speech_embedding.embedding_dim
+ print(f'feature_size: {feature_size}, vocab_size: {vocab_size}')
+
+ new_lm_head = torch.nn.Linear(
+ in_features=feature_size,
+ out_features=vocab_size,
+ bias=False
+ )
+
+ control_source_tokens = {
+ "sos": f"<|s_{cosyvoice3_token_size + 0}|>",
+ "eos": f"<|s_{cosyvoice3_token_size + 1}|>",
+ "task_id": f"<|s_{cosyvoice3_token_size + 2}|>",
+ }
+ alias_source_map = {
+ "<|sos|>": control_source_tokens["sos"],
+ "<|eos|>": control_source_tokens["eos"],
+ "<|task_id|>": control_source_tokens["task_id"],
+ }
+
+ # output lm head
+ with torch.no_grad():
+ # set the weight and bias of the new lm_head to 0
+ new_lm_head.weight.data.zero_()
+
+ target_slice = slice(speech_start_idx, speech_end_idx)
+
+ assert llm_decoder.weight.shape[0] == (target_slice.stop - target_slice.start), \
+ f"dim mistach: llm_decoder {llm_decoder.weight.shape[0]} vs 目标切片 {target_slice.stop - target_slice.start}"
+
+ new_lm_head.weight[target_slice] = llm_decoder.weight
+
+ for alias_token, source_token in alias_source_map.items():
+ alias_id = tokenizer.convert_tokens_to_ids(alias_token)
+ source_id = tokenizer.convert_tokens_to_ids(source_token)
+ assert alias_id != tokenizer.unk_token_id, f"missing alias token: {alias_token}"
+ assert source_id != tokenizer.unk_token_id, f"missing source token: {source_token}"
+ new_lm_head.weight[alias_id] = new_lm_head.weight[source_id]
+
+ llm.lm_head = new_lm_head
+
+ input_embeddings = llm.get_input_embeddings()
+
+ with torch.no_grad():
+ input_embeddings.weight[target_slice] = speech_embedding.weight
+
+ for alias_token, source_token in alias_source_map.items():
+ alias_id = tokenizer.convert_tokens_to_ids(alias_token)
+ source_id = tokenizer.convert_tokens_to_ids(source_token)
+ input_embeddings.weight[alias_id] = input_embeddings.weight[source_id]
+
+ alias_eos_token_id = tokenizer.convert_tokens_to_ids("<|eos|>")
+ real_eos_token_id = tokenizer.convert_tokens_to_ids(control_source_tokens["eos"])
+
+ llm.generation_config.eos_token_id = [alias_eos_token_id, real_eos_token_id]
+ llm.generation_config.pad_token_id = tokenizer.pad_token_id
+ llm.generation_config.temperature = 1.0
+ llm.generation_config.top_p = 0.8
+ llm.generation_config.top_k = 25
+
+ llm.config.eos_token_id = real_eos_token_id
+ llm.config.vocab_size = vocab_size
+ llm.config.tie_word_embeddings = False
+ llm.config.use_bias = False
+ llm.to(torch.bfloat16)
+ llm.save_pretrained(args.save_path)
+
+ TEMPLATE = (
+ "{%- for message in messages %}"
+ "{%- if message['role'] == 'user' %}"
+ "{{- '<|sos|>' + message['content'] + '<|task_id|>' }}"
+ "{%- elif message['role'] == 'assistant' %}"
+ "{{- message['content']}}"
+ "{%- endif %}"
+ "{%- endfor %}"
+ )
+
+ tokenizer.chat_template = TEMPLATE
+ tokenizer.save_pretrained(args.save_path)
\ No newline at end of file