diff --git a/README.md b/README.md
index 6308ac2cd..7968de2d4 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,13 @@
     # centos
     sudo yum install sox sox-devel
     ```
+  
+### llama-cpp-python Backend (optional)
+For CPU/low-VRAM inference using GGUF quantized models:
+```sh
+pip install llama-cpp-python
+```
+Download quantized GGUF model from [Ferraronp/CosyVoice3-qwen2.5-0.5b-speech-gguf](https://huggingface.co/Ferraronp/CosyVoice3-qwen2.5-0.5b-speech-gguf).
 
 ### Model download
 
@@ -151,6 +158,17 @@ Follow the code in `example.py` for detailed usage of each model.
 python example.py
 ```
 
+#### llama.cpp Usage
+Replace the default model initialization with:
+```python
+cosyvoice = AutoModel(
+    model_dir='pretrained_models/Fun-CosyVoice3-0.5B',
+    load_llama_cpp=True,
+    gguf_model_path='/path/to/model.gguf'
+)
+```
+All existing inference methods (`inference_zero_shot`, etc.) work unchanged.
+
 #### vLLM Usage
 CosyVoice2/3 now supports **vLLM 0.11.x+ (V1 engine)** and **vLLM 0.9.0 (legacy)**.
 Older vllm version(<0.9.0) do not support CosyVoice inference, and versions in between (e.g., 0.10.x) are not tested.
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index 7ab04a70f..b95883aa3 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 import os
 import time
-from typing import Generator
+import threading
+from typing import Generator, List
+import numpy as np
 from tqdm import tqdm
 from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
@@ -188,9 +190,33 @@ def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk
 
 class CosyVoice3(CosyVoice2):
 
-    def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
+    def __init__(
+        self,
+        model_dir,
+        load_trt=False,
+        load_vllm=False,
+        fp16=False,
+        trt_concurrent=1,
+        # llama.cpp parameters
+        load_llama_cpp=False,
+        gguf_model_path=None,
+    ):
         self.model_dir = model_dir
         self.fp16 = fp16
+        self.gguf_model_path = gguf_model_path
+
+        # match model's training context length
+        self.llm_n_ctx = 32768
+        # standard llama params
+        self.llm_temperature = 0.8
+        self.llm_top_p = 0.95
+        self.llm_top_k = 25
+
+        if load_llama_cpp and not gguf_model_path:
+            raise ValueError('gguf_model_path must be provided when load_llama_cpp=True')
+        if load_llama_cpp and not os.path.exists(gguf_model_path):
+            raise FileNotFoundError('gguf_model_path not found: {}'.format(gguf_model_path))
+
         if not os.path.exists(model_dir):
             model_dir = snapshot_download(model_dir)
         hyper_yaml_path = '{}/cosyvoice3.yaml'.format(model_dir)
@@ -209,10 +235,15 @@ def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_c
         if torch.cuda.is_available() is False and (load_trt is True or fp16 is True):
             load_trt, fp16 = False, False
             logging.warning('no cuda device, set load_trt/fp16 to False')
+
         self.model = CosyVoice3Model(configs['llm'], configs['flow'], configs['hift'], fp16)
+
+        # When using llama.cpp, skip loading PyTorch LLM weights to save VRAM
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
-                        '{}/hift.pt'.format(model_dir))
+                        '{}/hift.pt'.format(model_dir),
+                        load_llm=not load_llama_cpp)
+
         if load_vllm:
             self.model.load_vllm('{}/vllm'.format(model_dir))
         if load_trt:
@@ -222,8 +253,347 @@ def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_c
                                 '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
                                 trt_concurrent,
                                 self.fp16)
+
+        # Initialize llama.cpp if GGUF path provided
+        self._llama_cpp_loaded = False
+        if load_llama_cpp:
+            self._init_speech_token_metadata()
+            self._load_llama_cpp(gguf_model_path)
+            logging.info('CosyVoice3 initialized with llama.cpp backend (gguf={})'.format(gguf_model_path))
+
         del configs
 
+    # -------------------------------------------------------------------------
+    # llama.cpp integration
+    # -------------------------------------------------------------------------
+
+    def _init_speech_token_metadata(self):
+        """Initialize speech token ID constants for llama.cpp token mapping."""
+        self.base_speech_token_size = 6561
+        self.embedding_size = 6561 + 200
+        self.speech_token_offset = 151936
+        self.sos_speech_idx = self.base_speech_token_size + 0    # 6561
+        self.eos_speech_idx = self.base_speech_token_size + 1    # 6562
+        self.task_id_speech_idx = self.base_speech_token_size + 2  # 6563
+
+    def _load_llama_cpp(self, gguf_model_path):
+        """Load GGUF model via llama-cpp-python."""
+        from llama_cpp import Llama
+
+        self.llm_gguf = Llama(
+            model_path=gguf_model_path,
+            n_gpu_layers=-1,
+            n_ctx=self.llm_n_ctx,
+            logits_all=True,
+            verbose=False,
+            temperature=self.llm_temperature,
+            top_p=self.llm_top_p,
+            top_k=self.llm_top_k,
+        )
+
+        self.sos_token_id = self.speech_token_offset + self.sos_speech_idx
+        self.eos_token_id = self.speech_token_offset + self.eos_speech_idx
+        self.task_id_token_id = self.speech_token_offset + self.task_id_speech_idx
+
+        self._llama_cpp_loaded = True
+
+    def _sample_speech_token_constrained(self, logit_pos):
+        """Sample next token constrained to speech tokens + EOS only.
+
+        Uses manual logit extraction at the correct position.
+        Fallback when built-in sample() produces text tokens.
+        """
+        logits = np.array(self.llm_gguf.scores[logit_pos], dtype=np.float32)
+        n_vocab = len(logits)
+
+        # Mask: only allow speech tokens [offset, offset+base_size) and EOS
+        valid = np.full(n_vocab, False)
+        s = self.speech_token_offset
+        e = min(s + self.base_speech_token_size, n_vocab)
+        valid[s:e] = True
+        if self.eos_token_id < n_vocab:
+            valid[self.eos_token_id] = True
+        logits[~valid] = -np.inf
+
+        logits = logits / max(self.llm_temperature, 1e-8)
+        logits -= logits[valid].max()
+        probs = np.exp(logits)
+        probs /= probs.sum()
+
+        if self.llm_top_k > 0:
+            top_k = min(self.llm_top_k, int(np.sum(probs > 0)))
+            if top_k > 0:
+                threshold = np.sort(probs)[-top_k]
+                probs[probs < threshold] = 0.0
+                probs /= probs.sum()
+
+        if self.llm_top_p < 1.0:
+            sorted_idx = np.argsort(probs)[::-1]
+            cum = np.cumsum(probs[sorted_idx])
+            cutoff = np.searchsorted(cum, self.llm_top_p) + 1
+            keep = sorted_idx[:cutoff]
+            mask = np.zeros_like(probs)
+            mask[keep] = probs[keep]
+            probs = mask / mask.sum()
+
+        return int(np.random.choice(n_vocab, p=probs))
+
+    def _run_llama_cpp_inference(
+        self,
+        text_token_ids: List[int],
+        prompt_text_token_ids: List[int],
+        prompt_speech_tokens: List[int],
+    ) -> List[int]:
+        """
+        Run llama.cpp inference to generate speech tokens.
+
+        Uses pre-tokenized IDs from the CosyVoice frontend (same as PyTorch path).
+        Format: [SOS] + prompt_text_ids + text_ids + [TASK_ID] + offset(prompt_speech_tokens)
+        """
+        all_text_ids = prompt_text_token_ids + text_token_ids
+        prompt_speech_ids = [self.speech_token_offset + t for t in prompt_speech_tokens]
+        input_ids = [self.sos_token_id] + all_text_ids + [self.task_id_token_id] + prompt_speech_ids
+
+        self.llm_gguf.reset()
+        self.llm_gguf.eval(input_ids)
+
+        # Track position for constrained sampling fallback
+        n_past = len(input_ids)
+
+        speech_tokens = []
+        raw_generated = []
+        max_new_tokens = 2048
+
+        for i in range(max_new_tokens):
+            # Use built-in sample() (position-aware, like FastCosyVoice)
+            next_token_id = self.llm_gguf.sample()
+
+            # If built-in sample returns text token, retry with constrained sampling
+            if (next_token_id != self.eos_token_id and
+                not (self.speech_token_offset <= next_token_id < self.speech_token_offset + self.base_speech_token_size)):
+                if i == 0:
+                    logging.info('Built-in sample() returned text token {} on step 0, switching to constrained'.format(next_token_id))
+                next_token_id = self._sample_speech_token_constrained(logit_pos=n_past - 1)
+
+            raw_generated.append(next_token_id)
+
+            if next_token_id == self.eos_token_id:
+                break
+
+            if self.speech_token_offset <= next_token_id < self.speech_token_offset + self.base_speech_token_size:
+                speech_tokens.append(next_token_id - self.speech_token_offset)
+            else:
+                break
+
+            self.llm_gguf.eval([next_token_id])
+            n_past += 1
+
+        return speech_tokens
+
+    def _llama_cpp_job(
+        self,
+        text_token_ids: List[int],
+        prompt_text_token_ids: List[int],
+        prompt_speech_tokens: List[int],
+        tokens_list: list,
+        llm_end_flag: dict,
+        tokens_lock: threading.Lock,
+    ):
+        """Thread target: generate all speech tokens via llama.cpp and fill shared tokens_list."""
+        try:
+            speech_tokens = self._run_llama_cpp_inference(
+                text_token_ids=text_token_ids,
+                prompt_text_token_ids=prompt_text_token_ids,
+                prompt_speech_tokens=prompt_speech_tokens,
+            )
+            with tokens_lock:
+                tokens_list.extend(speech_tokens)
+        except Exception as e:
+            logging.error('llama.cpp inference error: {}'.format(e), exc_info=True)
+        finally:
+            llm_end_flag['done'] = True
+
+    # -------------------------------------------------------------------------
+    # Overridden inference methods with llama.cpp support
+    # -------------------------------------------------------------------------
+
+    def _extract_token_ids(self, model_input):
+        """Extract token ID lists from frontend model_input dict."""
+        text_ids = model_input['text'].squeeze(0).tolist()
+        prompt_text_ids = model_input.get('prompt_text', torch.zeros(1, 0, dtype=torch.int32)).squeeze(0).tolist()
+        prompt_speech_ids = model_input.get('llm_prompt_speech_token', torch.zeros(1, 0, dtype=torch.int32)).squeeze(0).tolist()
+        return text_ids, prompt_text_ids, prompt_speech_ids
+
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
+        if not self._llama_cpp_loaded:
+            yield from super().inference_zero_shot(tts_text, prompt_text, prompt_wav, zero_shot_spk_id, stream, speed, text_frontend)
+            return
+
+        # Generator text input: consume into string (llama.cpp needs full text upfront)
+        if hasattr(tts_text, '__next__'):
+            tts_text = ''.join(tts_text)
+            logging.info('Consumed generator text: {}'.format(tts_text[:100]))
+
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
+        for text_chunk in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            if (not isinstance(text_chunk, Generator)) and len(text_chunk) < 0.5 * len(prompt_text):
+                logging.warning('synthesis text {} too short than prompt text'.format(text_chunk))
+
+            model_input = self.frontend.frontend_zero_shot(text_chunk, prompt_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(text_chunk))
+
+            text_ids, prompt_text_ids, prompt_speech_ids = self._extract_token_ids(model_input)
+
+            if stream:
+                tokens_list = []
+                tokens_lock = threading.Lock()
+                llm_end_flag = {'done': False}
+
+                llm_thread = threading.Thread(
+                    target=self._llama_cpp_job,
+                    args=(text_ids, prompt_text_ids, prompt_speech_ids,
+                          tokens_list, llm_end_flag, tokens_lock),
+                    daemon=True
+                )
+                llm_thread.start()
+
+                for model_output in self.model.tts_stream_external_llm(
+                    tokens_list=tokens_list,
+                    tokens_lock=tokens_lock,
+                    llm_end_flag=llm_end_flag,
+                    **{k: v for k, v in model_input.items() if k.startswith('flow') or k.startswith('prompt_speech')}
+                ):
+                    speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                    logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                    yield model_output
+                    start_time = time.time()
+
+                llm_thread.join(timeout=5.0)
+            else:
+                speech_tokens = self._run_llama_cpp_inference(
+                    text_token_ids=text_ids,
+                    prompt_text_token_ids=prompt_text_ids,
+                    prompt_speech_tokens=prompt_speech_ids,
+                )
+                model_output = self.model.tts_with_external_tokens(
+                    tokens=speech_tokens,
+                    speed=speed,
+                    **{k: v for k, v in model_input.items() if k.startswith('flow') or k.startswith('prompt_speech')}
+                )
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+
+    def inference_cross_lingual(self, tts_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
+        if not self._llama_cpp_loaded:
+            yield from super().inference_cross_lingual(tts_text, prompt_wav, zero_shot_spk_id, stream, speed, text_frontend)
+            return
+
+        if hasattr(tts_text, '__next__'):
+            tts_text = ''.join(tts_text)
+
+        for text_chunk in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_cross_lingual(text_chunk, prompt_wav, self.sample_rate, zero_shot_spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(text_chunk))
+
+            text_ids, prompt_text_ids, prompt_speech_ids = self._extract_token_ids(model_input)
+
+            if stream:
+                tokens_list = []
+                tokens_lock = threading.Lock()
+                llm_end_flag = {'done': False}
+
+                llm_thread = threading.Thread(
+                    target=self._llama_cpp_job,
+                    args=(text_ids, prompt_text_ids, prompt_speech_ids,
+                          tokens_list, llm_end_flag, tokens_lock),
+                    daemon=True
+                )
+                llm_thread.start()
+
+                for model_output in self.model.tts_stream_external_llm(
+                    tokens_list=tokens_list,
+                    tokens_lock=tokens_lock,
+                    llm_end_flag=llm_end_flag,
+                    **{k: v for k, v in model_input.items() if k.startswith('flow') or k.startswith('prompt_speech')}
+                ):
+                    speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                    logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                    yield model_output
+                    start_time = time.time()
+
+                llm_thread.join(timeout=5.0)
+            else:
+                speech_tokens = self._run_llama_cpp_inference(
+                    text_token_ids=text_ids,
+                    prompt_text_token_ids=prompt_text_ids,
+                    prompt_speech_tokens=prompt_speech_ids,
+                )
+                model_output = self.model.tts_with_external_tokens(
+                    tokens=speech_tokens,
+                    speed=speed,
+                    **{k: v for k, v in model_input.items() if k.startswith('flow') or k.startswith('prompt_speech')}
+                )
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+
+    def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
+        if not self._llama_cpp_loaded:
+            yield from super().inference_instruct2(tts_text, instruct_text, prompt_wav, zero_shot_spk_id, stream, speed, text_frontend)
+            return
+
+        if hasattr(tts_text, '__next__'):
+            tts_text = ''.join(tts_text)
+
+        for text_chunk in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_instruct2(text_chunk, instruct_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(text_chunk))
+
+            text_ids, prompt_text_ids, prompt_speech_ids = self._extract_token_ids(model_input)
+
+            if stream:
+                tokens_list = []
+                tokens_lock = threading.Lock()
+                llm_end_flag = {'done': False}
+
+                llm_thread = threading.Thread(
+                    target=self._llama_cpp_job,
+                    args=(text_ids, prompt_text_ids, prompt_speech_ids,
+                          tokens_list, llm_end_flag, tokens_lock),
+                    daemon=True
+                )
+                llm_thread.start()
+
+                for model_output in self.model.tts_stream_external_llm(
+                    tokens_list=tokens_list,
+                    tokens_lock=tokens_lock,
+                    llm_end_flag=llm_end_flag,
+                    **{k: v for k, v in model_input.items() if k.startswith('flow') or k.startswith('prompt_speech')}
+                ):
+                    speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                    logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                    yield model_output
+                    start_time = time.time()
+
+                llm_thread.join(timeout=5.0)
+            else:
+                speech_tokens = self._run_llama_cpp_inference(
+                    text_token_ids=text_ids,
+                    prompt_text_token_ids=prompt_text_ids,
+                    prompt_speech_tokens=prompt_speech_ids,
+                )
+                model_output = self.model.tts_with_external_tokens(
+                    tokens=speech_tokens,
+                    speed=speed,
+                    **{k: v for k, v in model_input.items() if k.startswith('flow') or k.startswith('prompt_speech')}
+                )
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
 
 def AutoModel(**kwargs):
     if not os.path.exists(kwargs['model_dir']):
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 92a15d985..94f9ae61c 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -62,9 +62,10 @@ def __init__(self,
         self.hift_cache_dict = {}
         self.silent_tokens = []
 
-    def load(self, llm_model, flow_model, hift_model):
-        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device, weights_only=True), strict=True)
-        self.llm.to(self.device).eval()
+    def load(self, llm_model, flow_model, hift_model, load_llm=True):
+        if load_llm:
+            self.llm.load_state_dict(torch.load(llm_model, map_location=self.device, weights_only=True), strict=True)
+            self.llm.to(self.device).eval()
         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device, weights_only=True), strict=True)
         self.flow.to(self.device).eval()
         # in case hift_model is a hifigan model
@@ -448,3 +449,110 @@ def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, u
             tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
             self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
         return tts_speech
+
+    def tts_with_external_tokens(
+        self,
+        tokens,
+        flow_embedding=torch.zeros(0, 192),
+        flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+        prompt_speech_feat=torch.zeros(1, 0, 80),
+        speed=1.0,
+        **kwargs
+    ):
+        """Non-streaming TTS with pre-generated speech tokens (for llama.cpp backend)."""
+        if not tokens:
+            return {'tts_speech': torch.zeros(1, 0)}
+
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.hift_cache_dict[this_uuid] = None
+
+        tokens_gpu = torch.tensor(tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
+        tts_speech = self.token2wav(
+            token=tokens_gpu,
+            prompt_token=flow_prompt_speech_token,
+            prompt_feat=prompt_speech_feat,
+            embedding=flow_embedding,
+            token_offset=0,
+            uuid=this_uuid,
+            finalize=True,
+            speed=speed,
+        )
+
+        with self.lock:
+            self.hift_cache_dict.pop(this_uuid)
+        return {'tts_speech': tts_speech.cpu()}
+
+    def tts_stream_external_llm(
+        self,
+        tokens_list,
+        tokens_lock,
+        llm_end_flag,
+        flow_embedding=torch.zeros(0, 192),
+        flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+        prompt_speech_feat=torch.zeros(1, 0, 80),
+        **kwargs
+    ):
+        """Streaming TTS with external LLM providing tokens via shared list + lock."""
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.hift_cache_dict[this_uuid] = None
+
+        token_offset = 0
+        prompt_token_pad = int(
+            np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len)
+            * self.token_hop_len
+            - flow_prompt_speech_token.shape[1]
+        )
+
+        try:
+            while True:
+                time.sleep(0.1)
+                this_token_hop_len = (
+                    self.token_hop_len + prompt_token_pad if token_offset == 0
+                    else self.token_hop_len
+                )
+                with tokens_lock:
+                    available = len(tokens_list) - token_offset
+                    need = this_token_hop_len + self.flow.pre_lookahead_len
+
+                if available >= need:
+                    with tokens_lock:
+                        batch = list(tokens_list[:token_offset + need])
+                    this_tts_speech_token = torch.tensor(batch).unsqueeze(0)
+                    tts_speech = self.token2wav(
+                        token=this_tts_speech_token,
+                        prompt_token=flow_prompt_speech_token,
+                        prompt_feat=prompt_speech_feat,
+                        embedding=flow_embedding,
+                        token_offset=token_offset,
+                        uuid=this_uuid,
+                        stream=True,
+                        finalize=False,
+                    )
+                    token_offset += this_token_hop_len
+                    yield {'tts_speech': tts_speech.cpu()}
+
+                if llm_end_flag['done'] and available < need:
+                    break
+
+            # Final batch
+            with tokens_lock:
+                final_tokens = list(tokens_list)
+            if final_tokens and token_offset < len(final_tokens):
+                this_tts_speech_token = torch.tensor(final_tokens).unsqueeze(0)
+                tts_speech = self.token2wav(
+                    token=this_tts_speech_token,
+                    prompt_token=flow_prompt_speech_token,
+                    prompt_feat=prompt_speech_feat,
+                    embedding=flow_embedding,
+                    token_offset=token_offset,
+                    uuid=this_uuid,
+                    finalize=True,
+                )
+                yield {'tts_speech': tts_speech.cpu()}
+        finally:
+            with self.lock:
+                self.hift_cache_dict.pop(this_uuid, None)
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()