From 424127eea54be60491a3050eb853a9f6182e71ee Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 6 May 2026 17:57:29 +0800 Subject: [PATCH 1/2] fix kernels --- requirements/framework.txt | 2 +- swift/infer_engine/transformers_engine.py | 25 ++++++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/requirements/framework.txt b/requirements/framework.txt index cc6f80636f..66ad03f7f4 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -32,7 +32,7 @@ sortedcontainers>=1.5.9 tensorboard tiktoken tqdm -transformers>=4.33,<5.8.0 +transformers>=4.33,<5.9.0 transformers_stream_generator trl>=0.15,<1.0 uvicorn diff --git a/swift/infer_engine/transformers_engine.py b/swift/infer_engine/transformers_engine.py index ac7e963717..01cfede92a 100644 --- a/swift/infer_engine/transformers_engine.py +++ b/swift/infer_engine/transformers_engine.py @@ -7,6 +7,7 @@ import time import torch import torch.nn.functional as F +from contextlib import contextmanager from copy import deepcopy from PIL import Image from queue import Queue @@ -21,7 +22,7 @@ from swift.model import get_model_processor from swift.template import Template from swift.tuners import Swift -from swift.utils import get_last_valid_indices, safe_snapshot_download, to_device +from swift.utils import get_last_valid_indices, safe_snapshot_download, to_device, use_hf_hub from .infer_engine import InferEngine from .protocol import (ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingResponse, @@ -240,7 +241,8 @@ def _infer_stream(self, inputs: Dict[str, Any], *, generation_config: Generation def _model_generate(**kwargs): if is_torch_npu_available(): torch.npu.set_device(self.model.device) - self.template.generate(self.model, **kwargs) + with self._patch_kernels(): + self.template.generate(self.model, **kwargs) generate_kwargs = self.template.prepare_generate_kwargs(generate_kwargs, model=self.model) thread = Thread(target=_model_generate, kwargs=generate_kwargs) @@ -384,6 +386,22 @@ def _infer_forward(self, inputs: Dict[str, Any], adapter_request: Optional[Adapt res.append(ChatCompletionResponse(model=self.model_name, choices=choices, usage=usage_info)) return res + @contextmanager + def _patch_kernels(self): + use_hf = self.use_hf + if use_hf is None: + use_hf = True if use_hf_hub() else False + if not use_hf: + try: + from modelscope import patch_hub, unpatch_hub + except ImportError: + use_hf = True + if not use_hf: + patch_hub() + yield + if not use_hf: + unpatch_hub() + def _infer_full(self, inputs: Dict[str, Any], *, generation_config: GenerationConfig, adapter_request: Optional[AdapterRequest], request_config: RequestConfig, template_inputs) -> List[ChatCompletionResponse]: @@ -394,7 +412,8 @@ def _infer_full(self, inputs: Dict[str, Any], *, generation_config: GenerationCo generate_kwargs['adapter_names'] = adapter_names num_prompt_tokens = self._get_num_tokens(inputs) generate_kwargs = self.template.prepare_generate_kwargs(generate_kwargs, model=self.model) - output = dict(self.template.generate(self.model, **generate_kwargs)) + with self._patch_kernels(): + output = dict(self.template.generate(self.model, **generate_kwargs)) output.pop('past_key_values', None) batched_generate_ids = output['sequences'] batched_generate_ids = self.template.get_generate_ids(batched_generate_ids, num_prompt_tokens) From f683b405f3e3d958bd21247073247350c2ffa2d8 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 6 May 2026 20:08:00 +0800 Subject: [PATCH 2/2] fix --- swift/infer_engine/transformers_engine.py | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/swift/infer_engine/transformers_engine.py b/swift/infer_engine/transformers_engine.py index 01cfede92a..6dbbe686e9 100644 --- a/swift/infer_engine/transformers_engine.py +++ b/swift/infer_engine/transformers_engine.py @@ -388,18 +388,23 @@ def _infer_forward(self, inputs: Dict[str, Any], adapter_request: Optional[Adapt @contextmanager def _patch_kernels(self): - use_hf = self.use_hf - if use_hf is None: - use_hf = True if use_hf_hub() else False - if not use_hf: - try: - from modelscope import patch_hub, unpatch_hub - except ImportError: - use_hf = True - if not use_hf: + use_hf = self.use_hf if self.use_hf is not None else use_hf_hub() + if use_hf: + yield + return + + try: + from modelscope import patch_hub, unpatch_hub + except ImportError: + yield + return + try: patch_hub() - yield - if not use_hf: + except AttributeError: + pass + try: + yield + finally: unpatch_hub() def _infer_full(self, inputs: Dict[str, Any], *, generation_config: GenerationConfig,