-
Notifications
You must be signed in to change notification settings - Fork 2.8k
feat: Qwen asr speech recognition #3882
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| # coding=utf-8 | ||
| import traceback | ||
| from typing import Dict, Any | ||
|
|
||
| from common import forms | ||
| from common.exception.app_exception import AppApiException | ||
| from common.forms import BaseForm | ||
| from models_provider.base_model_provider import BaseModelCredential, ValidCode | ||
| from django.utils.translation import gettext as _ | ||
|
|
||
| from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelParams | ||
|
|
||
|
|
||
| class AliyunBaiLianAsrSTTModelCredential(BaseForm, BaseModelCredential): | ||
| api_url = forms.TextInputField(_('API URL'), required=True) | ||
| api_key = forms.PasswordInputField(_('API Key'), required=True) | ||
|
|
||
| def is_valid(self, | ||
| model_type: str, | ||
| model_name: str, | ||
| model_credential: Dict[str, Any], | ||
| model_params: Dict[str, Any], | ||
| provider, | ||
| raise_exception: bool = False | ||
| ) -> bool: | ||
| model_type_list = provider.get_model_type_list() | ||
| if not any(mt.get('value') == model_type for mt in model_type_list): | ||
| raise AppApiException( | ||
| ValidCode.valid_error.value, | ||
| _('{model_type} Model type is not supported').format(model_type=model_type) | ||
| ) | ||
|
|
||
| required_keys = ['api_key'] | ||
| for key in required_keys: | ||
| if key not in model_credential: | ||
| if raise_exception: | ||
| raise AppApiException( | ||
| ValidCode.valid_error.value, | ||
| _('{key} is required').format(key=key) | ||
| ) | ||
| return False | ||
|
|
||
| try: | ||
| model = provider.get_model(model_type, model_name, model_credential) | ||
| except Exception as e: | ||
| traceback.print_exc() | ||
| if isinstance(e, AppApiException): | ||
| raise e | ||
| if raise_exception: | ||
| raise AppApiException( | ||
| ValidCode.valid_error.value, | ||
| _('Verification failed, please check whether the parameters are correct: {error}').format( | ||
| error=str(e)) | ||
| ) | ||
| return False | ||
| return True | ||
|
|
||
| def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]: | ||
|
|
||
| return { | ||
| **model, | ||
| 'api_key': super().encryption(model.get('api_key', '')) | ||
| } | ||
|
|
||
| def get_model_params_setting_form(self, model_name): | ||
|
|
||
| return AliyunBaiLianOmiSTTModelParams() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The provided code appears to be a Django form class that handles authentication credentials from an API for accessing services like Aliyun Bai Lian STT (Speech-to-Text). Here are some points of interest and suggestions: Main Features
Potential Improvements
By addressing these suggestions and improving documentation, the code would become more maintainable, easier to debug, and potentially faster. Note: These are general recommendations and may need to be adjusted based upon specific requirements or constraints of your application environment. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| import base64 | ||
| import os.path | ||
| import traceback | ||
| from typing import Dict | ||
|
|
||
| import dashscope | ||
|
|
||
| from common.utils.logger import maxkb_logger | ||
| from models_provider.base_model_provider import MaxKBBaseModel | ||
| from models_provider.impl.base_stt import BaseSpeechToText | ||
|
|
||
|
|
||
| class AliyunBaiLianAsrSpeechToText(MaxKBBaseModel, BaseSpeechToText): | ||
| api_key: str | ||
| api_url: str | ||
| model: str | ||
| params: dict | ||
|
|
||
| def __init__(self, **kwargs): | ||
| super().__init__(**kwargs) | ||
| self.api_key = kwargs.get('api_key') | ||
| self.model = kwargs.get('model') | ||
| self.params = kwargs.get('params') | ||
| self.api_url = kwargs.get('api_url') | ||
|
|
||
| @staticmethod | ||
| def is_cache_model(): | ||
| return False | ||
|
|
||
| @staticmethod | ||
| def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs): | ||
| return AliyunBaiLianAsrSpeechToText( | ||
| model=model_name, | ||
| api_key=model_credential.get('api_key'), | ||
| api_url=model_credential.get('api_url'), | ||
| params=model_kwargs, | ||
| **model_kwargs | ||
| ) | ||
|
|
||
| def check_auth(self): | ||
| cwd = os.path.dirname(os.path.abspath(__file__)) | ||
| with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file: | ||
| self.speech_to_text(audio_file) | ||
|
|
||
| def speech_to_text(self, audio_file): | ||
| try: | ||
|
|
||
| base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") | ||
|
|
||
| messages = [ | ||
| { | ||
| "role": "user", | ||
| "content": [ | ||
| {"audio": f"data:audio/mp3;base64,{base64_audio}"}, | ||
| ] | ||
| } | ||
| ] | ||
| response = dashscope.MultiModalConversation.call( | ||
| api_key=self.api_key, | ||
| model=self.model, | ||
| messages=messages, | ||
| result_format="message", | ||
| ) | ||
|
|
||
| text = response["output"]["choices"][0]["message"].content[0]["text"] | ||
|
|
||
| return text | ||
|
|
||
| except Exception as err: | ||
| maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}") | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There seems to be an issue with the implementation of Issue: The call signature for Here's how you can modify it: def speech_to_text(self, audio_file):
try:
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
messages = [
{"role": "user", "content": [f"audio:{{'filename':'test.wav','data':{base64_audio}}},"]]
]
response = dashscope.ModelCall(api_key=self.api_key, model=self.model).call(messages)
text = response.choices[0].text
return text
except Exception as err:
maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")This assumes that there has been a change in dashscope library where MultiModal conversation calls have now switched to ModelCall instead. Additional SuggestionIf the above modification causes any error due to incorrect input parameters, make sure that the content type and formatting for the audio data in the request match what the backend expects. This could involve additional adjustments in |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There appear to be several issues and areas for improvement in your code snippet:
Duplicate Credentials: You have duplicate imports for
AliyunBaiLianAsrSTTModelCredentialfrom both the same directory (models_provider) and another directory (asr_stt). This can cause conflicts or missing credentials depending on how these files are imported.Import Path Confusions: Ensure that your import paths are correct and avoid circular dependencies if using classes defined in different modules.
Redundancy: There seems to be redundancy between some class names like
AliyunBaiLianS...Model Credentialand their respective model classes, such asAliyunBaiLianStTSpeechToText. It would be better to maintain consistency.Typo Correction: Corrected a typo in the comment block for 'stt' to 'stt'.
Comments Style: Consider improving the readability and clarity of comments.
Here is an optimized version of your code:
Changes Made: