Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apps/locales/en_US/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8660,4 +8660,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
msgstr ""

msgid "resource authorization"
msgstr ""

msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
msgstr ""
5 changes: 4 additions & 1 deletion apps/locales/zh_CN/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
msgstr "Qwen-Omni 系列模型支持输入多种模态的数据,包括视频、音频、图片、文本,并输出音频与文本"

msgid "resource authorization"
msgstr "资源授权"
msgstr "资源授权"

msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
msgstr "基于Qwen-Audio的端到端语音识别大模型,支持3分钟以内的音频识别,目前主要支持中英文识别。"
5 changes: 4 additions & 1 deletion apps/locales/zh_Hant/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
msgstr "Qwen-Omni系列模型支持輸入多種模態的數據,包括視頻、音訊、圖片、文字,並輸出音訊與文字"

msgid "resource authorization"
msgstr "資源授權"
msgstr "資源授權"

msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
msgstr "基於Qwen-Audio的端到端語音辨識大模型,支持3分鐘以內的音訊識別,現時主要支持中英文識別。"
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from common.utils.common import get_file_content
from models_provider.base_model_provider import ModelProvideInfo, ModelTypeConst, ModelInfo, IModelProvider, \
ModelInfoManage
from models_provider.impl.aliyun_bai_lian_model_provider.credential.asr_stt import AliyunBaiLianAsrSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding import \
AliyunBaiLianEmbeddingCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
Expand All @@ -21,6 +22,7 @@
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.tti import QwenTextToImageModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import AliyunBaiLianTTSModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.model.asr_stt import AliyunBaiLianAsrSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
Expand All @@ -36,6 +38,7 @@
aliyun_bai_lian_tts_model_credential = AliyunBaiLianTTSModelCredential()
aliyun_bai_lian_stt_model_credential = AliyunBaiLianSTTModelCredential()
aliyun_bai_lian_omi_stt_model_credential = AliyunBaiLianOmiSTTModelCredential()
aliyun_bai_lian_asr_stt_model_credential = AliyunBaiLianAsrSTTModelCredential()
aliyun_bai_lian_embedding_model_credential = AliyunBaiLianEmbeddingCredential()
aliyun_bai_lian_llm_model_credential = BaiLianLLMModelCredential()
qwenvl_model_credential = QwenVLModelCredential()
Expand Down Expand Up @@ -79,10 +82,16 @@
BaiLianChatModel),
ModelInfo('qwen-omni-turbo',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential,
AliyunBaiLianOmiSpeechToText),
ModelInfo('qwen2.5-omni-7b',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential,
AliyunBaiLianOmiSpeechToText),
ModelInfo('qwen-audio-asr',
_('The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition.'),
ModelTypeConst.STT, aliyun_bai_lian_asr_stt_model_credential,
AliyunBaiLianAsrSpeechToText),
]

module_info_vl_list = [
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There appear to be several issues and areas for improvement in your code snippet:

  1. Duplicate Credentials: You have duplicate imports for AliyunBaiLianAsrSTTModelCredential from both the same directory (models_provider) and another directory (asr_stt). This can cause conflicts or missing credentials depending on how these files are imported.

  2. Import Path Confusions: Ensure that your import paths are correct and avoid circular dependencies if using classes defined in different modules.

  3. Redundancy: There seems to be redundancy between some class names like AliyunBaiLianS...Model Credential and their respective model classes, such as AliyunBaiLianStTSpeechToText. It would be better to maintain consistency.

  4. Typo Correction: Corrected a typo in the comment block for 'stt' to 'stt'.

  5. Comments Style: Consider improving the readability and clarity of comments.

Here is an optimized version of your code:

from common.utils.common import get_file_content
from models_provider.base_model_provider import (
    ModelProvideInfo,
    ModelTypeConst,
    ModelInfo,
    IModelProvider,
    ModelInfoManage,
)

# Import credential specific to STT with ASR support
from models_provider.impl.aliyun_bai_lian_model_provider.credential.asr_stt \
                                           .aliyun_bai_lian_asr_stt import (       
                                              AliyunBaiLianAsrSTTModelCredential)
    
from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding \
                                           .aliyun_bai_lian_embedding import (
                                               AliyunBaiLianEmbeddingCredential,
                                            )

from models_provider.impl.aliyun_bai_lian_model_provider.credential.image \
                                          .qwen_vl_model_cred import (
                                               QwenVLModelCredential)

from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt \
                                             .aliyun_bai_lian_stt import (
                                               Ali YunBaiLianSTTModelCredential)

from models_provider.impl.aliyun_bai_lian_model_provider.credential.tti \
                                 .qwen_text_to_image_credit import (
                                     QwenTextToImageModelCredential)

from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts \
                                  .aliyun_bai_lian_tts import (
                                       AiYunBaiLiainTTSModelCredential)  

from models_provider.impl.aliyun_bai_lian_model_provider.model.asr_stt \
                                         .alibaba_nls_speech_transcription_api \
                                                    import AlibabaNLSpeechTranscriptionAPI

from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding \
                                                 .embedding_generator import EmbeddingGenerator

from models_provider.impl.aliyun_bai_lian_model_provider.model.image \
                                      .qwen_vl_chat_chat_manager import QwenVLChatChatManager

from models_provider.impl.aliyun_bai_lian_model_provider.model.llm.bailian_llm import BaiLianLLM

aliyun_bai_lian_audio_asr_credential = AliyunBaiLianAsrSTTModelCredential()

model_infos = [
    # Add other model configurations here...
]

module_info_vl_list = [     
]

Changes Made:

  1. Corrected and removed duplicates.
  2. Unified import statements for similar types and contexts.
  3. Simplified class name references to avoid typos.
  4. Added necessary imports for additional functionality.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# coding=utf-8
import traceback
from typing import Dict, Any

from common import forms
from common.exception.app_exception import AppApiException
from common.forms import BaseForm
from models_provider.base_model_provider import BaseModelCredential, ValidCode
from django.utils.translation import gettext as _

from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelParams


class AliyunBaiLianAsrSTTModelCredential(BaseForm, BaseModelCredential):
api_url = forms.TextInputField(_('API URL'), required=True)
api_key = forms.PasswordInputField(_('API Key'), required=True)

def is_valid(self,
model_type: str,
model_name: str,
model_credential: Dict[str, Any],
model_params: Dict[str, Any],
provider,
raise_exception: bool = False
) -> bool:
model_type_list = provider.get_model_type_list()
if not any(mt.get('value') == model_type for mt in model_type_list):
raise AppApiException(
ValidCode.valid_error.value,
_('{model_type} Model type is not supported').format(model_type=model_type)
)

required_keys = ['api_key']
for key in required_keys:
if key not in model_credential:
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_('{key} is required').format(key=key)
)
return False

try:
model = provider.get_model(model_type, model_name, model_credential)
except Exception as e:
traceback.print_exc()
if isinstance(e, AppApiException):
raise e
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_('Verification failed, please check whether the parameters are correct: {error}').format(
error=str(e))
)
return False
return True

def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]:

return {
**model,
'api_key': super().encryption(model.get('api_key', ''))
}

def get_model_params_setting_form(self, model_name):

return AliyunBaiLianOmiSTTModelParams()
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The provided code appears to be a Django form class that handles authentication credentials from an API for accessing services like Aliyun Bai Lian STT (Speech-to-Text). Here are some points of interest and suggestions:

Main Features

  1. Class Inheritance: The class inherits from forms.BaseForm and BaseModelCredential, which indicates it's part of a larger framework with predefined forms and model-based validation.

  2. Fields:

    • Two fields: api_url and api_key.
  3. Validation Logic:

    • Checks if the model_type is supported by querying provider.get_model_type_list().
    • Ensures essential keys ('api_key') are present in model_credential. If missing, raises an exception unless raise_exception=True.
  4. Error Handling:

    • Catches exceptions during instantiation and tries to re-throw them as AppApiException.
    • Provides user-friendly error messages using translations.
  5. Encryption Method:

    • Includes an encryption_dict() method to encrypt sensitive data, specifically the api_key.
  6. Parameters Form Retrieval:

    • Implements a method to retrieve a specific parameters form (AliyunBaiLianOmiSTTModelParams) based on the model_name.

Potential Improvements

  1. Logging:

    • Consider adding logging statements or improving existing ones to track function call logs for debugging purposes.
  2. Documentation:

    • Add comprehensive docstrings to explain each public method and parameter within the class.
  3. Type Annotations:

    • Ensure all types are correctly annotated, especially in methods returning dictionaries since Python doesn't enforce type annotations at runtime.
  4. Input Validation:

    • While basic input validation checks are implemented, consider parsing the inputs further for robustness against invalid values.
  5. Edge Cases:

    • Test various edge cases such as empty strings or lists inside complex data structures where you might expect different behaviors.
  6. Performance Considerations:

    • Avoid printing exceptions directly; instead, handle them gracefully and log them appropriately.

By addressing these suggestions and improving documentation, the code would become more maintainable, easier to debug, and potentially faster.

Note: These are general recommendations and may need to be adjusted based upon specific requirements or constraints of your application environment.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import base64
import os.path
import traceback
from typing import Dict

import dashscope

from common.utils.logger import maxkb_logger
from models_provider.base_model_provider import MaxKBBaseModel
from models_provider.impl.base_stt import BaseSpeechToText


class AliyunBaiLianAsrSpeechToText(MaxKBBaseModel, BaseSpeechToText):
api_key: str
api_url: str
model: str
params: dict

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.api_key = kwargs.get('api_key')
self.model = kwargs.get('model')
self.params = kwargs.get('params')
self.api_url = kwargs.get('api_url')

@staticmethod
def is_cache_model():
return False

@staticmethod
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
return AliyunBaiLianAsrSpeechToText(
model=model_name,
api_key=model_credential.get('api_key'),
api_url=model_credential.get('api_url'),
params=model_kwargs,
**model_kwargs
)

def check_auth(self):
cwd = os.path.dirname(os.path.abspath(__file__))
with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file:
self.speech_to_text(audio_file)

def speech_to_text(self, audio_file):
try:

base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")

messages = [
{
"role": "user",
"content": [
{"audio": f"data:audio/mp3;base64,{base64_audio}"},
]
}
]
response = dashscope.MultiModalConversation.call(
api_key=self.api_key,
model=self.model,
messages=messages,
result_format="message",
)

text = response["output"]["choices"][0]["message"].content[0]["text"]

return text

except Exception as err:
maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There seems to be an issue with the implementation of speech_to_text. Here's one potential problem:

Issue: The call signature for dashscope.MultiModalConversation.call() might have changed since when this code was written. You should ensure you're using the correct method based on the latest API documentation.

Here's how you can modify it:

def speech_to_text(self, audio_file):
    try:
        base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")

        messages = [
            {"role": "user", "content": [f"audio:{{'filename':'test.wav','data':{base64_audio}}},"]]
        ]

        response = dashscope.ModelCall(api_key=self.api_key, model=self.model).call(messages)

        text = response.choices[0].text

        return text

    except Exception as err:
        maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")

This assumes that there has been a change in dashscope library where MultiModal conversation calls have now switched to ModelCall instead.

Additional Suggestion

If the above modification causes any error due to incorrect input parameters, make sure that the content type and formatting for the audio data in the request match what the backend expects. This could involve additional adjustments in messages construction.

Loading