diff --git a/docs/en/user_guides/models.md b/docs/en/user_guides/models.md index ad9ee7899..a7636d6ef 100644 --- a/docs/en/user_guides/models.md +++ b/docs/en/user_guides/models.md @@ -96,10 +96,74 @@ models = [ ] ``` +### Authentication + +The `key` parameter defaults to `'ENV'`, which reads from the `OPENAI_API_KEY` environment variable. +If `OPENAI_API_KEY` is not set, the model will attempt to fallback to +Azure Managed Identity (`DefaultAzureCredential`) — no extra configuration is needed. + +You can also pass a key directly: + +```python +key='sk-...', # Explicit API key +key='ENV', # Read from OPENAI_API_KEY env var (default); falls back to Azure Managed Identity +``` + +### Azure OpenAI + +To use Azure OpenAI endpoints, set `azure_endpoint` and `azure_api_version` to reference your Azure resource. +Authentication: if `OPENAI_API_KEY` is set it will be used, +otherwise Azure Managed Identity is used as a fallback. + +```python +from opencompass.models import OpenAISDK + +models = [ + dict( + type=OpenAISDK, + path='gpt-4', + azure_endpoint='https://{resource-name}.openai.azure.com', + azure_api_version='2024-12-01-preview', + tokenizer_path='gpt-4', + meta_template=dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]), + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8, + ), +] +``` + +### Reasoning Effort + +For OpenAI reasoning models (o1, o3, o4, gpt-5), you can control the amount of reasoning +with the `reasoning_effort` parameter. Valid values are `'low'`, `'medium'`, and `'high'` +(case-insensitive). Defaults to `None` (use the model's default behavior). + +```python +from opencompass.models import OpenAISDK + +models = [ + dict( + type=OpenAISDK, + path='o3', + reasoning_effort='high', + openai_api_base='https://api.openai.com/v1/', + max_out_len=4096, + max_seq_len=32768, + ), +] +``` + We have provided several examples for API-based models. Please refer to ```bash configs +├── eval_api_demo.py +├── eval_api_azure_openai_demo.py ├── eval_zhipu.py ├── eval_xunfei.py └── eval_minimax.py diff --git a/docs/zh_cn/user_guides/models.md b/docs/zh_cn/user_guides/models.md index 9a69bf761..c07f6c57e 100644 --- a/docs/zh_cn/user_guides/models.md +++ b/docs/zh_cn/user_guides/models.md @@ -88,10 +88,70 @@ models = [ ] ``` +### 认证方式 + +`key` 参数默认为 `'ENV'`,会从环境变量 `OPENAI_API_KEY` 中读取。如果未设置 `OPENAI_API_KEY`, +模型会自动回退到 Azure 托管身份(`DefaultAzureCredential`)进行认证,无需额外配置。 + +你也可以直接传入密钥: + +```python +key='sk-...', # 直接指定 API Key +key='ENV', # 从 OPENAI_API_KEY 环境变量读取(默认);未设置时自动回退到 Azure 托管身份 +``` + +### Azure OpenAI + +使用 Azure OpenAI 时,将 `openai_api_base` 指向你的 Azure 资源即可。 +认证方式自动处理:如果设置了 `OPENAI_API_KEY` 则使用该密钥,否则自动回退到 Azure 托管身份。 + +```python +from opencompass.models import OpenAISDK + +models = [ + dict( + type=OpenAISDK, + path='gpt-4', + azure_endpoint='https://{resource-name}.openai.azure.com', + azure_api_version='2024-12-01-preview', + tokenizer_path='gpt-4', + meta_template=dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]), + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8, + ), +] +``` + +### 推理力度(Reasoning Effort) + +对于 OpenAI 推理模型(o1、o3、o4、gpt-5),可以通过 `reasoning_effort` 参数控制推理深度。 +有效值为 `'low'`、`'medium'`、`'high'`(不区分大小写)。默认为 `None`(使用模型的默认行为)。 + +```python +from opencompass.models import OpenAISDK + +models = [ + dict( + type=OpenAISDK, + path='o3', + reasoning_effort='high', # 控制推理深度 + openai_api_base='https://api.openai.com/v1/', + max_out_len=4096, + max_seq_len=32768, + ), +] +``` + 我们也提供了API模型的评测示例,请参考 ```bash configs +├── eval_api_azure_openai_demo.py ├── eval_zhipu.py ├── eval_xunfei.py └── eval_minimax.py diff --git a/examples/eval_api_azure_openai_demo.py b/examples/eval_api_azure_openai_demo.py new file mode 100644 index 000000000..abd886739 --- /dev/null +++ b/examples/eval_api_azure_openai_demo.py @@ -0,0 +1,57 @@ +""" +Example configuration of using Azure OpenAI models. + +If OPENAI_API_KEY is not set, Azure Managed Identity (DefaultAzureCredential) +is used automatically as a fallback. +""" + +from mmengine.config import read_base + +from opencompass.models import OpenAI, OpenAISDK + +with read_base(): + from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \ + gsm8k_datasets + +# API template for chat models +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], +) + +models = [ + dict( + abbr='Azure-GPT-5.1', + type=OpenAI, + path='gpt-5.1', + tokenizer_path='gpt-5', + # Azure OpenAI endpoint format: + openai_api_base='https://{resource-name}.openai.azure.com/openai/deployments/{deployment-name}/chat/completions?api-version=2024-12-01-preview', + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8, + retry=2, + ), + dict( + abbr='Azure-GPT-5.1-SDK', + type=OpenAISDK, + path='gpt-5.1', + tokenizer_path='gpt-5', + # Azure OpenAI endpoint format: + azure_endpoint='https://{resource-name}.openai.azure.com', + azure_api_version='2024-12-01-preview', + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8, + retry=2, + ), +] + +# Datasets to evaluate +datasets = gsm8k_datasets diff --git a/opencompass/models/base_api.py b/opencompass/models/base_api.py index ab33c609f..4d6951e31 100644 --- a/opencompass/models/base_api.py +++ b/opencompass/models/base_api.py @@ -310,6 +310,9 @@ def parse_template(self, prompt_template: PromptType, for item in prompt[1:]: if item['role'] == last_role: new_prompt[-1]['prompt'] += '\n' + item['prompt'] + if item.get('image'): + existing = new_prompt[-1].get('image', []) + new_prompt[-1]['image'] = existing + item['image'] else: last_role = item['role'] new_prompt.append(item) @@ -452,6 +455,8 @@ def _role2api_role(self, res['prompt'] = merged_prompt.get('begin', '') res['prompt'] += merged_prompt.get('prompt', '') res['prompt'] += merged_prompt.get('end', '') + if merged_prompt.get('image'): + res['image'] = merged_prompt['image'] return res, True diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 53a3749f4..488e01eb0 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -10,6 +10,7 @@ import httpx import jieba import requests +from azure.identity import DefaultAzureCredential, get_bearer_token_provider from tqdm import tqdm from opencompass.registry import MODELS @@ -44,9 +45,10 @@ class OpenAI(BaseAPIModel): retry (int): Number of retires if the API call fails. Defaults to 2. key (str or List[str]): OpenAI key(s). In particular, when it is set to "ENV", the key will be fetched from the environment - variable $OPENAI_API_KEY, as how openai defaults to be. If it's a - list, the keys will be used in round-robin manner. Defaults to - 'ENV'. + variable $OPENAI_API_KEY. If the variable is not set, Azure + Managed Identity (DefaultAzureCredential) will be used as a + fallback. If it's a list, the keys will be used in round-robin + manner. Defaults to 'ENV'. org (str or List[str], optional): OpenAI organization(s). If not specified, OpenAI uses the default organization bound to each API key. If specified, the orgs will be posted with each request in @@ -124,6 +126,8 @@ def __init__( self.tokenizer = None self.tokenizer_type = None self._init_tokenizer() + self.azure_credential = None + self.use_azure_identity = False if max_workers is None: cpu_count = os.cpu_count() or 1 @@ -131,11 +135,24 @@ def __init__( else: self.max_workers = max_workers + # Resolve API keys: try explicit key, then env var, then Azure identity if isinstance(key, str): if key == 'ENV': - if 'OPENAI_API_KEY' not in os.environ: - raise ValueError('OpenAI API key is not set.') - self.keys = os.getenv('OPENAI_API_KEY').split(',') + if 'OPENAI_API_KEY' in os.environ: + self.keys = os.getenv('OPENAI_API_KEY').split(',') + else: + self.logger.warning( + 'OPENAI_API_KEY is not set. Will try to use Azure Managed Identity for authentication.' + ) + try: + self.azure_credential = DefaultAzureCredential() + self.use_azure_identity = self.azure_credential is not None + self.keys = ['AZURE_TOKEN'] # placeholder to indicate Azure token usage + except Exception as e: + self.logger.warning( + f'Azure Managed Identity is not available: {e}. ' + 'OPENAI_API_KEY and managed identity are unavailable.') + raise ValueError('OpenAI API key is not set and Azure Managed Identity is not provided.') else: self.keys = [key] else: @@ -162,20 +179,26 @@ def __init__( self.proxy_url = openai_proxy_url def _next_valid_key(self): - with self._key_lock: - if len(self.invalid_keys) == len(self.keys): - raise RuntimeError('All keys have insufficient quota.') + # Get authentication token + if self.azure_credential: + token = self.azure_credential.get_token( + 'https://cognitiveservices.azure.com/.default') + key = token.token + else: + with self._key_lock: + if len(self.invalid_keys) == len(self.keys): + raise RuntimeError('All keys have insufficient quota.') - # find the next valid key - while True: - self.key_ctr += 1 - if self.key_ctr == len(self.keys): - self.key_ctr = 0 + # find the next valid key + while True: + self.key_ctr += 1 + if self.key_ctr == len(self.keys): + self.key_ctr = 0 - if self.keys[self.key_ctr] not in self.invalid_keys: - break + if self.keys[self.key_ctr] not in self.invalid_keys: + break - key = self.keys[self.key_ctr] + key = self.keys[self.key_ctr] return key def generate( @@ -563,13 +586,29 @@ def bin_trim_wrapper(text): if mode != 'none': input_content = bin_trim_wrapper(input_content) processed_prompts.append(input_content) - msg = {'content': input_content} + msg = {} if item['role'] == 'HUMAN': msg['role'] = 'user' elif item['role'] == 'BOT': msg['role'] = 'assistant' elif item['role'] == 'SYSTEM': msg['role'] = 'system' + # Build multi-part content when images are present + images = [ + img for img in item.get('image', []) if img + ] + if images: + content_parts = [ + {'type': 'text', 'text': input_content} + ] + for img_url in images: + content_parts.append({ + 'type': 'image_url', + 'image_url': {'url': img_url}, + }) + msg['content'] = content_parts + else: + msg['content'] = input_content messages.append(msg) input_len = sum( get_token_len_func(prompt) for prompt in processed_prompts) @@ -595,6 +634,8 @@ def bin_trim_wrapper(text): @MODELS.register_module() class OpenAISDK(OpenAI): + VALID_REASONING_EFFORTS = {None, 'low', 'medium', 'high'} + def __init__( self, path: str = 'gpt-3.5-turbo', @@ -606,6 +647,8 @@ def __init__( org: str | List[str] | None = None, meta_template: Dict | None = None, openai_api_base: str | List[str] = OPENAISDK_API_BASE, + azure_endpoint: Optional[str] = None, + azure_api_version: Optional[str] = '2024-12-01-preview', openai_proxy_url: Optional[str] = None, mode: str = 'none', logprobs: bool | None = False, @@ -620,6 +663,7 @@ def __init__( max_workers: Optional[int] = None, openai_extra_kwargs: Dict | None = None, timeout: int = 3600, + reasoning_effort: Optional[str] = None, ): super().__init__( path, @@ -646,6 +690,9 @@ def __init__( self.openai_api_base = random.choice(openai_api_base) else: self.openai_api_base = openai_api_base + + self.azure_endpoint = azure_endpoint + self.azure_api_version = azure_api_version self.timeout = timeout self.http_client_cfg = http_client_cfg @@ -657,10 +704,18 @@ def __init__( self.think_tag = think_tag self.openai_extra_kwargs = openai_extra_kwargs + if reasoning_effort: + reasoning_effort = reasoning_effort.lower() + if reasoning_effort not in self.VALID_REASONING_EFFORTS: + raise ValueError( + f'Invalid reasoning_effort: {reasoning_effort}. ' + f'Must be one of {self.VALID_REASONING_EFFORTS}') + self.reasoning_effort = reasoning_effort + def _create_fresh_client(self): """Create a fresh OpenAI client.""" import httpx - from openai import OpenAI + from openai import OpenAI, AzureOpenAI # Get current key (with key rotation) current_key = self._next_valid_key() @@ -678,9 +733,21 @@ def _create_fresh_client(self): timeout=httpx.Timeout(self.timeout), limits=limits) - return OpenAI(base_url=self.openai_api_base, - api_key=current_key, - http_client=http_client) + # Initialize OpenAI client with appropriate authentication + if self.azure_endpoint: + return AzureOpenAI( + azure_endpoint=self.azure_endpoint, + api_key=key if not self.azure_credential else None, + api_version=self.azure_api_version, + azure_ad_token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") if self.azure_credential else None, + http_client=http_client, + ) + else: + return OpenAI( + base_url=self.openai_api_base, + api_key=current_key, + http_client=http_client, + ) def _generate( self, @@ -722,6 +789,7 @@ def _generate( messages=messages, extra_body=self.extra_body, ) + query_data['reasoning_effort'] = self.reasoning_effort else: query_data = dict( model=self.path, @@ -838,6 +906,8 @@ def _generate( @MODELS.register_module() class OpenAISDKRollout(OpenAI): + VALID_REASONING_EFFORTS = {None, 'low', 'medium', 'high'} + def __init__( self, path: str = 'gpt-3.5-turbo', @@ -849,6 +919,8 @@ def __init__( org: str | List[str] | None = None, meta_template: Dict | None = None, openai_api_base: str | List[str] = OPENAISDK_API_BASE, + azure_endpoint: Optional[str] = None, + azure_api_version: Optional[str] = '2024-12-01-preview', openai_proxy_url: Optional[str] = None, mode: str = 'none', logprobs: bool | None = False, @@ -862,6 +934,7 @@ def __init__( think_tag: str = '', max_workers: Optional[int] = None, openai_extra_kwargs: Dict | None = None, + reasoning_effort: Optional[str] = None, ): super().__init__( path, @@ -883,7 +956,7 @@ def __init__( verbose=verbose, max_workers=max_workers, ) - from openai import OpenAI + from openai import OpenAI, AzureOpenAI # support multiple api_base for acceleration if isinstance(openai_api_base, List): @@ -898,18 +971,31 @@ def __init__( 'https://': self.proxy_url, } - self.openai_client = OpenAI( - base_url=self.openai_api_base, - api_key=key, - http_client=httpx.Client( - **http_client_cfg) if http_client_cfg else None, - ) - - if self.verbose: - self.logger.info(f'Used openai_client: {self.openai_client}') - self.status_code_mappings = status_code_mappings - self.think_tag = think_tag - self.openai_extra_kwargs = openai_extra_kwargs + # Initialize OpenAI client with appropriate authentication + if azure_endpoint: + self.openai_client = AzureOpenAI( + azure_endpoint=self.azure_endpoint, + api_key=key if not self.azure_credential else None, + api_version=azure_api_version, + token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") if self.azure_credential else None, + http_client=httpx.Client( + **http_client_cfg) if http_client_cfg else None, + ) + else: + self.openai_client = OpenAI( + base_url=self.openai_api_base, + api_key=key, + http_client=httpx.Client( + **http_client_cfg) if http_client_cfg else None, + ) + + if reasoning_effort is not None: + reasoning_effort = reasoning_effort.lower() + if reasoning_effort not in self.VALID_REASONING_EFFORTS: + raise ValueError( + f'Invalid reasoning_effort: {reasoning_effort}. ' + f'Must be one of {self.VALID_REASONING_EFFORTS}') + self.reasoning_effort = reasoning_effort def _generate( self, @@ -953,6 +1039,7 @@ def _generate( messages=messages, extra_body=self.extra_body, ) + query_data['reasoning_effort'] = self.reasoning_effort else: query_data = dict( model=self.path, @@ -968,6 +1055,12 @@ def _generate( self.acquire() try: + # Update API key with fresh Azure token if using Azure identity + if self.use_azure_identity: + token = self.azure_credential.get_token( + 'https://cognitiveservices.azure.com/.default') + self.openai_client.api_key = token.token + if self.verbose: self.logger.info('Start calling OpenAI API') diff --git a/opencompass/models/openai_streaming.py b/opencompass/models/openai_streaming.py index 63f9505a2..2890b3920 100644 --- a/opencompass/models/openai_streaming.py +++ b/opencompass/models/openai_streaming.py @@ -38,6 +38,8 @@ def __init__(self, org: str | List[str] | None = None, meta_template: Dict | None = None, openai_api_base: str | List[str] = OPENAISDK_API_BASE, + azure_endpoint: Optional[str] = None, + azure_api_version: Optional[str] = '2024-12-01-preview', openai_proxy_url: Optional[str] = None, mode: str = 'none', logprobs: bool | None = False, @@ -54,7 +56,8 @@ def __init__(self, stream_chunk_size: int = 1, timeout: int = 3600, finish_reason_confirm: bool = True, - max_workers: Optional[int] = None): + max_workers: Optional[int] = None, + reasoning_effort: Optional[str] = None): super().__init__( path=path, max_seq_len=max_seq_len, @@ -65,6 +68,8 @@ def __init__(self, org=org, meta_template=meta_template, openai_api_base=openai_api_base, + azure_endpoint=azure_endpoint, + azure_api_version=azure_api_version, openai_proxy_url=openai_proxy_url, mode=mode, logprobs=logprobs, @@ -78,6 +83,7 @@ def __init__(self, think_tag=think_tag, openai_extra_kwargs=openai_extra_kwargs, max_workers=max_workers, + reasoning_effort=reasoning_effort, ) self.stream = stream @@ -128,6 +134,7 @@ def _generate( extra_body=self.extra_body, stream=self.stream, # Enable streaming ) + query_data['reasoning_effort'] = self.reasoning_effort else: query_data = dict( model=self.path, diff --git a/opencompass/utils/prompt.py b/opencompass/utils/prompt.py index cef6a31dd..830aae51a 100644 --- a/opencompass/utils/prompt.py +++ b/opencompass/utils/prompt.py @@ -99,6 +99,11 @@ def format(self, **kwargs) -> PromptList: new_item = deepcopy(item) if 'prompt' in item: new_item['prompt'] = safe_format(item['prompt'], **kwargs) + if 'image' in item: + new_item['image'] = [ + safe_format(img, **kwargs) + for img in item['image'] + ] new_list.append(new_item) else: new_list.append(safe_format(item, **kwargs)) diff --git a/requirements/api.txt b/requirements/api.txt index e8cd3d156..84d3431e2 100644 --- a/requirements/api.txt +++ b/requirements/api.txt @@ -4,6 +4,7 @@ anthropic dashscope # openai openai +azure-identity # xunfei spark_ai_python sseclient-py==1.7.2 diff --git a/setup.py b/setup.py index 90c98565e..0ca756456 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,7 @@ def do_setup(): }, license='Apache License 2.0', include_package_data=True, - packages=find_packages(), + packages=find_packages(exclude=['autotest', 'autotest.*']), keywords=[ 'AI', 'NLP',