diff --git a/docs/en/user_guides/models.md b/docs/en/user_guides/models.md
index ad9ee7899..a7636d6ef 100644
--- a/docs/en/user_guides/models.md
+++ b/docs/en/user_guides/models.md
@@ -96,10 +96,74 @@ models = [
 ]
 ```
 
+### Authentication
+
+The `key` parameter defaults to `'ENV'`, which reads from the `OPENAI_API_KEY` environment variable.
+If `OPENAI_API_KEY` is not set, the model will attempt to fallback to
+Azure Managed Identity (`DefaultAzureCredential`) — no extra configuration is needed.
+
+You can also pass a key directly:
+
+```python
+key='sk-...',           # Explicit API key
+key='ENV',              # Read from OPENAI_API_KEY env var (default); falls back to Azure Managed Identity
+```
+
+### Azure OpenAI
+
+To use Azure OpenAI endpoints, set `azure_endpoint` and `azure_api_version` to reference your Azure resource.
+Authentication: if `OPENAI_API_KEY` is set it will be used,
+otherwise Azure Managed Identity is used as a fallback.
+
+```python
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        type=OpenAISDK,
+        path='gpt-4',
+        azure_endpoint='https://{resource-name}.openai.azure.com',
+        azure_api_version='2024-12-01-preview',
+        tokenizer_path='gpt-4',
+        meta_template=dict(round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True),
+        ]),
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+    ),
+]
+```
+
+### Reasoning Effort
+
+For OpenAI reasoning models (o1, o3, o4, gpt-5), you can control the amount of reasoning
+with the `reasoning_effort` parameter. Valid values are `'low'`, `'medium'`, and `'high'`
+(case-insensitive). Defaults to `None` (use the model's default behavior).
+
+```python
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        type=OpenAISDK,
+        path='o3',
+        reasoning_effort='high',
+        openai_api_base='https://api.openai.com/v1/',
+        max_out_len=4096,
+        max_seq_len=32768,
+    ),
+]
+```
+
 We have provided several examples for API-based models. Please refer to
 
 ```bash
 configs
+├── eval_api_demo.py
+├── eval_api_azure_openai_demo.py
 ├── eval_zhipu.py
 ├── eval_xunfei.py
 └── eval_minimax.py
diff --git a/docs/zh_cn/user_guides/models.md b/docs/zh_cn/user_guides/models.md
index 9a69bf761..c07f6c57e 100644
--- a/docs/zh_cn/user_guides/models.md
+++ b/docs/zh_cn/user_guides/models.md
@@ -88,10 +88,70 @@ models = [
 ]
 ```
 
+### 认证方式
+
+`key` 参数默认为 `'ENV'`，会从环境变量 `OPENAI_API_KEY` 中读取。如果未设置 `OPENAI_API_KEY`，
+模型会自动回退到 Azure 托管身份（`DefaultAzureCredential`）进行认证，无需额外配置。
+
+你也可以直接传入密钥：
+
+```python
+key='sk-...',           # 直接指定 API Key
+key='ENV',              # 从 OPENAI_API_KEY 环境变量读取（默认）；未设置时自动回退到 Azure 托管身份
+```
+
+### Azure OpenAI
+
+使用 Azure OpenAI 时，将 `openai_api_base` 指向你的 Azure 资源即可。
+认证方式自动处理：如果设置了 `OPENAI_API_KEY` 则使用该密钥，否则自动回退到 Azure 托管身份。
+
+```python
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        type=OpenAISDK,
+        path='gpt-4',
+        azure_endpoint='https://{resource-name}.openai.azure.com',
+        azure_api_version='2024-12-01-preview',
+        tokenizer_path='gpt-4',
+        meta_template=dict(round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True),
+        ]),
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+    ),
+]
+```
+
+### 推理力度（Reasoning Effort）
+
+对于 OpenAI 推理模型（o1、o3、o4、gpt-5），可以通过 `reasoning_effort` 参数控制推理深度。
+有效值为 `'low'`、`'medium'`、`'high'`（不区分大小写）。默认为 `None`（使用模型的默认行为）。
+
+```python
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        type=OpenAISDK,
+        path='o3',
+        reasoning_effort='high',                 # 控制推理深度
+        openai_api_base='https://api.openai.com/v1/',
+        max_out_len=4096,
+        max_seq_len=32768,
+    ),
+]
+```
+
 我们也提供了API模型的评测示例，请参考
 
 ```bash
 configs
+├── eval_api_azure_openai_demo.py
 ├── eval_zhipu.py
 ├── eval_xunfei.py
 └── eval_minimax.py
diff --git a/examples/eval_api_azure_openai_demo.py b/examples/eval_api_azure_openai_demo.py
new file mode 100644
index 000000000..abd886739
--- /dev/null
+++ b/examples/eval_api_azure_openai_demo.py
@@ -0,0 +1,57 @@
+"""
+Example configuration of using Azure OpenAI models.
+
+If OPENAI_API_KEY is not set, Azure Managed Identity (DefaultAzureCredential)
+is used automatically as a fallback.
+"""
+
+from mmengine.config import read_base
+
+from opencompass.models import OpenAI, OpenAISDK
+
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
+        gsm8k_datasets
+
+# API template for chat models
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        abbr='Azure-GPT-5.1',
+        type=OpenAI,
+        path='gpt-5.1',
+        tokenizer_path='gpt-5',
+        # Azure OpenAI endpoint format:
+        openai_api_base='https://{resource-name}.openai.azure.com/openai/deployments/{deployment-name}/chat/completions?api-version=2024-12-01-preview',
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        retry=2,
+    ),
+    dict(
+        abbr='Azure-GPT-5.1-SDK',
+        type=OpenAISDK,
+        path='gpt-5.1',
+        tokenizer_path='gpt-5',
+        # Azure OpenAI endpoint format:
+        azure_endpoint='https://{resource-name}.openai.azure.com',
+        azure_api_version='2024-12-01-preview',
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        retry=2,
+    ),
+]
+
+# Datasets to evaluate
+datasets = gsm8k_datasets
diff --git a/opencompass/models/base_api.py b/opencompass/models/base_api.py
index ab33c609f..4d6951e31 100644
--- a/opencompass/models/base_api.py
+++ b/opencompass/models/base_api.py
@@ -310,6 +310,9 @@ def parse_template(self, prompt_template: PromptType,
             for item in prompt[1:]:
                 if item['role'] == last_role:
                     new_prompt[-1]['prompt'] += '\n' + item['prompt']
+                    if item.get('image'):
+                        existing = new_prompt[-1].get('image', [])
+                        new_prompt[-1]['image'] = existing + item['image']
                 else:
                     last_role = item['role']
                     new_prompt.append(item)
@@ -452,6 +455,8 @@ def _role2api_role(self,
         res['prompt'] = merged_prompt.get('begin', '')
         res['prompt'] += merged_prompt.get('prompt', '')
         res['prompt'] += merged_prompt.get('end', '')
+        if merged_prompt.get('image'):
+            res['image'] = merged_prompt['image']
         return res, True
 
 
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 53a3749f4..488e01eb0 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -10,6 +10,7 @@
 import httpx
 import jieba
 import requests
+from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 from tqdm import tqdm
 
 from opencompass.registry import MODELS
@@ -44,9 +45,10 @@ class OpenAI(BaseAPIModel):
         retry (int): Number of retires if the API call fails. Defaults to 2.
         key (str or List[str]): OpenAI key(s). In particular, when it
             is set to "ENV", the key will be fetched from the environment
-            variable $OPENAI_API_KEY, as how openai defaults to be. If it's a
-            list, the keys will be used in round-robin manner. Defaults to
-            'ENV'.
+            variable $OPENAI_API_KEY. If the variable is not set, Azure
+            Managed Identity (DefaultAzureCredential) will be used as a
+            fallback. If it's a list, the keys will be used in round-robin
+            manner. Defaults to 'ENV'.
         org (str or List[str], optional): OpenAI organization(s). If not
             specified, OpenAI uses the default organization bound to each API
             key. If specified, the orgs will be posted with each request in
@@ -124,6 +126,8 @@ def __init__(
         self.tokenizer = None
         self.tokenizer_type = None
         self._init_tokenizer()
+        self.azure_credential = None
+        self.use_azure_identity = False
 
         if max_workers is None:
             cpu_count = os.cpu_count() or 1
@@ -131,11 +135,24 @@ def __init__(
         else:
             self.max_workers = max_workers
 
+        # Resolve API keys: try explicit key, then env var, then Azure identity
         if isinstance(key, str):
             if key == 'ENV':
-                if 'OPENAI_API_KEY' not in os.environ:
-                    raise ValueError('OpenAI API key is not set.')
-                self.keys = os.getenv('OPENAI_API_KEY').split(',')
+                if 'OPENAI_API_KEY' in os.environ:
+                    self.keys = os.getenv('OPENAI_API_KEY').split(',')
+                else:
+                    self.logger.warning(
+                        'OPENAI_API_KEY is not set. Will try to use Azure Managed Identity for authentication.'
+                    )
+                    try:
+                        self.azure_credential = DefaultAzureCredential()
+                        self.use_azure_identity = self.azure_credential is not None
+                        self.keys = ['AZURE_TOKEN']  # placeholder to indicate Azure token usage
+                    except Exception as e:
+                        self.logger.warning(
+                            f'Azure Managed Identity is not available: {e}. '
+                            'OPENAI_API_KEY and managed identity are unavailable.')
+                        raise ValueError('OpenAI API key is not set and Azure Managed Identity is not provided.')
             else:
                 self.keys = [key]
         else:
@@ -162,20 +179,26 @@ def __init__(
             self.proxy_url = openai_proxy_url
 
     def _next_valid_key(self):
-        with self._key_lock:
-            if len(self.invalid_keys) == len(self.keys):
-                raise RuntimeError('All keys have insufficient quota.')
+        # Get authentication token
+        if self.azure_credential:
+            token = self.azure_credential.get_token(
+                'https://cognitiveservices.azure.com/.default')
+            key = token.token
+        else:
+            with self._key_lock:
+                if len(self.invalid_keys) == len(self.keys):
+                    raise RuntimeError('All keys have insufficient quota.')
 
-            # find the next valid key
-            while True:
-                self.key_ctr += 1
-                if self.key_ctr == len(self.keys):
-                    self.key_ctr = 0
+                # find the next valid key
+                while True:
+                    self.key_ctr += 1
+                    if self.key_ctr == len(self.keys):
+                        self.key_ctr = 0
 
-                if self.keys[self.key_ctr] not in self.invalid_keys:
-                    break
+                    if self.keys[self.key_ctr] not in self.invalid_keys:
+                        break
 
-            key = self.keys[self.key_ctr]
+                key = self.keys[self.key_ctr]
         return key
 
     def generate(
@@ -563,13 +586,29 @@ def bin_trim_wrapper(text):
                     if mode != 'none':
                         input_content = bin_trim_wrapper(input_content)
                     processed_prompts.append(input_content)
-                    msg = {'content': input_content}
+                    msg = {}
                     if item['role'] == 'HUMAN':
                         msg['role'] = 'user'
                     elif item['role'] == 'BOT':
                         msg['role'] = 'assistant'
                     elif item['role'] == 'SYSTEM':
                         msg['role'] = 'system'
+                    # Build multi-part content when images are present
+                    images = [
+                        img for img in item.get('image', []) if img
+                    ]
+                    if images:
+                        content_parts = [
+                            {'type': 'text', 'text': input_content}
+                        ]
+                        for img_url in images:
+                            content_parts.append({
+                                'type': 'image_url',
+                                'image_url': {'url': img_url},
+                            })
+                        msg['content'] = content_parts
+                    else:
+                        msg['content'] = input_content
                     messages.append(msg)
                 input_len = sum(
                     get_token_len_func(prompt) for prompt in processed_prompts)
@@ -595,6 +634,8 @@ def bin_trim_wrapper(text):
 @MODELS.register_module()
 class OpenAISDK(OpenAI):
 
+    VALID_REASONING_EFFORTS = {None, 'low', 'medium', 'high'}
+
     def __init__(
         self,
         path: str = 'gpt-3.5-turbo',
@@ -606,6 +647,8 @@ def __init__(
         org: str | List[str] | None = None,
         meta_template: Dict | None = None,
         openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+        azure_endpoint: Optional[str] = None,
+        azure_api_version: Optional[str] = '2024-12-01-preview',
         openai_proxy_url: Optional[str] = None,
         mode: str = 'none',
         logprobs: bool | None = False,
@@ -620,6 +663,7 @@ def __init__(
         max_workers: Optional[int] = None,
         openai_extra_kwargs: Dict | None = None,
         timeout: int = 3600,
+        reasoning_effort: Optional[str] = None,
     ):
         super().__init__(
             path,
@@ -646,6 +690,9 @@ def __init__(
             self.openai_api_base = random.choice(openai_api_base)
         else:
             self.openai_api_base = openai_api_base
+        
+        self.azure_endpoint = azure_endpoint
+        self.azure_api_version = azure_api_version
 
         self.timeout = timeout
         self.http_client_cfg = http_client_cfg
@@ -657,10 +704,18 @@ def __init__(
         self.think_tag = think_tag
         self.openai_extra_kwargs = openai_extra_kwargs
 
+        if reasoning_effort:
+            reasoning_effort = reasoning_effort.lower()
+        if reasoning_effort not in self.VALID_REASONING_EFFORTS:
+            raise ValueError(
+                f'Invalid reasoning_effort: {reasoning_effort}. '
+                f'Must be one of {self.VALID_REASONING_EFFORTS}')
+        self.reasoning_effort = reasoning_effort
+
     def _create_fresh_client(self):
         """Create a fresh OpenAI client."""
         import httpx
-        from openai import OpenAI
+        from openai import OpenAI, AzureOpenAI
 
         # Get current key (with key rotation)
         current_key = self._next_valid_key()
@@ -678,9 +733,21 @@ def _create_fresh_client(self):
                                    timeout=httpx.Timeout(self.timeout),
                                    limits=limits)
 
-        return OpenAI(base_url=self.openai_api_base,
-                      api_key=current_key,
-                      http_client=http_client)
+        # Initialize OpenAI client with appropriate authentication
+        if self.azure_endpoint:
+            return AzureOpenAI(
+                azure_endpoint=self.azure_endpoint,
+                api_key=key if not self.azure_credential else None,
+                api_version=self.azure_api_version,
+                azure_ad_token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") if self.azure_credential else None,
+                http_client=http_client,
+            )
+        else:
+            return OpenAI(
+                base_url=self.openai_api_base,
+                api_key=current_key,
+                http_client=http_client,
+            )
 
     def _generate(
         self,
@@ -722,6 +789,7 @@ def _generate(
                     messages=messages,
                     extra_body=self.extra_body,
                 )
+                query_data['reasoning_effort'] = self.reasoning_effort
             else:
                 query_data = dict(
                     model=self.path,
@@ -838,6 +906,8 @@ def _generate(
 @MODELS.register_module()
 class OpenAISDKRollout(OpenAI):
 
+    VALID_REASONING_EFFORTS = {None, 'low', 'medium', 'high'}
+
     def __init__(
         self,
         path: str = 'gpt-3.5-turbo',
@@ -849,6 +919,8 @@ def __init__(
         org: str | List[str] | None = None,
         meta_template: Dict | None = None,
         openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+        azure_endpoint: Optional[str] = None,
+        azure_api_version: Optional[str] = '2024-12-01-preview',
         openai_proxy_url: Optional[str] = None,
         mode: str = 'none',
         logprobs: bool | None = False,
@@ -862,6 +934,7 @@ def __init__(
         think_tag: str = '</think>',
         max_workers: Optional[int] = None,
         openai_extra_kwargs: Dict | None = None,
+        reasoning_effort: Optional[str] = None,
     ):
         super().__init__(
             path,
@@ -883,7 +956,7 @@ def __init__(
             verbose=verbose,
             max_workers=max_workers,
         )
-        from openai import OpenAI
+        from openai import OpenAI, AzureOpenAI
 
         # support multiple api_base for acceleration
         if isinstance(openai_api_base, List):
@@ -898,18 +971,31 @@ def __init__(
                     'https://': self.proxy_url,
                 }
 
-        self.openai_client = OpenAI(
-            base_url=self.openai_api_base,
-            api_key=key,
-            http_client=httpx.Client(
-                **http_client_cfg) if http_client_cfg else None,
-        )
-
-        if self.verbose:
-            self.logger.info(f'Used openai_client: {self.openai_client}')
-        self.status_code_mappings = status_code_mappings
-        self.think_tag = think_tag
-        self.openai_extra_kwargs = openai_extra_kwargs
+        # Initialize OpenAI client with appropriate authentication
+        if azure_endpoint:
+            self.openai_client = AzureOpenAI(
+                azure_endpoint=self.azure_endpoint,
+                api_key=key if not self.azure_credential else None,
+                api_version=azure_api_version,
+                token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") if self.azure_credential else None,
+                http_client=httpx.Client(
+                    **http_client_cfg) if http_client_cfg else None,
+            )
+        else:
+            self.openai_client = OpenAI(
+                base_url=self.openai_api_base,
+                api_key=key,
+                http_client=httpx.Client(
+                    **http_client_cfg) if http_client_cfg else None,
+            )
+
+        if reasoning_effort is not None:
+            reasoning_effort = reasoning_effort.lower()
+        if reasoning_effort not in self.VALID_REASONING_EFFORTS:
+            raise ValueError(
+                f'Invalid reasoning_effort: {reasoning_effort}. '
+                f'Must be one of {self.VALID_REASONING_EFFORTS}')
+        self.reasoning_effort = reasoning_effort
 
     def _generate(
         self,
@@ -953,6 +1039,7 @@ def _generate(
                     messages=messages,
                     extra_body=self.extra_body,
                 )
+                query_data['reasoning_effort'] = self.reasoning_effort
             else:
                 query_data = dict(
                     model=self.path,
@@ -968,6 +1055,12 @@ def _generate(
 
             self.acquire()
             try:
+                # Update API key with fresh Azure token if using Azure identity
+                if self.use_azure_identity:
+                    token = self.azure_credential.get_token(
+                        'https://cognitiveservices.azure.com/.default')
+                    self.openai_client.api_key = token.token
+
                 if self.verbose:
                     self.logger.info('Start calling OpenAI API')
 
diff --git a/opencompass/models/openai_streaming.py b/opencompass/models/openai_streaming.py
index 63f9505a2..2890b3920 100644
--- a/opencompass/models/openai_streaming.py
+++ b/opencompass/models/openai_streaming.py
@@ -38,6 +38,8 @@ def __init__(self,
                  org: str | List[str] | None = None,
                  meta_template: Dict | None = None,
                  openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+                 azure_endpoint: Optional[str] = None,
+                 azure_api_version: Optional[str] = '2024-12-01-preview',
                  openai_proxy_url: Optional[str] = None,
                  mode: str = 'none',
                  logprobs: bool | None = False,
@@ -54,7 +56,8 @@ def __init__(self,
                  stream_chunk_size: int = 1,
                  timeout: int = 3600,
                  finish_reason_confirm: bool = True,
-                 max_workers: Optional[int] = None):
+                 max_workers: Optional[int] = None,
+                 reasoning_effort: Optional[str] = None):
         super().__init__(
             path=path,
             max_seq_len=max_seq_len,
@@ -65,6 +68,8 @@ def __init__(self,
             org=org,
             meta_template=meta_template,
             openai_api_base=openai_api_base,
+            azure_endpoint=azure_endpoint,
+            azure_api_version=azure_api_version,
             openai_proxy_url=openai_proxy_url,
             mode=mode,
             logprobs=logprobs,
@@ -78,6 +83,7 @@ def __init__(self,
             think_tag=think_tag,
             openai_extra_kwargs=openai_extra_kwargs,
             max_workers=max_workers,
+            reasoning_effort=reasoning_effort,
         )
 
         self.stream = stream
@@ -128,6 +134,7 @@ def _generate(
                     extra_body=self.extra_body,
                     stream=self.stream,  # Enable streaming
                 )
+                query_data['reasoning_effort'] = self.reasoning_effort
             else:
                 query_data = dict(
                     model=self.path,
diff --git a/opencompass/utils/prompt.py b/opencompass/utils/prompt.py
index cef6a31dd..830aae51a 100644
--- a/opencompass/utils/prompt.py
+++ b/opencompass/utils/prompt.py
@@ -99,6 +99,11 @@ def format(self, **kwargs) -> PromptList:
                 new_item = deepcopy(item)
                 if 'prompt' in item:
                     new_item['prompt'] = safe_format(item['prompt'], **kwargs)
+                if 'image' in item:
+                    new_item['image'] = [
+                        safe_format(img, **kwargs)
+                        for img in item['image']
+                    ]
                 new_list.append(new_item)
             else:
                 new_list.append(safe_format(item, **kwargs))
diff --git a/requirements/api.txt b/requirements/api.txt
index e8cd3d156..84d3431e2 100644
--- a/requirements/api.txt
+++ b/requirements/api.txt
@@ -4,6 +4,7 @@ anthropic
 dashscope
 # openai
 openai
+azure-identity
  # xunfei
 spark_ai_python
 sseclient-py==1.7.2
diff --git a/setup.py b/setup.py
index 90c98565e..0ca756456 100644
--- a/setup.py
+++ b/setup.py
@@ -134,7 +134,7 @@ def do_setup():
         },
         license='Apache License 2.0',
         include_package_data=True,
-        packages=find_packages(),
+        packages=find_packages(exclude=['autotest', 'autotest.*']),
         keywords=[
             'AI',
             'NLP',