Skip to content

Commit 00d431e

Browse files
authored
Merge pull request BerriAI#4807 from BerriAI/litellm_return-response_headers
[Feat] Return response headers on `litellm.completion` , `litellm.embedding`
2 parents 36cb63c + f622562 commit 00d431e

5 files changed

Lines changed: 397 additions & 10 deletions

File tree

docs/my-website/docs/providers/openai.md

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,104 @@ response = completion(
238238

239239
## Advanced
240240

241+
### Getting OpenAI API Response Headers
242+
243+
Set `litellm.return_response_headers = True` to get raw response headers from OpenAI
244+
245+
You can expect to always get the `_response_headers` field from `litellm.completion()`, `litellm.embedding()` functions
246+
247+
<Tabs>
248+
<TabItem value="litellm.completion" label="litellm.completion">
249+
250+
```python
251+
litellm.return_response_headers = True
252+
253+
# /chat/completion
254+
response = completion(
255+
model="gpt-4o-mini",
256+
messages=[
257+
{
258+
"role": "user",
259+
"content": "hi",
260+
}
261+
],
262+
)
263+
print(f"response: {response}")
264+
print("_response_headers=", response._response_headers)
265+
```
266+
</TabItem>
267+
268+
<TabItem value="litellm.completion - streaming" label="litellm.completion + stream">
269+
270+
```python
271+
litellm.return_response_headers = True
272+
273+
# /chat/completion
274+
response = completion(
275+
model="gpt-4o-mini",
276+
stream=True,
277+
messages=[
278+
{
279+
"role": "user",
280+
"content": "hi",
281+
}
282+
],
283+
)
284+
print(f"response: {response}")
285+
print("response_headers=", response._response_headers)
286+
for chunk in response:
287+
print(chunk)
288+
```
289+
</TabItem>
290+
291+
<TabItem value="litellm.embedding" label="litellm.embedding">
292+
293+
```python
294+
litellm.return_response_headers = True
295+
296+
# embedding
297+
embedding_response = litellm.embedding(
298+
model="text-embedding-ada-002",
299+
input="hello",
300+
)
301+
302+
embedding_response_headers = embedding_response._response_headers
303+
print("embedding_response_headers=", embedding_response_headers)
304+
```
305+
306+
</TabItem>
307+
</Tabs>
308+
Expected Response Headers from OpenAI
309+
310+
```json
311+
{
312+
"date": "Sat, 20 Jul 2024 22:05:23 GMT",
313+
"content-type": "application/json",
314+
"transfer-encoding": "chunked",
315+
"connection": "keep-alive",
316+
"access-control-allow-origin": "*",
317+
"openai-model": "text-embedding-ada-002",
318+
"openai-organization": "*****",
319+
"openai-processing-ms": "20",
320+
"openai-version": "2020-10-01",
321+
"strict-transport-security": "max-age=15552000; includeSubDomains; preload",
322+
"x-ratelimit-limit-requests": "5000",
323+
"x-ratelimit-limit-tokens": "5000000",
324+
"x-ratelimit-remaining-requests": "4999",
325+
"x-ratelimit-remaining-tokens": "4999999",
326+
"x-ratelimit-reset-requests": "12ms",
327+
"x-ratelimit-reset-tokens": "0s",
328+
"x-request-id": "req_cc37487bfd336358231a17034bcfb4d9",
329+
"cf-cache-status": "DYNAMIC",
330+
"set-cookie": "__cf_bm=E_FJY8fdAIMBzBE2RZI2.OkMIO3lf8Hz.ydBQJ9m3q8-1721513123-1.0.1.1-6OK0zXvtd5s9Jgqfz66cU9gzQYpcuh_RLaUZ9dOgxR9Qeq4oJlu.04C09hOTCFn7Hg.k.2tiKLOX24szUE2shw; path=/; expires=Sat, 20-Jul-24 22:35:23 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, *cfuvid=SDndIImxiO3U0aBcVtoy1TBQqYeQtVDo1L6*Nlpp7EU-1721513123215-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
331+
"x-content-type-options": "nosniff",
332+
"server": "cloudflare",
333+
"cf-ray": "8a66409b4f8acee9-SJC",
334+
"content-encoding": "br",
335+
"alt-svc": "h3=\":443\"; ma=86400"
336+
}
337+
```
338+
241339
### Parallel Function calling
242340
See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call)
243341
```python

litellm/llms/openai.py

Lines changed: 164 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,34 @@ async def make_openai_chat_completion_request(
784784
except Exception as e:
785785
raise e
786786

787+
def make_sync_openai_chat_completion_request(
788+
self,
789+
openai_client: OpenAI,
790+
data: dict,
791+
timeout: Union[float, httpx.Timeout],
792+
):
793+
"""
794+
Helper to:
795+
- call chat.completions.create.with_raw_response when litellm.return_response_headers is True
796+
- call chat.completions.create by default
797+
"""
798+
try:
799+
if litellm.return_response_headers is True:
800+
raw_response = openai_client.chat.completions.with_raw_response.create(
801+
**data, timeout=timeout
802+
)
803+
804+
headers = dict(raw_response.headers)
805+
response = raw_response.parse()
806+
return headers, response
807+
else:
808+
response = openai_client.chat.completions.create(
809+
**data, timeout=timeout
810+
)
811+
return None, response
812+
except Exception as e:
813+
raise e
814+
787815
def completion(
788816
self,
789817
model_response: ModelResponse,
@@ -916,7 +944,15 @@ def completion(
916944
},
917945
)
918946

919-
response = openai_client.chat.completions.create(**data, timeout=timeout) # type: ignore
947+
headers, response = (
948+
self.make_sync_openai_chat_completion_request(
949+
openai_client=openai_client,
950+
data=data,
951+
timeout=timeout,
952+
)
953+
)
954+
955+
logging_obj.model_call_details["response_headers"] = headers
920956
stringified_response = response.model_dump()
921957
logging_obj.post_call(
922958
input=messages,
@@ -927,6 +963,7 @@ def completion(
927963
return convert_to_model_response_object(
928964
response_object=stringified_response,
929965
model_response_object=model_response,
966+
_response_headers=headers,
930967
)
931968
except openai.UnprocessableEntityError as e:
932969
## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
@@ -1043,6 +1080,25 @@ async def acompletion(
10431080
},
10441081
)
10451082

1083+
headers, response = await self.make_openai_chat_completion_request(
1084+
openai_aclient=openai_aclient, data=data, timeout=timeout
1085+
)
1086+
stringified_response = response.model_dump()
1087+
logging_obj.post_call(
1088+
input=data["messages"],
1089+
api_key=api_key,
1090+
original_response=stringified_response,
1091+
additional_args={"complete_input_dict": data},
1092+
)
1093+
logging_obj.model_call_details["response_headers"] = headers
1094+
return convert_to_model_response_object(
1095+
response_object=stringified_response,
1096+
model_response_object=model_response,
1097+
hidden_params={"headers": headers},
1098+
_response_headers=headers,
1099+
)
1100+
except Exception as e:
1101+
raise e
10461102
headers, response = await self.make_openai_chat_completion_request(
10471103
openai_aclient=openai_aclient, data=data, timeout=timeout
10481104
)
@@ -1122,13 +1178,20 @@ def streaming(
11221178
"complete_input_dict": data,
11231179
},
11241180
)
1125-
response = openai_client.chat.completions.create(**data, timeout=timeout)
1181+
headers, response = self.make_sync_openai_chat_completion_request(
1182+
openai_client=openai_client,
1183+
data=data,
1184+
timeout=timeout,
1185+
)
1186+
1187+
logging_obj.model_call_details["response_headers"] = headers
11261188
streamwrapper = CustomStreamWrapper(
11271189
completion_stream=response,
11281190
model=model,
11291191
custom_llm_provider="openai",
11301192
logging_obj=logging_obj,
11311193
stream_options=data.get("stream_options", None),
1194+
_response_headers=headers,
11321195
)
11331196
return streamwrapper
11341197

@@ -1170,8 +1233,30 @@ async def async_streaming(
11701233
},
11711234
)
11721235

1236+
headers, response = await self.make_openai_chat_completion_request(
1237+
openai_aclient=openai_aclient, data=data, timeout=timeout
1238+
)
1239+
logging_obj.model_call_details["response_headers"] = headers
1240+
streamwrapper = CustomStreamWrapper(
1241+
completion_stream=response,
1242+
model=model,
1243+
custom_llm_provider="openai",
1244+
logging_obj=logging_obj,
1245+
stream_options=data.get("stream_options", None),
1246+
_response_headers=headers,
1247+
)
1248+
return streamwrapper
1249+
except (
1250+
Exception
1251+
) as e: # need to exception handle here. async exceptions don't get caught in sync functions.
1252+
if response is not None and hasattr(response, "text"):
1253+
raise OpenAIError(
1254+
status_code=500,
1255+
message=f"{str(e)}\n\nOriginal Response: {response.text}",
1256+
11731257
headers, response = await self.make_openai_chat_completion_request(
11741258
openai_aclient=openai_aclient, data=data, timeout=timeout
1259+
11751260
)
11761261
logging_obj.model_call_details["response_headers"] = headers
11771262
streamwrapper = CustomStreamWrapper(
@@ -1252,6 +1337,32 @@ async def make_openai_embedding_request(
12521337
except Exception as e:
12531338
raise e
12541339

1340+
def make_sync_openai_embedding_request(
1341+
self,
1342+
openai_client: OpenAI,
1343+
data: dict,
1344+
timeout: Union[float, httpx.Timeout],
1345+
):
1346+
"""
1347+
Helper to:
1348+
- call embeddings.create.with_raw_response when litellm.return_response_headers is True
1349+
- call embeddings.create by default
1350+
"""
1351+
try:
1352+
if litellm.return_response_headers is True:
1353+
raw_response = openai_client.embeddings.with_raw_response.create(
1354+
**data, timeout=timeout
1355+
) # type: ignore
1356+
1357+
headers = dict(raw_response.headers)
1358+
response = raw_response.parse()
1359+
return headers, response
1360+
else:
1361+
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
1362+
return None, response
1363+
except Exception as e:
1364+
raise e
1365+
12551366
async def aembedding(
12561367
self,
12571368
input: list,
@@ -1286,7 +1397,12 @@ async def aembedding(
12861397
additional_args={"complete_input_dict": data},
12871398
original_response=stringified_response,
12881399
)
1289-
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore
1400+
return convert_to_model_response_object(
1401+
response_object=stringified_response,
1402+
model_response_object=model_response,
1403+
response_type="embedding",
1404+
_response_headers=headers,
1405+
) # type: ignore
12901406
except Exception as e:
12911407
## LOGGING
12921408
logging_obj.post_call(
@@ -1347,17 +1463,26 @@ def embedding(
13471463
client=client,
13481464
)
13491465

1350-
## COMPLETION CALL
1351-
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
1466+
## embedding CALL
1467+
headers: Optional[Dict] = None
1468+
headers, sync_embedding_response = self.make_sync_openai_embedding_request(
1469+
openai_client=openai_client, data=data, timeout=timeout
1470+
) # type: ignore
1471+
13521472
## LOGGING
1473+
logging_obj.model_call_details["response_headers"] = headers
13531474
logging_obj.post_call(
13541475
input=input,
13551476
api_key=api_key,
13561477
additional_args={"complete_input_dict": data},
1357-
original_response=response,
1478+
original_response=sync_embedding_response,
13581479
)
1359-
1360-
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore
1480+
return convert_to_model_response_object(
1481+
response_object=sync_embedding_response.model_dump(),
1482+
model_response_object=model_response,
1483+
_response_headers=headers,
1484+
response_type="embedding",
1485+
) # type: ignore
13611486
except OpenAIError as e:
13621487
exception_mapping_worked = True
13631488
raise e
@@ -1520,6 +1645,33 @@ async def make_openai_audio_transcriptions_request(
15201645
except Exception as e:
15211646
raise e
15221647

1648+
def make_sync_openai_audio_transcriptions_request(
1649+
self,
1650+
openai_client: OpenAI,
1651+
data: dict,
1652+
timeout: Union[float, httpx.Timeout],
1653+
):
1654+
"""
1655+
Helper to:
1656+
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
1657+
- call openai_aclient.audio.transcriptions.create by default
1658+
"""
1659+
try:
1660+
if litellm.return_response_headers is True:
1661+
raw_response = (
1662+
openai_client.audio.transcriptions.with_raw_response.create(
1663+
**data, timeout=timeout
1664+
)
1665+
) # type: ignore
1666+
headers = dict(raw_response.headers)
1667+
response = raw_response.parse()
1668+
return headers, response
1669+
else:
1670+
response = openai_client.audio.transcriptions.create(**data, timeout=timeout) # type: ignore
1671+
return None, response
1672+
except Exception as e:
1673+
raise e
1674+
15231675
def audio_transcriptions(
15241676
self,
15251677
model: str,
@@ -1555,8 +1707,10 @@ def audio_transcriptions(
15551707
timeout=timeout,
15561708
max_retries=max_retries,
15571709
)
1558-
response = openai_client.audio.transcriptions.create(
1559-
**data, timeout=timeout # type: ignore
1710+
response = self.make_sync_openai_audio_transcriptions_request(
1711+
openai_client=openai_client,
1712+
data=data,
1713+
timeout=timeout,
15601714
)
15611715

15621716
if isinstance(response, BaseModel):

0 commit comments

Comments
 (0)