-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathcalling_the_endpoint_asynchronously.py
More file actions
46 lines (37 loc) · 1.32 KB
/
calling_the_endpoint_asynchronously.py
File metadata and controls
46 lines (37 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
from time import sleep
from verda import VerdaClient
from verda.inference_client import AsyncStatus
# Configuration - replace with your deployment name
DEPLOYMENT_NAME = os.environ.get('VERDA_DEPLOYMENT_NAME')
# Get client secret and id from environment variables
CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
INFERENCE_KEY = os.environ.get('VERDA_INFERENCE_KEY')
# Verda client instance
verda = VerdaClient(
CLIENT_ID,
CLIENT_SECRET,
inference_key=INFERENCE_KEY,
)
# Get the deployment
deployment = verda.containers.get_deployment_by_name(DEPLOYMENT_NAME)
# Make an asynchronous request to the endpoint.
# This example demonstrates calling a SGLang deployment which serves LLMs using an OpenAI-compatible API format
data = {
'model': 'deepseek-ai/deepseek-llm-7b-chat',
'prompt': 'Is consciousness fundamentally computational, or is there something more to subjective experience that cannot be reduced to information processing?',
'max_tokens': 128,
'temperature': 0.7,
'top_p': 0.9,
}
header = {'Content-Type': 'application/json'}
response = deployment.run(
data=data,
path='v1/completions',
headers=header,
)
while response.status() != AsyncStatus.Completed:
print(response.status_json())
sleep(1)
print(response.output())