Skip to content

Commit e489a5c

Browse files
authored
server: support OAI /v1/audio/transcriptions API (#21863)
* server: support OAI /v1/audio/transcriptions API * address autoreview comments * correct default response_format value
1 parent e21cdc1 commit e489a5c

9 files changed

Lines changed: 194 additions & 38 deletions

File tree

tools/server/server-common.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,6 +1433,60 @@ json convert_responses_to_chatcmpl(const json & response_body) {
14331433
return chatcmpl_body;
14341434
}
14351435

1436+
json convert_transcriptions_to_chatcmpl(
1437+
const json & inp_body,
1438+
const std::map<std::string, raw_buffer> & in_files,
1439+
std::vector<raw_buffer> & out_files) {
1440+
// TODO @ngxson : this function may need to be improved in the future
1441+
// handle input files
1442+
out_files.clear();
1443+
auto it = in_files.find("file");
1444+
if (it != in_files.end()) {
1445+
out_files.push_back(it->second);
1446+
} else {
1447+
throw std::invalid_argument("No input file found for transcription");
1448+
}
1449+
1450+
// handle input data
1451+
std::string prompt = json_value(inp_body, "prompt", std::string());
1452+
std::string language = json_value(inp_body, "language", std::string());
1453+
std::string response_format = json_value(inp_body, "response_format", std::string("json"));
1454+
if (response_format != "json") {
1455+
throw std::invalid_argument("Only 'json' response_format is supported for transcription");
1456+
}
1457+
if (prompt.empty()) {
1458+
prompt = "Transcribe audio to text";
1459+
}
1460+
if (!language.empty()) {
1461+
prompt += string_format(" (language: %s)", language.c_str());
1462+
}
1463+
prompt += mtmd_default_marker();
1464+
1465+
json chatcmpl_body = inp_body; // copy all fields
1466+
chatcmpl_body["messages"] = json::array({
1467+
{
1468+
{"role", "user"},
1469+
{"content", prompt},
1470+
},
1471+
});
1472+
1473+
// because input from form-data, everything is string, we need to correct the types here
1474+
std::string stream = json_value(inp_body, "stream", std::string("false"));
1475+
chatcmpl_body["stream"] = stream == "true";
1476+
1477+
if (inp_body.contains("max_tokens")) {
1478+
std::string inp = inp_body["max_tokens"].get<std::string>();
1479+
chatcmpl_body["max_tokens"] = std::stoul(inp);
1480+
}
1481+
1482+
if (inp_body.contains("temperature")) {
1483+
std::string inp = inp_body["temperature"].get<std::string>();
1484+
chatcmpl_body["temperature"] = std::stof(inp);
1485+
}
1486+
1487+
return chatcmpl_body;
1488+
}
1489+
14361490
json convert_anthropic_to_oai(const json & body) {
14371491
json oai_body;
14381492

tools/server/server-common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,12 @@ json oaicompat_chat_params_parse(
305305
// convert OpenAI Responses API format to OpenAI Chat Completions API format
306306
json convert_responses_to_chatcmpl(const json & body);
307307

308+
// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
309+
json convert_transcriptions_to_chatcmpl(
310+
const json & body,
311+
const std::map<std::string, raw_buffer> & in_files,
312+
std::vector<raw_buffer> & out_files);
313+
308314
// convert Anthropic Messages API format to OpenAI Chat Completions API format
309315
json convert_anthropic_to_oai(const json & body);
310316

tools/server/server-context.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3732,6 +3732,33 @@ void server_routes::init_routes() {
37323732
TASK_RESPONSE_TYPE_OAI_RESP);
37333733
};
37343734

3735+
this->post_transcriptions_oai = [this](const server_http_req & req) {
3736+
auto res = create_response();
3737+
3738+
if (!meta->has_mtmd || !meta->chat_params.allow_audio) {
3739+
res->error(format_error_response("The current model does not support audio input.", ERROR_TYPE_NOT_SUPPORTED));
3740+
return res;
3741+
}
3742+
3743+
std::vector<raw_buffer> files;
3744+
json body = convert_transcriptions_to_chatcmpl(
3745+
json::parse(req.body),
3746+
req.files,
3747+
files);
3748+
SRV_DBG("%s\n", "Request converted: OpenAI Transcriptions -> OpenAI Chat Completions");
3749+
SRV_DBG("converted request: %s\n", body.dump().c_str());
3750+
json body_parsed = oaicompat_chat_params_parse(
3751+
body,
3752+
meta->chat_params,
3753+
files);
3754+
return handle_completions_impl(
3755+
req,
3756+
SERVER_TASK_TYPE_COMPLETION,
3757+
body_parsed,
3758+
files,
3759+
TASK_RESPONSE_TYPE_OAI_ASR);
3760+
};
3761+
37353762
this->post_anthropic_messages = [this](const server_http_req & req) {
37363763
auto res = create_response();
37373764
std::vector<raw_buffer> files;

tools/server/server-context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ struct server_routes {
111111
server_http_context::handler_t post_completions_oai;
112112
server_http_context::handler_t post_chat_completions;
113113
server_http_context::handler_t post_responses_oai;
114+
server_http_context::handler_t post_transcriptions_oai;
114115
server_http_context::handler_t post_anthropic_messages;
115116
server_http_context::handler_t post_anthropic_count_tokens;
116117
server_http_context::handler_t post_apply_template;

tools/server/server-http.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ void server_http_context::get(const std::string & path, const server_http_contex
428428
req.path,
429429
build_query_string(req),
430430
req.body,
431+
{},
431432
req.is_connection_closed
432433
});
433434
server_http_res_ptr response = handler(*request);
@@ -437,12 +438,39 @@ void server_http_context::get(const std::string & path, const server_http_contex
437438

438439
void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
439440
pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
441+
std::string body = req.body;
442+
std::map<std::string, raw_buffer> files;
443+
444+
if (req.is_multipart_form_data()) {
445+
// translate text fields to a JSON object and use it as the body
446+
json form_json = json::object();
447+
for (const auto & [key, field] : req.form.fields) {
448+
if (form_json.contains(key)) {
449+
// if the key already exists, convert it to an array
450+
if (!form_json[key].is_array()) {
451+
json existing_value = form_json[key];
452+
form_json[key] = json::array({existing_value});
453+
}
454+
form_json[key].push_back(field.content);
455+
} else {
456+
form_json[key] = field.content;
457+
}
458+
}
459+
body = form_json.dump();
460+
461+
// populate files from multipart form
462+
for (const auto & [key, file] : req.form.files) {
463+
files[key] = raw_buffer(file.content.begin(), file.content.end());
464+
}
465+
}
466+
440467
server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
441468
get_params(req),
442469
get_headers(req),
443470
req.path,
444471
build_query_string(req),
445-
req.body,
472+
body,
473+
std::move(files),
446474
req.is_connection_closed
447475
});
448476
server_http_res_ptr response = handler(*request);

tools/server/server-http.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <map>
66
#include <string>
77
#include <thread>
8+
#include <vector>
9+
#include <cstdint>
810

911
struct common_params;
1012

@@ -32,13 +34,15 @@ struct server_http_res {
3234
// unique pointer, used by set_chunked_content_provider
3335
// httplib requires the stream provider to be stored in heap
3436
using server_http_res_ptr = std::unique_ptr<server_http_res>;
37+
using raw_buffer = std::vector<uint8_t>;
3538

3639
struct server_http_req {
3740
std::map<std::string, std::string> params; // path_params + query_params
3841
std::map<std::string, std::string> headers; // used by MCP proxy
3942
std::string path;
4043
std::string query_string; // query parameters string (e.g. "action=save")
4144
std::string body;
45+
std::map<std::string, raw_buffer> files; // used for file uploads (form data)
4246
const std::function<bool()> & should_stop;
4347

4448
std::string get_param(const std::string & key, const std::string & def = "") const {

tools/server/server-task.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,8 @@ json server_task_result_cmpl_final::to_json() {
725725
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
726726
case TASK_RESPONSE_TYPE_OAI_RESP:
727727
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
728+
case TASK_RESPONSE_TYPE_OAI_ASR:
729+
return to_json_oaicompat_asr();
728730
case TASK_RESPONSE_TYPE_ANTHROPIC:
729731
return stream ? to_json_anthropic_stream() : to_json_anthropic();
730732
default:
@@ -1102,6 +1104,21 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
11021104
return server_sent_events;
11031105
}
11041106

1107+
json server_task_result_cmpl_final::to_json_oaicompat_asr() {
1108+
json event = json {
1109+
{"type", "transcript.text.done"},
1110+
{"text", content},
1111+
{"usage", json {
1112+
{"type", "tokens"},
1113+
{"input_tokens", n_prompt_tokens},
1114+
{"output_tokens", n_decoded},
1115+
{"total_tokens", n_decoded + n_prompt_tokens},
1116+
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
1117+
}},
1118+
};
1119+
return event;
1120+
}
1121+
11051122
json server_task_result_cmpl_final::to_json_anthropic() {
11061123
std::string stop_reason = "max_tokens";
11071124
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@@ -1400,6 +1417,8 @@ json server_task_result_cmpl_partial::to_json() {
14001417
return to_json_oaicompat_chat();
14011418
case TASK_RESPONSE_TYPE_OAI_RESP:
14021419
return to_json_oaicompat_resp();
1420+
case TASK_RESPONSE_TYPE_OAI_ASR:
1421+
return to_json_oaicompat_asr();
14031422
case TASK_RESPONSE_TYPE_ANTHROPIC:
14041423
return to_json_anthropic();
14051424
default:
@@ -1650,6 +1669,14 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
16501669
return events;
16511670
}
16521671

1672+
json server_task_result_cmpl_partial::to_json_oaicompat_asr() {
1673+
json event = json {
1674+
{"type", "transcript.text.delta"},
1675+
{"delta", content},
1676+
};
1677+
return event;
1678+
}
1679+
16531680
json server_task_result_cmpl_partial::to_json_anthropic() {
16541681
json events = json::array();
16551682
bool first = (n_decoded == 1);

tools/server/server-task.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ enum task_response_type {
3434
TASK_RESPONSE_TYPE_OAI_CHAT,
3535
TASK_RESPONSE_TYPE_OAI_CMPL,
3636
TASK_RESPONSE_TYPE_OAI_RESP,
37+
TASK_RESPONSE_TYPE_OAI_ASR, // transcriptions API
3738
TASK_RESPONSE_TYPE_OAI_EMBD,
3839
TASK_RESPONSE_TYPE_ANTHROPIC,
3940
};
@@ -401,6 +402,8 @@ struct server_task_result_cmpl_final : server_task_result {
401402

402403
json to_json_oaicompat_resp_stream();
403404

405+
json to_json_oaicompat_asr();
406+
404407
json to_json_anthropic();
405408

406409
json to_json_anthropic_stream();
@@ -457,6 +460,8 @@ struct server_task_result_cmpl_partial : server_task_result {
457460

458461
json to_json_oaicompat_resp();
459462

463+
json to_json_oaicompat_asr();
464+
460465
json to_json_anthropic();
461466
};
462467

tools/server/server.cpp

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ int main(int argc, char ** argv) {
145145
routes.post_completions_oai = models_routes->proxy_post;
146146
routes.post_chat_completions = models_routes->proxy_post;
147147
routes.post_responses_oai = models_routes->proxy_post;
148+
routes.post_transcriptions_oai = models_routes->proxy_post;
148149
routes.post_anthropic_messages = models_routes->proxy_post;
149150
routes.post_anthropic_count_tokens = models_routes->proxy_post;
150151
routes.post_infill = models_routes->proxy_post;
@@ -160,48 +161,51 @@ int main(int argc, char ** argv) {
160161
routes.post_slots = models_routes->proxy_post;
161162

162163
// custom routes for router
163-
routes.get_props = models_routes->get_router_props;
164-
routes.get_models = models_routes->get_router_models;
165-
ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load));
166-
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
164+
routes.get_props = models_routes->get_router_props;
165+
routes.get_models = models_routes->get_router_models;
166+
167+
ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load));
168+
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
167169
}
168170

169-
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
170-
ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
171-
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
172-
ctx_http.get ("/props", ex_wrapper(routes.get_props));
173-
ctx_http.post("/props", ex_wrapper(routes.post_props));
174-
ctx_http.post("/api/show", ex_wrapper(routes.get_api_show));
175-
ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
176-
ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
177-
ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
178-
ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy
179-
ctx_http.post("/completions", ex_wrapper(routes.post_completions));
180-
ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai));
181-
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
182-
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
183-
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
184-
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
185-
ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai));
186-
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
171+
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
172+
ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
173+
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
174+
ctx_http.get ("/props", ex_wrapper(routes.get_props));
175+
ctx_http.post("/props", ex_wrapper(routes.post_props));
176+
ctx_http.post("/api/show", ex_wrapper(routes.get_api_show));
177+
ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
178+
ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
179+
ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
180+
ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy
181+
ctx_http.post("/completions", ex_wrapper(routes.post_completions));
182+
ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai));
183+
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
184+
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
185+
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
186+
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
187+
ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai));
188+
ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
189+
ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
190+
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
187191
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
188-
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
189-
ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy
190-
ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings));
191-
ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai));
192-
ctx_http.post("/rerank", ex_wrapper(routes.post_rerank));
193-
ctx_http.post("/reranking", ex_wrapper(routes.post_rerank));
194-
ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank));
195-
ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank));
196-
ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize));
197-
ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize));
198-
ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template));
192+
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
193+
ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy
194+
ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings));
195+
ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai));
196+
ctx_http.post("/rerank", ex_wrapper(routes.post_rerank));
197+
ctx_http.post("/reranking", ex_wrapper(routes.post_rerank));
198+
ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank));
199+
ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank));
200+
ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize));
201+
ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize));
202+
ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template));
199203
// LoRA adapters hotswap
200-
ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters));
201-
ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters));
204+
ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters));
205+
ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters));
202206
// Save & load slots
203-
ctx_http.get ("/slots", ex_wrapper(routes.get_slots));
204-
ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots));
207+
ctx_http.get ("/slots", ex_wrapper(routes.get_slots));
208+
ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots));
205209
// CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
206210
if (params.webui_mcp_proxy) {
207211
SRV_WRN("%s", "-----------------\n");

0 commit comments

Comments
 (0)