Skip to content

Commit e164e99

Browse files
committed
examples : add parakeet-server example
This commit adds a http server for parkeet similar to whisper-server. The shared functionality has been extracted in to examples/server-common.h to avoid code duplication.
1 parent 6edfe60 commit e164e99

8 files changed

Lines changed: 1002 additions & 343 deletions

File tree

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ else()
109109
add_subdirectory(vad-speech-segments)
110110
add_subdirectory(parakeet-cli)
111111
add_subdirectory(parakeet-quantize)
112+
add_subdirectory(parakeet-server)
112113
if (WHISPER_SDL2)
113114
add_subdirectory(stream)
114115
add_subdirectory(command)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
set(CMAKE_CXX_STANDARD 17)
2+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
3+
4+
set(TARGET parakeet-server)
5+
add_executable(${TARGET} parakeet-server.cpp)
6+
7+
include(DefaultTargetOptions)
8+
9+
target_sources(${TARGET} PRIVATE ../server-common.cpp)
10+
11+
target_link_libraries(${TARGET} PRIVATE common json_cpp parakeet ${CMAKE_THREAD_LIBS_INIT})
12+
13+
if (WIN32)
14+
target_link_libraries(${TARGET} PRIVATE ws2_32)
15+
endif()
16+
17+
install(TARGETS ${TARGET} RUNTIME)

examples/parakeet-server/parakeet-server.cpp

Lines changed: 427 additions & 0 deletions
Large diffs are not rendered by default.

examples/server-common.cpp

Lines changed: 356 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,356 @@
1+
#include "server-common.h"
2+
#include "common-whisper.h"
3+
4+
#include <cstdio>
5+
#include <csignal>
6+
#include <random>
7+
#include <sstream>
8+
#include <memory>
9+
#include <fstream>
10+
#include <algorithm>
11+
#include <cmath>
12+
#include <cstdlib>
13+
#include <filesystem>
14+
15+
#if defined (_WIN32)
16+
#include <windows.h>
17+
#endif
18+
19+
const std::string json_format = "json";
20+
const std::string text_format = "text";
21+
const std::string srt_format = "srt";
22+
const std::string vjson_format = "verbose_json";
23+
const std::string vtt_format = "vtt";
24+
25+
namespace {
26+
std::function<void()> g_shutdown_callback;
27+
std::atomic_flag g_is_terminating = ATOMIC_FLAG_INIT;
28+
29+
void signal_handler(int /*signal*/) {
30+
if (g_is_terminating.test_and_set()) {
31+
fprintf(stderr, "Received second interrupt, terminating immediately.\n");
32+
exit(1);
33+
}
34+
if (g_shutdown_callback) {
35+
g_shutdown_callback();
36+
}
37+
}
38+
}
39+
40+
bool parse_str_to_bool(const std::string & s) {
41+
if (s == "true" || s == "1" || s == "yes" || s == "y") {
42+
return true;
43+
}
44+
return false;
45+
}
46+
47+
bool check_ffmpeg_availability() {
48+
int result = system("ffmpeg -version");
49+
if (result == 0) {
50+
std::cout << "ffmpeg is available." << std::endl;
51+
} else {
52+
std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed "
53+
<< "and that its executable is included in your system's PATH. ";
54+
exit(0);
55+
}
56+
return true;
57+
}
58+
59+
std::string generate_temp_filename(const std::string & path, const std::string & prefix, const std::string & extension) {
60+
auto now = std::chrono::system_clock::now();
61+
auto now_time_t = std::chrono::system_clock::to_time_t(now);
62+
63+
static std::mt19937 rng{std::random_device{}()};
64+
std::uniform_int_distribution<long long> dist(0, 1e9);
65+
66+
std::stringstream ss;
67+
ss << path
68+
<< std::filesystem::path::preferred_separator
69+
<< prefix
70+
<< "-"
71+
<< std::put_time(std::localtime(&now_time_t), "%Y%m%d-%H%M%S")
72+
<< "-"
73+
<< dist(rng)
74+
<< extension;
75+
76+
return ss.str();
77+
}
78+
79+
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp, bool stereo) {
80+
std::ostringstream cmd_stream;
81+
std::string converted_filename_temp = temp_filename + "_temp.wav";
82+
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac " << (stereo ? 2 : 1) << " -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
83+
std::string cmd = cmd_stream.str();
84+
85+
int status = std::system(cmd.c_str());
86+
if (status != 0) {
87+
error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
88+
return false;
89+
}
90+
91+
if (remove(temp_filename.c_str()) != 0) {
92+
error_resp = "{\"error\":\"Failed to remove the original file.\"}";
93+
return false;
94+
}
95+
96+
if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
97+
error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
98+
return false;
99+
}
100+
return true;
101+
}
102+
103+
void setup_signal_handler(std::function<void()> shutdown_callback) {
104+
g_shutdown_callback = std::move(shutdown_callback);
105+
106+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
107+
struct sigaction sigint_action;
108+
sigint_action.sa_handler = signal_handler;
109+
sigemptyset(&sigint_action.sa_mask);
110+
sigint_action.sa_flags = 0;
111+
sigaction(SIGINT, &sigint_action, NULL);
112+
sigaction(SIGTERM, &sigint_action, NULL);
113+
#elif defined (_WIN32)
114+
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
115+
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
116+
};
117+
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
118+
#endif
119+
}
120+
121+
static std::string ms_to_timestamp(int64_t t_ms, bool comma = false) {
122+
// to_timestamp expects centiseconds, our adapter uses milliseconds
123+
return ::to_timestamp(t_ms / 10, comma);
124+
}
125+
126+
127+
std::string format_text(const transcription_result & result) {
128+
std::stringstream ss;
129+
const int n_segments = result.n_segments();
130+
for (int i = 0; i < n_segments; ++i) {
131+
auto seg = result.get_segment(i);
132+
auto speaker = result.get_speaker ? result.get_speaker(i) : std::string();
133+
ss << speaker << seg.text << "\n";
134+
}
135+
return ss.str();
136+
}
137+
138+
std::string format_srt(const transcription_result & result, int offset_n) {
139+
std::stringstream ss;
140+
const int n_segments = result.n_segments();
141+
for (int i = 0; i < n_segments; ++i) {
142+
auto seg = result.get_segment(i);
143+
auto speaker = result.get_speaker ? result.get_speaker(i) : std::string();
144+
145+
ss << i + 1 + offset_n << "\n";
146+
ss << ms_to_timestamp(seg.t0, true) << " --> " << ms_to_timestamp(seg.t1, true) << "\n";
147+
ss << speaker << seg.text << "\n\n";
148+
}
149+
return ss.str();
150+
}
151+
152+
std::string format_vtt(const transcription_result & result) {
153+
std::stringstream ss;
154+
ss << "WEBVTT\n\n";
155+
156+
const int n_segments = result.n_segments();
157+
for (int i = 0; i < n_segments; ++i) {
158+
auto seg = result.get_segment(i);
159+
std::string speaker_tag;
160+
161+
if (result.get_speaker) {
162+
auto speaker_id = result.get_speaker(i);
163+
if (!speaker_id.empty()) {
164+
speaker_tag = "<v Speaker" + speaker_id + ">";
165+
}
166+
}
167+
168+
ss << ms_to_timestamp(seg.t0) << " --> " << ms_to_timestamp(seg.t1) << "\n";
169+
ss << speaker_tag << seg.text << "\n\n";
170+
}
171+
return ss.str();
172+
}
173+
174+
std::string format_json(const transcription_result & result) {
175+
std::string text = format_text(result);
176+
json jres = json{{"text", text}};
177+
return jres.dump(-1, ' ', false, json::error_handler_t::replace);
178+
}
179+
180+
std::string format_verbose_json(
181+
const transcription_result & result,
182+
float temperature,
183+
float duration,
184+
bool no_timestamps,
185+
bool token_timestamps) {
186+
std::string text = format_text(result);
187+
std::string task = result.get_task ? result.get_task() : std::string("transcribe");
188+
std::string language = result.get_language ? result.get_language() : std::string();
189+
190+
json jres = json{
191+
{"task", task},
192+
{"language", language},
193+
{"duration", duration},
194+
{"text", text},
195+
{"segments", json::array()}
196+
};
197+
198+
// Merge language probability data into the top-level response.
199+
// Adapters return a json object whose keys are merged directly, allowing
200+
// model-specific fields (e.g. whisper's detected_language) to appear at
201+
// the top level alongside the standard language_probabilities map.
202+
if (result.get_language_probabilities) {
203+
json lang_data = result.get_language_probabilities();
204+
for (auto & [key, val] : lang_data.items()) {
205+
jres[key] = val;
206+
}
207+
}
208+
209+
const int n_segments = result.n_segments();
210+
for (int i = 0; i < n_segments; ++i) {
211+
auto seg = result.get_segment(i);
212+
213+
json segment = json{
214+
{"id", i},
215+
{"text", seg.text},
216+
};
217+
218+
if (!no_timestamps) {
219+
segment["start"] = seg.t0 * 0.001f; // ms -> seconds
220+
segment["end"] = seg.t1 * 0.001f;
221+
}
222+
223+
if (result.get_speaker) {
224+
auto speaker_id = result.get_speaker(i);
225+
if (!speaker_id.empty()) {
226+
segment["speaker"] = speaker_id;
227+
}
228+
}
229+
230+
// Build word-level tokens by merging partial UTF-8 tokens
231+
std::vector<json> words;
232+
int n_tokens = (int)seg.tokens.size();
233+
float total_logprob = 0.0f;
234+
235+
for (int j = 0; j < n_tokens; ++j) {
236+
auto & tok = seg.tokens[j];
237+
238+
// Merge trailing partial UTF-8 bytes into complete words
239+
std::string word_text = tok.text;
240+
int64_t word_t1 = tok.t1;
241+
242+
while (j + 1 < n_tokens) {
243+
int trailing = utf8_trailing_bytes_needed(word_text);
244+
if (trailing <= 0) break;
245+
246+
++j;
247+
auto & next_tok = seg.tokens[j];
248+
word_text += next_tok.text;
249+
if (next_tok.t1 > word_t1) {
250+
word_t1 = next_tok.t1;
251+
}
252+
}
253+
254+
json word = json{{"word", word_text}};
255+
if (!no_timestamps && token_timestamps) {
256+
word["start"] = tok.t0 * 0.001f;
257+
word["end"] = word_t1 * 0.001f;
258+
}
259+
word["probability"] = tok.prob;
260+
261+
// Approximate logprob from probability
262+
float logprob = tok.prob > 0.0f ? std::log(tok.prob + 1e-10f) : -1e10f;
263+
total_logprob += logprob;
264+
265+
words.push_back(word);
266+
}
267+
268+
segment["words"] = words;
269+
segment["tokens"] = json::array();
270+
for (auto & tok : seg.tokens) {
271+
segment["tokens"].push_back(tok.id);
272+
}
273+
274+
segment["temperature"] = temperature;
275+
int n_word_tokens = (int)seg.tokens.size();
276+
segment["avg_logprob"] = n_word_tokens > 0 ? total_logprob / n_word_tokens : 0.0f;
277+
segment["no_speech_prob"] = seg.no_speech_prob;
278+
279+
jres["segments"].push_back(segment);
280+
}
281+
282+
return jres.dump(-1, ' ', false, json::error_handler_t::replace);
283+
}
284+
285+
void setup_server_common(
286+
httplib::Server & svr,
287+
const server_params & sparams,
288+
std::atomic<server_state> & state,
289+
std::function<void(const httplib::Request &, httplib::Response &)> load_handler,
290+
std::function<void(const httplib::Request &, httplib::Response &)> inference_handler,
291+
const std::string & default_content,
292+
const std::string & server_name) {
293+
294+
svr.set_default_headers({
295+
{"Server", server_name},
296+
{"Access-Control-Allow-Origin", "*"},
297+
{"Access-Control-Allow-Headers", "content-type, authorization"}
298+
});
299+
300+
// Default index page
301+
svr.Get(sparams.request_path + "/", [&](const httplib::Request &, httplib::Response & res) {
302+
res.set_content(default_content, "text/html");
303+
return false;
304+
});
305+
306+
// CORS preflight
307+
svr.Options(sparams.request_path + sparams.inference_path,
308+
[&](const httplib::Request &, httplib::Response &) {});
309+
310+
// Inference endpoint
311+
svr.Post(sparams.request_path + sparams.inference_path, inference_handler);
312+
313+
// Model reload endpoint
314+
if (load_handler) {
315+
svr.Post(sparams.request_path + "/load", load_handler);
316+
}
317+
318+
// Health check
319+
svr.Get(sparams.request_path + "/health", [&](const httplib::Request &, httplib::Response & res) {
320+
server_state current_state = state.load();
321+
if (current_state == SERVER_STATE_READY) {
322+
res.set_content("{\"status\":\"ok\"}", "application/json");
323+
} else {
324+
res.set_content("{\"status\":\"loading model\"}", "application/json");
325+
res.status = 503;
326+
}
327+
});
328+
329+
// Exception handler
330+
svr.set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
331+
const char fmt[] = "500 Internal Server Error\n%s";
332+
char buf[BUFSIZ];
333+
try {
334+
std::rethrow_exception(std::move(ep));
335+
} catch (std::exception & e) {
336+
snprintf(buf, sizeof(buf), fmt, e.what());
337+
} catch (...) {
338+
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
339+
}
340+
res.set_content(buf, "text/plain");
341+
res.status = 500;
342+
});
343+
344+
// Error handler
345+
svr.set_error_handler([](const httplib::Request & req, httplib::Response & res) {
346+
if (res.status == 400) {
347+
res.set_content("Invalid request", "text/plain");
348+
} else if (res.status != 500) {
349+
res.set_content("File Not Found (" + req.path + ")", "text/plain");
350+
res.status = 404;
351+
}
352+
});
353+
354+
svr.set_read_timeout(sparams.read_timeout);
355+
svr.set_write_timeout(sparams.write_timeout);
356+
}

0 commit comments

Comments
 (0)