|
| 1 | +#include "server-common.h" |
| 2 | +#include "common-whisper.h" |
| 3 | + |
| 4 | +#include <cstdio> |
| 5 | +#include <csignal> |
| 6 | +#include <random> |
| 7 | +#include <sstream> |
| 8 | +#include <memory> |
| 9 | +#include <fstream> |
| 10 | +#include <algorithm> |
| 11 | +#include <cmath> |
| 12 | +#include <cstdlib> |
| 13 | +#include <filesystem> |
| 14 | + |
| 15 | +#if defined (_WIN32) |
| 16 | +#include <windows.h> |
| 17 | +#endif |
| 18 | + |
| 19 | +const std::string json_format = "json"; |
| 20 | +const std::string text_format = "text"; |
| 21 | +const std::string srt_format = "srt"; |
| 22 | +const std::string vjson_format = "verbose_json"; |
| 23 | +const std::string vtt_format = "vtt"; |
| 24 | + |
| 25 | +namespace { |
| 26 | + std::function<void()> g_shutdown_callback; |
| 27 | + std::atomic_flag g_is_terminating = ATOMIC_FLAG_INIT; |
| 28 | + |
| 29 | + void signal_handler(int /*signal*/) { |
| 30 | + if (g_is_terminating.test_and_set()) { |
| 31 | + fprintf(stderr, "Received second interrupt, terminating immediately.\n"); |
| 32 | + exit(1); |
| 33 | + } |
| 34 | + if (g_shutdown_callback) { |
| 35 | + g_shutdown_callback(); |
| 36 | + } |
| 37 | + } |
| 38 | +} |
| 39 | + |
| 40 | +bool parse_str_to_bool(const std::string & s) { |
| 41 | + if (s == "true" || s == "1" || s == "yes" || s == "y") { |
| 42 | + return true; |
| 43 | + } |
| 44 | + return false; |
| 45 | +} |
| 46 | + |
| 47 | +bool check_ffmpeg_availability() { |
| 48 | + int result = system("ffmpeg -version"); |
| 49 | + if (result == 0) { |
| 50 | + std::cout << "ffmpeg is available." << std::endl; |
| 51 | + } else { |
| 52 | + std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed " |
| 53 | + << "and that its executable is included in your system's PATH. "; |
| 54 | + exit(0); |
| 55 | + } |
| 56 | + return true; |
| 57 | +} |
| 58 | + |
| 59 | +std::string generate_temp_filename(const std::string & path, const std::string & prefix, const std::string & extension) { |
| 60 | + auto now = std::chrono::system_clock::now(); |
| 61 | + auto now_time_t = std::chrono::system_clock::to_time_t(now); |
| 62 | + |
| 63 | + static std::mt19937 rng{std::random_device{}()}; |
| 64 | + std::uniform_int_distribution<long long> dist(0, 1e9); |
| 65 | + |
| 66 | + std::stringstream ss; |
| 67 | + ss << path |
| 68 | + << std::filesystem::path::preferred_separator |
| 69 | + << prefix |
| 70 | + << "-" |
| 71 | + << std::put_time(std::localtime(&now_time_t), "%Y%m%d-%H%M%S") |
| 72 | + << "-" |
| 73 | + << dist(rng) |
| 74 | + << extension; |
| 75 | + |
| 76 | + return ss.str(); |
| 77 | +} |
| 78 | + |
| 79 | +bool convert_to_wav(const std::string & temp_filename, std::string & error_resp, bool stereo) { |
| 80 | + std::ostringstream cmd_stream; |
| 81 | + std::string converted_filename_temp = temp_filename + "_temp.wav"; |
| 82 | + cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac " << (stereo ? 2 : 1) << " -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1"; |
| 83 | + std::string cmd = cmd_stream.str(); |
| 84 | + |
| 85 | + int status = std::system(cmd.c_str()); |
| 86 | + if (status != 0) { |
| 87 | + error_resp = "{\"error\":\"FFmpeg conversion failed.\"}"; |
| 88 | + return false; |
| 89 | + } |
| 90 | + |
| 91 | + if (remove(temp_filename.c_str()) != 0) { |
| 92 | + error_resp = "{\"error\":\"Failed to remove the original file.\"}"; |
| 93 | + return false; |
| 94 | + } |
| 95 | + |
| 96 | + if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) { |
| 97 | + error_resp = "{\"error\":\"Failed to rename the temporary file.\"}"; |
| 98 | + return false; |
| 99 | + } |
| 100 | + return true; |
| 101 | +} |
| 102 | + |
| 103 | +void setup_signal_handler(std::function<void()> shutdown_callback) { |
| 104 | + g_shutdown_callback = std::move(shutdown_callback); |
| 105 | + |
| 106 | +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) |
| 107 | + struct sigaction sigint_action; |
| 108 | + sigint_action.sa_handler = signal_handler; |
| 109 | + sigemptyset(&sigint_action.sa_mask); |
| 110 | + sigint_action.sa_flags = 0; |
| 111 | + sigaction(SIGINT, &sigint_action, NULL); |
| 112 | + sigaction(SIGTERM, &sigint_action, NULL); |
| 113 | +#elif defined (_WIN32) |
| 114 | + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { |
| 115 | + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; |
| 116 | + }; |
| 117 | + SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); |
| 118 | +#endif |
| 119 | +} |
| 120 | + |
| 121 | +static std::string ms_to_timestamp(int64_t t_ms, bool comma = false) { |
| 122 | + // to_timestamp expects centiseconds, our adapter uses milliseconds |
| 123 | + return ::to_timestamp(t_ms / 10, comma); |
| 124 | +} |
| 125 | + |
| 126 | + |
| 127 | +std::string format_text(const transcription_result & result) { |
| 128 | + std::stringstream ss; |
| 129 | + const int n_segments = result.n_segments(); |
| 130 | + for (int i = 0; i < n_segments; ++i) { |
| 131 | + auto seg = result.get_segment(i); |
| 132 | + auto speaker = result.get_speaker ? result.get_speaker(i) : std::string(); |
| 133 | + ss << speaker << seg.text << "\n"; |
| 134 | + } |
| 135 | + return ss.str(); |
| 136 | +} |
| 137 | + |
| 138 | +std::string format_srt(const transcription_result & result, int offset_n) { |
| 139 | + std::stringstream ss; |
| 140 | + const int n_segments = result.n_segments(); |
| 141 | + for (int i = 0; i < n_segments; ++i) { |
| 142 | + auto seg = result.get_segment(i); |
| 143 | + auto speaker = result.get_speaker ? result.get_speaker(i) : std::string(); |
| 144 | + |
| 145 | + ss << i + 1 + offset_n << "\n"; |
| 146 | + ss << ms_to_timestamp(seg.t0, true) << " --> " << ms_to_timestamp(seg.t1, true) << "\n"; |
| 147 | + ss << speaker << seg.text << "\n\n"; |
| 148 | + } |
| 149 | + return ss.str(); |
| 150 | +} |
| 151 | + |
| 152 | +std::string format_vtt(const transcription_result & result) { |
| 153 | + std::stringstream ss; |
| 154 | + ss << "WEBVTT\n\n"; |
| 155 | + |
| 156 | + const int n_segments = result.n_segments(); |
| 157 | + for (int i = 0; i < n_segments; ++i) { |
| 158 | + auto seg = result.get_segment(i); |
| 159 | + std::string speaker_tag; |
| 160 | + |
| 161 | + if (result.get_speaker) { |
| 162 | + auto speaker_id = result.get_speaker(i); |
| 163 | + if (!speaker_id.empty()) { |
| 164 | + speaker_tag = "<v Speaker" + speaker_id + ">"; |
| 165 | + } |
| 166 | + } |
| 167 | + |
| 168 | + ss << ms_to_timestamp(seg.t0) << " --> " << ms_to_timestamp(seg.t1) << "\n"; |
| 169 | + ss << speaker_tag << seg.text << "\n\n"; |
| 170 | + } |
| 171 | + return ss.str(); |
| 172 | +} |
| 173 | + |
| 174 | +std::string format_json(const transcription_result & result) { |
| 175 | + std::string text = format_text(result); |
| 176 | + json jres = json{{"text", text}}; |
| 177 | + return jres.dump(-1, ' ', false, json::error_handler_t::replace); |
| 178 | +} |
| 179 | + |
| 180 | +std::string format_verbose_json( |
| 181 | + const transcription_result & result, |
| 182 | + float temperature, |
| 183 | + float duration, |
| 184 | + bool no_timestamps, |
| 185 | + bool token_timestamps) { |
| 186 | + std::string text = format_text(result); |
| 187 | + std::string task = result.get_task ? result.get_task() : std::string("transcribe"); |
| 188 | + std::string language = result.get_language ? result.get_language() : std::string(); |
| 189 | + |
| 190 | + json jres = json{ |
| 191 | + {"task", task}, |
| 192 | + {"language", language}, |
| 193 | + {"duration", duration}, |
| 194 | + {"text", text}, |
| 195 | + {"segments", json::array()} |
| 196 | + }; |
| 197 | + |
| 198 | + // Merge language probability data into the top-level response. |
| 199 | + // Adapters return a json object whose keys are merged directly, allowing |
| 200 | + // model-specific fields (e.g. whisper's detected_language) to appear at |
| 201 | + // the top level alongside the standard language_probabilities map. |
| 202 | + if (result.get_language_probabilities) { |
| 203 | + json lang_data = result.get_language_probabilities(); |
| 204 | + for (auto & [key, val] : lang_data.items()) { |
| 205 | + jres[key] = val; |
| 206 | + } |
| 207 | + } |
| 208 | + |
| 209 | + const int n_segments = result.n_segments(); |
| 210 | + for (int i = 0; i < n_segments; ++i) { |
| 211 | + auto seg = result.get_segment(i); |
| 212 | + |
| 213 | + json segment = json{ |
| 214 | + {"id", i}, |
| 215 | + {"text", seg.text}, |
| 216 | + }; |
| 217 | + |
| 218 | + if (!no_timestamps) { |
| 219 | + segment["start"] = seg.t0 * 0.001f; // ms -> seconds |
| 220 | + segment["end"] = seg.t1 * 0.001f; |
| 221 | + } |
| 222 | + |
| 223 | + if (result.get_speaker) { |
| 224 | + auto speaker_id = result.get_speaker(i); |
| 225 | + if (!speaker_id.empty()) { |
| 226 | + segment["speaker"] = speaker_id; |
| 227 | + } |
| 228 | + } |
| 229 | + |
| 230 | + // Build word-level tokens by merging partial UTF-8 tokens |
| 231 | + std::vector<json> words; |
| 232 | + int n_tokens = (int)seg.tokens.size(); |
| 233 | + float total_logprob = 0.0f; |
| 234 | + |
| 235 | + for (int j = 0; j < n_tokens; ++j) { |
| 236 | + auto & tok = seg.tokens[j]; |
| 237 | + |
| 238 | + // Merge trailing partial UTF-8 bytes into complete words |
| 239 | + std::string word_text = tok.text; |
| 240 | + int64_t word_t1 = tok.t1; |
| 241 | + |
| 242 | + while (j + 1 < n_tokens) { |
| 243 | + int trailing = utf8_trailing_bytes_needed(word_text); |
| 244 | + if (trailing <= 0) break; |
| 245 | + |
| 246 | + ++j; |
| 247 | + auto & next_tok = seg.tokens[j]; |
| 248 | + word_text += next_tok.text; |
| 249 | + if (next_tok.t1 > word_t1) { |
| 250 | + word_t1 = next_tok.t1; |
| 251 | + } |
| 252 | + } |
| 253 | + |
| 254 | + json word = json{{"word", word_text}}; |
| 255 | + if (!no_timestamps && token_timestamps) { |
| 256 | + word["start"] = tok.t0 * 0.001f; |
| 257 | + word["end"] = word_t1 * 0.001f; |
| 258 | + } |
| 259 | + word["probability"] = tok.prob; |
| 260 | + |
| 261 | + // Approximate logprob from probability |
| 262 | + float logprob = tok.prob > 0.0f ? std::log(tok.prob + 1e-10f) : -1e10f; |
| 263 | + total_logprob += logprob; |
| 264 | + |
| 265 | + words.push_back(word); |
| 266 | + } |
| 267 | + |
| 268 | + segment["words"] = words; |
| 269 | + segment["tokens"] = json::array(); |
| 270 | + for (auto & tok : seg.tokens) { |
| 271 | + segment["tokens"].push_back(tok.id); |
| 272 | + } |
| 273 | + |
| 274 | + segment["temperature"] = temperature; |
| 275 | + int n_word_tokens = (int)seg.tokens.size(); |
| 276 | + segment["avg_logprob"] = n_word_tokens > 0 ? total_logprob / n_word_tokens : 0.0f; |
| 277 | + segment["no_speech_prob"] = seg.no_speech_prob; |
| 278 | + |
| 279 | + jres["segments"].push_back(segment); |
| 280 | + } |
| 281 | + |
| 282 | + return jres.dump(-1, ' ', false, json::error_handler_t::replace); |
| 283 | +} |
| 284 | + |
| 285 | +void setup_server_common( |
| 286 | + httplib::Server & svr, |
| 287 | + const server_params & sparams, |
| 288 | + std::atomic<server_state> & state, |
| 289 | + std::function<void(const httplib::Request &, httplib::Response &)> load_handler, |
| 290 | + std::function<void(const httplib::Request &, httplib::Response &)> inference_handler, |
| 291 | + const std::string & default_content, |
| 292 | + const std::string & server_name) { |
| 293 | + |
| 294 | + svr.set_default_headers({ |
| 295 | + {"Server", server_name}, |
| 296 | + {"Access-Control-Allow-Origin", "*"}, |
| 297 | + {"Access-Control-Allow-Headers", "content-type, authorization"} |
| 298 | + }); |
| 299 | + |
| 300 | + // Default index page |
| 301 | + svr.Get(sparams.request_path + "/", [&](const httplib::Request &, httplib::Response & res) { |
| 302 | + res.set_content(default_content, "text/html"); |
| 303 | + return false; |
| 304 | + }); |
| 305 | + |
| 306 | + // CORS preflight |
| 307 | + svr.Options(sparams.request_path + sparams.inference_path, |
| 308 | + [&](const httplib::Request &, httplib::Response &) {}); |
| 309 | + |
| 310 | + // Inference endpoint |
| 311 | + svr.Post(sparams.request_path + sparams.inference_path, inference_handler); |
| 312 | + |
| 313 | + // Model reload endpoint |
| 314 | + if (load_handler) { |
| 315 | + svr.Post(sparams.request_path + "/load", load_handler); |
| 316 | + } |
| 317 | + |
| 318 | + // Health check |
| 319 | + svr.Get(sparams.request_path + "/health", [&](const httplib::Request &, httplib::Response & res) { |
| 320 | + server_state current_state = state.load(); |
| 321 | + if (current_state == SERVER_STATE_READY) { |
| 322 | + res.set_content("{\"status\":\"ok\"}", "application/json"); |
| 323 | + } else { |
| 324 | + res.set_content("{\"status\":\"loading model\"}", "application/json"); |
| 325 | + res.status = 503; |
| 326 | + } |
| 327 | + }); |
| 328 | + |
| 329 | + // Exception handler |
| 330 | + svr.set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { |
| 331 | + const char fmt[] = "500 Internal Server Error\n%s"; |
| 332 | + char buf[BUFSIZ]; |
| 333 | + try { |
| 334 | + std::rethrow_exception(std::move(ep)); |
| 335 | + } catch (std::exception & e) { |
| 336 | + snprintf(buf, sizeof(buf), fmt, e.what()); |
| 337 | + } catch (...) { |
| 338 | + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); |
| 339 | + } |
| 340 | + res.set_content(buf, "text/plain"); |
| 341 | + res.status = 500; |
| 342 | + }); |
| 343 | + |
| 344 | + // Error handler |
| 345 | + svr.set_error_handler([](const httplib::Request & req, httplib::Response & res) { |
| 346 | + if (res.status == 400) { |
| 347 | + res.set_content("Invalid request", "text/plain"); |
| 348 | + } else if (res.status != 500) { |
| 349 | + res.set_content("File Not Found (" + req.path + ")", "text/plain"); |
| 350 | + res.status = 404; |
| 351 | + } |
| 352 | + }); |
| 353 | + |
| 354 | + svr.set_read_timeout(sparams.read_timeout); |
| 355 | + svr.set_write_timeout(sparams.write_timeout); |
| 356 | +} |
0 commit comments