server : save requests and responses to JSON

XeonBloomfield · XeonBloomfield · commit c8d0f931c43d · 2026-04-02T16:58:22.000+02:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2993,6 +2993,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.endpoint_slots = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    add_opt(common_arg(
+        {"--request-save-path"}, "PATH",
+        "path to save per-request prompt/response JSON files (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.request_save_path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REQUEST_SAVE_PATH"));
     add_opt(common_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
diff --git a/common/common.h b/common/common.h
@@ -625,6 +625,7 @@ struct common_params {
 
     bool log_json = false;
 
+    std::string request_save_path;
     std::string slot_save_path;
     std::string media_path; // path to directory for loading media files
 
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -210,6 +210,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
 | `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--request-save-path PATH` | path to save per-request prompt/response JSON files (default: disabled)<br/>(env: LLAMA_ARG_REQUEST_SAVE_PATH) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
 | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
@@ -265,6 +266,12 @@ For boolean options like `--mmap` or `--kv-offload`, the environment variable is
 - `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
 - If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
 
+To save one JSON file per generation request in a relative `output` directory:
+
+```bash
+./llama-server -m model.gguf --request-save-path output
+```
+
 Example usage of docker compose with environment variables:
 
 ```yml
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -12,6 +12,10 @@
 #include <random>
 #include <sstream>
 #include <fstream>
+#include <chrono>
+#include <ctime>
+#include <filesystem>
+#include <iomanip>
 
 json format_error_response(const std::string & message, const enum error_type type) {
     std::string type_str;
@@ -57,6 +61,82 @@ json format_error_response(const std::string & message, const enum error_type ty
     };
 }
 
+static std::tm server_gmtime(std::time_t time) {
+    std::tm tm = {};
+#ifdef _WIN32
+    gmtime_s(&tm, &time);
+#else
+    gmtime_r(&time, &tm);
+#endif
+    return tm;
+}
+
+std::string server_timestamp_utc_iso8601() {
+    const auto now = std::chrono::system_clock::now();
+    const auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()) % 1000;
+    const std::time_t now_time = std::chrono::system_clock::to_time_t(now);
+    const std::tm now_tm = server_gmtime(now_time);
+
+    std::ostringstream oss;
+    oss << std::put_time(&now_tm, "%Y-%m-%dT%H:%M:%S")
+        << '.'
+        << std::setw(3) << std::setfill('0') << ms.count()
+        << 'Z';
+    return oss.str();
+}
+
+std::string server_timestamp_utc_filename() {
+    std::string timestamp = server_timestamp_utc_iso8601();
+    for (char & ch : timestamp) {
+        if (ch == ':') {
+            ch = '-';
+        }
+    }
+    return timestamp;
+}
+
+bool server_save_json_artifact(const std::string & dir, const json & data, const std::string & filename_base, std::string * filepath) {
+    if (dir.empty()) {
+        return false;
+    }
+
+    if (!fs_create_directory_with_parents(dir)) {
+        return false;
+    }
+
+    const std::filesystem::path base_dir(dir);
+
+    for (int suffix = 0; suffix < 10000; ++suffix) {
+        std::string filename = filename_base;
+        if (suffix > 0) {
+            filename += "-" + std::to_string(suffix);
+        }
+        filename += ".json";
+
+        const std::filesystem::path output_path = base_dir / filename;
+        if (std::filesystem::exists(output_path)) {
+            continue;
+        }
+
+        std::ofstream file(output_path, std::ios::binary | std::ios::out | std::ios::trunc);
+        if (!file.is_open()) {
+            return false;
+        }
+
+        file << data.dump(2) << '\n';
+        if (!file.good()) {
+            return false;
+        }
+
+        if (filepath != nullptr) {
+            *filepath = output_path.string();
+        }
+        return true;
+    }
+
+    return false;
+}
+
 //
 // random string / id
 //
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
@@ -83,6 +83,9 @@ struct server_grammar_trigger {
 };
 
 json format_error_response(const std::string & message, const enum error_type type);
+std::string server_timestamp_utc_iso8601();
+std::string server_timestamp_utc_filename();
+bool server_save_json_artifact(const std::string & dir, const json & data, const std::string & filename_base, std::string * filepath = nullptr);
 
 //
 // random string / id
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -3037,6 +3037,84 @@ void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
     impl->queue_tasks.on_sleeping_state(std::move(callback));
 }
 
+static json completion_results_to_response_json(json arr, task_response_type res_type) {
+    GGML_ASSERT(!arr.empty() && "empty completion results");
+
+    if (arr.size() == 1) {
+        return arr[0];
+    }
+
+    if (res_type == TASK_RESPONSE_TYPE_OAI_CHAT || res_type == TASK_RESPONSE_TYPE_OAI_CMPL) {
+        json & choices = arr[0]["choices"];
+        for (size_t i = 1; i < arr.size(); ++i) {
+            choices.push_back(std::move(arr[i]["choices"][0]));
+        }
+        return arr[0];
+    }
+
+    return arr;
+}
+
+static json completion_final_result_to_non_stream_json(server_task_result_cmpl_final & result) {
+    switch (result.res_type) {
+        case TASK_RESPONSE_TYPE_NONE:
+            return result.to_json_non_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CMPL:
+            return result.to_json_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            return result.to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            return result.to_json_oaicompat_resp();
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            return result.to_json_anthropic();
+        default:
+            throw std::logic_error("Invalid task_response_type");
+    }
+}
+
+static void maybe_save_request_artifact(
+        const common_params & params,
+        const server_http_req & req,
+        bool stream,
+        const json & request_data,
+        const json & prompt_data,
+        int status,
+        const json & response_data) {
+    if (params.request_save_path.empty()) {
+        return;
+    }
+
+    const std::string filename_base = server_timestamp_utc_filename();
+    std::string timestamp = filename_base;
+    int hyphen_count = 0;
+    for (char & ch : timestamp) {
+        if (ch == '-') {
+            ++hyphen_count;
+            if (hyphen_count > 2) {
+                ch = ':';
+            }
+        }
+    }
+
+    json artifact = {
+        {"timestamp", timestamp},
+        {"endpoint",  req.path},
+        {"stream",    stream},
+        {"status",    status},
+        {"request",   request_data},
+        {"prompt",    prompt_data},
+        {"response",  response_data},
+    };
+
+    std::string filepath;
+    if (!server_save_json_artifact(params.request_save_path, artifact, filename_base, &filepath)) {
+        SRV_WRN("failed to save request artifact in %s\n", params.request_save_path.c_str());
+        return;
+    }
+
+    SRV_DBG("saved request artifact: %s\n", filepath.c_str());
+}
+
 
 //
 // server_routes
@@ -3053,6 +3131,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
     auto res = create_response();
     auto completion_id = gen_chatcmplid();
     auto & rd = res->rd;
+    const json prompt_data = data.contains("prompt") ? data.at("prompt") : json(nullptr);
 
     try {
         std::vector<server_task> tasks;
@@ -3105,7 +3184,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
 
         rd.post_tasks(std::move(tasks));
     } catch (const std::exception & e) {
-        res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+        json error_json = format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST);
+        json response_json = {{"error", error_json}};
+        res->error(error_json);
+        maybe_save_request_artifact(params, req, json_value(data, "stream", false), data, prompt_data, res->status, response_json);
         return res;
     }
 
@@ -3117,29 +3199,20 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
         if (all_results.is_terminated) {
             return res; // connection is closed
         } else if (all_results.error) {
-            res->error(all_results.error->to_json());
+            json error_json = all_results.error->to_json();
+            json response_json = {{"error", error_json}};
+            res->error(error_json);
+            maybe_save_request_artifact(params, req, stream, data, prompt_data, res->status, response_json);
             return res;
         } else {
             json arr = json::array();
             for (auto & res : all_results.results) {
                 GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
                 arr.push_back(res->to_json());
             }
-            GGML_ASSERT(!arr.empty() && "empty results");
-            if (arr.size() == 1) {
-                // if single request, return single object instead of array
-                res->ok(arr[0]);
-            } else if (res_type == TASK_RESPONSE_TYPE_OAI_CHAT || res_type == TASK_RESPONSE_TYPE_OAI_CMPL) {
-                // if multiple results in OAI format, we need to re-format them
-                json & choices = arr[0]["choices"];
-                for (size_t i = 1; i < arr.size(); i++) {
-                    choices.push_back(std::move(arr[i]["choices"][0]));
-                }
-                res->ok(arr[0]);
-            } else {
-                // multi-results, non-OAI compat
-                res->ok(arr);
-            }
+            json response_json = completion_results_to_response_json(std::move(arr), res_type);
+            res->ok(response_json);
+            maybe_save_request_artifact(params, req, stream, data, prompt_data, res->status, response_json);
         }
     } else {
         // in streaming mode, the first error must be treated as non-stream response
@@ -3152,7 +3225,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
         }
 
         if (first_result->is_error()) {
-            res->error(first_result->to_json());
+            json error_json = first_result->to_json();
+            json response_json = {{"error", error_json}};
+            res->error(error_json);
+            maybe_save_request_artifact(params, req, stream, data, prompt_data, res->status, response_json);
             return res;
         }
 
@@ -3161,6 +3237,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
             dynamic_cast<server_task_result_cmpl_final*>  (first_result.get()) != nullptr
         );
 
+        auto final_results = std::make_shared<json>(json::array());
+        if (auto * final_result = dynamic_cast<server_task_result_cmpl_final *>(first_result.get())) {
+            final_results->push_back(completion_final_result_to_non_stream_json(*final_result));
+        }
+        auto is_saved = std::make_shared<bool>(false);
+
         // next responses are streamed
         // to be sent immediately
         json first_result_json = first_result->to_json();
@@ -3173,7 +3255,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
         }
         res->status = 200;
         res->content_type = "text/event-stream";
-        res->next = [res_this = res.get(), res_type, &req](std::string & output) -> bool {
+        res->next = [res_this = res.get(), res_type, &req, request_data = data, prompt_data, final_results, is_saved, this](std::string & output) -> bool {
             static auto format_error = [](task_response_type res_type, const json & res_json) {
                 if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
                     return format_anthropic_sse({
@@ -3202,6 +3284,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
 
                 // check if there is more data
                 if (!rd.has_next()) {
+                    if (!*is_saved && !final_results->empty()) {
+                        json response_json = completion_results_to_response_json(*final_results, res_type);
+                        maybe_save_request_artifact(params, req, true, request_data, prompt_data, 200, response_json);
+                        *is_saved = true;
+                    }
+
                     switch (res_type) {
                         case TASK_RESPONSE_TYPE_NONE:
                         case TASK_RESPONSE_TYPE_OAI_RESP:
@@ -3228,6 +3316,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                 // send the results
                 if (result->is_error()) {
                     json res_json = result->to_json();
+                    if (!*is_saved) {
+                        maybe_save_request_artifact(params, req, true, request_data, prompt_data, 500, json{{"error", res_json}});
+                        *is_saved = true;
+                    }
                     output = format_error(res_type, res_json);
                     SRV_DBG("%s", "error received during streaming, terminating stream\n");
                     return false; // terminate on error
@@ -3236,6 +3328,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                         dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
                         || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
                     );
+                    if (auto * final_result = dynamic_cast<server_task_result_cmpl_final *>(result.get())) {
+                        final_results->push_back(completion_final_result_to_non_stream_json(*final_result));
+                    }
                     json res_json = result->to_json();
                     if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
                         output = format_anthropic_sse(res_json);
@@ -3251,6 +3346,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
 
             } catch (const std::exception & e) {
                 json error_json = format_error_response(e.what(), ERROR_TYPE_SERVER);
+                if (!*is_saved) {
+                    maybe_save_request_artifact(params, req, true, request_data, prompt_data, 500, json{{"error", error_json}});
+                    *is_saved = true;
+                }
                 output = format_error(res_type, error_json);
 
                 // terminate on exception
diff --git a/tools/server/tests/unit/test_request_save.py b/tools/server/tests/unit/test_request_save.py
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py