Skip to content

Commit c8d0f93

Browse files
server : save requests and responses to JSON
1 parent 2233737 commit c8d0f93

8 files changed

Lines changed: 369 additions & 19 deletions

File tree

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2993,6 +2993,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29932993
params.endpoint_slots = value;
29942994
}
29952995
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2996+
add_opt(common_arg(
2997+
{"--request-save-path"}, "PATH",
2998+
"path to save per-request prompt/response JSON files (default: disabled)",
2999+
[](common_params & params, const std::string & value) {
3000+
params.request_save_path = value;
3001+
}
3002+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REQUEST_SAVE_PATH"));
29963003
add_opt(common_arg(
29973004
{"--slot-save-path"}, "PATH",
29983005
"path to save slot kv cache (default: disabled)",

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,7 @@ struct common_params {
625625

626626
bool log_json = false;
627627

628+
std::string request_save_path;
628629
std::string slot_save_path;
629630
std::string media_path; // path to directory for loading media files
630631

tools/server/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ For the full list of features, please refer to [server's changelog](https://gith
210210
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
211211
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
212212
| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
213+
| `--request-save-path PATH` | path to save per-request prompt/response JSON files (default: disabled)<br/>(env: LLAMA_ARG_REQUEST_SAVE_PATH) |
213214
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
214215
| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
215216
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
@@ -265,6 +266,12 @@ For boolean options like `--mmap` or `--kv-offload`, the environment variable is
265266
- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
266267
- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
267268

269+
To save one JSON file per generation request in a relative `output` directory:
270+
271+
```bash
272+
./llama-server -m model.gguf --request-save-path output
273+
```
274+
268275
Example usage of docker compose with environment variables:
269276

270277
```yml

tools/server/server-common.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include <random>
1313
#include <sstream>
1414
#include <fstream>
15+
#include <chrono>
16+
#include <ctime>
17+
#include <filesystem>
18+
#include <iomanip>
1519

1620
json format_error_response(const std::string & message, const enum error_type type) {
1721
std::string type_str;
@@ -57,6 +61,82 @@ json format_error_response(const std::string & message, const enum error_type ty
5761
};
5862
}
5963

64+
static std::tm server_gmtime(std::time_t time) {
65+
std::tm tm = {};
66+
#ifdef _WIN32
67+
gmtime_s(&tm, &time);
68+
#else
69+
gmtime_r(&time, &tm);
70+
#endif
71+
return tm;
72+
}
73+
74+
std::string server_timestamp_utc_iso8601() {
75+
const auto now = std::chrono::system_clock::now();
76+
const auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()) % 1000;
77+
const std::time_t now_time = std::chrono::system_clock::to_time_t(now);
78+
const std::tm now_tm = server_gmtime(now_time);
79+
80+
std::ostringstream oss;
81+
oss << std::put_time(&now_tm, "%Y-%m-%dT%H:%M:%S")
82+
<< '.'
83+
<< std::setw(3) << std::setfill('0') << ms.count()
84+
<< 'Z';
85+
return oss.str();
86+
}
87+
88+
std::string server_timestamp_utc_filename() {
89+
std::string timestamp = server_timestamp_utc_iso8601();
90+
for (char & ch : timestamp) {
91+
if (ch == ':') {
92+
ch = '-';
93+
}
94+
}
95+
return timestamp;
96+
}
97+
98+
bool server_save_json_artifact(const std::string & dir, const json & data, const std::string & filename_base, std::string * filepath) {
99+
if (dir.empty()) {
100+
return false;
101+
}
102+
103+
if (!fs_create_directory_with_parents(dir)) {
104+
return false;
105+
}
106+
107+
const std::filesystem::path base_dir(dir);
108+
109+
for (int suffix = 0; suffix < 10000; ++suffix) {
110+
std::string filename = filename_base;
111+
if (suffix > 0) {
112+
filename += "-" + std::to_string(suffix);
113+
}
114+
filename += ".json";
115+
116+
const std::filesystem::path output_path = base_dir / filename;
117+
if (std::filesystem::exists(output_path)) {
118+
continue;
119+
}
120+
121+
std::ofstream file(output_path, std::ios::binary | std::ios::out | std::ios::trunc);
122+
if (!file.is_open()) {
123+
return false;
124+
}
125+
126+
file << data.dump(2) << '\n';
127+
if (!file.good()) {
128+
return false;
129+
}
130+
131+
if (filepath != nullptr) {
132+
*filepath = output_path.string();
133+
}
134+
return true;
135+
}
136+
137+
return false;
138+
}
139+
60140
//
61141
// random string / id
62142
//

tools/server/server-common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ struct server_grammar_trigger {
8383
};
8484

8585
json format_error_response(const std::string & message, const enum error_type type);
86+
std::string server_timestamp_utc_iso8601();
87+
std::string server_timestamp_utc_filename();
88+
bool server_save_json_artifact(const std::string & dir, const json & data, const std::string & filename_base, std::string * filepath = nullptr);
8689

8790
//
8891
// random string / id

tools/server/server-context.cpp

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3037,6 +3037,84 @@ void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
30373037
impl->queue_tasks.on_sleeping_state(std::move(callback));
30383038
}
30393039

3040+
static json completion_results_to_response_json(json arr, task_response_type res_type) {
3041+
GGML_ASSERT(!arr.empty() && "empty completion results");
3042+
3043+
if (arr.size() == 1) {
3044+
return arr[0];
3045+
}
3046+
3047+
if (res_type == TASK_RESPONSE_TYPE_OAI_CHAT || res_type == TASK_RESPONSE_TYPE_OAI_CMPL) {
3048+
json & choices = arr[0]["choices"];
3049+
for (size_t i = 1; i < arr.size(); ++i) {
3050+
choices.push_back(std::move(arr[i]["choices"][0]));
3051+
}
3052+
return arr[0];
3053+
}
3054+
3055+
return arr;
3056+
}
3057+
3058+
static json completion_final_result_to_non_stream_json(server_task_result_cmpl_final & result) {
3059+
switch (result.res_type) {
3060+
case TASK_RESPONSE_TYPE_NONE:
3061+
return result.to_json_non_oaicompat();
3062+
case TASK_RESPONSE_TYPE_OAI_CMPL:
3063+
return result.to_json_oaicompat();
3064+
case TASK_RESPONSE_TYPE_OAI_CHAT:
3065+
return result.to_json_oaicompat_chat();
3066+
case TASK_RESPONSE_TYPE_OAI_RESP:
3067+
return result.to_json_oaicompat_resp();
3068+
case TASK_RESPONSE_TYPE_ANTHROPIC:
3069+
return result.to_json_anthropic();
3070+
default:
3071+
throw std::logic_error("Invalid task_response_type");
3072+
}
3073+
}
3074+
3075+
static void maybe_save_request_artifact(
3076+
const common_params & params,
3077+
const server_http_req & req,
3078+
bool stream,
3079+
const json & request_data,
3080+
const json & prompt_data,
3081+
int status,
3082+
const json & response_data) {
3083+
if (params.request_save_path.empty()) {
3084+
return;
3085+
}
3086+
3087+
const std::string filename_base = server_timestamp_utc_filename();
3088+
std::string timestamp = filename_base;
3089+
int hyphen_count = 0;
3090+
for (char & ch : timestamp) {
3091+
if (ch == '-') {
3092+
++hyphen_count;
3093+
if (hyphen_count > 2) {
3094+
ch = ':';
3095+
}
3096+
}
3097+
}
3098+
3099+
json artifact = {
3100+
{"timestamp", timestamp},
3101+
{"endpoint", req.path},
3102+
{"stream", stream},
3103+
{"status", status},
3104+
{"request", request_data},
3105+
{"prompt", prompt_data},
3106+
{"response", response_data},
3107+
};
3108+
3109+
std::string filepath;
3110+
if (!server_save_json_artifact(params.request_save_path, artifact, filename_base, &filepath)) {
3111+
SRV_WRN("failed to save request artifact in %s\n", params.request_save_path.c_str());
3112+
return;
3113+
}
3114+
3115+
SRV_DBG("saved request artifact: %s\n", filepath.c_str());
3116+
}
3117+
30403118

30413119
//
30423120
// server_routes
@@ -3053,6 +3131,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
30533131
auto res = create_response();
30543132
auto completion_id = gen_chatcmplid();
30553133
auto & rd = res->rd;
3134+
const json prompt_data = data.contains("prompt") ? data.at("prompt") : json(nullptr);
30563135

30573136
try {
30583137
std::vector<server_task> tasks;
@@ -3105,7 +3184,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
31053184

31063185
rd.post_tasks(std::move(tasks));
31073186
} catch (const std::exception & e) {
3108-
res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
3187+
json error_json = format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST);
3188+
json response_json = {{"error", error_json}};
3189+
res->error(error_json);
3190+
maybe_save_request_artifact(params, req, json_value(data, "stream", false), data, prompt_data, res->status, response_json);
31093191
return res;
31103192
}
31113193

@@ -3117,29 +3199,20 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
31173199
if (all_results.is_terminated) {
31183200
return res; // connection is closed
31193201
} else if (all_results.error) {
3120-
res->error(all_results.error->to_json());
3202+
json error_json = all_results.error->to_json();
3203+
json response_json = {{"error", error_json}};
3204+
res->error(error_json);
3205+
maybe_save_request_artifact(params, req, stream, data, prompt_data, res->status, response_json);
31213206
return res;
31223207
} else {
31233208
json arr = json::array();
31243209
for (auto & res : all_results.results) {
31253210
GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
31263211
arr.push_back(res->to_json());
31273212
}
3128-
GGML_ASSERT(!arr.empty() && "empty results");
3129-
if (arr.size() == 1) {
3130-
// if single request, return single object instead of array
3131-
res->ok(arr[0]);
3132-
} else if (res_type == TASK_RESPONSE_TYPE_OAI_CHAT || res_type == TASK_RESPONSE_TYPE_OAI_CMPL) {
3133-
// if multiple results in OAI format, we need to re-format them
3134-
json & choices = arr[0]["choices"];
3135-
for (size_t i = 1; i < arr.size(); i++) {
3136-
choices.push_back(std::move(arr[i]["choices"][0]));
3137-
}
3138-
res->ok(arr[0]);
3139-
} else {
3140-
// multi-results, non-OAI compat
3141-
res->ok(arr);
3142-
}
3213+
json response_json = completion_results_to_response_json(std::move(arr), res_type);
3214+
res->ok(response_json);
3215+
maybe_save_request_artifact(params, req, stream, data, prompt_data, res->status, response_json);
31433216
}
31443217
} else {
31453218
// in streaming mode, the first error must be treated as non-stream response
@@ -3152,7 +3225,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
31523225
}
31533226

31543227
if (first_result->is_error()) {
3155-
res->error(first_result->to_json());
3228+
json error_json = first_result->to_json();
3229+
json response_json = {{"error", error_json}};
3230+
res->error(error_json);
3231+
maybe_save_request_artifact(params, req, stream, data, prompt_data, res->status, response_json);
31563232
return res;
31573233
}
31583234

@@ -3161,6 +3237,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
31613237
dynamic_cast<server_task_result_cmpl_final*> (first_result.get()) != nullptr
31623238
);
31633239

3240+
auto final_results = std::make_shared<json>(json::array());
3241+
if (auto * final_result = dynamic_cast<server_task_result_cmpl_final *>(first_result.get())) {
3242+
final_results->push_back(completion_final_result_to_non_stream_json(*final_result));
3243+
}
3244+
auto is_saved = std::make_shared<bool>(false);
3245+
31643246
// next responses are streamed
31653247
// to be sent immediately
31663248
json first_result_json = first_result->to_json();
@@ -3173,7 +3255,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
31733255
}
31743256
res->status = 200;
31753257
res->content_type = "text/event-stream";
3176-
res->next = [res_this = res.get(), res_type, &req](std::string & output) -> bool {
3258+
res->next = [res_this = res.get(), res_type, &req, request_data = data, prompt_data, final_results, is_saved, this](std::string & output) -> bool {
31773259
static auto format_error = [](task_response_type res_type, const json & res_json) {
31783260
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
31793261
return format_anthropic_sse({
@@ -3202,6 +3284,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
32023284

32033285
// check if there is more data
32043286
if (!rd.has_next()) {
3287+
if (!*is_saved && !final_results->empty()) {
3288+
json response_json = completion_results_to_response_json(*final_results, res_type);
3289+
maybe_save_request_artifact(params, req, true, request_data, prompt_data, 200, response_json);
3290+
*is_saved = true;
3291+
}
3292+
32053293
switch (res_type) {
32063294
case TASK_RESPONSE_TYPE_NONE:
32073295
case TASK_RESPONSE_TYPE_OAI_RESP:
@@ -3228,6 +3316,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
32283316
// send the results
32293317
if (result->is_error()) {
32303318
json res_json = result->to_json();
3319+
if (!*is_saved) {
3320+
maybe_save_request_artifact(params, req, true, request_data, prompt_data, 500, json{{"error", res_json}});
3321+
*is_saved = true;
3322+
}
32313323
output = format_error(res_type, res_json);
32323324
SRV_DBG("%s", "error received during streaming, terminating stream\n");
32333325
return false; // terminate on error
@@ -3236,6 +3328,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
32363328
dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
32373329
|| dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
32383330
);
3331+
if (auto * final_result = dynamic_cast<server_task_result_cmpl_final *>(result.get())) {
3332+
final_results->push_back(completion_final_result_to_non_stream_json(*final_result));
3333+
}
32393334
json res_json = result->to_json();
32403335
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
32413336
output = format_anthropic_sse(res_json);
@@ -3251,6 +3346,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
32513346

32523347
} catch (const std::exception & e) {
32533348
json error_json = format_error_response(e.what(), ERROR_TYPE_SERVER);
3349+
if (!*is_saved) {
3350+
maybe_save_request_artifact(params, req, true, request_data, prompt_data, 500, json{{"error", error_json}});
3351+
*is_saved = true;
3352+
}
32543353
output = format_error(res_type, error_json);
32553354

32563355
// terminate on exception

0 commit comments

Comments
 (0)