llm-backend/ollama_client.cpp at main · HandleTlsData/llm-backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#include "ollama_client.hpp"
#include "rag.hpp"
#include "commands.hpp"

//definition of ollama methods
#define LIST_MODELS_URI "/api/tags"
#define LIST_MODELS_METHOD "GET"

#define SIMPLE_COMPLETION_URI "/api/generate"
#define SIMPLE_COMPLETION_METHOD "POST"

#define CHAT_COMPLETION_URI "/api/chat"
#define CHAT_COMPLETION_METHOD "POST"

#define GENERATE_EMBED_URI "/api/embeddings"
#define GENERATE_EMBED_METHOD "POST"

#define LOAD_MODEL_URI "/api/generate"
#define LOAD_MODEL_METHOD "POST"

ollama_client::ollama_client(const std::string &backendUrl, const std::string &imageBackendURL, const std::string &embedBackendURL)
{
    this->accessed();
    this->backURL = backendUrl;
    this->model = "llama3.1:latest";
    this->backURL_img = imageBackendURL;
    this->backURL_emb = embedBackendURL;
}

ollama_client::~ollama_client()
{
}

bool ollama_client::isConnected()
{
    //getting list of models to test connection
    std::string targetURL = backURL + LIST_MODELS_URI;
    std::string response = makeHttpRequest(targetURL, {}, LIST_MODELS_METHOD);
    return response.length() > 1;
}

bool ollama_client::loadModel()
{
    json requestBody = {
        {"model", this->model}
    };

    std::string targetURL = backURL + LOAD_MODEL_URI;
    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, LOAD_MODEL_METHOD);
    json responseBody = json::parse(responseEncoded);
    bool result = responseBody["done"];
    return result;
}

void ollama_client::pingModel()
{
    //this->processSingleMessage("Ping!");
    this->loadModel();
}

std::string ollama_client::processSingleMessage(const std::string& requestMsg)
{
    std::string responseText = {};

    this->lastRequestMessage = requestMsg;
    json requestBody = {
        {"model", this->model},
        {"prompt", this->lastRequestMessage},
        {"stream", this->supportStreaming},
        {"keep_alive", "20m"}
    };

    std::string targetURL = backURL + SIMPLE_COMPLETION_URI;

    //no streaming logic, full response returned in a single blocking request
    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, SIMPLE_COMPLETION_METHOD);

    json responseBody = json::parse(responseEncoded);
    responseText = responseBody["response"];
    this->lastResponseMessage = responseText;
    return responseText;
}

std::string ollama_client::processChatMessage(const std::string& requestMsg, const std::vector<std::pair<std::string, bool>>& chatHistory)
{
    std::string responseText = {};

    this->lastRequestMessage = requestMsg;
    json requestBody = {
        {"model", this->model},
        {"stream", this->supportStreaming},
        {"keep_alive", "20m"}
    };

    requestBody["messages"] = nlohmann::json::array();

    for (const auto& [content, role] : chatHistory) {
        nlohmann::json message;
        message["content"] = content;
        message["role"] = role ? "user" : "assistant";
        requestBody["messages"].push_back(message);
    }

    //process latest one
    {
        nlohmann::json message;
        message["content"] = requestMsg;
        message["role"] = "user";
        requestBody["messages"].push_back(message);
    }

    std::string targetURL = backURL + CHAT_COMPLETION_URI;

    //no streaming logic, full response returned in a single blocking request
    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, CHAT_COMPLETION_METHOD);
    LOG("received response: {}", responseEncoded);

    json responseBody = json::parse(responseEncoded);
    responseText = responseBody["message"]["content"];
    this->lastResponseMessage = responseText;
    return responseText;
}

std::string ollama_client::processChatMessageWithImage(const std::string &requestMsg, const std::string &imageB64)
{
    std::string responseText = {};

    this->lastRequestMessage = requestMsg;
    json requestBody = {
        {"model", "llava:latest"},
        {"stream", this->supportStreaming},
        {"keep_alive", "20m"}
    };

    requestBody["messages"] = nlohmann::json::array();

    //process latest one
    {
        nlohmann::json message;
        message["content"] = requestMsg;
        message["role"] = "user";
        message["images"].push_back(imageB64);
        requestBody["messages"].push_back(message);
    }

    std::string targetURL = this->backURL_img + CHAT_COMPLETION_URI;

    //no streaming logic, full response returned in a single blocking request
    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, CHAT_COMPLETION_METHOD);

    json responseBody = json::parse(responseEncoded);
    responseText = responseBody["message"]["content"];
    this->lastResponseMessage = responseText;
    return responseText;
}

std::string ollama_client::processChatMessageWithEmbed(const std::string &requestMsg, const std::string &embedData, const std::vector<std::pair<std::string, bool>> &chatHistory)
{
    std::string responseText = {};

    std::string prompt = "Using this data: ";
    prompt += embedData + ". Respond to this prompt: " + requestMsg;
    LOG("prompt: {}", prompt.c_str());


    this->lastRequestMessage = requestMsg;
    json requestBody = {
        {"model", this->model},
        {"stream", this->supportStreaming},
        {"keep_alive", "20m"}
    };

    requestBody["messages"] = nlohmann::json::array();

    {
        nlohmann::json message;
        message["content"] = rag::getSystemMessage();
        message["role"] = "system";
        requestBody["messages"].push_back(message);
    }

    for (const auto& [content, role] : chatHistory) {
        nlohmann::json message;
        message["content"] = content;
        message["role"] = role ? "user" : "assistant";
        requestBody["messages"].push_back(message);
    }

    {
        nlohmann::json message;
        message["content"] = prompt;
        message["role"] = "user";
        requestBody["messages"].push_back(message);
    }

    std::string targetURL = this->backURL_emb + CHAT_COMPLETION_URI;

    //no streaming logic, full response returned in a single blocking request
    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, CHAT_COMPLETION_METHOD);

    json responseBody = json::parse(responseEncoded);
    responseText = responseBody["message"]["content"];
    this->lastResponseMessage = responseText;
    return responseText;
}

std::string ollama_client::processMessageWithCommandHandler(const std::string &requestMsg)
{
    std::string responseText = {};

    this->lastRequestMessage = requestMsg;
    json requestBody = {
        {"model", this->model},
        {"stream", this->supportStreaming},
        {"keep_alive", "20m"}
    };

    requestBody["messages"] = nlohmann::json::array();
    {
        nlohmann::json message;
        std::string systemMsg = "Tools: ";

        for(const auto& x : g_cmd->listCommands())
        {
            systemMsg += "\"[" + x.first + "]extracted argument[/" + x.first + "]\" " + x.second + ";";
        }

        systemMsg += "Your task is to transform given text into commands if it fits the functionality of the tools. All the tools have at least one incoming argument. "
        "For the text given to you, you need to extract the argument and form a command. Some given texts cannot be transformed into commands - respond with \"Undefined\" on them instead. "
        "Transform text into commands only on direct orders to do something in given text, otherwise respond with \"Undefined\". "
        "Always respond with \"Undefined\" to given text that don't require the use of the tools and if you haven't found the right tool "
        "or if you are not completely sure that the tools are suitable for given text. "
        "For example, if you don't have a tool for measuring the distance between cities, don't try to solve it using similar tools and just respond with \"Undefined\".";

        message["content"] = systemMsg;
        message["role"] = "system";
        requestBody["messages"].push_back(message);
    }

    //give it some examples
    std::vector<std::pair<std::string, std::string>> toolsUsageExamples =
    {
        {"Given text: whats the weather in berlin?", "[GETWEATHER]Berlin[/GETWEATHER]"},
        {"Given text: Generate me a picture of good weather in berlin", "[GENIMG]good weather in berlin[/GENIMG]"},
        {"Given text: What's the weather like in Miami today?", "[GETWEATHER]Miami[/GETWEATHER]"},
        {"Given text: generate me image of los angeles", "[GENIMG]los angeles[/GENIMG]"},
        {"Given text: generate me image of cute dog", "[GENIMG]cute dog[/GENIMG]"},
        {"Given text: generate a picture of a powerful car", "[GENIMG]powerful car[/GENIMG]"},
        {"Given text: generate me a poem", "Undefined"},
        {"Given text: generate me a story about two pets", "Undefined"},
        {"Given text: write me a 2000 word story", "Undefined"},
        {"Given text: Tell me about the weather in different seasons in the city of Berlin", "Undefined"},
        {"Given text: Generate me hello world in C++", "Undefined"},
        {"Given text: Write me md5 hashing function in python", "Undefined"},
        {"Given text: can we do encryption with javascript?", "Undefined"},
        {"Given text: Do you think 6*6 can be solved with get weather tool?", "Undefined"},
        {"Given text: Can this be solved with python?", "Undefined"},
        {"Given text: image los angeles", "Undefined"}
    };

    for(const auto& example : toolsUsageExamples)
    {
        {
            nlohmann::json message;
            message["content"] = example.first;
            message["role"] = "user";
            requestBody["messages"].push_back(message);
        }

        {
            nlohmann::json message;
            message["content"] = example.second;
            message["role"] = "assistant";
            requestBody["messages"].push_back(message);
        }
    }

    {
        nlohmann::json message;
        message["content"] = "Given text: " + requestMsg;
        message["role"] = "user";
        requestBody["messages"].push_back(message);
    }

    std::string targetURL = backURL + CHAT_COMPLETION_URI;

    //no streaming logic, full response returned in a single blocking request
    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, CHAT_COMPLETION_METHOD);
    LOG("received response: {}", responseEncoded);

    json responseBody = json::parse(responseEncoded);
    responseText = responseBody["message"]["content"];
    this->lastResponseMessage = responseText;
    return responseText;
}

std::vector<float> ollama_client::processNewEmbedding(const std::string& embedding)
{
    std::string responseText = {};

    json requestBody = {
        {"model", "mxbai-embed-large:latest"},
        {"prompt", embedding}
    };

    std::string targetURL = this->backURL_emb + GENERATE_EMBED_URI;

    std::string responseEncoded = makeHttpRequest(targetURL, requestBody, GENERATE_EMBED_METHOD);

    json responseBody = json::parse(responseEncoded);
    // printf("\nresponse: %.50s\n", responseBody.dump(2).c_str());
    std::vector<float> newEmbs = responseBody["embedding"];
    return newEmbs;
}

std::string ollama_client::processListModels()
{
    std::string targetURL = backURL + LIST_MODELS_URI;
    std::string responseEncoded = makeHttpRequest(targetURL, {}, LIST_MODELS_METHOD);
    json responseBody = json::parse(responseEncoded);
    std::vector<nlohmann::json> models = responseBody["models"];
    std::vector<std::string> modelNames = {};
    for (const auto& model : models)
    {
        modelNames.push_back(model["name"]);
    }
    json output;
    output["models"] = modelNames;
    return output.dump();
}