ggml-org
diff --git a/‎common/arg.cpp‎
Lines changed: 9 additions & 2 deletions b/‎common/arg.cpp‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎common/common.h‎
Lines changed: 4 additions & 0 deletions b/‎common/common.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tools/cli/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎tools/cli/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
@@ -600,9 +600,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
     }
 
-    // model is required (except for server)
+    // model is required (except for server, or when using --endpoint in CLI)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion && params.endpoint.empty()) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
@@ -1398,6 +1398,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.show_timings = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
+    add_opt(common_arg(
+        {"--endpoint"}, "URL",
+        string_format("connect to a running llama-server at URL instead of loading a model locally (e.g. http://localhost:8080)"),
+        [](common_params & params, const std::string & value) {
+            params.endpoint = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_ENDPOINT"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
 
@@ -555,6 +555,10 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
+    // remote server endpoint for CLI (e.g. "http://localhost:8080")
+    // when set, CLI connects to a running server instead of loading a model
+    std::string endpoint   = "";                                                                   // NOLINT
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
 
@@ -1,6 +1,6 @@
 set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
+add_executable(${TARGET} cli.cpp cli-remote.cpp)
+target_link_libraries(${TARGET} PRIVATE server-context cpp-httplib PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 include_directories(../server)