srossitto79
diff --git a/‎common/arg.cpp‎
Lines changed: 2 additions & 1 deletion b/‎common/arg.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 0 deletions b/‎common/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/sync_vendor.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/sync_vendor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama-context.cpp‎
Lines changed: 14 additions & 0 deletions b/‎src/llama-context.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/llama-ext.h‎
Lines changed: 12 additions & 0 deletions b/‎src/llama-ext.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tests/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎tests/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/export-graph-ops.cpp‎
Lines changed: 169 additions & 0 deletions b/‎tests/export-graph-ops.cpp‎
Lines changed: 169 additions & 0 deletions
@@ -2671,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
+                    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
 
@@ -106,6 +106,7 @@ enum llama_example {
     LLAMA_EXAMPLE_FINETUNE_QLORA,
     LLAMA_EXAMPLE_FIT_PARAMS,
     LLAMA_EXAMPLE_RESULTS,
+    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
 
     LLAMA_EXAMPLE_COUNT,
 };
 
@@ -5,7 +5,7 @@
 import sys
 import subprocess
 
-HTTPLIB_VERSION = "refs/tags/v0.37.0"
+HTTPLIB_VERSION = "refs/tags/v0.37.1"
 
 vendor = {
     "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
 
@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-mmap.h"
 #include "llama-model.h"
+#include "llama-ext.h"
 
 #include <cinttypes>
 #include <cmath>
@@ -3240,6 +3241,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
     return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
 }
 
+struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs) {
+    auto * memory = ctx->get_memory();
+    llama_memory_context_ptr mctx;
+    if (memory) {
+        mctx = memory->init_full();
+    }
+    return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
+}
+
 // llama adapter API
 
 int32_t llama_set_adapters_lora(
 
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "llama-context.h"
+#include "ggml.h"
+#include "stdint.h"
+
+// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
+LLAMA_API struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs);
@@ -260,6 +260,7 @@ endif()
 set(LLAMA_TEST_NAME test-mtmd-c-api)
 llama_build_and_test(test-mtmd-c-api.c)
 target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
+unset(LLAMA_TEST_NAME)
 
 # GGUF model data fetcher library for tests that need real model metadata
 # Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
@@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 
-
+llama_build(export-graph-ops.cpp)
+target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
@@ -0,0 +1,169 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "../src/llama-ext.h"
+#include "ggml.h"
+
+#include <array>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <iostream>
+
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb;
+
+    input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
+        memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
+        memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
+    }
+
+    bool operator<(const input_tensor &b) const {
+        return std::tie(type, ne, nb) <
+               std::tie(b.type, b.ne, b.nb);
+    }
+
+    void serialize(std::ostream& out) const {
+        out << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+        for (size_t i = 0; i < 4; i++) {
+            out << nb[i] << ' ';
+        }
+    }
+};
+
+struct test_object {
+    ggml_op op;
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::vector<int32_t> op_params;
+    std::vector<input_tensor> sources;
+    std::string name;
+
+    void serialize(std::ostream& out) const {
+        out << op << ' ' << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+
+        out << op_params.size() << ' ';
+        for (size_t i = 0; i < op_params.size(); i++) {
+            out << op_params[i] << ' ';
+        }
+
+        out << sources.size() << ' ';
+        for (size_t s = 0; s < sources.size(); s++) {
+            sources[s].serialize(out);
+        }
+
+        if (!name.empty()) {
+            out << name;
+        } else {
+            out << '-';
+        }
+
+        out << '\n';
+    }
+
+    bool operator<(const test_object &b) const {
+        return std::tie(op, type, ne, op_params, sources) <
+               std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
+    }
+};
+
+static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
+    int n_nodes = ggml_graph_n_nodes(cgraph);
+    int n_skipped = 0;
+    int n_before = (int) tests.size();
+    for (int i = 0; i < n_nodes; i++) {
+        ggml_tensor * node = ggml_graph_node(cgraph, i);
+
+        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+            n_skipped++;
+            continue;
+        }
+
+        test_object test;
+
+        test.op = node->op;
+        test.type = node->type;
+        memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
+
+        test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
+        memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
+
+        for (size_t s = 0; s < GGML_MAX_SRC; s++) {
+            if (node->src[s] == nullptr) {
+                break;
+            }
+
+            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
+        }
+
+        test.name = node->name;
+        tests.insert(test);
+    }
+
+    int n_new = (int) tests.size() - n_before;
+    LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
+            label, n_new, n_nodes, n_skipped);
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+    params.out_file = "tests.txt";
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
+        return 1;
+    }
+
+    common_init();
+
+    // Load CPU-only
+    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    params.devices = { cpu_device, nullptr };
+    params.fit_params = false;
+    params.n_gpu_layers = 0;
+
+    params.warmup = false;
+
+    auto init_result = common_init_from_params(params);
+
+    llama_context * ctx = init_result->context();
+
+    const uint32_t n_seqs  = llama_n_seq_max(ctx);
+    const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
+
+    std::set<test_object> tests;
+
+    auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
+    if (!gf_pp) {
+        throw std::runtime_error("failed to reserve prompt processing graph");
+    }
+    extract_graph_ops(gf_pp, "pp", tests);
+
+    auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
+    if (!gf_tg) {
+        throw std::runtime_error("failed to reserve token generation graph");
+    }
+    extract_graph_ops(gf_tg, "tg", tests);
+
+    LOG_INF("%d unique ops total\n", (int) tests.size());
+
+    std::ofstream f(params.out_file);
+
+    if (!f.is_open()) {
+        throw std::runtime_error("Unable to open output file");
+    }
+
+    for (const auto& test : tests) {
+        test.serialize(f);
+    }
+
+    return 0;
+}