Skip to content

Commit 3a60272

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 5edf151 + 557fe2d commit 3a60272

12 files changed

Lines changed: 561 additions & 58 deletions

File tree

common/arg.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2671,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26712671
[](common_params & params, const std::string & value) {
26722672
params.out_file = value;
26732673
}
2674-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
2674+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
2675+
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
26752676
add_opt(common_arg(
26762677
{"-ofreq", "--output-frequency"}, "N",
26772678
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ enum llama_example {
106106
LLAMA_EXAMPLE_FINETUNE_QLORA,
107107
LLAMA_EXAMPLE_FIT_PARAMS,
108108
LLAMA_EXAMPLE_RESULTS,
109+
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
109110

110111
LLAMA_EXAMPLE_COUNT,
111112
};

scripts/sync_vendor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import sys
66
import subprocess
77

8-
HTTPLIB_VERSION = "refs/tags/v0.37.0"
8+
HTTPLIB_VERSION = "refs/tags/v0.37.1"
99

1010
vendor = {
1111
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

src/llama-context.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "llama-memory.h"
88
#include "llama-mmap.h"
99
#include "llama-model.h"
10+
#include "llama-ext.h"
1011

1112
#include <cinttypes>
1213
#include <cmath>
@@ -3240,6 +3241,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
32403241
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
32413242
}
32423243

3244+
struct ggml_cgraph * llama_graph_reserve(
3245+
struct llama_context * ctx,
3246+
uint32_t n_tokens,
3247+
uint32_t n_seqs,
3248+
uint32_t n_outputs) {
3249+
auto * memory = ctx->get_memory();
3250+
llama_memory_context_ptr mctx;
3251+
if (memory) {
3252+
mctx = memory->init_full();
3253+
}
3254+
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
3255+
}
3256+
32433257
// llama adapter API
32443258

32453259
int32_t llama_set_adapters_lora(

src/llama-ext.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#pragma once
2+
3+
#include "llama-context.h"
4+
#include "ggml.h"
5+
#include "stdint.h"
6+
7+
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
8+
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
9+
struct llama_context * ctx,
10+
uint32_t n_tokens,
11+
uint32_t n_seqs,
12+
uint32_t n_outputs);

tests/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ endif()
260260
set(LLAMA_TEST_NAME test-mtmd-c-api)
261261
llama_build_and_test(test-mtmd-c-api.c)
262262
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
263+
unset(LLAMA_TEST_NAME)
263264

264265
# GGUF model data fetcher library for tests that need real model metadata
265266
# Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
@@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
284285
llama_build_and_test(test-alloc.cpp)
285286
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
286287

287-
288+
llama_build(export-graph-ops.cpp)
289+
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)

tests/export-graph-ops.cpp

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#include "arg.h"
2+
#include "common.h"
3+
#include "log.h"
4+
#include "llama.h"
5+
#include "../src/llama-ext.h"
6+
#include "ggml.h"
7+
8+
#include <array>
9+
#include <vector>
10+
#include <set>
11+
#include <fstream>
12+
#include <iostream>
13+
14+
struct input_tensor {
15+
ggml_type type;
16+
std::array<int64_t, 4> ne;
17+
std::array<size_t, 4> nb;
18+
19+
input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
20+
memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
21+
memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
22+
}
23+
24+
bool operator<(const input_tensor &b) const {
25+
return std::tie(type, ne, nb) <
26+
std::tie(b.type, b.ne, b.nb);
27+
}
28+
29+
void serialize(std::ostream& out) const {
30+
out << type << ' ';
31+
for (size_t i = 0; i < 4; i++) {
32+
out << ne[i] << ' ';
33+
}
34+
for (size_t i = 0; i < 4; i++) {
35+
out << nb[i] << ' ';
36+
}
37+
}
38+
};
39+
40+
struct test_object {
41+
ggml_op op;
42+
ggml_type type;
43+
std::array<int64_t, 4> ne;
44+
std::vector<int32_t> op_params;
45+
std::vector<input_tensor> sources;
46+
std::string name;
47+
48+
void serialize(std::ostream& out) const {
49+
out << op << ' ' << type << ' ';
50+
for (size_t i = 0; i < 4; i++) {
51+
out << ne[i] << ' ';
52+
}
53+
54+
out << op_params.size() << ' ';
55+
for (size_t i = 0; i < op_params.size(); i++) {
56+
out << op_params[i] << ' ';
57+
}
58+
59+
out << sources.size() << ' ';
60+
for (size_t s = 0; s < sources.size(); s++) {
61+
sources[s].serialize(out);
62+
}
63+
64+
if (!name.empty()) {
65+
out << name;
66+
} else {
67+
out << '-';
68+
}
69+
70+
out << '\n';
71+
}
72+
73+
bool operator<(const test_object &b) const {
74+
return std::tie(op, type, ne, op_params, sources) <
75+
std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
76+
}
77+
};
78+
79+
static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
80+
int n_nodes = ggml_graph_n_nodes(cgraph);
81+
int n_skipped = 0;
82+
int n_before = (int) tests.size();
83+
for (int i = 0; i < n_nodes; i++) {
84+
ggml_tensor * node = ggml_graph_node(cgraph, i);
85+
86+
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
87+
n_skipped++;
88+
continue;
89+
}
90+
91+
test_object test;
92+
93+
test.op = node->op;
94+
test.type = node->type;
95+
memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
96+
97+
test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
98+
memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
99+
100+
for (size_t s = 0; s < GGML_MAX_SRC; s++) {
101+
if (node->src[s] == nullptr) {
102+
break;
103+
}
104+
105+
test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
106+
}
107+
108+
test.name = node->name;
109+
tests.insert(test);
110+
}
111+
112+
int n_new = (int) tests.size() - n_before;
113+
LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
114+
label, n_new, n_nodes, n_skipped);
115+
}
116+
117+
int main(int argc, char ** argv) {
118+
common_params params;
119+
params.out_file = "tests.txt";
120+
121+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
122+
return 1;
123+
}
124+
125+
common_init();
126+
127+
// Load CPU-only
128+
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
129+
params.devices = { cpu_device, nullptr };
130+
params.fit_params = false;
131+
params.n_gpu_layers = 0;
132+
133+
params.warmup = false;
134+
135+
auto init_result = common_init_from_params(params);
136+
137+
llama_context * ctx = init_result->context();
138+
139+
const uint32_t n_seqs = llama_n_seq_max(ctx);
140+
const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
141+
142+
std::set<test_object> tests;
143+
144+
auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
145+
if (!gf_pp) {
146+
throw std::runtime_error("failed to reserve prompt processing graph");
147+
}
148+
extract_graph_ops(gf_pp, "pp", tests);
149+
150+
auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
151+
if (!gf_tg) {
152+
throw std::runtime_error("failed to reserve token generation graph");
153+
}
154+
extract_graph_ops(gf_tg, "tg", tests);
155+
156+
LOG_INF("%d unique ops total\n", (int) tests.size());
157+
158+
std::ofstream f(params.out_file);
159+
160+
if (!f.is_open()) {
161+
throw std::runtime_error("Unable to open output file");
162+
}
163+
164+
for (const auto& test : tests) {
165+
test.serialize(f);
166+
}
167+
168+
return 0;
169+
}

0 commit comments

Comments
 (0)