Skip to content

Commit ddce19d

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/nix/package-gguf-py.nix # .devops/nix/scope.nix # common/CMakeLists.txt # docs/backend/SYCL.md # examples/lookahead/lookahead.cpp # examples/lookup/lookup.cpp # examples/sycl/run-llama2.sh # examples/sycl/win-run-llama2.bat # examples/sycl/win-test.bat # ggml/src/ggml-hexagon/CMakeLists.txt # ggml/src/ggml-hexagon/htp/flash-attn-ops.c # ggml/src/ggml-hexagon/htp/hvx-dump.h # ggml/src/ggml-hexagon/htp/hvx-reduce.h # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-hexagon/htp/softmax-ops.c # ggml/src/ggml-hexagon/htp/unary-ops.c # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # scripts/sync-ggml.last
2 parents 76b22a7 + 2634ed2 commit ddce19d

19 files changed

Lines changed: 375 additions & 74 deletions

Makefile

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -716,25 +716,25 @@ clean:
716716
rm -vrf llguidance
717717

718718
# useful tools
719-
main: tools/completion/completion.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
719+
main: tools/completion/completion.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
720720
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
721-
mainvk: tools/completion/completion.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
721+
mainvk: tools/completion/completion.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
722722
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
723-
fitparams: tools/fit-params/fit-params.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
723+
fitparams: tools/fit-params/fit-params.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
724724
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
725725
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/version.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
726726
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
727727
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
728728
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
729-
ttsmain: tools/tts/tts.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
729+
ttsmain: tools/tts/tts.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
730730
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
731731
gguf-split: tools/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
732732
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
733-
mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp tools/mtmd/mtmd-helper.cpp tools/mtmd/clip.cpp common/debug.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
733+
mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp tools/mtmd/mtmd-helper.cpp tools/mtmd/clip.cpp common/debug.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
734734
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
735-
embedding: examples/embedding/embedding.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
735+
embedding: examples/embedding/embedding.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
736736
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
737-
embeddingvk: examples/embedding/embedding.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/chat.cpp common/preset.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
737+
embeddingvk: examples/embedding/embedding.cpp common/arg.cpp common/speculative.cpp common/ngram-cache.cpp common/ngram-map.cpp common/ngram-mod.cpp common/chat.cpp common/preset.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
738738
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
739739
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/ttscpp.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
740740
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

common/arg.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3399,7 +3399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
33993399
}
34003400
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
34013401
add_opt(common_arg(
3402-
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
3402+
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
34033403
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
34043404
common_speculative_type_to_str(params.speculative.type).c_str()),
34053405
[](common_params & params, const std::string & value) {
@@ -3413,6 +3413,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34133413
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
34143414
} else if (value == "ngram-map-k4v") {
34153415
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
3416+
} else if (value == "ngram-mod") {
3417+
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
34163418
} else {
34173419
throw std::invalid_argument("unknown speculative decoding type without draft model");
34183420
}

common/common.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ enum common_speculative_type {
168168
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
169169
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
170170
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
171+
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
171172
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
172173
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
173174
};
@@ -249,6 +250,8 @@ struct common_params_model {
249250
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
250251
};
251252

253+
struct common_ngram_mod;
254+
252255
struct common_params_speculative {
253256
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
254257

@@ -266,6 +269,8 @@ struct common_params_speculative {
266269
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
267270
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
268271

272+
std::shared_ptr<common_ngram_mod> ngram_mod;
273+
269274
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
270275
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
271276

common/jinja/value.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <set>
1313
#include <sstream>
1414
#include <string>
15+
#include <unordered_map>
1516
#include <vector>
1617

1718
namespace jinja {

common/ngram-map.cpp

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77
#include <cstdio>
88
#include <sstream>
99

10+
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
11+
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
12+
std::ostringstream oss;
13+
oss << '[';
14+
for (size_t i = 0; i < length; ++i) {
15+
if (i > 0) {
16+
oss << ", ";
17+
}
18+
oss << inp[start + i];
19+
}
20+
oss << ']';
21+
return oss.str();
22+
}
23+
24+
1025
// n-gram simple
1126
//
1227

@@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
100115
// maximum number of counted values of a ngram map value.
101116
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
102117

103-
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
104-
105118
void common_ngram_map_draft(common_ngram_map & map,
106119
const llama_tokens & inp, llama_token sampled,
107120
llama_tokens & draft) {
@@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
347360
n_accepted, curr_value.n_accepted);
348361
curr_value.n_accepted = n_accepted;
349362
}
350-
351-
// Helper functions.
352-
//
353-
354-
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
355-
std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
356-
std::ostringstream oss;
357-
oss << '[';
358-
for (size_t i = 0; i < length; ++i) {
359-
if (i > 0) {
360-
oss << ", ";
361-
}
362-
oss << inp[start + i];
363-
}
364-
oss << ']';
365-
return oss.str();
366-
}
367-

common/ngram-map.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//
1212

1313
#include "llama.h"
14+
#include "common.h"
1415

1516
#include <vector>
1617

common/ngram-mod.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include "ngram-mod.h"
2+
3+
//
4+
// common_ngram_mod
5+
//
6+
7+
common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
8+
entries.resize(size);
9+
10+
reset();
11+
}
12+
13+
size_t common_ngram_mod::idx(const entry_t * tokens) const {
14+
size_t res = 0;
15+
16+
for (size_t i = 0; i < n; ++i) {
17+
res = res*6364136223846793005ULL + tokens[i];
18+
}
19+
20+
res = res % entries.size();
21+
22+
return res;
23+
}
24+
25+
void common_ngram_mod::add(const entry_t * tokens) {
26+
const size_t i = idx(tokens);
27+
28+
if (entries[i] == EMPTY) {
29+
used++;
30+
}
31+
32+
entries[i] = tokens[n];
33+
}
34+
35+
common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
36+
const size_t i = idx(tokens);
37+
38+
return entries[i];
39+
}
40+
41+
void common_ngram_mod::reset() {
42+
std::fill(entries.begin(), entries.end(), EMPTY);
43+
used = 0;
44+
}
45+
46+
size_t common_ngram_mod::get_n() const {
47+
return n;
48+
}
49+
50+
size_t common_ngram_mod::get_used() const {
51+
return used;
52+
}
53+
54+
size_t common_ngram_mod::size() const {
55+
return entries.size();
56+
}
57+
58+
size_t common_ngram_mod::size_bytes() const {
59+
return entries.size() * sizeof(entries[0]);
60+
}

common/ngram-mod.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#pragma once
2+
3+
#include <cstdint>
4+
#include <vector>
5+
#include <cstddef>
6+
7+
//
8+
// common_ngram_mod
9+
// ref: https://github.com/ggml-org/llama.cpp/pull/19164
10+
//
11+
12+
// basic n-gram hasher
13+
struct common_ngram_mod {
14+
using entry_t = int32_t;
15+
16+
static constexpr entry_t EMPTY = -1;
17+
18+
common_ngram_mod(uint16_t n, size_t size);
19+
20+
size_t idx(const entry_t * tokens) const;
21+
void add(const entry_t * tokens);
22+
entry_t get(const entry_t * tokens) const; // return -1 if not found
23+
24+
void reset();
25+
26+
size_t get_n() const;
27+
size_t get_used() const;
28+
29+
size_t size() const;
30+
size_t size_bytes() const;
31+
32+
private:
33+
size_t n; // ngram size to hash
34+
35+
size_t used;
36+
37+
std::vector<entry_t> entries;
38+
};

0 commit comments

Comments
 (0)