Skip to content

Commit d9a7bd5

Browse files
committed
gpu layer offloading disabled for phi models in clblast
1 parent 0a70cc1 commit d9a7bd5

File tree

4 files changed

+38
-36
lines changed

4 files changed

+38
-36
lines changed

expose.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,13 +169,9 @@ extern "C"
169169
{
170170
printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format);
171171
}
172-
else if(file_format==FileFormat::GGUF_FALCON)
173-
{
174-
printf("\n---\nIdentified as FALCON model: (ver %d)\nAttempting to Load...\n---\n", file_format);
175-
}
176172
else
177173
{
178-
printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format);
174+
printf("\n---\nIdentified as GGUF model: (ver %d)\nAttempting to Load...\n---\n", file_format);
179175
}
180176
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
181177
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)

gpttype_adapter.cpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
141141
{
142142
return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
143143
}
144-
else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
144+
else if(file_format == FileFormat::GGUF_GENERIC)
145145
{
146146
return std::string(llama_token_to_str(llama_ctx_v4, id));
147147
}
@@ -153,7 +153,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
153153

154154
static void TokenizeString(const std::string & str_to_tokenize, std::vector<int> & output_tokens, FileFormat file_format)
155155
{
156-
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
156+
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
157157
{
158158
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
159159
{
@@ -182,9 +182,9 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)
182182
{
183183
unsigned int eosID = 0;
184184

185-
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
185+
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
186186
{
187-
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
187+
if(file_format == FileFormat::GGUF_GENERIC)
188188
{
189189
eosID = llama_token_eos(&(llama_ctx_v4->model));
190190
}
@@ -696,7 +696,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
696696
file_format = in_file_format;
697697
n_threads = kcpp_params->n_threads = inputs.threads;
698698
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
699-
bool isGguf = (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON);
699+
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
700700

701701
n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize);
702702
modelname = kcpp_params->model = inputs.model_filename;
@@ -712,7 +712,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
712712
auto clamped_max_context_length = inputs.max_context_length;
713713

714714
if(clamped_max_context_length>16384 &&
715-
file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
715+
file_format != FileFormat::GGUF_GENERIC)
716716
{
717717
printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n");
718718
clamped_max_context_length = 16384;
@@ -748,7 +748,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
748748
{
749749
//approximate NTK aware ctx
750750
auto effectivenctx = kcpp_params->n_ctx;
751-
if((file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) && file_format_meta.n_ctx_train > 2048)
751+
if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048)
752752
{
753753
float factor = file_format_meta.n_ctx_train/2048;
754754
effectivenctx = effectivenctx/factor;
@@ -781,7 +781,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
781781

782782
printf("System Info: %s\n", llama_print_system_info());
783783
#if defined(GGML_USE_CUBLAS)
784-
if(file_format!=FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
784+
if(file_format!=FileFormat::GGUF_GENERIC)
785785
{
786786
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
787787
{
@@ -915,7 +915,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
915915
}
916916
return ModelLoadResult::SUCCESS;
917917
}
918-
else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
918+
else if(file_format==FileFormat::GGUF_GENERIC)
919919
{
920920
llama_model_params model_params = llama_model_default_params();
921921
llama_context_params llama_ctx_params = llama_context_default_params();
@@ -932,10 +932,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
932932
model_params.use_mmap = inputs.use_mmap;
933933
model_params.use_mlock = inputs.use_mlock;
934934
model_params.n_gpu_layers = inputs.gpulayers;
935+
935936
#if defined(GGML_USE_CLBLAST)
936-
if(file_format==FileFormat::GGUF_FALCON && model_params.n_gpu_layers>0)
937+
if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0)
937938
{
938-
printf("\nGPU layer offload for GGUF FALCON on OpenCL is known to have issues, it has been set to 0.\n");
939+
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
939940
model_params.n_gpu_layers = 0;
940941
}
941942
#endif
@@ -1642,13 +1643,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
16421643
else
16431644
{
16441645
bool triggersc = useSmartContext;
1645-
if(useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
1646+
if(useContextShift && (file_format == FileFormat::GGUF_GENERIC))
16461647
{
16471648
PurgeMissingTokens(llama_ctx_v4, current_context_tokens, embd_inp, inputs.max_length, nctx);
16481649
triggersc = false;
16491650
}
16501651
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);
1651-
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
1652+
if(file_format == FileFormat::GGUF_GENERIC)
16521653
{
16531654
llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
16541655
}
@@ -1669,7 +1670,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
16691670
{
16701671
//for non llama, limit to 256
16711672
int bbs = blasbatchsize;
1672-
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
1673+
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
16731674
{
16741675
bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
16751676
}
@@ -1821,7 +1822,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
18211822
{
18221823
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0);
18231824
}
1824-
else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
1825+
else if(file_format == FileFormat::GGUF_GENERIC)
18251826
{
18261827
evalres = (llama_decode(llama_ctx_v4, llama_batch_get_one(embd.data(), embdsize, n_past, 0))==0);
18271828
}
@@ -1934,9 +1935,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
19341935
float * logitsPtr;
19351936
float lowestLogit = 0;
19361937
int btsize = banned_token_ids.size();
1937-
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
1938+
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
19381939
{
1939-
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
1940+
if(file_format == FileFormat::GGUF_GENERIC)
19401941
{
19411942
logitsPtr = llama_get_logits(llama_ctx_v4);
19421943
}

model_adapter.cpp

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ void print_tok_vec(std::vector<float> &embd)
255255
else if(magic == 0x46554747)
256256
{
257257
fin.close();
258-
fileformat = FileFormat::GGUF_LLAMA;
258+
fileformat = FileFormat::GGUF_GENERIC;
259259

260260
struct gguf_init_params ggufparams;
261261
ggufparams.no_alloc = true;
@@ -267,19 +267,8 @@ void print_tok_vec(std::vector<float> &embd)
267267
std::string modelarch = "";
268268
if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); }
269269

270-
if(modelarch=="llama")
271-
{
272-
fileformat = FileFormat::GGUF_LLAMA;
273-
}
274-
else if(modelarch=="falcon")
275-
{
276-
fileformat = FileFormat::GGUF_FALCON; //uses the same loader
277-
}
278-
279-
280270
printf("\nThe reported GGUF Arch is: %s\n",(modelarch==""?"unknown":modelarch.c_str()));
281271

282-
283272
if(modelarch!="" && fileformatmeta!=nullptr)
284273
{
285274
std::string fkey = modelarch+".context_length";
@@ -289,6 +278,15 @@ void print_tok_vec(std::vector<float> &embd)
289278
}
290279
int filever = gguf_get_version(ctx);
291280
fileformatmeta->fileversion = filever;
281+
fileformatmeta->model_architecture = GGUFArch::DEFAULT;
282+
if(modelarch=="phi2")
283+
{
284+
fileformatmeta->model_architecture = GGUFArch::PHI;
285+
}
286+
else if(modelarch=="falcon")
287+
{
288+
fileformatmeta->model_architecture = GGUFArch::FALCON;
289+
}
292290
}
293291
gguf_free(ctx);
294292
}

model_adapter.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ enum FileFormat
2121
GGJT=3, // 3=(llama ggjt)
2222
GGJT_2=4, //newer llama format unshuffled
2323
GGJT_3=5, //using 16bit scalar
24-
GGUF_LLAMA=6, //GGUF (llama newest ver)
24+
25+
GGUF_GENERIC=6, //GGUF (llama newest ver)
2526

2627
GPTJ_1=100, //the very first super old GPTJ format
2728
GPTJ_2=101, //pygmalion, uses old ggml lib
@@ -47,14 +48,20 @@ enum FileFormat
4748

4849
MPT_1=500, //first supported mpt version
4950

50-
GGUF_FALCON=600, //GGUF (falcon)
51+
};
5152

53+
enum GGUFArch
54+
{
55+
DEFAULT = 0, //used for llama and other generic gguf
56+
FALCON = 1,
57+
PHI = 2,
5258
};
5359

5460
struct FileFormatExtraMeta
5561
{
5662
int n_ctx_train = 2048;
5763
int fileversion = 0;
64+
GGUFArch model_architecture = GGUFArch::DEFAULT;
5865
};
5966

6067
enum ModelLoadResult

0 commit comments

Comments
 (0)