GradientAI Auto ROPE Base calculation (LostRuins#910)

askmyteapot · LostRuins · web-flow · commit 1e72b65c38f7 · 2024-06-13T18:12:00.000+08:00
* GradientAI Auto ROPE Base calculation https://gradient.ai/blog/scaling-rotational-embeddings-for-long-context-language-models has a formula that better fits the ideal rope scaling. Tested with Lllama3, checked calculation is correct for llama2. Retains logic for not scaling rope if under trained CTX. * add in solar scaling logic Solar based models require the context values to be multiplied by 8. This is (i'm guessing) because the positions as based on a 32k context, but sliding window of 4k. * Update model_adapter.h adding in tensor count to identify solar models based on tensor count of 435. * Update model_adapter.cpp add in n_tensor count for solar identification * refactor and cleanup GradientAI rope scaling --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -7,6 +7,7 @@
 //No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
 //Python will ALWAYS provide the memory, we just write to it.
 
+#include <cmath>
 #include <time.h>
 #include <mutex>
 #include "model_adapter.h"
@@ -787,6 +788,19 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
     return desiredBlasBatchSize;
 }
 
+//this function applies automatic scaling to rope freq base when the desired context exceeds trained context
+static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_train, int n_ctx_desired, bool is_solar)
+{
+    if(n_ctx_desired <= n_ctx_train || n_ctx_desired <= 2048)
+    {
+        return original_rope_base;
+    }
+    float ctx_multiplier = (is_solar?8.0f:1.0f);
+	float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318;
+    float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
+    return powf(original_rope_base, logf(chi_ctx_value) / logf(chi_ctx_train_value));
+}
+
 ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
 {
     ggml_time_init();
@@ -835,28 +849,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     }
     else
     {
-        rope_freq_scale = 1.0f;
-        if (kcpp_params->n_ctx <= 2048) //normie mode
+        //Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later.
+        rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_params->n_ctx,false);
+        if(file_format==FileFormat::GGUF_GENERIC)
         {
-            rope_freq_base = 10000.0f;
+            printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n");
         }
         else
         {
-            //approximate NTK aware ctx
-            auto effectivenctx = kcpp_params->n_ctx;
-            if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048)
-            {
-                float factor = file_format_meta.n_ctx_train/2048;
-                effectivenctx = effectivenctx/factor;
-            }
-            float magic_multiplier = 8.0f;
-            float base_multiplier = effectivenctx*magic_multiplier;
-            float base_raw = 10000.0f;
-            rope_freq_base = (effectivenctx <= 2048 ? base_raw : base_multiplier);
-
+            printf("Using Automatic RoPE scaling, Pre-GGUF (scale:%.3f, base:%.1f).\n",rope_freq_scale, rope_freq_base);
         }
-
-        printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n");
     }
     gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale;
     gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base;
@@ -1085,7 +1087,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
         else
         {
-            //if the model modifes rope in any way, use the model values. Otherwise, use our automatic ones
+            //if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones
             //special exception for llama, which uses auto scale
             if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) ||
             llamamodel->hparams.rope_freq_scale_train!=1.0f ||
@@ -1095,8 +1097,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             }
             else
             {
-                float multiplier_rope_base = llamamodel->hparams.rope_freq_base_train/10000.0f;
-                rope_freq_base *= multiplier_rope_base;
+				//Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
+                rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, file_format_meta.n_ctx_train, kcpp_params->n_ctx, file_format_meta.model_architecture==GGUFArch::ARCH_SOLAR);
                 llama_ctx_params.rope_freq_base = rope_freq_base;
                 llama_ctx_params.rope_freq_scale = rope_freq_scale;
                 printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
@@ -2467,4 +2469,4 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     concat_output_mtx.unlock();
     output.text = concat_output_reader_copy_res.c_str();
     return output;
-}
+}
diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -271,6 +271,9 @@ void print_tok_vec(std::vector<float> &embd)
 
         if(modelarch!="" && fileformatmeta!=nullptr)
         {
+            int n_tensors = gguf_get_n_tensors(ctx);
+            float freq_base_train = 0;
+
             std::string fkey = modelarch+".context_length";
             int keyidx = gguf_find_key(ctx, fkey.c_str());
             if (keyidx != -1) {
@@ -281,8 +284,14 @@ void print_tok_vec(std::vector<float> &embd)
             if (keyidx != -1) {
                 fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);
             }
+            fkey = modelarch+".rope.freq_base";
+            keyidx = gguf_find_key(ctx, fkey.c_str());
+            if (keyidx != -1) {
+                freq_base_train = gguf_get_val_f32(ctx, keyidx);
+            }
 
             int filever = gguf_get_version(ctx);
+
             fileformatmeta->fileversion = filever;
             fileformatmeta->model_architecture = GGUFArch::ARCH_DEFAULT;
             if(modelarch=="phi2")
@@ -297,7 +306,12 @@ void print_tok_vec(std::vector<float> &embd)
             {
                 fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA;
             }
+            else if(modelarch=="llama" && freq_base_train==10000.0f && n_tensors==435)
+            {
+                fileformatmeta->model_architecture = GGUFArch::ARCH_SOLAR;
+            }
         }
+
         gguf_free(ctx);
     }
 
@@ -531,4 +545,4 @@ void print_tok_vec(std::vector<float> &embd)
         //remove all tokens between start part and start of LCS in new prompt, thus avoiding shift
         //if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B
     }
- }
+ }
diff --git a/model_adapter.h b/model_adapter.h
@@ -56,6 +56,7 @@ enum GGUFArch
     ARCH_FALCON = 1,
     ARCH_PHI = 2,
     ARCH_MAMBA = 3,
+    ARCH_SOLAR = 4,
 };
 
 struct FileFormatExtraMeta

Original file line number	Diff line number	Diff line change
`@@ -271,6 +271,9 @@ void print_tok_vec(std::vector<float> &embd)`
`271`	`271`
`272`	`272`	`if(modelarch!="" && fileformatmeta!=nullptr)`
`273`	`273`	`{`
	`274`	`+ int n_tensors = gguf_get_n_tensors(ctx);`
	`275`	`+ float freq_base_train = 0;`
	`276`	`+`
`274`	`277`	`std::string fkey = modelarch+".context_length";`
`275`	`278`	`int keyidx = gguf_find_key(ctx, fkey.c_str());`
`276`	`279`	`if (keyidx != -1) {`
`@@ -281,8 +284,14 @@ void print_tok_vec(std::vector<float> &embd)`
`281`	`284`	`if (keyidx != -1) {`
`282`	`285`	`fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);`
`283`	`286`	`}`
	`287`	`+ fkey = modelarch+".rope.freq_base";`
	`288`	`+ keyidx = gguf_find_key(ctx, fkey.c_str());`
	`289`	`+ if (keyidx != -1) {`
	`290`	`+ freq_base_train = gguf_get_val_f32(ctx, keyidx);`
	`291`	`+ }`
`284`	`292`
`285`	`293`	`int filever = gguf_get_version(ctx);`
	`294`	`+`
`286`	`295`	`fileformatmeta->fileversion = filever;`
`287`	`296`	`fileformatmeta->model_architecture = GGUFArch::ARCH_DEFAULT;`
`288`	`297`	`if(modelarch=="phi2")`
`@@ -297,7 +306,12 @@ void print_tok_vec(std::vector<float> &embd)`
`297`	`306`	`{`
`298`	`307`	`fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA;`
`299`	`308`	`}`
	`309`	`+ else if(modelarch=="llama" && freq_base_train==10000.0f && n_tensors==435)`
	`310`	`+ {`
	`311`	`+ fileformatmeta->model_architecture = GGUFArch::ARCH_SOLAR;`
	`312`	`+ }`
`300`	`313`	`}`
	`314`	`+`
`301`	`315`	`gguf_free(ctx);`
`302`	`316`	`}`
`303`	`317`
`@@ -531,4 +545,4 @@ void print_tok_vec(std::vector<float> &embd)`
`531`	`545`	`//remove all tokens between start part and start of LCS in new prompt, thus avoiding shift`
`532`	`546`	`//if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B`
`533`	`547`	`}`
`534`		`- }`
	`548`	`+ }`