software-mansion
diff --git a/‎packages/react-native-executorch/android/libs/classes.jar‎
1.35 KB b/‎packages/react-native-executorch/android/libs/classes.jar‎
1.35 KB
diff --git a/‎packages/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so‎
100755100644
259 KB b/‎packages/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so‎
100755100644
259 KB
diff --git a/‎packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so‎
100755100644
466 KB b/‎packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so‎
100755100644
466 KB
diff --git a/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/hf_tokenizer.h‎
Lines changed: 1 addition & 1 deletion b/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/hf_tokenizer.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/model.h‎
Lines changed: 1 addition & 1 deletion b/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/model.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/pre_tokenizer.h‎
Lines changed: 10 additions & 2 deletions b/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/pre_tokenizer.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/truncation.h‎
Lines changed: 1 addition & 1 deletion b/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/truncation.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/unicode-nfc-data.h‎
Lines changed: 4140 additions & 0 deletions b/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/unicode-nfc-data.h‎
Lines changed: 4140 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/unicode-nfc.h‎
Lines changed: 48 additions & 0 deletions b/‎packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/unicode-nfc.h‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib‎
199 KB b/‎packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib‎
199 KB
@@ -80,4 +80,4 @@ class HFTokenizer : public Tokenizer {
   Model::Ptr _model;
 };
 
-} // namespace tokenizers
+} // namespace tokenizers
@@ -148,4 +148,4 @@ class ModelConfig {
   Model::Ptr create() const;
 };
 
-} // namespace tokenizers
+} // namespace tokenizers
@@ -104,6 +104,11 @@ class PreTokenizerConfig {
    */
   PRETOKENIZER_CONFIG_MEMBER(bool, add_prefix_space)
 
+  /**
+   * Used by: ByteLevelPreTokenizer
+   */
+  PRETOKENIZER_CONFIG_MEMBER(bool, use_regex)
+
   /**
    * Used by RegexPreTokenizer
    */
@@ -225,11 +230,13 @@ class ByteLevelPreTokenizer : public PreTokenizer {
    * @param add_prefix_space: Whether to add a leading space to the first word
    * @param pattern: A user-supplied regex to use for token splitting. If not
    *    provided, it use the standard GPT2 pattern.
+   * @param use_regex: Whether to use regex for splitting. If false, only apply
+   *    byte encoding without splitting.
    */
   ByteLevelPreTokenizer(bool add_prefix_space = true,
-                        const std::string &pattern = "");
+                        const std::string &pattern = "", bool use_regex = true);
   explicit ByteLevelPreTokenizer(const std::string &pattern)
-      : ByteLevelPreTokenizer(true, pattern) {}
+      : ByteLevelPreTokenizer(true, pattern, true) {}
 
   /** Perform pre-tokenization */
   std::vector<std::string>
@@ -238,6 +245,7 @@ class ByteLevelPreTokenizer : public PreTokenizer {
 private:
   const std::string pattern_;
   const bool add_prefix_space_;
+  const bool use_regex_;
 
 }; // end class ByteLevelPreTokenizer
 
 
@@ -89,4 +89,4 @@ class TruncationConfig {
   TruncationParams params;
 };
 
-} // namespace tokenizers
+} // namespace tokenizers
@@ -0,0 +1,48 @@
+/*
+ * Unicode NFC/NFD normalization implementation
+ * This implementation fixes the broken llama.cpp-unicode normalization
+ * by using proper multi-codepoint decomposition sequences.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace tokenizers {
+
+/**
+ * Normalize a sequence of codepoints to NFD (Canonical Decomposition)
+ *
+ * @param cpts Input codepoint sequence
+ * @return NFD-normalized codepoint sequence
+ */
+std::vector<uint32_t> unicode_normalize_nfd(const std::vector<uint32_t> &cpts);
+
+/**
+ * Normalize a sequence of codepoints to NFC (Canonical Decomposition +
+ * Composition)
+ *
+ * @param cpts Input codepoint sequence
+ * @return NFC-normalized codepoint sequence
+ */
+std::vector<uint32_t> unicode_normalize_nfc(const std::vector<uint32_t> &cpts);
+
+/**
+ * Normalize a UTF-8 string to NFD
+ *
+ * @param utf8 Input UTF-8 string
+ * @return NFD-normalized UTF-8 string
+ */
+std::string unicode_normalize_nfd_utf8(const std::string &utf8);
+
+/**
+ * Normalize a UTF-8 string to NFC
+ *
+ * @param utf8 Input UTF-8 string
+ * @return NFC-normalized UTF-8 string
+ */
+std::string unicode_normalize_nfc_utf8(const std::string &utf8);
+
+} // namespace tokenizers