PreferredAI
diff --git a/‎.gitignore‎
Lines changed: 10 additions & 0 deletions b/‎.gitignore‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pcatt/.libs/libtbb.so.12‎
2.33 MB b/‎pcatt/.libs/libtbb.so.12‎
2.33 MB
diff --git a/‎pcatt/benchmark_encoding.py‎
Lines changed: 0 additions & 93 deletions b/‎pcatt/benchmark_encoding.py‎
Lines changed: 0 additions & 93 deletions
diff --git a/‎pcatt/hf/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎pcatt/hf/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎pcatt/hf/greedtok.py‎
Lines changed: 56 additions & 7 deletions b/‎pcatt/hf/greedtok.py‎
Lines changed: 56 additions & 7 deletions
diff --git a/‎pcatt/slow_bpe.py‎
Lines changed: 0 additions & 77 deletions b/‎pcatt/slow_bpe.py‎
Lines changed: 0 additions & 77 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 34 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎setup.cfg‎
Lines changed: 0 additions & 2 deletions b/‎setup.cfg‎
Lines changed: 0 additions & 2 deletions
@@ -30,3 +30,13 @@
 *.exe
 *.out
 *.app
+
+#build
+.eggs/**
+build/**
+dist/**
+wheelhouse/**
+**/**.egg-info/**
+**.tar.gz
+main.py
+*.pyc
@@ -0,0 +1 @@
+recursive-include pcatt/.libs *.so*
@@ -28,10 +28,10 @@ For detailed logs, compare cpp_logs/{$data}/{$data}.log versus cpp_logs/{$data}/
 
 ###  Huggingface AutoTokenizer interface
 
-Install the v0.14.x version (for transformers >= 4):
+Install the v0.15 version (for transformers >= 4), for Linux-based:
 ```
-wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.14.1.zip"
-unzip v0.14.1.zip -d pcatt
+wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.15.zip"
+unzip v0.15.zip -d pcatt
 cd pcatt
 pip install -r requirements.txt
 pip install .
 
@@ -1,13 +1,11 @@
 from pcatt.hf.greedtok import GreedTok
 from transformers import PretrainedConfig, AutoConfig, AutoTokenizer
 
-
 class GreedTokConfig(PretrainedConfig):
     model_type = "greedtok"
 
     def __init__():
         pass
 
-
 AutoConfig.register("greedtok", GreedTokConfig)
 AutoTokenizer.register(GreedTokConfig, GreedTok)
@@ -510,6 +510,55 @@ def encode(
         )
 
         return encoded_inputs["input_ids"][0]
+    
+    def batch_encode(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        padding_side: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> List[List[int]]:
+        """
+        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
+
+        Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
+
+        Args:
+            text (`str`, `List[str]` or `List[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+        Returns:
+            `List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text.
+        """
+
+        encoded_inputs = self._call_one(
+            text,
+            text_pair,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=False,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+    
+    
 
     def _init_set(self, key, current_value, value_if_key_not_exist):
         if current_value != None:
@@ -835,8 +884,8 @@ def batch_decode(
     def train_new_from_iterator(
         self,
         text_iterator,
-        vocab_size,
-        special_tokens_map=None,
+        vocab_size: int,
+        special_tokens_map: dict[str,str]|None =None,
         **kwargs,
     ):
         """
@@ -894,11 +943,11 @@ def train_new_from_iterator(
 
     def train_new_from_counts(
         self,
-        word_counts,
-        vocab_size,
-        max_token_length=None,
-        min_word_count=None,
-        special_tokens_map=None,
+        word_counts: dict[str, int],
+        vocab_size: int,
+        max_token_length: int = 20,
+        min_word_count: int = 0,
+        special_tokens_map: dict[str, str] | None = None,
         **kwargs,
     ):
         """
 
@@ -0,0 +1,34 @@
+[build-system]
+requires = [
+    "setuptools>=64",
+    "wheel",
+    "pybind11",
+    "tbb-devel",
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "greedtok"
+version = "0.15"
+description = "Partition Cover Approach to Tokenization"
+readme = "README.md"
+license = { text = "MIT" }
+authors = [
+    { name = "JP Lim", email = "jiapeng.lim.2021@phdcs.smu.edu.sg" }
+]
+requires-python = ">=3.8"
+
+dependencies = [
+    "transformers>=4.4",
+]
+
+[project.urls]
+Homepage = "https://github.com/PreferredAI/pcatt/"
+Repository = "https://github.com/PreferredAI/pcatt/"
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["pcatt", "pcatt.*"]