update doc, allow more default behaviour, release v0.14

jararap · jararap · commit ccd839c65dab · 2025-04-17T04:03:26.000Z
diff --git a/README.md b/README.md
@@ -27,11 +27,10 @@ For detailed logs, compare cpp_logs/{$data}/{$data}.log versus cpp_logs/{$data}/
 
 Install the beta version (for transformers >= 4):
 ```
-wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.14-beta8.zip"
-unzip v0.14-beta8.zip -d pcatt
+wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.14.zip"
+unzip v0.14.zip -d pcatt
 cd pcatt
 pip install -r requirements.txt
-pip install transformers
 pip install .
 ```
 
diff --git a/pcatt/hf/greedtok.py b/pcatt/hf/greedtok.py
@@ -730,64 +730,6 @@ def if_none_convert(x, value):
 
         return batch_outputs
 
-    def encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput, EncodedInput],
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        raise NotImplementedError
-
-    def batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            List[TextInput],
-            List[TextInputPair],
-            List[PreTokenizedInput],
-            List[PreTokenizedInputPair],
-            List[EncodedInput],
-            List[EncodedInputPair],
-        ],
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        raise NotImplementedError
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        raise NotImplementedError("Implemented in C++ backend")
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        raise NotImplementedError("Implemented in C++ backend")
-
     def prepare_for_model(
         self,
         ids: List[int],
@@ -811,16 +753,6 @@ def prepare_for_model(
     ) -> BatchEncoding:
         NotImplementedError("Implemented in C++ backend")
 
-    def truncate_sequences(
-        self,
-        ids: List[int],
-        pair_ids: Optional[List[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> Tuple[List[int], List[int], List[int]]:
-        raise NotImplementedError("Implemented in C++ backend")
-
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """
         Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 from setuptools import setup, Extension
 from pathlib import Path
 
-PATH_PREFIXES = [get_path(p) for p in ['data', 'platlib']]
+PATH_PREFIXES = [get_path(p) for p in ["data", "platlib"]]
 
 modules = []
 include_dirs = [
@@ -11,18 +11,18 @@
     for path in [
         f"{prefix}/include/",
         f"{prefix}/include/tbb",
-        f"{prefix}/pybind11/include"
+        f"{prefix}/pybind11/include",
     ]
 ]
 
-for code in ["greedy_builder", "greedy_encoder", "pco_tokenizer"]:
+for code in ["greedy_encoder", "pco_tokenizer"]:
     modules.append(
         Extension(
             f"pcatt.{code}",
             extra_compile_args=["-O3", "-std=c++23"],
-            define_macros=[("MAJOR_VERSION", "0"), ("MINOR_VERSION", "14-beta")],
+            define_macros=[("MAJOR_VERSION", "0"), ("MINOR_VERSION", "14")],
             include_dirs=include_dirs,
-            library_dirs=[f"{prefix}/lib/" for prefix in PATH_PREFIXES] ,
+            library_dirs=[f"{prefix}/lib/" for prefix in PATH_PREFIXES],
             libraries=["tbb"],
             sources=[f"pcatt/{code}.cpp"],
         )
@@ -33,7 +33,7 @@
 
 setup(
     name="greedtok",
-    version="0.14-beta",
+    version="0.14",
     description="Partition Cover Approach to Tokenization",
     author="JP Lim",
     author_email="jiapeng.lim.2021@phdcs.smu.edu.sg",