Skip to content

Commit ccd839c

Browse files
committed
update doc, allow more default behaviour, release v0.14
1 parent 9824a7e commit ccd839c

3 files changed

Lines changed: 8 additions & 77 deletions

File tree

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,10 @@ For detailed logs, compare cpp_logs/{$data}/{$data}.log versus cpp_logs/{$data}/
2727

2828
Install the beta version (for transformers >= 4):
2929
```
30-
wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.14-beta8.zip"
31-
unzip v0.14-beta8.zip -d pcatt
30+
wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.14.zip"
31+
unzip v0.14.zip -d pcatt
3232
cd pcatt
3333
pip install -r requirements.txt
34-
pip install transformers
3534
pip install .
3635
```
3736

pcatt/hf/greedtok.py

Lines changed: 0 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -730,64 +730,6 @@ def if_none_convert(x, value):
730730

731731
return batch_outputs
732732

733-
def encode_plus(
734-
self,
735-
text: Union[TextInput, PreTokenizedInput, EncodedInput],
736-
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
737-
add_special_tokens: bool = True,
738-
padding: Union[bool, str, PaddingStrategy] = False,
739-
truncation: Union[bool, str, TruncationStrategy] = None,
740-
max_length: Optional[int] = None,
741-
stride: int = 0,
742-
is_split_into_words: bool = False,
743-
pad_to_multiple_of: Optional[int] = None,
744-
return_tensors: Optional[Union[str, TensorType]] = None,
745-
return_token_type_ids: Optional[bool] = None,
746-
return_attention_mask: Optional[bool] = None,
747-
return_overflowing_tokens: bool = False,
748-
return_special_tokens_mask: bool = False,
749-
return_offsets_mapping: bool = False,
750-
return_length: bool = False,
751-
verbose: bool = True,
752-
**kwargs,
753-
) -> BatchEncoding:
754-
raise NotImplementedError
755-
756-
def batch_encode_plus(
757-
self,
758-
batch_text_or_text_pairs: Union[
759-
List[TextInput],
760-
List[TextInputPair],
761-
List[PreTokenizedInput],
762-
List[PreTokenizedInputPair],
763-
List[EncodedInput],
764-
List[EncodedInputPair],
765-
],
766-
padding: Union[bool, str, PaddingStrategy] = False,
767-
truncation: Union[bool, str, TruncationStrategy] = None,
768-
max_length: Optional[int] = None,
769-
stride: int = 0,
770-
is_split_into_words: bool = False,
771-
pad_to_multiple_of: Optional[int] = None,
772-
return_tensors: Optional[Union[str, TensorType]] = None,
773-
return_token_type_ids: Optional[bool] = None,
774-
return_attention_mask: Optional[bool] = None,
775-
return_overflowing_tokens: bool = False,
776-
return_special_tokens_mask: bool = False,
777-
**kwargs,
778-
) -> BatchEncoding:
779-
raise NotImplementedError
780-
781-
def create_token_type_ids_from_sequences(
782-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
783-
) -> List[int]:
784-
raise NotImplementedError("Implemented in C++ backend")
785-
786-
def build_inputs_with_special_tokens(
787-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
788-
) -> List[int]:
789-
raise NotImplementedError("Implemented in C++ backend")
790-
791733
def prepare_for_model(
792734
self,
793735
ids: List[int],
@@ -811,16 +753,6 @@ def prepare_for_model(
811753
) -> BatchEncoding:
812754
NotImplementedError("Implemented in C++ backend")
813755

814-
def truncate_sequences(
815-
self,
816-
ids: List[int],
817-
pair_ids: Optional[List[int]] = None,
818-
num_tokens_to_remove: int = 0,
819-
truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
820-
stride: int = 0,
821-
) -> Tuple[List[int], List[int], List[int]]:
822-
raise NotImplementedError("Implemented in C++ backend")
823-
824756
def convert_tokens_to_string(self, tokens: List[str]) -> str:
825757
"""
826758
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we

setup.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from setuptools import setup, Extension
33
from pathlib import Path
44

5-
PATH_PREFIXES = [get_path(p) for p in ['data', 'platlib']]
5+
PATH_PREFIXES = [get_path(p) for p in ["data", "platlib"]]
66

77
modules = []
88
include_dirs = [
@@ -11,18 +11,18 @@
1111
for path in [
1212
f"{prefix}/include/",
1313
f"{prefix}/include/tbb",
14-
f"{prefix}/pybind11/include"
14+
f"{prefix}/pybind11/include",
1515
]
1616
]
1717

18-
for code in ["greedy_builder", "greedy_encoder", "pco_tokenizer"]:
18+
for code in ["greedy_encoder", "pco_tokenizer"]:
1919
modules.append(
2020
Extension(
2121
f"pcatt.{code}",
2222
extra_compile_args=["-O3", "-std=c++23"],
23-
define_macros=[("MAJOR_VERSION", "0"), ("MINOR_VERSION", "14-beta")],
23+
define_macros=[("MAJOR_VERSION", "0"), ("MINOR_VERSION", "14")],
2424
include_dirs=include_dirs,
25-
library_dirs=[f"{prefix}/lib/" for prefix in PATH_PREFIXES] ,
25+
library_dirs=[f"{prefix}/lib/" for prefix in PATH_PREFIXES],
2626
libraries=["tbb"],
2727
sources=[f"pcatt/{code}.cpp"],
2828
)
@@ -33,7 +33,7 @@
3333

3434
setup(
3535
name="greedtok",
36-
version="0.14-beta",
36+
version="0.14",
3737
description="Partition Cover Approach to Tokenization",
3838
author="JP Lim",
3939
author_email="jiapeng.lim.2021@phdcs.smu.edu.sg",

0 commit comments

Comments
 (0)