Skip to content

Commit b6244f6

Browse files
committed
update setup, bundle libtbb.so.12, small QoL fixes
1 parent 916d4f3 commit b6244f6

11 files changed

Lines changed: 121 additions & 210 deletions

File tree

.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,13 @@
3030
*.exe
3131
*.out
3232
*.app
33+
34+
#build
35+
.eggs/**
36+
build/**
37+
dist/**
38+
wheelhouse/**
39+
**/**.egg-info/**
40+
**.tar.gz
41+
main.py
42+
*.pyc

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
recursive-include pcatt/.libs *.so*

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ For detailed logs, compare cpp_logs/{$data}/{$data}.log versus cpp_logs/{$data}/
2828

2929
### Huggingface AutoTokenizer interface
3030

31-
Install the v0.14.x version (for transformers >= 4):
31+
Install the v0.15 version (for transformers >= 4), for Linux-based:
3232
```
33-
wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.14.1.zip"
34-
unzip v0.14.1.zip -d pcatt
33+
wget "https://github.com/PreferredAI/pcatt/archive/refs/tags/v0.15.zip"
34+
unzip v0.15.zip -d pcatt
3535
cd pcatt
3636
pip install -r requirements.txt
3737
pip install .

pcatt/.libs/libtbb.so.12

2.33 MB
Binary file not shown.

pcatt/benchmark_encoding.py

Lines changed: 0 additions & 93 deletions
This file was deleted.

pcatt/hf/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
from pcatt.hf.greedtok import GreedTok
22
from transformers import PretrainedConfig, AutoConfig, AutoTokenizer
33

4-
54
class GreedTokConfig(PretrainedConfig):
65
model_type = "greedtok"
76

87
def __init__():
98
pass
109

11-
1210
AutoConfig.register("greedtok", GreedTokConfig)
1311
AutoTokenizer.register(GreedTokConfig, GreedTok)

pcatt/hf/greedtok.py

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,55 @@ def encode(
510510
)
511511

512512
return encoded_inputs["input_ids"][0]
513+
514+
def batch_encode(
515+
self,
516+
text: Union[TextInput, PreTokenizedInput],
517+
text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
518+
add_special_tokens: bool = True,
519+
padding: Union[bool, str, PaddingStrategy] = False,
520+
truncation: Union[bool, str, TruncationStrategy] = None,
521+
max_length: Optional[int] = None,
522+
stride: int = 0,
523+
padding_side: Optional[bool] = None,
524+
return_tensors: Optional[Union[str, TensorType]] = None,
525+
**kwargs,
526+
) -> List[List[int]]:
527+
"""
528+
Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
529+
530+
Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
531+
532+
Args:
533+
text (`str`, `List[str]` or `List[int]`):
534+
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
535+
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
536+
method).
537+
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
538+
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
539+
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
540+
method).
541+
Returns:
542+
`List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text.
543+
"""
544+
545+
encoded_inputs = self._call_one(
546+
text,
547+
text_pair,
548+
add_special_tokens=add_special_tokens,
549+
padding=padding,
550+
truncation=truncation,
551+
max_length=max_length,
552+
stride=stride,
553+
is_split_into_words=False,
554+
padding_side=padding_side,
555+
return_tensors=return_tensors,
556+
**kwargs,
557+
)
558+
559+
return encoded_inputs["input_ids"]
560+
561+
513562

514563
def _init_set(self, key, current_value, value_if_key_not_exist):
515564
if current_value != None:
@@ -835,8 +884,8 @@ def batch_decode(
835884
def train_new_from_iterator(
836885
self,
837886
text_iterator,
838-
vocab_size,
839-
special_tokens_map=None,
887+
vocab_size: int,
888+
special_tokens_map: dict[str,str]|None =None,
840889
**kwargs,
841890
):
842891
"""
@@ -894,11 +943,11 @@ def train_new_from_iterator(
894943

895944
def train_new_from_counts(
896945
self,
897-
word_counts,
898-
vocab_size,
899-
max_token_length=None,
900-
min_word_count=None,
901-
special_tokens_map=None,
946+
word_counts: dict[str, int],
947+
vocab_size: int,
948+
max_token_length: int = 20,
949+
min_word_count: int = 0,
950+
special_tokens_map: dict[str, str] | None = None,
902951
**kwargs,
903952
):
904953
"""

pcatt/slow_bpe.py

Lines changed: 0 additions & 77 deletions
This file was deleted.

pyproject.toml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
[build-system]
2+
requires = [
3+
"setuptools>=64",
4+
"wheel",
5+
"pybind11",
6+
"tbb-devel",
7+
]
8+
build-backend = "setuptools.build_meta"
9+
10+
[project]
11+
name = "greedtok"
12+
version = "0.15"
13+
description = "Partition Cover Approach to Tokenization"
14+
readme = "README.md"
15+
license = { text = "MIT" }
16+
authors = [
17+
{ name = "JP Lim", email = "jiapeng.lim.2021@phdcs.smu.edu.sg" }
18+
]
19+
requires-python = ">=3.8"
20+
21+
dependencies = [
22+
"transformers>=4.4",
23+
]
24+
25+
[project.urls]
26+
Homepage = "https://github.com/PreferredAI/pcatt/"
27+
Repository = "https://github.com/PreferredAI/pcatt/"
28+
29+
[tool.setuptools]
30+
include-package-data = true
31+
32+
[tool.setuptools.packages.find]
33+
where = ["."]
34+
include = ["pcatt", "pcatt.*"]

setup.cfg

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)