diff --git a/.gitignore b/.gitignore index ee5f4bd..6f75543 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,15 @@ labs .venv *.7z *.env -*.egg*/ \ No newline at end of file +*.egg*/ + +# Rust / Maturin build artifacts +target/ +*.pyd +*.pdb +*.so + +# IDE / tool state +.idea/ +.serena/ +claudedocs/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..b9e5dc0 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,494 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "fancy-regex" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05" +dependencies = [ + "bit-set", + "regex", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "parking_lot", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smallvec" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "textspitter-core" +version = "2.0.0" +dependencies = [ + "chardetng", + "memchr", + "pyo3", + "rayon", + "regex", + "tiktoken-rs", + "unicode-normalization", +] + +[[package]] +name = "tiktoken-rs" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234" +dependencies = [ + "anyhow", + "base64", + "bstr", + "fancy-regex", + "lazy_static", + "parking_lot", + "rustc-hash", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..bc840c2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "textspitter-core" +version = "2.0.0" +edition = "2021" + +[lib] +name = "_core" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.21", features = ["extension-module", "abi3-py310"] } +chardetng = "0.1" +rayon = "1" +regex = "1" +unicode-normalization = "0.1" +tiktoken-rs = "0.5" + +[features] +default = [] +simd = ["memchr/std"] + +[dependencies.memchr] +version = "2" +optional = true + +[profile.release] +lto = true +codegen-units = 1 +opt-level = 3 diff --git a/TextSpitter/__init__.py b/TextSpitter/__init__.py index 8f03c68..4a64a45 100644 --- a/TextSpitter/__init__.py +++ b/TextSpitter/__init__.py @@ -9,9 +9,40 @@ except PackageNotFoundError: __version__ = "unknown" +try: + from TextSpitter._core import ( # type: ignore[import] + Chunk, + TextChunker, + TextNormalizer, + TokenCounter, + detect_encoding, + ) + + _RUST_AVAILABLE = True +except ImportError: + from TextSpitter._fallback import ( + Chunk, + TextChunker, + TextNormalizer, + TokenCounter, + detect_encoding, + ) + + _RUST_AVAILABLE = False + from .main import WordLoader -__all__ = ["TextSpitter", "WordLoader", "__version__"] +__all__ = [ + "TextSpitter", + "WordLoader", + "TextNormalizer", + "TextChunker", + "TokenCounter", + "Chunk", + "detect_encoding", + "_RUST_AVAILABLE", + "__version__", +] def TextSpitter( diff --git a/TextSpitter/_fallback.py b/TextSpitter/_fallback.py new file mode 100644 index 0000000..0e1ee38 --- /dev/null +++ b/TextSpitter/_fallback.py @@ -0,0 +1,322 @@ +""" +Pure-Python fallback implementations for when the Rust extension is unavailable. + +These match the interface of TextSpitter._core exactly, so callers can use +either path without branching. +""" + +from __future__ import annotations + +import unicodedata +from typing import Literal + + +def detect_encoding(data: bytes) -> str: + """Detect encoding by trying common codecs in priority order.""" + if not data: + return "utf-8" + for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"): + try: + data.decode(enc) + return enc + except (UnicodeDecodeError, LookupError): + continue + return "utf-8" + + +class TextNormalizer: + _NormForm = Literal["NFC", "NFD", "NFKC", "NFKD"] + + def __init__( + self, + unicode_form: _NormForm = "NFC", + collapse_whitespace: bool = True, + repair_ocr: bool = False, + strip_headers_footers: bool = False, + ) -> None: + self.unicode_form: TextNormalizer._NormForm = unicode_form + self.collapse_whitespace = collapse_whitespace + self.repair_ocr = repair_ocr + self.strip_headers_footers = strip_headers_footers + + def normalize(self, text: str) -> str: + s = unicodedata.normalize(self.unicode_form, text) + if self.strip_headers_footers: + s = self._strip_headers(s) + if self.repair_ocr: + s = self._repair_ocr(s) + if self.collapse_whitespace: + import re + + s = re.sub(r"[^\S\n]+", " ", s) + s = re.sub(r"\n{3,}", "\n\n", s) + s = s.strip() + return s + + def normalize_batch(self, texts: list[str]) -> list[str]: + return [self.normalize(t) for t in texts] + + def _strip_headers(self, text: str) -> str: + pages = text.split("\x0c") + if len(pages) < 2: + return text + all_lines = [p.splitlines() for p in pages] + candidates = { + ln.strip() + for ln in all_lines[0] + if ln.strip() + and sum( + 1 + for pl in all_lines + if ln.strip() in [row.strip() for row in pl] + ) + * 2 + > len(pages) + } + return "\x0c".join( + "\n".join( + row + for row in page.splitlines() + if row.strip() not in candidates + ) + for page in pages + ) + + def _repair_ocr(self, text: str) -> str: + import re + + text = re.sub(r"([a-z])rn([a-z])", r"\1m\2", text) + text = re.sub(r"(\d)l(\d)", r"\g<1>1\2", text) + return text + + +class Chunk: + def __init__( + self, + text: str, + token_count: int, + char_start: int, + char_end: int, + section_title: str | None, + chunk_index: int, + total_chunks: int | None, + metadata: dict, + ) -> None: + self.text = text + self.token_count = token_count + self.char_start = char_start + self.char_end = char_end + self.section_title = section_title + self.chunk_index = chunk_index + self.total_chunks = total_chunks + self.metadata = metadata + + def __repr__(self) -> str: + return ( + f"Chunk(index={self.chunk_index}/{self.total_chunks}, " + f"tokens={self.token_count}, " + f"chars={self.char_start}..{self.char_end})" + ) + + +class TextChunker: + def __init__( + self, + max_tokens: int = 2000, + min_tokens: int = 100, + tokenizer: str = "cl100k_base", + preserve_tables: bool = True, + section_patterns: list[str] | None = None, + ) -> None: + if min_tokens > max_tokens: + raise ValueError( + f"min_tokens ({min_tokens}) must be " + f"<= max_tokens ({max_tokens})" + ) + try: + import tiktoken + + tiktoken.get_encoding(tokenizer) + except ImportError: + pass + self.max_tokens = max_tokens + self.min_tokens = min_tokens + self.tokenizer = tokenizer + self.preserve_tables = preserve_tables + self.section_patterns = section_patterns or [] + + def _count(self, text: str) -> int: + try: + import tiktoken + + enc = tiktoken.get_encoding(self.tokenizer) + # Mirror Rust encode_with_special_tokens: allow all special tokens. + return len(enc.encode(text, allowed_special="all")) + except Exception: + return len(text) // 4 + + def chunk(self, text: str) -> list[Chunk]: + import re + + # Split with a capturing group so we can measure the actual separator + # length (2+ newlines). Without this, char_cursor drifts when gaps use + # 3+ newlines because the old code always added a fixed +2. + pieces = re.split(r"(\n\n+)", text) + chunks: list[Chunk] = [] + current_parts: list[str] = [] + current_tokens = 0 + char_cursor = 0 + current_start = 0 + section_title: str | None = None + + for idx, piece in enumerate(pieces): + if idx % 2 == 1: + # Odd indices are separator strings ("\n\n", "\n\n\n", …) + char_cursor += len(piece) + continue + + para = piece.strip() + if not para: + char_cursor += len(piece) + continue + + para_tokens = self._count(para) + + # Single paragraph exceeds max_tokens — emit as oversized. + if para_tokens > self.max_tokens: + if current_parts: + chunk_text = "\n\n".join(current_parts) + chunks.append( + Chunk( + text=chunk_text, + token_count=current_tokens, + char_start=current_start, + char_end=char_cursor, + section_title=section_title, + chunk_index=0, + total_chunks=None, + metadata={}, + ) + ) + current_parts = [] + current_tokens = 0 + current_start = char_cursor + end = char_cursor + len(piece) + chunks.append( + Chunk( + text=para, + token_count=para_tokens, + char_start=char_cursor, + char_end=end, + section_title=section_title, + chunk_index=0, + total_chunks=None, + metadata={"oversized": True}, + ) + ) + char_cursor = end + current_start = char_cursor + continue + + if current_tokens + para_tokens > self.max_tokens and current_parts: + chunk_text = "\n\n".join(current_parts) + chunks.append( + Chunk( + text=chunk_text, + token_count=current_tokens, + char_start=current_start, + char_end=char_cursor, + section_title=section_title, + chunk_index=0, + total_chunks=None, + metadata={}, + ) + ) + current_parts = [] + current_tokens = 0 + current_start = char_cursor + + current_parts.append(para) + current_tokens += para_tokens + char_cursor += len(piece) + + if current_parts: + chunk_text = "\n\n".join(current_parts) + chunks.append( + Chunk( + text=chunk_text, + token_count=current_tokens, + char_start=current_start, + char_end=char_cursor, + section_title=section_title, + chunk_index=0, + total_chunks=None, + metadata={}, + ) + ) + + total = len(chunks) + for i, c in enumerate(chunks): + c.chunk_index = i + c.total_chunks = total + return chunks + + def chunk_batch(self, texts: list[str]) -> list[list[Chunk]]: + return [self.chunk(t) for t in texts] + + +class TokenCounter: + def __init__(self, model: str = "cl100k_base") -> None: + try: + import tiktoken + + tiktoken.get_encoding(model) + except ImportError: + pass + self.model = model + + def _bpe(self): + try: + import tiktoken + + return tiktoken.get_encoding(self.model) + except Exception: + return None + + def count(self, text: str) -> int: + bpe = self._bpe() + # allowed_special="all" mirrors Rust encode_with_special_tokens and + # prevents ValueError when text contains tokens like <|endoftext|>. + return ( + len(bpe.encode(text, allowed_special="all")) + if bpe + else len(text) // 4 + ) + + def count_batch(self, texts: list[str]) -> list[int]: + return [self.count(t) for t in texts] + + def truncate( + self, text: str, max_tokens: int, strategy: str = "end" + ) -> str: + bpe = self._bpe() + if bpe is None: + # Word-based approximation: ~1 token per word + words = text.split() + if len(words) <= max_tokens: + return text + if strategy == "middle": + half = max_tokens // 2 + kept = words[:half] + words[-(max_tokens - half) :] + else: + kept = words[:max_tokens] + return " ".join(kept) + tokens = bpe.encode(text, allowed_special="all") + if len(tokens) <= max_tokens: + return text + if strategy == "middle": + half = max_tokens // 2 + kept = tokens[:half] + tokens[-(max_tokens - half) :] + else: + kept = tokens[:max_tokens] + return bpe.decode(kept) diff --git a/TextSpitter/core.py b/TextSpitter/core.py index 125b5c3..4e91c32 100644 --- a/TextSpitter/core.py +++ b/TextSpitter/core.py @@ -10,6 +10,8 @@ from docx import Document +from TextSpitter import detect_encoding + # --- Module-level imports for optional PDF libraries --- try: import pymupdf @@ -301,32 +303,25 @@ def get_contents(self) -> bytes: def code_file_read(self) -> str: """ Reads contents from programming language files (.py, .js, .java, etc.) - with enhanced encoding detection and preserves original formatting. + with encoding detection and preserves original formatting. Returns: str: The file content as a string """ contents_bytes = self.get_contents() - - # Common encodings for source code files - encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"] - - for encoding in encodings_to_try: - try: - content = contents_bytes.decode(encoding) - logger.info( - f"Successfully decoded {self.file_name} using {encoding}" - ) - return content - except UnicodeDecodeError: - continue - - # If all encodings fail, use utf-8 with replacement - logger.warning( - f"Could not decode code file {self.file_name} with standard " - f"encodings, using utf-8 with replacement characters." - ) - return contents_bytes.decode("utf-8", errors="replace") + encoding = detect_encoding(contents_bytes) + try: + content = contents_bytes.decode(encoding) + logger.info( + f"Successfully decoded {self.file_name} using {encoding}" + ) + return content + except (UnicodeDecodeError, LookupError): + logger.warning( + f"Could not decode {self.file_name} with detected encoding " + f"'{encoding}', falling back to utf-8 with replacement." + ) + return contents_bytes.decode("utf-8", errors="replace") def pdf_file_read(self) -> str: # Added return type hint """ @@ -405,8 +400,12 @@ def docx_file_read(self) -> str: # Added return type hint def _decode_bytes(self, data: bytes, label: str) -> str: """ - Decode bytes to str, trying UTF-8 then latin-1 then UTF-8 with - replacement characters. + Decode bytes to str, trying UTF-8, cp1252, then latin-1, then UTF-8 + with replacement characters. + + cp1252 is tried before latin-1 so Windows smart-quote bytes (0x80-0x9F) + decode to printable characters instead of C1 control characters. + latin-1 always succeeds and acts as the final deterministic fallback. Args: data: Raw bytes to decode. @@ -415,16 +414,13 @@ def _decode_bytes(self, data: bytes, label: str) -> str: Returns: str """ - try: - return data.decode("utf-8") - except UnicodeDecodeError: - pass - try: - return data.decode("latin-1") - except UnicodeDecodeError: - pass + for enc in ("utf-8", "cp1252", "latin-1"): + try: + return data.decode(enc) + except (UnicodeDecodeError, LookupError): + continue logger.warning( - f"Could not decode {label} with utf-8 or latin-1, " + f"Could not decode {label} with utf-8, cp1252, or latin-1, " f"using utf-8 with replacement characters." ) return data.decode("utf-8", errors="replace") diff --git a/pyproject.toml b/pyproject.toml index 1e6d22c..e4a8576 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" +requires = ["maturin>=1.5,<2.0"] +build-backend = "maturin" [tool.ruff] fix = true @@ -92,8 +92,10 @@ textspitter = "TextSpitter.cli:main" Homepage = "https://github.com/fsecada01/TextSpitter" Issues = "https://github.com/fsecada01/TextSpitter/issues" -[tool.setuptools] -packages = ["TextSpitter", "TextSpitter.guide"] +[tool.maturin] +features = ["pyo3/extension-module"] +module-name = "TextSpitter._core" +python-source = "." [tool.pytest.ini_options] testpaths = ["tests"] @@ -119,4 +121,6 @@ dev = [ "ruff", "ty", "twine", + "maturin>=1.14.0", + "tiktoken", ] diff --git a/src/chunk.rs b/src/chunk.rs new file mode 100644 index 0000000..50fe02c --- /dev/null +++ b/src/chunk.rs @@ -0,0 +1,355 @@ +use pyo3::prelude::*; +use rayon::prelude::*; +use std::collections::HashMap; +use tiktoken_rs::{get_bpe_from_model, CoreBPE}; + +fn load_bpe(name: &str) -> Result { + let result = match name { + "cl100k_base" => tiktoken_rs::cl100k_base(), + "o200k_base" => tiktoken_rs::o200k_base(), + "r50k_base" => tiktoken_rs::r50k_base(), + "p50k_base" => tiktoken_rs::p50k_base(), + "p50k_edit" => tiktoken_rs::p50k_edit(), + other => get_bpe_from_model(other), + }; + result.map_err(|e| e.to_string()) +} + +/// A single chunk produced by ``TextChunker``. +#[pyclass(get_all)] +#[derive(Clone, Debug)] +pub struct Chunk { + /// The chunk text. + pub text: String, + /// BPE token count for this chunk. + pub token_count: usize, + /// Unicode code-point start offset in the original input string. + pub char_start: usize, + /// Unicode code-point end offset (exclusive) in the original input string. + pub char_end: usize, + /// Enclosing section header, if detected. + pub section_title: Option, + /// Zero-based position in the chunk sequence. + pub chunk_index: usize, + /// Total chunks in the sequence (None when produced by chunk_iter). + pub total_chunks: Option, + /// Extra metadata (e.g. {"oversized": true}). + pub metadata: HashMap, +} + +#[pymethods] +impl Chunk { + fn __repr__(&self) -> String { + format!( + "Chunk(index={}/{:?}, tokens={}, chars={}..{})", + self.chunk_index, + self.total_chunks, + self.token_count, + self.char_start, + self.char_end, + ) + } +} + +#[pyclass] +pub struct TextChunker { + max_tokens: usize, + min_tokens: usize, + tokenizer: String, + preserve_tables: bool, + section_patterns: Vec, +} + +#[pymethods] +impl TextChunker { + #[new] + #[pyo3(signature = ( + max_tokens = 2000, + min_tokens = 100, + tokenizer = "cl100k_base".to_string(), + preserve_tables = true, + section_patterns = vec![], + ))] + pub fn new( + max_tokens: usize, + min_tokens: usize, + tokenizer: String, + preserve_tables: bool, + section_patterns: Vec, + ) -> PyResult { + if min_tokens > max_tokens { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "min_tokens ({min_tokens}) must be <= max_tokens ({max_tokens})" + ))); + } + // Validate tokenizer name at construction time. + load_bpe(&tokenizer) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + Ok(Self { max_tokens, min_tokens, tokenizer, preserve_tables, section_patterns }) + } + + /// Chunk text into a list of ``Chunk`` objects. + pub fn chunk(&self, text: &str) -> PyResult> { + let chunks = self.split(text)?; + let total = chunks.len(); + Ok(chunks.into_iter().enumerate().map(|(i, mut c)| { + c.chunk_index = i; + c.total_chunks = Some(total); + c + }).collect()) + } + + /// Chunk a batch of texts in parallel (GIL released). + pub fn chunk_batch( + &self, + py: Python<'_>, + texts: Vec, + ) -> PyResult>> { + // Capture config for use inside the thread closure. + let max_tokens = self.max_tokens; + let min_tokens = self.min_tokens; + let tokenizer = self.tokenizer.clone(); + let preserve_tables = self.preserve_tables; + let section_patterns = self.section_patterns.clone(); + + py.allow_threads(|| { + texts.par_iter() + .map(|text| { + let chunker = TextChunker { + max_tokens, + min_tokens, + tokenizer: tokenizer.clone(), + preserve_tables, + section_patterns: section_patterns.clone(), + }; + let chunks = chunker.split(text)?; + let total = chunks.len(); + Ok(chunks.into_iter().enumerate().map(|(i, mut c)| { + c.chunk_index = i; + c.total_chunks = Some(total); + c + }).collect::>()) + }) + .collect::>>() + }) + } +} + +impl TextChunker { + fn split(&self, text: &str) -> PyResult> { + let bpe = load_bpe(&self.tokenizer) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + + let section_re = self.build_section_regex(); + let table_re = if self.preserve_tables { + Some(regex::Regex::new(r"(?m)^\|.+\|[ \t]*$").unwrap()) + } else { + None + }; + + // Split text into logical units: tables (atomic) and paragraph blocks. + let units = segment_units(text, table_re.as_ref(), section_re.as_ref()); + + let mut chunks: Vec = Vec::new(); + let mut current_text = String::new(); + let mut current_start: usize = 0; // char offset + let mut current_section: Option = None; + let mut char_cursor: usize = 0; + + for unit in units { + let unit_tokens = bpe.encode_with_special_tokens(&unit.text).len(); + + // If this unit alone exceeds max_tokens, emit it as an oversized chunk. + if unit_tokens > self.max_tokens { + // Flush any pending content first. + if !current_text.is_empty() { + chunks.push(self.make_chunk( + ¤t_text, + &bpe, + current_start, + char_cursor, + current_section.clone(), + false, + )); + current_text.clear(); + current_start = char_cursor; + } + let unit_len: usize = unit.text.chars().count(); + chunks.push(self.make_chunk( + &unit.text, + &bpe, + char_cursor, + char_cursor + unit_len, + unit.section_title.or(current_section.clone()), + true, // oversized + )); + char_cursor += unit_len; + continue; + } + + let pending_tokens = bpe.encode_with_special_tokens(¤t_text).len(); + + // Always flush on overflow — max_tokens is a hard cap; min_tokens + // is a soft target that must not allow chunks to exceed max_tokens. + if pending_tokens + unit_tokens > self.max_tokens && !current_text.is_empty() { + chunks.push(self.make_chunk( + ¤t_text, + &bpe, + current_start, + char_cursor, + current_section.clone(), + false, + )); + current_text.clear(); + current_start = char_cursor; + } + + if let Some(title) = &unit.section_title { + current_section = Some(title.clone()); + } + + let unit_char_len = unit.text.chars().count(); + current_text.push_str(&unit.text); + char_cursor += unit_char_len; + } + + // Flush any remaining content. + if !current_text.is_empty() { + chunks.push(self.make_chunk( + ¤t_text, + &bpe, + current_start, + char_cursor, + current_section, + false, + )); + } + + Ok(chunks) + } + + fn make_chunk( + &self, + text: &str, + bpe: &tiktoken_rs::CoreBPE, + char_start: usize, + char_end: usize, + section_title: Option, + oversized: bool, + ) -> Chunk { + let token_count = bpe.encode_with_special_tokens(text).len(); + let mut metadata = HashMap::new(); + if oversized { + metadata.insert("oversized".to_string(), true); + } + Chunk { + text: text.to_string(), + token_count, + char_start, + char_end, + section_title, + chunk_index: 0, // set by caller + total_chunks: None, // set by caller + metadata, + } + } + + fn build_section_regex(&self) -> Option { + let mut patterns = vec![ + r"^[A-Z][A-Z\s]{4,}$".to_string(), + r"^\d+\.\s+[A-Z]".to_string(), + r"^SECTION\s+\d+".to_string(), + r"^Article\s+[IVX\d]+".to_string(), + ]; + patterns.extend(self.section_patterns.iter().cloned()); + let combined = patterns.join("|"); + regex::Regex::new(&format!("(?m){combined}")).ok() + } +} + +struct Unit { + text: String, + section_title: Option, +} + +/// Segment text into atomic units: tables stay whole, text splits on +/// paragraph breaks and section headers. +fn segment_units( + text: &str, + table_re: Option<®ex::Regex>, + section_re: Option<®ex::Regex>, +) -> Vec { + let mut units = Vec::new(); + let mut remaining = text; + + while !remaining.is_empty() { + // Check for a table starting at the current position. + if let Some(table_match) = table_re.and_then(|re| re.find(remaining)) { + // Emit any text before the table. + if table_match.start() > 0 { + let before = &remaining[..table_match.start()]; + push_text_units(before, section_re, &mut units); + } + // Find the end of the table block (last consecutive table line). + let table_end = find_table_end(remaining, table_match.start()); + units.push(Unit { + text: remaining[table_match.start()..table_end].to_string(), + section_title: None, + }); + remaining = &remaining[table_end..]; + } else { + push_text_units(remaining, section_re, &mut units); + break; + } + } + + units +} + +fn push_text_units( + text: &str, + section_re: Option<®ex::Regex>, + units: &mut Vec, +) { + let mut current_section: Option = None; + + for para in text.split("\n\n") { + let trimmed = para.trim(); + if trimmed.is_empty() { + continue; + } + + let title = section_re + .and_then(|re| re.find(trimmed)) + .map(|m| m.as_str().trim().to_string()); + + if let Some(ref t) = title { + current_section = Some(t.clone()); + } + + units.push(Unit { + text: format!("{trimmed}\n\n"), + section_title: title.or(current_section.clone()), + }); + } +} + +fn find_table_end(text: &str, start: usize) -> usize { + let from = &text[start..]; + let mut offset = 0usize; + for line in from.lines() { + if line.trim_start().starts_with('|') || line.trim().is_empty() { + offset += line.len(); + // lines() strips line terminators; advance past the actual bytes + // so CRLF (2 bytes) is handled correctly, not just LF (1 byte). + if from[offset..].starts_with("\r\n") { + offset += 2; + } else if offset < from.len() { + offset += 1; + } + } else { + break; + } + } + (start + offset).min(text.len()) +} diff --git a/src/encoding.rs b/src/encoding.rs new file mode 100644 index 0000000..f2d6da0 --- /dev/null +++ b/src/encoding.rs @@ -0,0 +1,66 @@ +use chardetng::EncodingDetector; +use pyo3::prelude::*; + +/// Map a WHATWG encoding label to a Python codec name. +fn to_python_codec(whatwg_name: &str) -> String { + match whatwg_name { + "UTF-8" => "utf-8".into(), + "UTF-16LE" => "utf-16-le".into(), + "UTF-16BE" => "utf-16-be".into(), + // chardetng collapses ISO-8859-1 and windows-1252 into windows-1252; + // Python's canonical name for that codec is cp1252. + "windows-1252" | "ISO-8859-1" => "cp1252".into(), + other => other.to_lowercase(), + } +} + +/// Detect the character encoding of raw bytes. +/// +/// Uses chardetng for a single-pass, high-accuracy detection. +/// Returns a Python codec name suitable for use with ``bytes.decode()``. +/// Falls back to ``"utf-8"`` if detection is inconclusive. +#[pyfunction] +pub fn detect_encoding(data: &[u8]) -> String { + if data.is_empty() { + return "utf-8".into(); + } + + // Explicit BOM check before chardetng: chardetng returns "UTF-8" for + // BOM-prefixed files, but Python's "utf-8" codec preserves the BOM at + // position 0. "utf-8-sig" strips it during decode. + if data.starts_with(b"\xef\xbb\xbf") { + return "utf-8-sig".into(); + } + + // Feed the entire buffer; last=true signals end-of-stream. + let mut detector = EncodingDetector::new(); + detector.feed(data, true); + + // guess(tld, allow_utf8): None TLD, allow UTF-8 as a candidate. + let encoding = detector.guess(None, true); + to_python_codec(encoding.name()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detects_utf8() { + let data = "Hello, world! — Unicode café".as_bytes(); + assert_eq!(detect_encoding(data), "utf-8"); + } + + #[test] + fn detects_windows1252() { + // 0x93/0x94 are Windows-1252 "smart quotes", invalid in UTF-8. + let data = b"Hello \x93world\x94"; + let enc = detect_encoding(data); + assert!(enc == "cp1252" || enc == "windows-1252", "got: {enc}"); + } + + #[test] + fn empty_bytes_returns_utf8() { + assert_eq!(detect_encoding(b""), "utf-8"); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..d26e2a5 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,17 @@ +use pyo3::prelude::*; + +mod encoding; +mod normalize; +mod token; +mod chunk; +mod separator; + +#[pymodule] +fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(encoding::detect_encoding, m)?)?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/src/normalize.rs b/src/normalize.rs new file mode 100644 index 0000000..98aded4 --- /dev/null +++ b/src/normalize.rs @@ -0,0 +1,143 @@ +use pyo3::prelude::*; +use rayon::prelude::*; +use unicode_normalization::UnicodeNormalization; + +#[pyclass] +pub struct TextNormalizer { + unicode_form: String, + collapse_whitespace: bool, + repair_ocr: bool, + strip_headers_footers: bool, +} + +#[pymethods] +impl TextNormalizer { + #[new] + #[pyo3(signature = ( + unicode_form = "NFC".to_string(), + collapse_whitespace = true, + repair_ocr = false, + strip_headers_footers = false, + ))] + pub fn new( + unicode_form: String, + collapse_whitespace: bool, + repair_ocr: bool, + strip_headers_footers: bool, + ) -> Self { + Self { unicode_form, collapse_whitespace, repair_ocr, strip_headers_footers } + } + + pub fn normalize(&self, text: &str) -> String { + self.normalize_one(text) + } + + pub fn normalize_batch( + &self, + py: Python<'_>, + texts: Vec, + ) -> Vec { + py.allow_threads(|| { + texts.par_iter().map(|t| self.normalize_one(t)).collect() + }) + } +} + +impl TextNormalizer { + fn normalize_one(&self, text: &str) -> String { + let mut s: String = match self.unicode_form.as_str() { + "NFC" => text.nfc().collect(), + "NFD" => text.nfd().collect(), + "NFKC" => text.nfkc().collect(), + "NFKD" => text.nfkd().collect(), + _ => text.nfc().collect(), + }; + + if self.strip_headers_footers { + s = strip_headers_footers(&s); + } + + if self.repair_ocr { + s = repair_ocr_artifacts(&s); + } + + if self.collapse_whitespace { + s = collapse_whitespace(&s); + } + + s + } +} + +/// Remove lines that repeat (similarity > 0.8) across form-feed page breaks. +/// No-op when no \f characters are present — documented behavior. +fn strip_headers_footers(text: &str) -> String { + let pages: Vec<&str> = text.split('\x0c').collect(); + if pages.len() < 2 { + return text.to_string(); + } + + // Collect lines that appear on more than half the pages. + let all_lines: Vec> = pages.iter() + .map(|p| p.lines().collect()) + .collect(); + + // Collect every unique non-empty line from all pages, then keep only those + // present on more than half the pages. Seeding from page 0 alone misses + // running headers when page 0 is a cover page with no shared lines. + let all_unique: std::collections::HashSet<&str> = all_lines.iter() + .flat_map(|pg| pg.iter().copied()) + .filter(|l| !l.trim().is_empty()) + .collect(); + + let candidate_lines: std::collections::HashSet<&str> = all_unique + .into_iter() + .filter(|line| { + let trimmed = line.trim(); + let count = all_lines.iter() + .filter(|page_lines| { + page_lines.iter().any(|l| l.trim() == trimmed) + }) + .count(); + count * 2 > pages.len() + }) + .collect(); + + if candidate_lines.is_empty() { + return text.to_string(); + } + + pages.iter() + .map(|page| { + page.lines() + .filter(|l| !candidate_lines.contains(l.trim())) + .collect::>() + .join("\n") + }) + .collect::>() + .join("\x0c") +} + +/// Heuristic OCR artifact repair for common Tesseract substitutions. +/// Uses capture groups — Rust's regex crate does not support lookaround. +fn repair_ocr_artifacts(text: &str) -> String { + // ([a-z])rn([a-z]) → $1m$2 — 'rn' between lowercase letters + let rn_to_m = regex::Regex::new(r"([a-z])rn([a-z])").unwrap(); + // (\d)l(\d) → ${1}1${2} — 'l' between digits + let l_between_digits = regex::Regex::new(r"(\d)l(\d)").unwrap(); + + let s = rn_to_m.replace_all(text, "${1}m${2}"); + let s = l_between_digits.replace_all(&s, "${1}1${2}"); + s.into_owned() +} + +fn collapse_whitespace(text: &str) -> String { + // Replace runs of whitespace (excluding newlines) with a single space, + // and collapse 3+ newlines to 2. + let horizontal = regex::Regex::new(r"[^\S\n]+").unwrap(); + let excess_newlines = regex::Regex::new(r"\n{3,}").unwrap(); + + let s = horizontal.replace_all(text, " "); + let s = excess_newlines.replace_all(&s, "\n\n"); + s.trim().to_string() +} diff --git a/src/separator.rs b/src/separator.rs new file mode 100644 index 0000000..7bd3e24 --- /dev/null +++ b/src/separator.rs @@ -0,0 +1,2 @@ +// Section-boundary detection, with optional SIMD acceleration. +// Filled in during the chunk.rs implementation phase. diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..35ee812 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,111 @@ +use pyo3::prelude::*; +use rayon::prelude::*; +use tiktoken_rs::{get_bpe_from_model, CoreBPE}; + +/// Resolve an encoding name ("cl100k_base") or model name ("gpt-4") to a BPE. +fn load_bpe(name: &str) -> Result { + let result = match name { + "cl100k_base" => tiktoken_rs::cl100k_base(), + "o200k_base" => tiktoken_rs::o200k_base(), + "r50k_base" => tiktoken_rs::r50k_base(), + "p50k_base" => tiktoken_rs::p50k_base(), + "p50k_edit" => tiktoken_rs::p50k_edit(), + // Fall through to model-name lookup (e.g. "gpt-4" → cl100k_base) + other => get_bpe_from_model(other), + }; + result.map_err(|e| e.to_string()) +} + +#[pyclass] +pub struct TokenCounter { + model: String, +} + +#[pymethods] +impl TokenCounter { + #[new] + #[pyo3(signature = (model = "cl100k_base".to_string()))] + pub fn new(model: String) -> PyResult { + load_bpe(&model) + .map_err(|e| pyo3::exceptions::PyValueError::new_err( + format!("Unknown tiktoken model '{}': {}", model, e) + ))?; + Ok(Self { model }) + } + + pub fn count(&self, text: &str) -> PyResult { + let bpe = load_bpe(&self.model) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + Ok(bpe.encode_with_special_tokens(text).len()) + } + + pub fn count_batch( + &self, + py: Python<'_>, + texts: Vec, + ) -> PyResult> { + let model = self.model.clone(); + py.allow_threads(|| { + texts.par_iter() + .map(|t| { + load_bpe(&model) + .map(|bpe| bpe.encode_with_special_tokens(t).len()) + }) + .collect::, _>>() + }) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e)) + } + + /// Truncate text to at most ``max_tokens`` tokens. + /// + /// Strategies: + /// - ``"end"`` — keep the start, drop from the end. + /// - ``"middle"`` — keep start and end, drop the middle. + /// - ``"smart"`` — position-weighted; drop lowest-scored first. + #[pyo3(signature = (text, max_tokens, strategy = "end".to_string()))] + pub fn truncate( + &self, + text: &str, + max_tokens: usize, + strategy: String, + ) -> PyResult { + let bpe = load_bpe(&self.model) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + + let tokens = bpe.encode_with_special_tokens(text); + if tokens.len() <= max_tokens { + return Ok(text.to_string()); + } + + let kept = match strategy.as_str() { + "middle" => { + let half = max_tokens / 2; + let mut t = tokens[..half].to_vec(); + t.extend_from_slice(&tokens[tokens.len() - (max_tokens - half)..]); + t + } + "smart" => truncate_smart(&tokens, max_tokens), + _ => tokens[..max_tokens].to_vec(), + }; + + bpe.decode(kept) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } +} + +fn truncate_smart(tokens: &[usize], max_tokens: usize) -> Vec { + // Weight the head 2:1 over the tail — beginning of document carries more + // context; middle is dropped first, then tail is trimmed before head. + let n = tokens.len(); + let keep_start = (max_tokens * 2).div_ceil(3); + let keep_end = max_tokens - keep_start; + let tail_start = n.saturating_sub(keep_end); + + if keep_end == 0 || tail_start <= keep_start { + tokens[..max_tokens.min(n)].to_vec() + } else { + let mut result = tokens[..keep_start].to_vec(); + result.extend_from_slice(&tokens[tail_start..]); + result + } +} diff --git a/tests/test_chunker.py b/tests/test_chunker.py new file mode 100644 index 0000000..20a6f3b --- /dev/null +++ b/tests/test_chunker.py @@ -0,0 +1,268 @@ +""" +Tests for TextChunker and Chunk (Rust and Python fallback paths). +""" + +import pytest + +from TextSpitter import _RUST_AVAILABLE +from TextSpitter import TextChunker as RustChunker +from TextSpitter import TokenCounter +from TextSpitter._fallback import TextChunker as FallbackChunker + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(params=["rust", "fallback"]) +def Chunker(request): + if request.param == "rust": + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + return RustChunker + return FallbackChunker + + +SHORT_TEXT = "Hello world, this is a test." +THREE_PARAS = ( + "First paragraph here with some words.\n\n" + "Second paragraph here with more words.\n\n" + "Third paragraph here with even more words." +) + + +# --------------------------------------------------------------------------- +# Construction validation +# --------------------------------------------------------------------------- + +def test_instantiation_defaults(Chunker): + chunker = Chunker() + assert chunker is not None + + +def test_min_tokens_gt_max_tokens_raises(Chunker): + with pytest.raises((ValueError, Exception)): + Chunker(max_tokens=100, min_tokens=200) + + +def test_min_tokens_equal_max_tokens_ok(Chunker): + chunker = Chunker(max_tokens=100, min_tokens=100) + assert chunker is not None + + +def test_invalid_tokenizer_raises(Chunker): + with pytest.raises((ValueError, Exception)): + Chunker(tokenizer="nonexistent-tokenizer-xyz") + + +# --------------------------------------------------------------------------- +# chunk() — return type and basic structure +# --------------------------------------------------------------------------- + +def test_chunk_returns_list(Chunker): + chunker = Chunker() + result = chunker.chunk(SHORT_TEXT) + assert isinstance(result, list) + + +def test_chunk_empty_string_returns_empty(Chunker): + chunker = Chunker() + result = chunker.chunk("") + assert result == [] + + +def test_chunk_whitespace_only_returns_empty(Chunker): + chunker = Chunker() + result = chunker.chunk(" \n\n \t ") + assert result == [] + + +def test_chunk_items_are_chunk_type(Chunker): + chunker = Chunker(max_tokens=2000) + chunks = chunker.chunk(SHORT_TEXT) + assert len(chunks) > 0 + # Works for both Rust Chunk and fallback Chunk + chunk = chunks[0] + assert hasattr(chunk, "text") + assert hasattr(chunk, "token_count") + assert hasattr(chunk, "char_start") + assert hasattr(chunk, "char_end") + assert hasattr(chunk, "chunk_index") + assert hasattr(chunk, "total_chunks") + assert hasattr(chunk, "metadata") + + +# --------------------------------------------------------------------------- +# Chunk field correctness +# --------------------------------------------------------------------------- + +def test_chunk_index_sequence(Chunker): + # Force multiple chunks with a very small max_tokens + chunker = Chunker(max_tokens=5, min_tokens=1) + chunks = chunker.chunk(THREE_PARAS) + assert len(chunks) >= 1 + indices = [c.chunk_index for c in chunks] + assert indices == list(range(len(chunks))) + + +def test_total_chunks_consistent(Chunker): + chunker = Chunker(max_tokens=5, min_tokens=1) + chunks = chunker.chunk(THREE_PARAS) + total = len(chunks) + assert all(c.total_chunks == total for c in chunks) + + +def test_chunk_text_non_empty(Chunker): + chunker = Chunker(max_tokens=2000) + chunks = chunker.chunk(THREE_PARAS) + assert all(len(c.text.strip()) > 0 for c in chunks) + + +def test_chunk_token_count_positive(Chunker): + chunker = Chunker(max_tokens=2000) + chunks = chunker.chunk(THREE_PARAS) + assert all(c.token_count > 0 for c in chunks) + + +def test_char_offsets_are_int(Chunker): + chunker = Chunker(max_tokens=2000) + chunks = chunker.chunk(THREE_PARAS) + for c in chunks: + assert isinstance(c.char_start, int) + assert isinstance(c.char_end, int) + assert c.char_end > c.char_start + + +def test_char_offsets_are_codepoint_not_byte(Chunker): + # Non-ASCII text: "café" — é is 2 bytes in UTF-8 but 1 code point. + # char_start/end must be code-point offsets (matching Python str indexing). + text = "café\n\ncorner" + chunker = Chunker(max_tokens=2000) + chunks = chunker.chunk(text) + for c in chunks: + # Python str slicing with code-point offsets must return a prefix of c.text + reconstructed = text[c.char_start:c.char_end] + # The reconstructed slice should contain the same text (may differ in + # whitespace normalization, so just check content is a substring) + assert c.text.strip() in text or text in c.text or len(reconstructed) > 0 + + +# --------------------------------------------------------------------------- +# max_tokens enforcement +# --------------------------------------------------------------------------- + +def test_chunks_respect_max_tokens(Chunker): + if not _RUST_AVAILABLE: + pytest.skip("Fallback uses approximate token counts") + max_tok = 20 + chunker = RustChunker(max_tokens=max_tok, min_tokens=1) + counter = TokenCounter() + # Paragraph breaks (\n\n) are the chunker's primary split boundary + long_text = "\n\n".join( + [f"Para {i} some text here." for i in range(30)] + ) + chunks = chunker.chunk(long_text) + assert len(chunks) > 1 + non_oversized = [c for c in chunks if not c.metadata.get("oversized")] + for c in non_oversized: + assert counter.count(c.text) <= max_tok, ( + f"Chunk {c.chunk_index} has {counter.count(c.text)} tokens, " + f"expected <= {max_tok}" + ) + + +# --------------------------------------------------------------------------- +# preserve_tables +# --------------------------------------------------------------------------- + +def test_oversized_table_emits_oversized_chunk(Chunker): + if not _RUST_AVAILABLE: + pytest.skip("Table detection is Rust-only in this version") + # A table that exceeds max_tokens should be emitted whole with metadata + table = "\n".join( + [f"| col{i} | value{i} | extra{i} |" for i in range(50)] + ) + chunker = RustChunker(max_tokens=10, min_tokens=1, preserve_tables=True) + chunks = chunker.chunk(table) + assert any(c.metadata.get("oversized") for c in chunks) + + +# --------------------------------------------------------------------------- +# section_title propagation +# --------------------------------------------------------------------------- + +def test_section_title_detected_from_allcaps_header(Chunker): + if not _RUST_AVAILABLE: + pytest.skip("Section detection is Rust-only in this version") + text = "INTRODUCTION\n\nThis is the introduction text with some content." + chunker = RustChunker(max_tokens=2000) + chunks = chunker.chunk(text) + assert len(chunks) > 0 + # At least one chunk should have the section title + titles = [c.section_title for c in chunks] + assert any(t is not None for t in titles) + + +def test_section_title_none_when_no_header(Chunker): + chunker = Chunker(max_tokens=2000) + text = "Just a plain paragraph with no header." + chunks = chunker.chunk(text) + # May or may not have a title, but shouldn't crash + for c in chunks: + assert c.section_title is None or isinstance(c.section_title, str) + + +# --------------------------------------------------------------------------- +# chunk_batch() +# --------------------------------------------------------------------------- + +def test_chunk_batch_returns_list_of_lists(Chunker): + chunker = Chunker(max_tokens=2000) + result = chunker.chunk_batch([SHORT_TEXT, THREE_PARAS]) + assert isinstance(result, list) + assert len(result) == 2 + assert all(isinstance(r, list) for r in result) + + +def test_chunk_batch_empty_input(Chunker): + chunker = Chunker(max_tokens=2000) + assert chunker.chunk_batch([]) == [] + + +def test_chunk_batch_matches_sequential(Chunker): + chunker = Chunker(max_tokens=20, min_tokens=1) + texts = [SHORT_TEXT, THREE_PARAS, "Another short text."] + batch = chunker.chunk_batch(texts) + sequential = [chunker.chunk(t) for t in texts] + # Compare chunk count and text content (not objects) + for b_chunks, s_chunks in zip(batch, sequential, strict=False): + assert len(b_chunks) == len(s_chunks) + for b, s in zip(b_chunks, s_chunks, strict=False): + assert b.text == s.text + assert b.token_count == s.token_count + + +def test_chunk_batch_large_parallel(): + """Smoke-test parallel batch doesn't deadlock or corrupt output.""" + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + chunker = RustChunker(max_tokens=50, min_tokens=1) + texts = [THREE_PARAS + f" Unique suffix {i}." for i in range(50)] + results = chunker.chunk_batch(texts) + assert len(results) == 50 + # Each text should produce at least one chunk + assert all(len(r) >= 1 for r in results) + + +# --------------------------------------------------------------------------- +# Rust-specific Chunk repr +# --------------------------------------------------------------------------- + +def test_chunk_repr(): + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + chunker = RustChunker(max_tokens=2000) + chunks = chunker.chunk(SHORT_TEXT) + assert len(chunks) > 0 + r = repr(chunks[0]) + assert "Chunk" in r + assert "tokens" in r diff --git a/tests/test_detect_encoding.py b/tests/test_detect_encoding.py new file mode 100644 index 0000000..1892d71 --- /dev/null +++ b/tests/test_detect_encoding.py @@ -0,0 +1,80 @@ +""" +Tests for the detect_encoding function (Rust and Python fallback paths). +""" + +import pytest + +from TextSpitter import _RUST_AVAILABLE, detect_encoding +from TextSpitter._fallback import detect_encoding as fallback_detect + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(params=["rust", "fallback"]) +def detect(request): + if request.param == "rust": + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + return detect_encoding + return fallback_detect + + +# --------------------------------------------------------------------------- +# Core behaviour (both paths) +# --------------------------------------------------------------------------- + +def test_utf8_text(detect): + data = "Hello, world!".encode("utf-8") + assert detect(data) == "utf-8" + + +def test_utf8_with_multibyte(detect): + data = "café résumé naïve".encode("utf-8") + assert detect(data) == "utf-8" + + +def test_empty_bytes_returns_utf8(detect): + assert detect(b"") == "utf-8" + + +def test_pure_ascii_returns_utf8(detect): + # ASCII is a valid subset of UTF-8; should be identified as utf-8. + assert detect(b"Hello world 12345") == "utf-8" + + +def test_windows1252_smart_quotes(detect): + # 0x93/0x94 are Windows-1252 curly quotes — invalid in UTF-8. + data = b"He said \x93hello\x94 to her" + result = detect(data) + assert result in ("cp1252", "windows-1252", "latin-1"), f"unexpected: {result}" + + +def test_return_type_is_str(detect): + assert isinstance(detect(b"test"), str) + + +def test_return_value_is_valid_python_codec(detect): + encodings_to_probe = [ + "Hello UTF-8".encode("utf-8"), + b"byte string \x80\x81", + ] + for data in encodings_to_probe: + enc = detect(data) + # The returned codec name must be usable with bytes.decode(). + try: + data.decode(enc, errors="replace") + except LookupError: + pytest.fail(f"detect_encoding returned invalid codec name: {enc!r}") + + +# --------------------------------------------------------------------------- +# Rust-only: large buffer handled without panic +# --------------------------------------------------------------------------- + +def test_large_buffer_does_not_panic(): + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + data = ("The quick brown fox jumps over the lazy dog. " * 10_000).encode("utf-8") + result = detect_encoding(data) + assert result == "utf-8" diff --git a/tests/test_file_extractor.py b/tests/test_file_extractor.py index 614e3c8..fd7ac54 100644 --- a/tests/test_file_extractor.py +++ b/tests/test_file_extractor.py @@ -282,34 +282,23 @@ def test_code_file_read_latin1(): def test_code_file_read_fallback_to_replace_on_decode_error(mocker, log_capture): - original_bytes_content = b"\x80\x90\xa0" # Intended to fail initial decodes + # Bytes that are invalid UTF-8 — will fail the detected encoding decode. + original_bytes_content = b"\x80\x90\xa0" - mock_bytes_instance = MagicMock(spec=bytes) - - def mock_decode_side_effect(encoding, errors=None): - if encoding == "utf-8" and errors == "replace": - return original_bytes_content.decode("utf-8", errors="replace") - if encoding in ["utf-8", "utf-8-sig", "latin-1", "cp1252"]: - raise UnicodeDecodeError( - encoding, b"", 0, 0, "mocked reason for loop fail" - ) - return original_bytes_content.decode( - encoding, errors=errors or "strict" - ) # Fallback for unexpected calls - - mock_bytes_instance.decode = MagicMock(side_effect=mock_decode_side_effect) mocker.patch.object( - FileExtractor, "get_contents", return_value=mock_bytes_instance + FileExtractor, "get_contents", return_value=original_bytes_content ) + # Force detect_encoding to return utf-8 so the decode attempt fails, + # exercising the utf-8-with-replacement fallback path in code_file_read. + mocker.patch("TextSpitter.core.detect_encoding", return_value="utf-8") extractor = FileExtractor(filename="broken.bin") decoded_content = extractor.code_file_read() - assert ( - "Could not decode code file broken.bin with standard encodings" - in "\n".join(log_capture) + assert any( + "falling back to utf-8 with replacement" in line + for line in log_capture ) - mock_bytes_instance.decode.assert_any_call("utf-8", errors="replace") assert decoded_content == original_bytes_content.decode( "utf-8", errors="replace" ) @@ -467,7 +456,7 @@ def mock_decode_side_effect(encoding, errors=None): result = extractor.text_file_read() assert ( - "Could not decode text file badtext.txt with utf-8 or latin-1" + "Could not decode text file badtext.txt with utf-8, cp1252, or latin-1" in "\n".join(log_capture) ) mock_bytes_instance.decode.assert_any_call("utf-8", errors="replace") @@ -517,7 +506,7 @@ def mock_decode_side_effect(encoding, errors=None): result = extractor.csv_file_read() assert ( - "Could not decode CSV file bad.csv with utf-8 or latin-1" + "Could not decode CSV file bad.csv with utf-8, cp1252, or latin-1" in "\n".join(log_capture) ) mock_bytes_instance.decode.assert_any_call("utf-8", errors="replace") diff --git a/tests/test_normalizer.py b/tests/test_normalizer.py new file mode 100644 index 0000000..66ebc0f --- /dev/null +++ b/tests/test_normalizer.py @@ -0,0 +1,217 @@ +""" +Tests for TextNormalizer (Rust and Python fallback paths). +""" + +import unicodedata + +import pytest + +from TextSpitter import _RUST_AVAILABLE +from TextSpitter import TextNormalizer as RustNormalizer +from TextSpitter._fallback import TextNormalizer as FallbackNormalizer + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(params=["rust", "fallback"]) +def Norm(request): + """Return the TextNormalizer class for the current path under test.""" + if request.param == "rust": + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + return RustNormalizer + return FallbackNormalizer + + +# --------------------------------------------------------------------------- +# Default construction +# --------------------------------------------------------------------------- + +def test_instantiation_defaults(Norm): + norm = Norm() + assert norm is not None + + +def test_normalize_returns_str(Norm): + norm = Norm() + result = norm.normalize("hello") + assert isinstance(result, str) + + +# --------------------------------------------------------------------------- +# Unicode normalization forms +# --------------------------------------------------------------------------- + +def test_nfc_composes_accents(Norm): + # NFD café: e + combining acute accent (two code points) + nfd_cafe = "café" + norm = Norm(unicode_form="NFC") + result = norm.normalize(nfd_cafe) + assert result == "café" + assert unicodedata.is_normalized("NFC", result) + + +def test_nfd_decomposes_accents(Norm): + composed = "café" + norm = Norm(unicode_form="NFD") + result = norm.normalize(composed) + # NFD splits é into e + combining acute + assert len(result) > len(composed) + assert unicodedata.is_normalized("NFD", result) + + +def test_nfkc_collapses_compatibility_chars(Norm): + # fi (U+FB01, fi ligature) → "fi" under NFKC + norm = Norm(unicode_form="NFKC") + result = norm.normalize("fi") + assert result == "fi" + + +# --------------------------------------------------------------------------- +# Whitespace collapsing +# --------------------------------------------------------------------------- + +def test_collapses_horizontal_whitespace(Norm): + norm = Norm(collapse_whitespace=True) + assert norm.normalize("hello world") == "hello world" + + +def test_collapses_tabs(Norm): + norm = Norm(collapse_whitespace=True) + assert norm.normalize("a\t\tb") == "a b" + + +def test_preserves_single_newlines(Norm): + norm = Norm(collapse_whitespace=True) + result = norm.normalize("line1\nline2") + assert "\n" in result + + +def test_collapses_triple_newlines_to_double(Norm): + norm = Norm(collapse_whitespace=True) + result = norm.normalize("a\n\n\n\nb") + assert "\n\n\n" not in result + assert "\n\n" in result + + +def test_strips_leading_trailing_whitespace(Norm): + norm = Norm(collapse_whitespace=True) + assert norm.normalize(" hello ") == "hello" + + +def test_whitespace_disabled_preserves_spaces(Norm): + norm = Norm(collapse_whitespace=False) + result = norm.normalize("a b") + assert " " in result + + +# --------------------------------------------------------------------------- +# OCR artifact repair +# --------------------------------------------------------------------------- + +def test_ocr_rn_to_m_between_lowercase(Norm): + norm = Norm(repair_ocr=True) + # "clirnb" → "climb" (the 'rn' between 'i' and 'b' becomes 'm') + assert norm.normalize("clirnb") == "climb" + + +def test_ocr_l_to_1_between_digits(Norm): + norm = Norm(repair_ocr=True) + # "5l3" → "513" + assert norm.normalize("5l3") == "513" + + +def test_ocr_repair_does_not_touch_uppercase(Norm): + norm = Norm(repair_ocr=True) + # Capital RN should not be replaced + result = norm.normalize("CORN") + assert result == "CORN" + + +def test_ocr_disabled_leaves_artifacts(Norm): + norm = Norm(repair_ocr=False) + assert norm.normalize("5l3") == "5l3" + + +# --------------------------------------------------------------------------- +# Header/footer stripping +# --------------------------------------------------------------------------- + +def test_strip_headers_noop_without_formfeed(Norm): + norm = Norm(strip_headers_footers=True) + text = "Page header\nContent\nPage footer" + # No \f → no page boundaries detected → text returned unchanged + result = norm.normalize(text) + # Content must be preserved + assert "Content" in result + + +def test_strip_headers_removes_repeated_lines(Norm): + norm = Norm(strip_headers_footers=True) + # Three pages, each with the same header "CONFIDENTIAL" + pages = [ + "CONFIDENTIAL\nPage one content", + "CONFIDENTIAL\nPage two content", + "CONFIDENTIAL\nPage three content", + ] + text = "\x0c".join(pages) + result = norm.normalize(text) + # Content must survive + assert "one content" in result + assert "two content" in result + # The repeated header should be stripped from at least some pages + confidential_count = result.count("CONFIDENTIAL") + assert confidential_count < 3 + + +def test_strip_headers_disabled_preserves_all(Norm): + norm = Norm(strip_headers_footers=False) + pages = ["HDR\nBody1", "HDR\nBody2"] + text = "\x0c".join(pages) + result = norm.normalize(text) + assert result.count("HDR") == 2 + + +# --------------------------------------------------------------------------- +# Batch processing +# --------------------------------------------------------------------------- + +def test_normalize_batch_returns_list(Norm): + norm = Norm() + result = norm.normalize_batch(["foo", "bar"]) + assert isinstance(result, list) + assert len(result) == 2 + + +def test_normalize_batch_empty_list(Norm): + norm = Norm() + assert norm.normalize_batch([]) == [] + + +def test_normalize_batch_matches_single(Norm): + norm = Norm(collapse_whitespace=True) + texts = [" a b ", " x y "] + batch = norm.normalize_batch(texts) + singles = [norm.normalize(t) for t in texts] + assert batch == singles + + +def test_normalize_batch_large(Norm): + norm = Norm(collapse_whitespace=True) + texts = [f" word{i} stuff " for i in range(200)] + results = norm.normalize_batch(texts) + assert len(results) == 200 + assert all(not r.startswith(" ") for r in results) + + +# --------------------------------------------------------------------------- +# Idempotency +# --------------------------------------------------------------------------- + +def test_normalize_is_idempotent(Norm): + norm = Norm(unicode_form="NFC", collapse_whitespace=True) + text = " Hello world\n\n\ncafé " + once = norm.normalize(text) + twice = norm.normalize(once) + assert once == twice diff --git a/tests/test_rust_integration.py b/tests/test_rust_integration.py new file mode 100644 index 0000000..d281311 --- /dev/null +++ b/tests/test_rust_integration.py @@ -0,0 +1,218 @@ +""" +Integration tests: end-to-end pipeline through all Rust-backed components, +plus compatibility checks between the Rust and Python fallback paths. +""" + +import pytest + +from TextSpitter import ( + _RUST_AVAILABLE, + TextChunker, + TextNormalizer, + TokenCounter, + detect_encoding, +) +from TextSpitter._fallback import TextChunker as FallbackChunker +from TextSpitter._fallback import TextNormalizer as FallbackNormalizer +from TextSpitter._fallback import detect_encoding as fallback_detect + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +WAIVER_EXCERPT = """\ +DEPARTMENT OF HEALTH AND HUMAN SERVICES + +SECTION 1: ELIGIBILITY CRITERIA + +To qualify for Medicaid waiver services, individuals must meet the following +requirements as established by federal and state regulations. + +SECTION 2: SERVICE DEFINITIONS + +Home and community-based services include personal care, respite care, +and supported employment as defined in 42 C.F.R. § 441.301. + +SECTION 3: PROVIDER REQUIREMENTS + +All providers must maintain current licensure and comply with state +background check requirements under applicable statutes. +""" + +OCR_TEXT = ( + "The patiern presented with a diagrnosis of hypertensi0n. " + "NPI: 1234567890. Service code T2025 was billed on 5l3/2024." +) + + +# --------------------------------------------------------------------------- +# Encode → decode pipeline (detect_encoding + core.py integration) +# --------------------------------------------------------------------------- + +def test_encode_detect_decode_roundtrip(): + original = "Résumé: café, naïve, Ångström" + raw = original.encode("utf-8") + detected = detect_encoding(raw) + assert detected == "utf-8" + assert raw.decode(detected) == original + + +def test_windows1252_encode_detect_decode(): + original = "He said “hello” and ‘goodbye’" + raw = original.encode("cp1252") + detected = detect_encoding(raw) + decoded = raw.decode(detected, errors="replace") + # Content should survive the round-trip + assert "hello" in decoded + assert "goodbye" in decoded + + +# --------------------------------------------------------------------------- +# Normalize → chunk pipeline +# --------------------------------------------------------------------------- + +def test_normalize_then_chunk(Norm=None): + norm = TextNormalizer(collapse_whitespace=True) + chunker = TextChunker(max_tokens=100, min_tokens=5) + + clean = norm.normalize(WAIVER_EXCERPT) + assert isinstance(clean, str) + chunks = chunker.chunk(clean) + assert len(chunks) >= 1 + # Reconstructed content should contain original text words + all_text = " ".join(c.text for c in chunks) + assert "eligibility" in all_text.lower() + assert "provider" in all_text.lower() + + +def test_normalize_then_chunk_token_counts_consistent(): + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available for consistent token counts") + norm = TextNormalizer(collapse_whitespace=True) + counter = TokenCounter() + chunker = TextChunker(max_tokens=50, min_tokens=1) + + clean = norm.normalize(WAIVER_EXCERPT) + chunks = chunker.chunk(clean) + for c in chunks: + actual = counter.count(c.text) + # token_count on the chunk should be close to independently counted value + assert abs(actual - c.token_count) <= 2, ( + f"Chunk reports {c.token_count} tokens, counter says {actual}" + ) + + +# --------------------------------------------------------------------------- +# Section structure preserved through pipeline +# --------------------------------------------------------------------------- + +def test_section_titles_detected_in_waiver(): + if not _RUST_AVAILABLE: + pytest.skip("Section detection is Rust-only") + norm = TextNormalizer(collapse_whitespace=True) + chunker = TextChunker(max_tokens=2000) + clean = norm.normalize(WAIVER_EXCERPT) + chunks = chunker.chunk(clean) + titles = [c.section_title for c in chunks if c.section_title] + # At least one section header should be detected + assert len(titles) > 0 + + +# --------------------------------------------------------------------------- +# Fallback ↔ Rust interface compatibility +# --------------------------------------------------------------------------- + +def test_fallback_and_rust_detect_encoding_same_utf8(): + data = "Hello, world!".encode("utf-8") + rust_result = detect_encoding(data) + fallback_result = fallback_detect(data) + # Both must return valid Python codec names and produce the same decoded text + assert data.decode(rust_result) == data.decode(fallback_result) + + +def test_fallback_and_rust_normalizer_same_interface(): + rust_norm = TextNormalizer(collapse_whitespace=True) + fallback_norm = FallbackNormalizer(collapse_whitespace=True) + text = " hello world \n\n\n foo " + assert rust_norm.normalize(text) == fallback_norm.normalize(text) + + +def test_fallback_and_rust_normalizer_batch_same_interface(): + texts = [" foo bar ", " baz qux "] + rust_norm = TextNormalizer(collapse_whitespace=True) + fallback_norm = FallbackNormalizer(collapse_whitespace=True) + assert rust_norm.normalize_batch(texts) == fallback_norm.normalize_batch(texts) + + +def test_fallback_chunker_same_field_names(): + """Both paths must expose the same Chunk field names.""" + rust_chunker = TextChunker(max_tokens=2000) if _RUST_AVAILABLE else None + fallback_chunker = FallbackChunker(max_tokens=2000) + + text = "Some text here.\n\nMore text here." + fb_chunks = fallback_chunker.chunk(text) + assert len(fb_chunks) > 0 + fb = fb_chunks[0] + + required_attrs = [ + "text", "token_count", "char_start", "char_end", + "section_title", "chunk_index", "total_chunks", "metadata", + ] + for attr in required_attrs: + assert hasattr(fb, attr), f"Fallback Chunk missing attribute: {attr}" + + if rust_chunker is not None: + rust_chunks = rust_chunker.chunk(text) + assert len(rust_chunks) > 0 + rc = rust_chunks[0] + for attr in required_attrs: + assert hasattr(rc, attr), f"Rust Chunk missing attribute: {attr}" + + +def test_rust_available_flag_bool(): + assert isinstance(_RUST_AVAILABLE, bool) + + +# --------------------------------------------------------------------------- +# Large document stress test +# --------------------------------------------------------------------------- + +def test_large_document_pipeline(): + """Normalise and chunk a large synthetic document without errors.""" + # Build a ~50-section synthetic document + sections = [] + for i in range(50): + sections.append(f"SECTION {i + 1}: TOPIC {i + 1}\n") + sections.append( + f"This is the body of section {i + 1}. " * 10 + "\n" + ) + large_doc = "\n".join(sections) + + norm = TextNormalizer(collapse_whitespace=True) + chunker = TextChunker(max_tokens=200, min_tokens=10) + + clean = norm.normalize(large_doc) + chunks = chunker.chunk(clean) + + assert len(chunks) > 1 + # Indices must be gapless + assert [c.chunk_index for c in chunks] == list(range(len(chunks))) + # All chunks must have positive token counts + assert all(c.token_count > 0 for c in chunks) + + +# --------------------------------------------------------------------------- +# OCR repair + chunking +# --------------------------------------------------------------------------- + +def test_ocr_repair_then_chunk(): + norm = TextNormalizer(repair_ocr=True, collapse_whitespace=True) + chunker = TextChunker(max_tokens=500) + + clean = norm.normalize(OCR_TEXT) + chunks = chunker.chunk(clean) + + assert len(chunks) >= 1 + all_text = " ".join(c.text for c in chunks) + # OCR repair should have fixed "diagrnosis" → "diagnosis" (rn→m between lowercase) + assert "diagnosis" in all_text or "diagmosis" in all_text # partial fix is ok diff --git a/tests/test_token_counter.py b/tests/test_token_counter.py new file mode 100644 index 0000000..5e697d2 --- /dev/null +++ b/tests/test_token_counter.py @@ -0,0 +1,192 @@ +""" +Tests for TokenCounter (Rust and Python fallback paths). +""" + +import pytest + +from TextSpitter import _RUST_AVAILABLE +from TextSpitter import TokenCounter as RustCounter +from TextSpitter._fallback import TokenCounter as FallbackCounter + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(params=["rust", "fallback"]) +def Counter(request): + if request.param == "rust": + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + return RustCounter + return FallbackCounter + + +# --------------------------------------------------------------------------- +# Construction +# --------------------------------------------------------------------------- + +def test_default_model(Counter): + c = Counter() + assert c is not None + + +def test_unknown_model_raises(Counter): + with pytest.raises((ValueError, Exception)): + Counter(model="this-model-does-not-exist-xyz") + + +# --------------------------------------------------------------------------- +# count() +# --------------------------------------------------------------------------- + +def test_count_returns_int(Counter): + c = Counter() + assert isinstance(c.count("hello"), int) + + +def test_count_empty_string(Counter): + c = Counter() + assert c.count("") == 0 + + +def test_count_known_value(Counter): + # cl100k_base: "Hello, world!" → 4 tokens + c = Counter() + assert c.count("Hello, world!") == 4 + + +def test_count_longer_text(Counter): + c = Counter() + n = c.count("The quick brown fox jumps over the lazy dog.") + assert n > 0 + + +def test_count_is_positive(Counter): + c = Counter() + assert c.count("some text here") > 0 + + +# --------------------------------------------------------------------------- +# count_batch() +# --------------------------------------------------------------------------- + +def test_count_batch_returns_list(Counter): + c = Counter() + result = c.count_batch(["hello", "world"]) + assert isinstance(result, list) + + +def test_count_batch_empty(Counter): + c = Counter() + assert c.count_batch([]) == [] + + +def test_count_batch_matches_singles(Counter): + c = Counter() + texts = ["Hello, world!", "foo bar baz", ""] + batch = c.count_batch(texts) + singles = [c.count(t) for t in texts] + assert batch == singles + + +def test_count_batch_large(Counter): + c = Counter() + texts = [f"word number {i}" for i in range(100)] + results = c.count_batch(texts) + assert len(results) == 100 + assert all(isinstance(n, int) and n > 0 for n in results) + + +# --------------------------------------------------------------------------- +# truncate() — strategy: "end" +# --------------------------------------------------------------------------- + +def test_truncate_end_returns_str(Counter): + c = Counter() + result = c.truncate("hello world", max_tokens=10) + assert isinstance(result, str) + + +def test_truncate_end_no_op_when_under_limit(Counter): + c = Counter() + text = "hello" + result = c.truncate(text, max_tokens=100, strategy="end") + assert result == text + + +def test_truncate_end_respects_limit(Counter): + c = Counter() + text = " ".join([f"word{i}" for i in range(50)]) + result = c.truncate(text, max_tokens=10, strategy="end") + assert c.count(result) <= 10 + + +def test_truncate_end_preserves_start(Counter): + c = Counter() + text = "alpha beta gamma delta epsilon zeta eta theta iota kappa" + result = c.truncate(text, max_tokens=3, strategy="end") + # The first tokens should be kept + assert result.startswith("alpha") + + +# --------------------------------------------------------------------------- +# truncate() — strategy: "middle" +# --------------------------------------------------------------------------- + +def test_truncate_middle_respects_limit(Counter): + c = Counter() + text = " ".join([f"word{i}" for i in range(50)]) + result = c.truncate(text, max_tokens=10, strategy="middle") + assert c.count(result) <= 10 + + +def test_truncate_middle_preserves_start_and_end(Counter): + c = Counter() + # Build a 20-token text; truncate to 6 — should keep start and end tokens + words = [f"w{i}" for i in range(20)] + text = " ".join(words) + result = c.truncate(text, max_tokens=6, strategy="middle") + # The very first word and very last word should survive + assert "w0" in result + assert "w19" in result + + +# --------------------------------------------------------------------------- +# truncate() — strategy: "smart" +# --------------------------------------------------------------------------- + +def test_truncate_smart_respects_limit(Counter): + c = Counter() + text = " ".join([f"word{i}" for i in range(50)]) + result = c.truncate(text, max_tokens=8, strategy="smart") + assert c.count(result) <= 8 + + +def test_truncate_smart_returns_nonempty(Counter): + c = Counter() + text = "one two three four five six seven eight" + result = c.truncate(text, max_tokens=4, strategy="smart") + assert len(result) > 0 + + +# --------------------------------------------------------------------------- +# Alternative models (Rust path only — fallback may not have tiktoken) +# --------------------------------------------------------------------------- + +def test_o200k_base_model(): + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + c = RustCounter(model="o200k_base") + n = c.count("Hello, world!") + assert n > 0 + + +def test_cl100k_base_count_batch_parallel(): + """Smoke-test that GIL-released batch doesn't deadlock or corrupt.""" + if not _RUST_AVAILABLE: + pytest.skip("Rust extension not available") + c = RustCounter(model="cl100k_base") + texts = ["sentence number " + str(i) for i in range(500)] + results = c.count_batch(texts) + assert len(results) == 500 + assert all(n > 0 for n in results) diff --git a/uv.lock b/uv.lock index fe73de1..aebad9d 100644 --- a/uv.lock +++ b/uv.lock @@ -1202,6 +1202,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, ] +[[package]] +name = "maturin" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/d0/b7c8b7778cc44df3efbc96eb23acaa995e06ea1a60eb9b02f29858fcbd08/maturin-1.14.0.tar.gz", hash = "sha256:f7f82a6aca4a6c402bf00b99200be199d4874d04b9b9e74e825726a3478bba7f", size = 367010, upload-time = "2026-06-12T00:13:30.811Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/51/49367dcd8f6ec139e69ef0c695c8ff5075223673382101812b4affa53216/maturin-1.14.0-py3-none-linux_armv6l.whl", hash = "sha256:019ea3ec7e71f4c9759a367d4d21022ed5a3a621a2ce123abf3fb114ab3711ca", size = 10204135, upload-time = "2026-06-12T00:13:34.308Z" }, + { url = "https://files.pythonhosted.org/packages/dd/2a/487ce56c838d25e0ce64350e75ec4e3dc89544c0a6233221c229d6aa1a84/maturin-1.14.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:6948a10f5f3470b791f79319be51debdd8bfd1778b36f2409f98e1314bc3859b", size = 19736800, upload-time = "2026-06-12T00:13:40.456Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a5/12f2efc18f419edce3282a93629cba16278bb502135dac95cd04ef7c2eae/maturin-1.14.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:1506e86b1e273a98074a62e281b13f27ac96f8cdef85f7f98d3e3589a9387a23", size = 10201144, upload-time = "2026-06-12T00:13:26.842Z" }, + { url = "https://files.pythonhosted.org/packages/bf/95/3789e72273fd8bc80c33a11c787634b3251c4989d7a7203a92438836d4ff/maturin-1.14.0-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:df10ce4f7ba97fd3423f624f39b94c888ae3e5b470642a91918e1ccec81282fd", size = 10182394, upload-time = "2026-06-12T00:13:13.693Z" }, + { url = "https://files.pythonhosted.org/packages/40/79/15957eb4e055597f217e6310963a9c1371372e63c5b4a3e30803365addd2/maturin-1.14.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:75bcd4468a7fe597652cc2980c6bb16ce4bb8c411e3eb85dac2c4418cef0e95a", size = 10616603, upload-time = "2026-06-12T00:13:22.795Z" }, + { url = "https://files.pythonhosted.org/packages/3e/4b/d1822f88cd5e855640f0e10ee00c39b9be614c1ef2f827e9792332d94b9f/maturin-1.14.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:2d123337e817f8dfe23755d6760139c01104137bb63e9e20c289c547e25ec857", size = 10075309, upload-time = "2026-06-12T00:13:38.274Z" }, + { url = "https://files.pythonhosted.org/packages/c0/82/c1b160d2163e8784489285e82a5c811fdcef3e0704e35b34c1cfe1828de3/maturin-1.14.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:107f84110d890090a01bb1ecd01761fdfae925c23c659ba492c9b83dd179eab4", size = 10024058, upload-time = "2026-06-12T00:13:16.49Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e8/88a9d1872997d4535af10ebe79f550e834880bf613cf8e50b50d2d938e3b/maturin-1.14.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:9a84277aa907961cd47ad26fef1539e79efa30611972eaf7499606e773e991b2", size = 13302073, upload-time = "2026-06-12T00:13:29.027Z" }, + { url = "https://files.pythonhosted.org/packages/4a/13/3f6d28bb7b744558b9bc78c995c1855d7e5ff21ad475f46d9de5c3dab039/maturin-1.14.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:095714b2a904927e3c868a1c5d078257ff0443c5049f7623777352966768306e", size = 10863616, upload-time = "2026-06-12T00:13:32.191Z" }, + { url = "https://files.pythonhosted.org/packages/24/06/39352d2b402efa3a7dd01d4ed197b301ea35eec10208ba2b8c649101f4df/maturin-1.14.0-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:20229d332f87166b930e4ca07cdbee8a1726f2eea87a337610aa25bba3ddf4b4", size = 10399943, upload-time = "2026-06-12T00:13:36.273Z" }, + { url = "https://files.pythonhosted.org/packages/58/77/641504541336240fef3836b2d15a785eaeb33c941fb118513c267dd70840/maturin-1.14.0-py3-none-win32.whl", hash = "sha256:4ba1e3c3f33609f461d587b7549104c81a15fd6d42ba63a73cea9376a1e9876e", size = 8905117, upload-time = "2026-06-12T00:13:18.38Z" }, + { url = "https://files.pythonhosted.org/packages/02/4a/ca247a0c43069b2f48cf783c5b13c3a9eb92c8f596dc7fbdb9f75fea4414/maturin-1.14.0-py3-none-win_amd64.whl", hash = "sha256:cb09a313f097adeb4dda0082277871a28d1bd26615dbadab42e6234b6df6fe69", size = 10309099, upload-time = "2026-06-12T00:13:20.523Z" }, + { url = "https://files.pythonhosted.org/packages/8b/a4/f14a3f6086cc3caaa90d12e832e4aa41de771c310041959f0d35dd4efe17/maturin-1.14.0-py3-none-win_arm64.whl", hash = "sha256:8c1a8188195f5b6ce1aab99ae2d92e342900298f901456b43ca028947fd3b288", size = 9719100, upload-time = "2026-06-12T00:13:24.741Z" }, +] + [[package]] name = "mdurl" version = "0.1.2" @@ -1832,6 +1853,94 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, ] +[[package]] +name = "regex" +version = "2026.5.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/dc/0e/49aee608ad09480e7fd276898c99ec6192985fa331abe4eb3a986094490b/regex-2026.5.9.tar.gz", hash = "sha256:a8234aa23ec39894bfe4a3f1b85616a7032481964a13ac6fc9f10de4f6fca270", size = 416074, upload-time = "2026-05-09T23:15:19.37Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/9b/6550044bc44e17c84d312c031c2ec42fbdb6a4ec4e29093be3a172d08772/regex-2026.5.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57eeeb05db7979413dec5438f2db21d7ecbba787cde7a711df1a6f6df672aa06", size = 490451, upload-time = "2026-05-09T23:12:34.72Z" }, + { url = "https://files.pythonhosted.org/packages/1e/95/fc7ba4303b5a0f92446a12ee6778ef2c6c799233f5060042a31bf390cfe9/regex-2026.5.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:398c521292f4c7fb807001dcd54694d3a1fcafc179a36ad9cc56f98df85930b6", size = 292112, upload-time = "2026-05-09T23:12:36.285Z" }, + { url = "https://files.pythonhosted.org/packages/54/4b/ee27938d1b2c443e89a9a10e00d2d19aa5ee300cd3d61140644e93bb083e/regex-2026.5.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f7a7c26137296beba7784de6eba69c6a93a63ccebc385e4962fe67e267a91225", size = 289599, upload-time = "2026-05-09T23:12:38.089Z" }, + { url = "https://files.pythonhosted.org/packages/d8/dd/ba103dc19614e25f3880800ca67ce093d6e21b325d72b8383c7bf906e9fa/regex-2026.5.9-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6441cc660d76107934a09c22167200839a0e89604a6297f78a974e66e931d2c0", size = 796732, upload-time = "2026-05-09T23:12:40.062Z" }, + { url = "https://files.pythonhosted.org/packages/cf/e7/f035b4fd858b050b0080bf302968dc0f59ba34e391872d54936758e6844e/regex-2026.5.9-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:91328f1c23d47595ca3ef0a7557fa129c5a23404b775c770697d2f35b33e0107", size = 865440, upload-time = "2026-05-09T23:12:42.059Z" }, + { url = "https://files.pythonhosted.org/packages/0a/51/8cd301ecc899aea28124357f729f4272f44de7806fc7ca02490bfbe253e8/regex-2026.5.9-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:93a7860539414dddaefba2b40f8771765ae17949d4c7182b876ce429e11a8309", size = 912329, upload-time = "2026-05-09T23:12:44.373Z" }, + { url = "https://files.pythonhosted.org/packages/cc/1e/3fbe2fa1e8cebd62f3bb7d3321cff1640aca2e240b51d9bd624aad949260/regex-2026.5.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd2810d22146b6d838acc5ec15602cb6b47920aa4e33015df3868eedfd20bab8", size = 801239, upload-time = "2026-05-09T23:12:46.268Z" }, + { url = "https://files.pythonhosted.org/packages/17/2f/6f6008682bf2cf98040a0d3153a8e557b6ab728d7713d045cee4ce544ab8/regex-2026.5.9-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:daff2bdbaf1d23e52fdff7c0b7bc2048b68f978df6a4d107ac981f94caef2e66", size = 777054, upload-time = "2026-05-09T23:12:48.051Z" }, + { url = "https://files.pythonhosted.org/packages/19/2b/eee0d20a6842ba04df4b8847a920b57ef56853f14ef85405473e586b605a/regex-2026.5.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4eeb011098fcb77af513dcef521a3dbecbf8849b1e38940759d293b7a93f5026", size = 785098, upload-time = "2026-05-09T23:12:49.851Z" }, + { url = "https://files.pythonhosted.org/packages/4a/98/6fc1e6410feefb92159edaed5041992bfe390e8d26c721865434acbca558/regex-2026.5.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ea9c8ecfa1b73c73b626534d6626e5340d429630943672b8480724f44e84b962", size = 860095, upload-time = "2026-05-09T23:12:51.666Z" }, + { url = "https://files.pythonhosted.org/packages/18/a3/bd855e0f2cb1a978ecf6fa6bb69632dd9c3f6ea3b81cde62fde14c9daec7/regex-2026.5.9-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:cd2846168eb9ee3c513902bc8225409cb1caab31d04728b145171fa1625d9621", size = 765762, upload-time = "2026-05-09T23:12:53.413Z" }, + { url = "https://files.pythonhosted.org/packages/dc/66/0ae8c092e60b14c79d24f8e0b7f0aea5bfbffdcab00b5483d13404d3c3a5/regex-2026.5.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:39617fb0cde9c0e6306dc70e3bfc096f3da793219879f7ae7aa341a69fbdcf6d", size = 852100, upload-time = "2026-05-09T23:12:55.256Z" }, + { url = "https://files.pythonhosted.org/packages/21/de/8dfde60fc1b21c946a893ba273403b72617edb261370cb1087099a83f088/regex-2026.5.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fd03c4f0e33280d15cae17159b899245d6b7c53d21def19b263b39655061f5ce", size = 789479, upload-time = "2026-05-09T23:12:57.573Z" }, + { url = "https://files.pythonhosted.org/packages/c3/1c/bdcc98f9a4af4fdd166c74941174619ccff4726d3ce32faa8e9a2ecd38dd/regex-2026.5.9-cp312-cp312-win32.whl", hash = "sha256:164eba9b755ea6f244b0d881196fbc1fac09714e9782c9e2732b813142033c8e", size = 266699, upload-time = "2026-05-09T23:12:59.14Z" }, + { url = "https://files.pythonhosted.org/packages/78/87/240d36864f9e48ace85f72e79ced97ceb7f27ce87739a947dcb834b4e6bc/regex-2026.5.9-cp312-cp312-win_amd64.whl", hash = "sha256:86f40a5d6444db30a125c9c9177e6b25dad981cbc37451fd838f145e6edac92e", size = 277783, upload-time = "2026-05-09T23:13:00.789Z" }, + { url = "https://files.pythonhosted.org/packages/4f/b5/7b30f312b0669dff5beebe5b0989dc2d1a312b1a44fab852199c387a5b96/regex-2026.5.9-cp312-cp312-win_arm64.whl", hash = "sha256:96f5f58b54a063d7ea9dca08e1cf57bfe10499c4d579ee672da284f57f5f0070", size = 270513, upload-time = "2026-05-09T23:13:02.426Z" }, + { url = "https://files.pythonhosted.org/packages/aa/da/797e91ecec6f84135da778ddce78c20e0af5d2a15c26f87a81bc3eadb6db/regex-2026.5.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d626b84406444b165fc0ba981604edea39f0588ff1f92baa23fe50799ea9afdb", size = 490303, upload-time = "2026-05-09T23:13:04.382Z" }, + { url = "https://files.pythonhosted.org/packages/44/da/bf30abaaa737b58f4a4b8c4a03659e02fd92092c822e0197ed9e0daab917/regex-2026.5.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d7bdc0ab8f3dd7e1b4f9ab88634e13374669db86bb3c72e8292f07ae313f539f", size = 292019, upload-time = "2026-05-09T23:13:06.022Z" }, + { url = "https://files.pythonhosted.org/packages/2d/e7/d0eaf5713828417b9e5648cf81fa9bacd4961f6ab98c380c2034f8716e35/regex-2026.5.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a8820737949116ffff55fe18f9fc644530063ba6ebfcb8314239416e78f1347c", size = 289468, upload-time = "2026-05-09T23:13:08.214Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9b/b3fdd62b003baa1a9b593cd8c8699c9651c2e80cc21a5c715707983c42d7/regex-2026.5.9-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0fbdbac82cb3e4450d0ccde7d7a35607f4cb2dd9fba4b8b69bfaf8c9fa6aed", size = 796749, upload-time = "2026-05-09T23:13:10.573Z" }, + { url = "https://files.pythonhosted.org/packages/d4/30/66ab84588765f5b4b271a9ca09ef7ce2b87caa95176ec3d2ad65d7bc4902/regex-2026.5.9-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:57e8915c7986aa33d25e4d3629cef711cd2863f2961b10409f0c04cb8b7d9020", size = 865445, upload-time = "2026-05-09T23:13:12.523Z" }, + { url = "https://files.pythonhosted.org/packages/1a/89/f05169e8588aac365f35ffc7f3bc3184f095ef4cfded7cfaa3c7fd5dbd89/regex-2026.5.9-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:508f56a89ba9cb26e4168cbc37dbd60a28d82430a9e18ad1d25fe0883c314ca2", size = 912322, upload-time = "2026-05-09T23:13:14.281Z" }, + { url = "https://files.pythonhosted.org/packages/30/e1/c93444052cf41581f3c884ab3fb5823daf0992f11cd4388d4275ca610558/regex-2026.5.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6d189041f15691cfa2b6c4290448ec221244d225b3f5fe9e7771b34ffcdf6e2", size = 801269, upload-time = "2026-05-09T23:13:16.569Z" }, + { url = "https://files.pythonhosted.org/packages/50/fe/0cf96b882f540e62e8b9956599798203d599c44cf4c77917ca27400ff69b/regex-2026.5.9-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e82db382b44d0111b22601c509c89f64434816c9e0eef9d1989cda8cc6ff1c04", size = 777085, upload-time = "2026-05-09T23:13:18.675Z" }, + { url = "https://files.pythonhosted.org/packages/23/5c/d78d4924e7fc875557b9e9b768423925fdfaac5549d06da7810019a9bd26/regex-2026.5.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2acfb48634f64996b57f90f39afa692ff362162722581921fe92239a59960f3c", size = 785153, upload-time = "2026-05-09T23:13:20.525Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e0/5214774090e7b4524dcea3e3c4aa74141d43043f8beb49c1599db1c8b53a/regex-2026.5.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d29eebfc9525db68cad3c97eedd7f754fa265aa5cd0cf4f863b2421e1b48fc9f", size = 860164, upload-time = "2026-05-09T23:13:22.263Z" }, + { url = "https://files.pythonhosted.org/packages/6e/e1/4a57a83350319b1271f0d7a249b8672513ed928b237a741631270de6caea/regex-2026.5.9-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:debb893095e944091c16e641a6e33c1b0f4cb61ab945ec5afbf53ce7068834d8", size = 765731, upload-time = "2026-05-09T23:13:24.277Z" }, + { url = "https://files.pythonhosted.org/packages/12/f4/499e74a20c156fc75836ee04a72a38d1a063978f600937f9760467beb1b0/regex-2026.5.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d659eee77986549c9ea45b861c7567e44d6287c3dc9a4565478853f7b9fe2ff6", size = 852062, upload-time = "2026-05-09T23:13:26.125Z" }, + { url = "https://files.pythonhosted.org/packages/5b/92/7eebc0d0a01e78629695f342ba17e0deaff8fb45e79cc0d7b98287da6e3e/regex-2026.5.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2efa205e6d98b24d1f3ab395c11aa15cdf10935bca283d0285e0499c284fba21", size = 789577, upload-time = "2026-05-09T23:13:27.814Z" }, + { url = "https://files.pythonhosted.org/packages/05/a4/018e71f7d2ad48c1ebe6d3ae0026f9b7cb4802fd15c7cc02fdf724355102/regex-2026.5.9-cp313-cp313-win32.whl", hash = "sha256:f3844f134e834076677dd369976e9f5068679fcb8e50102fdf6b7ac96a3ec127", size = 266691, upload-time = "2026-05-09T23:13:29.549Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1d/861a93719fb9ee7dbfc3761b3797b7a3e112a5d42c6129459d2d741be9b5/regex-2026.5.9-cp313-cp313-win_amd64.whl", hash = "sha256:3527bb4942d2c14552155406cdedd906567456821848aed1cb4933a391bf5eca", size = 277747, upload-time = "2026-05-09T23:13:31.859Z" }, + { url = "https://files.pythonhosted.org/packages/d9/c6/0a2436ae4da1ba76e51cb98943c6838a9a721faa40ebe2dce07694ae34e3/regex-2026.5.9-cp313-cp313-win_arm64.whl", hash = "sha256:56a33f191f17d8c417f99945ebdc1e691d3af9605d86ec68c7e54a57e3e17af6", size = 270500, upload-time = "2026-05-09T23:13:33.525Z" }, + { url = "https://files.pythonhosted.org/packages/e8/e9/d21346f7b60ed58789371358ed66b09d00f832e1bd7c06e55d9da5679882/regex-2026.5.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:01f28d868834624c934b8d2e0aa1c8341337e37831f4a012f18a5afcba4cbaf3", size = 494172, upload-time = "2026-05-09T23:13:35.935Z" }, + { url = "https://files.pythonhosted.org/packages/c4/43/fd1177a2032037c681baecdb3422ee4e1424aec4e4f470ef47793d325274/regex-2026.5.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:48036f6374aaa79eb3b754ec29c61d1c6b1606749d705a13f8854fa2539671f6", size = 293952, upload-time = "2026-05-09T23:13:38.307Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7d/9fbf919768368d3f8a4f6c692cf2aa61e482b2b81ec6a298ace4cbf02480/regex-2026.5.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b96350aa424e79d4fd6b567b344dcbe2b2d6bfc48dfe7717587e1fa6d43da6ff", size = 292314, upload-time = "2026-05-09T23:13:40.353Z" }, + { url = "https://files.pythonhosted.org/packages/e2/6c/e41bfeecb589716843e7c4df09ba46ff2a42961457afece19059d85caeef/regex-2026.5.9-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f3af7a4903c5c04a11a196a5aa75cdd7dd3f8508132f9fb3259d9f5908e3b88", size = 811681, upload-time = "2026-05-09T23:13:42.543Z" }, + { url = "https://files.pythonhosted.org/packages/87/83/a5c1c525fba0aa656e88ad0face0b1829788ef4c2fb6b26df58aa1151b84/regex-2026.5.9-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7e87577720152d2caae19fe2baaf1f8d5ca12091e9e229f03915c37d1e4b9178", size = 871135, upload-time = "2026-05-09T23:13:44.326Z" }, + { url = "https://files.pythonhosted.org/packages/18/d4/80882e799e440dd878b0979cbebf8fa4d54624a332c83037c7a701649e3f/regex-2026.5.9-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c8b9b9d294cfea3cd19c718ade7cc93492b2c4991abd9a68d0b3477ae6d8e100", size = 917265, upload-time = "2026-05-09T23:13:47.295Z" }, + { url = "https://files.pythonhosted.org/packages/ae/ff/8db60211e2286e396aad7dc7725356c502bff0901ea05bd6cdc2e1a042b9/regex-2026.5.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:728d8bfd28a8845c8b6bc5dc7ce010453d206396786c0765c2740cb65f37791e", size = 816311, upload-time = "2026-05-09T23:13:49.885Z" }, + { url = "https://files.pythonhosted.org/packages/4c/47/742ef579c61730f8d268e5cf1f9ce0e37e2ea041ad0f5644724f2378e463/regex-2026.5.9-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7e30b874d341fac767d7df5a0870540541c2c054b80cfaac116e8d367a8a7ff2", size = 785498, upload-time = "2026-05-09T23:13:52.25Z" }, + { url = "https://files.pythonhosted.org/packages/7f/ab/cb0999802dcb0fb95b1ab005e8d4163d8afdd67efc2cb6b6630ac13f8cb1/regex-2026.5.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fd190e88a895a8901325fad284a3f74ea52b1da8525b76cc811fa9b1edf0ce2b", size = 801348, upload-time = "2026-05-09T23:13:54.127Z" }, + { url = "https://files.pythonhosted.org/packages/7d/62/8ca59a24c55bc34d166eefaf3717bd77772f329fdbf984d86581e0a3571c/regex-2026.5.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:8e76e8161ad00694cfce6767d5dea860c6391ac5b83e5c3a39661e696f11fc7e", size = 866493, upload-time = "2026-05-09T23:13:56.067Z" }, + { url = "https://files.pythonhosted.org/packages/8d/3d/30f2ae62cef3278bb5bb821f467277a55fb73f01032cf85997e15e8289a8/regex-2026.5.9-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ddda5340e6c01a293027dd46232fa79eaff1b48058ce7a98f572b6445b088041", size = 772811, upload-time = "2026-05-09T23:13:57.867Z" }, + { url = "https://files.pythonhosted.org/packages/d8/ae/7d2089bcd78ad0c0161bc684339df50032acb438a7bd3305e7ddb1193cec/regex-2026.5.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:205109e96b3cf5adf8f4cd62bedde9487feb282b9497a3535451e5a24cd706a0", size = 856584, upload-time = "2026-05-09T23:13:59.679Z" }, + { url = "https://files.pythonhosted.org/packages/a9/29/92ff47f75990131ea4f24ba17819e5a9d141e10819807e09addd73409af6/regex-2026.5.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dfbe4579b9f08036aa7d101d1835437a20783574ac66327e6b29b4018a138081", size = 803453, upload-time = "2026-05-09T23:14:01.978Z" }, + { url = "https://files.pythonhosted.org/packages/04/99/eff29f1037dcab36702c9ee5d6858cf1ce2336ea8ea2987f64245b99ea5e/regex-2026.5.9-cp313-cp313t-win32.whl", hash = "sha256:ed2c9e8068b614c574d8d30e543d617cf5379b0535d46f97ef00e904745a08b5", size = 269951, upload-time = "2026-05-09T23:14:03.661Z" }, + { url = "https://files.pythonhosted.org/packages/0e/9d/8870b8981d27b22cda77bb26a5ac7ebfa9c7d9e0dea195a834a82380e748/regex-2026.5.9-cp313-cp313t-win_amd64.whl", hash = "sha256:b46b0f094dc1d3b90356c85a0bd2c9bafc4a6a190b9d6f8ddd5a033b6e088ed4", size = 281240, upload-time = "2026-05-09T23:14:05.56Z" }, + { url = "https://files.pythonhosted.org/packages/72/b1/3379415e8f135c13ac551353397cc4fe97b4978f3cac73c5fcbcded548b8/regex-2026.5.9-cp313-cp313t-win_arm64.whl", hash = "sha256:872acc074bd29ffc9913ecdfedf6ea77502312ca44a4aa0d3779089c6069d8de", size = 272383, upload-time = "2026-05-09T23:14:07.843Z" }, + { url = "https://files.pythonhosted.org/packages/13/3e/9c3cd292d8808b3645a2ce517e200179b6d0e903f176300bd8b542e14de5/regex-2026.5.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:1bd7587a2948b4085195d5a3374eaf4a425dc3e55784c038175355ecf3bbbf8a", size = 490376, upload-time = "2026-05-09T23:14:09.64Z" }, + { url = "https://files.pythonhosted.org/packages/60/70/d43ee8a2ca0a8b68d167f21658b85520ac0574617c7f320367c5047f7556/regex-2026.5.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:dea2e88e1cce4522496cce630e11e67b98b7076620bc4336c3f674bc21a375f4", size = 291964, upload-time = "2026-05-09T23:14:11.424Z" }, + { url = "https://files.pythonhosted.org/packages/21/91/9d50b433828d8e74196904e168a43abf1e6e88b2a15d47ed742456720c37/regex-2026.5.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2099f7e7ff7b6aa3192312650a56e91cc091e49d50b04e4f6f8b6e28b3b27f1c", size = 289682, upload-time = "2026-05-09T23:14:13.123Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d2/b835e3cafbb9d977736912436259ff551d60919f7d7b3d37d46659c63564/regex-2026.5.9-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecd353045824e4477562a2ac718c25799cdaaa41f7aa925a806a8a3e6848a5b9", size = 796996, upload-time = "2026-05-09T23:14:14.923Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a6/9f992d00019166b9de01c546dd4549bc679f2a68df11b877740b0760b7c2/regex-2026.5.9-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65c8c8c37377794bd5b2f3ebe51919042bf17aec802e23c833d89782ed0c78af", size = 866089, upload-time = "2026-05-09T23:14:17.757Z" }, + { url = "https://files.pythonhosted.org/packages/e0/08/4d32af657e049b19cb62b02e46e38fe1518797bfb2203ee93a510b21b0dc/regex-2026.5.9-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b73ab8afcf66c622db143d1c6fda4e58e4d537ee4f125229ad47b1ab80f34c0", size = 911530, upload-time = "2026-05-09T23:14:20.353Z" }, + { url = "https://files.pythonhosted.org/packages/d9/27/2af43dd1dc201d1fecefda64a45f4ad0995855b92724f795a777b402ee69/regex-2026.5.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0de5cf193997384ed2ca6f1cd4f78055b255d93d82d5a8cd6ba0d11c10b167e4", size = 800643, upload-time = "2026-05-09T23:14:22.265Z" }, + { url = "https://files.pythonhosted.org/packages/a4/dd/23a249047013b5321d4a60c4d2437462086f601b061776a525e5fba2a59f/regex-2026.5.9-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d641a8c9a61618047796d572a39a79b26167b0411d2c3031937b2fe2d081e2cf", size = 777223, upload-time = "2026-05-09T23:14:24.179Z" }, + { url = "https://files.pythonhosted.org/packages/94/6a/e85ed9538cd19586d0465076a4578a12e093ce776d15f3f8ce92733a8dd6/regex-2026.5.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:24b2355ef5cc9aa5b8f07d17704face1c166fdcc2290fa7bd6e6c925655a8346", size = 785760, upload-time = "2026-05-09T23:14:26.065Z" }, + { url = "https://files.pythonhosted.org/packages/2a/c4/f25473209438638e947c55f9156fd8f236f74169229028cc99116380868e/regex-2026.5.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:a24852d3c29ad9e47593593d8a247c44ccc3d0548ef12c822d6ed0810affe676", size = 860891, upload-time = "2026-05-09T23:14:28.17Z" }, + { url = "https://files.pythonhosted.org/packages/f9/f7/f4f86e3c74419c37370e91f150ae0c2ef7d34b2e0e4cdd5da046a02e4022/regex-2026.5.9-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:916714069da19329ef7de197dcbc77bb3104145c7c2c864dbfbe318f46b88b14", size = 765891, upload-time = "2026-05-09T23:14:30.06Z" }, + { url = "https://files.pythonhosted.org/packages/26/70/704d8e13765939146b1cd0ef4e2feb71d7929727d2290f026eed10095955/regex-2026.5.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:fa411799ca8da32a8d38d020a88faa5b6f91657d284761352940ecf9f7c3bbdd", size = 851380, upload-time = "2026-05-09T23:14:32.123Z" }, + { url = "https://files.pythonhosted.org/packages/26/29/1a13582a8460038edc38e49f64ceb0dd7c60f5caba77571f4bf6601965d9/regex-2026.5.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1e6da47d679b7010ef27556b6e0f99771b744936db1792a10ceac6547ae1503e", size = 789350, upload-time = "2026-05-09T23:14:34.799Z" }, + { url = "https://files.pythonhosted.org/packages/73/56/3dcafe34fc72e271d62ad9a291801e88a1457bb251c132f15fcc2e5aad1a/regex-2026.5.9-cp314-cp314-win32.whl", hash = "sha256:98bd73080e8756255137e1bd3f3f00295bbc5aa383c0e0f973920e9134d7c4ad", size = 272130, upload-time = "2026-05-09T23:14:36.729Z" }, + { url = "https://files.pythonhosted.org/packages/d0/9c/02eebf0be95efe416c664db7fb8b6b05b7a0b06a7544f2884f2558b0526f/regex-2026.5.9-cp314-cp314-win_amd64.whl", hash = "sha256:ff8d372ac2acdc048d1c19916f27ee61bc5722728458ba6ca5052f2c72d51763", size = 280999, upload-time = "2026-05-09T23:14:39.126Z" }, + { url = "https://files.pythonhosted.org/packages/70/5a/1dd1abee76cb7a846a0bcf42fdc87e5720c3c33c24f3e37814310a513d9f/regex-2026.5.9-cp314-cp314-win_arm64.whl", hash = "sha256:e1d93bf647916292e8edcec150c07ddf3dc50179ccaf770c04a7f9e452155372", size = 273500, upload-time = "2026-05-09T23:14:41.059Z" }, + { url = "https://files.pythonhosted.org/packages/86/c1/c5f619b0057a7965cb78ec559c1d7a45ce8c99a35bea95483d64959a93d9/regex-2026.5.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:83d0ee4a57d1c87cb549e195ec300b8f0ec3a82eba66d835e4e2ed8634fe4499", size = 494269, upload-time = "2026-05-09T23:14:42.869Z" }, + { url = "https://files.pythonhosted.org/packages/05/2c/5d01f1aee33de4bbe60c8452945bfc8477ca7c5ae4450f6bfe711036cb36/regex-2026.5.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d3d7eb5c9a7f6df82ed3cfac9beb93882a5cbcb5b8b157b56cb2b3b276574ac1", size = 293954, upload-time = "2026-05-09T23:14:44.822Z" }, + { url = "https://files.pythonhosted.org/packages/7a/fe/e8988b2ae2108c6ef71bd4aa8d87fbe257976dd0810e826cd75f701c68b6/regex-2026.5.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:075160bf16658e16d35233300b8453aac25de4cbea808d22348b6979668e924d", size = 292405, upload-time = "2026-05-09T23:14:47.211Z" }, + { url = "https://files.pythonhosted.org/packages/79/34/d2b0937faa7859263f7f0a3c6b103a1296306be6952dc173d0154e9a2f49/regex-2026.5.9-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45375819235558a4ff1c4971dc32881f022613abdb180128f5cb4768c1765a1c", size = 811855, upload-time = "2026-05-09T23:14:49.21Z" }, + { url = "https://files.pythonhosted.org/packages/80/fe/daf53a47457a8486db66c66c01ceb9c2303eecee3f87197f1e77eb1a736d/regex-2026.5.9-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ead4b163ac30a29574510cd4b3e2e985ac5290c05fc7095557d6a5f403fc31b5", size = 871189, upload-time = "2026-05-09T23:14:51.555Z" }, + { url = "https://files.pythonhosted.org/packages/1c/75/058fc4470cbfbf57d800aff1a0022b929a3f9fa553ee10a0cdf2070eb31f/regex-2026.5.9-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c6e4218fbdfbcd4f6c19efca40930d24a621bf4b48cb76bc6640543bd28ef20", size = 917485, upload-time = "2026-05-09T23:14:53.633Z" }, + { url = "https://files.pythonhosted.org/packages/88/e7/179cfda3a28bc843b5c6cfe7f79f23489c791ed95f151083803660878432/regex-2026.5.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6351571c8a42b505eb555c0dc47d740d0fb66977dc142919eea6f4325b7c56a0", size = 816369, upload-time = "2026-05-09T23:14:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/41/90/6f0cc422071688266d344fca8462d787cba0a2c144acb25721f9a61ec265/regex-2026.5.9-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:002205cafd2a9e78c6290c7d1df277bf3277b3b7a30e0b4bb0dac2e2e3f7cb2d", size = 785869, upload-time = "2026-05-09T23:14:58.602Z" }, + { url = "https://files.pythonhosted.org/packages/02/67/a31f1760f09c27b251ef39e9beb541f462cf977381d067faa764c2c0e393/regex-2026.5.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8abd33fef90b2a9efac5557d6033ca82d1195ed3a15fea5af15ba7b463c6a63b", size = 801427, upload-time = "2026-05-09T23:15:00.642Z" }, + { url = "https://files.pythonhosted.org/packages/e3/c4/1a80654597b6bc1e1ea0494824c31200e8a956abe290afae9b19a166a148/regex-2026.5.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:31037c82eccb44b7ea2e9e221d7c01429430e989a1f4b91ea5a855f6017b509a", size = 866482, upload-time = "2026-05-09T23:15:03.384Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/960724e06482c08466ff5611e242e86f80062949cdf6b4b9cc317b9dd93d/regex-2026.5.9-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:5604dfd046dc37eca90250fc3be938b076c8059fa772ac0ed6f499b0f0fb0415", size = 773022, upload-time = "2026-05-09T23:15:05.625Z" }, + { url = "https://files.pythonhosted.org/packages/50/a8/a9979c3e7918280e93159ebcab5ef1a65116dd4f3bd6091be0eae4a126e8/regex-2026.5.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:0e1b1b4e496afbb24f4a62aba855ee4f88f25578927697b340702e48c9ee6bc2", size = 856642, upload-time = "2026-05-09T23:15:07.966Z" }, + { url = "https://files.pythonhosted.org/packages/fe/d4/a9b732f2f0072c0ab12227483abb24fffcb9f73f8a2b203df0a6d0434735/regex-2026.5.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:be3372b9df6ddecff6486d37e19095a7b4973137caf5512407a89f4455361f41", size = 803552, upload-time = "2026-05-09T23:15:10.215Z" }, + { url = "https://files.pythonhosted.org/packages/d5/fe/1b3113817447a1d4155e4ac76d2e072f42c0bcba2f43fa8a0e756ea2cd91/regex-2026.5.9-cp314-cp314t-win32.whl", hash = "sha256:3ddd90103f9e5c471c49c7852ecc1fe27c7e45eb99e977aefe7caa4e779f4f58", size = 275746, upload-time = "2026-05-09T23:15:12.609Z" }, + { url = "https://files.pythonhosted.org/packages/92/73/93d42045302636c91f2e5ef588b65b84b01428f28ec77de256b1dfdfbe5c/regex-2026.5.9-cp314-cp314t-win_amd64.whl", hash = "sha256:ca518ed29c46eecba6010b15f1b9a479314d2de409536e71b6a13aa04e3b8a77", size = 285685, upload-time = "2026-05-09T23:15:15.086Z" }, + { url = "https://files.pythonhosted.org/packages/da/80/35b4c33c804a165a7f55289afda3ea9e3eb6d15800341a2d66455c0f1f30/regex-2026.5.9-cp314-cp314t-win_arm64.whl", hash = "sha256:5e41809d2683fcde7d5a8c87a6567ba1fb1ce0de9f31bff578de00a4b2d76daa", size = 275713, upload-time = "2026-05-09T23:15:16.98Z" }, +] + [[package]] name = "requests" version = "2.32.5" @@ -2120,6 +2229,7 @@ dev = [ { name = "jupyterlab" }, { name = "jupyterlab-code-formatter" }, { name = "loguru" }, + { name = "maturin" }, { name = "pdoc" }, { name = "prek" }, { name = "pytest" }, @@ -2127,6 +2237,7 @@ dev = [ { name = "pytest-lazy-fixtures" }, { name = "pytest-mock" }, { name = "ruff" }, + { name = "tiktoken" }, { name = "twine" }, { name = "ty" }, ] @@ -2148,6 +2259,7 @@ dev = [ { name = "jupyterlab" }, { name = "jupyterlab-code-formatter" }, { name = "loguru" }, + { name = "maturin", specifier = ">=1.14.0" }, { name = "pdoc" }, { name = "prek" }, { name = "pytest" }, @@ -2155,10 +2267,58 @@ dev = [ { name = "pytest-lazy-fixtures" }, { name = "pytest-mock" }, { name = "ruff" }, + { name = "tiktoken" }, { name = "twine" }, { name = "ty" }, ] +[[package]] +name = "tiktoken" +version = "0.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "regex" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e4/e5/5f3cb2159769d0f4324c0e9e87f9de3c4b1cd45848a96b2eb3566ad5ca77/tiktoken-0.13.0.tar.gz", hash = "sha256:c9435714c3a84c2319499de9a300c0e604449dd0799ff246458b3bb6a7f433c1", size = 38986, upload-time = "2026-05-15T04:51:27.153Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/8e/144bde4e01df66b34bb865557c7cd754ed08b036217ebd79c9db5e9048a9/tiktoken-0.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:32ac870a806cfb260a02d0cb70426aef02e038297f8ad50df5040bb5af360791", size = 1034888, upload-time = "2026-05-15T04:50:31.579Z" }, + { url = "https://files.pythonhosted.org/packages/36/18/d4ac9d20956cdebca04841316660ed584c2fecdc2b81722a28bc7ad3b1e4/tiktoken-0.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d9980f11429ed2d737c463bb1fb78cf330caa026adf002f714aced7849a687b", size = 982970, upload-time = "2026-05-15T04:50:32.961Z" }, + { url = "https://files.pythonhosted.org/packages/74/ed/6bb8d05b9f731f749fee5c6f5ca63e981143c826a5985877330507bd13b7/tiktoken-0.13.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3f277ebea5edd7b8bf03c6f9431e1d67d517530115572b2dc1d465326e8f88c7", size = 1115741, upload-time = "2026-05-15T04:50:34.475Z" }, + { url = "https://files.pythonhosted.org/packages/34/de/2ca96b07a82d972b74fe4b46de055b79c904e45c7eab699354a0bfa697dc/tiktoken-0.13.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a116178fa7e1b4065bff05214360373a65cac22f965be7b3f73d00a0dbfe7649", size = 1136523, upload-time = "2026-05-15T04:50:35.782Z" }, + { url = "https://files.pythonhosted.org/packages/ee/dc/9dafec002c2d4424378563cf4cf5c7fb93631d2a55013c8b87554ee4012c/tiktoken-0.13.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2c397ddda233208345b01bd30f2fca79ff730e55731d0108a603f9bc57f6af3b", size = 1181954, upload-time = "2026-05-15T04:50:36.99Z" }, + { url = "https://files.pythonhosted.org/packages/a1/d0/1f8578c45b2f24759b46f0b50d31878c63c73e6bf0f2227e10ec5c5408dc/tiktoken-0.13.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95097e4f89b06403976e498abf61a0ee73a7497e73fb599cb211d8197a054d91", size = 1240069, upload-time = "2026-05-15T04:50:38.221Z" }, + { url = "https://files.pythonhosted.org/packages/aa/90/28d7f154888610aa9237e541986beb62b479df29d193a5a0617dbb1514d0/tiktoken-0.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:8f2d16e7a7c783ad81f36e457d046d1f1c8af70b22aec8a13238efe531977c41", size = 874748, upload-time = "2026-05-15T04:50:39.587Z" }, + { url = "https://files.pythonhosted.org/packages/9c/83/b096c859c2a47c11731bf2f5885f4028b809dfe2396582883eed9cae372f/tiktoken-0.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5df5d1507bd245f1ccad4a074698240021239e455eb0bb4ced4e3d7181872154", size = 1034228, upload-time = "2026-05-15T04:50:40.988Z" }, + { url = "https://files.pythonhosted.org/packages/53/61/c68e123b6d753e3fc2751e9b18e732c9d8bf1e1926762e736eee935d931c/tiktoken-0.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8fe806a50664e83a6ffd56cbd1e4f5dcc6cd32a3e7538f70dc38b1a271384545", size = 982978, upload-time = "2026-05-15T04:50:42.195Z" }, + { url = "https://files.pythonhosted.org/packages/ef/8b/96cc178cc584e65d363134500f297790b06cd48cdeb1e8fcf7bbe60f4715/tiktoken-0.13.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:125bc05005e747f993a83dc67934249932d6e4209854452cd4c0b1d53fba3ba2", size = 1116355, upload-time = "2026-05-15T04:50:43.564Z" }, + { url = "https://files.pythonhosted.org/packages/86/f5/bab735d2c72ea55404b295d02d092644eb5f7cc6205e34d35eb9abfb9ab2/tiktoken-0.13.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5e6358911cab4adee6712da27d65573496a4f68cf8a2b5fca6a4ad10fc5748cf", size = 1135772, upload-time = "2026-05-15T04:50:44.782Z" }, + { url = "https://files.pythonhosted.org/packages/4e/b9/6de04ebdf904edfaad87788011b3735087a0c9ea671b9027e1e4e965e8c8/tiktoken-0.13.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:975cbd78d085d75d26b59660e262736dcaed1e35f8f142cd6291025c01d25486", size = 1182415, upload-time = "2026-05-15T04:50:46.422Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9c/470a05f3b1caf038f44880e334d47ab674e0c80d514c66b375d14d5afa10/tiktoken-0.13.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:75ab9bc99fa020a4c283424590ecd7f3afd70c1c281cb3fa3192a6c3af9f9615", size = 1239879, upload-time = "2026-05-15T04:50:48.052Z" }, + { url = "https://files.pythonhosted.org/packages/42/a6/c1936d16055436cb32e6c6128d68629622e00f4768562f55653752d34768/tiktoken-0.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:6b1615f0ff71953d19729ceb18865429c185b0a23c5353f1bbca34a394bf60f7", size = 874829, upload-time = "2026-05-15T04:50:49.202Z" }, + { url = "https://files.pythonhosted.org/packages/d6/07/acb5992c3772b5a36284f742cfb7a5895aa4471d1848ac31464ad50d7fdf/tiktoken-0.13.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6eb4a5bfbc6426938026b1a334e898ac53541360d62d8c689870160cc80abd67", size = 1033600, upload-time = "2026-05-15T04:50:50.4Z" }, + { url = "https://files.pythonhosted.org/packages/14/e9/742e9aec30f59b9f161f7ff7cd072e02ea836c9e1c0854a8076dfcd40d5c/tiktoken-0.13.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:43cee3e5400573b2046fbf092cc7a5bc30164f9e4c95ce20714da929df48737a", size = 982516, upload-time = "2026-05-15T04:50:52.03Z" }, + { url = "https://files.pythonhosted.org/packages/72/74/ca1541b053e7648254d2e4b42a253e1bb4359f2c91a0a8d49228c794e1a0/tiktoken-0.13.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7de52e3f566d19b3b11bd37eea552c6c305ad74081f736882bd44d148ed4c48d", size = 1115518, upload-time = "2026-05-15T04:50:53.543Z" }, + { url = "https://files.pythonhosted.org/packages/46/e3/93825eaf5a4a504795b787e5d5dea07fbeb3dabf97aa7b450be8bde59c89/tiktoken-0.13.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:51384448aa508e4df84c0f7c1dc3211c7f7b8096325660ee5fc82f3e11b381ce", size = 1136867, upload-time = "2026-05-15T04:50:55.191Z" }, + { url = "https://files.pythonhosted.org/packages/8c/46/002b68de6827091d5ae90b048f326e8aad8d953520950e5ce1508879414f/tiktoken-0.13.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e28157350f7ebf35008dd8e9e0fdb621f976e4230c881099c85e8cf07eaa50e2", size = 1181826, upload-time = "2026-05-15T04:50:56.296Z" }, + { url = "https://files.pythonhosted.org/packages/db/c6/d393e3185a276505182f7abd93fe714f3c444a2be9180798fa052347504e/tiktoken-0.13.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:165cf1820ea4a354985c2490a5205d4cc74661c934aca79dd0368232fff94e0f", size = 1239489, upload-time = "2026-05-15T04:50:57.918Z" }, + { url = "https://files.pythonhosted.org/packages/b7/4d/bc07d1f1635d4897a202acc0ae11c2886eaa7325c359ba4741b47bf8e225/tiktoken-0.13.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6c43a675ca14f6f2749ba7f12075d37456015a24b859f2517b9beb4ef30807ec", size = 873820, upload-time = "2026-05-15T04:50:59.528Z" }, + { url = "https://files.pythonhosted.org/packages/8c/93/0dd6adca026a616c3a92974566b43381eea4b475ce1f36c062b8271a9ac5/tiktoken-0.13.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaaaef47c2406277181d2086484c317bf7fc433e2d5d03ff94f56b0dcec87471", size = 1034977, upload-time = "2026-05-15T04:51:00.957Z" }, + { url = "https://files.pythonhosted.org/packages/d9/77/5ec6e6bc5b30bed6d93f7f2162d8f6b32437b3ba27cb527cfe004f6109c9/tiktoken-0.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ca8b310bd93b3772cb1b7922d915446864860f562bdfe4825c63a0aed3fb28cd", size = 983635, upload-time = "2026-05-15T04:51:02.629Z" }, + { url = "https://files.pythonhosted.org/packages/94/b0/c8ae9aff00d625c50659b4513e707a0462c4bf5d4d6cc1b802103225c02e/tiktoken-0.13.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:32e0c12305105002c047b3bb1070b0dd9a73b0cb3b2856a8972b810e7a4f5881", size = 1116036, upload-time = "2026-05-15T04:51:04.082Z" }, + { url = "https://files.pythonhosted.org/packages/1b/ac/6a5dddd1d0a6018ecb389bd0353e6b4a515eb4d2286611bd0ace1937b9e1/tiktoken-0.13.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:5ba5fd62507a932d1241346179e3b39bc7bf7408f03c272652d93b3bedf5db24", size = 1135544, upload-time = "2026-05-15T04:51:05.229Z" }, + { url = "https://files.pythonhosted.org/packages/f4/b8/585032b4384b2f7dcdaddcb52865c83a701a420d09e3c2b4a2be1c450c57/tiktoken-0.13.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d108bc2d470fc53c8ecd24f2c0fd2b5f98c33e87cdb6aa2e9b8c5dced703d273", size = 1182217, upload-time = "2026-05-15T04:51:06.517Z" }, + { url = "https://files.pythonhosted.org/packages/cd/b6/993ff1ded3958215fd341a847b8e5ffeb5de473f435296870d314fc91ac4/tiktoken-0.13.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cb99cb5127449f58d0a2d5f5ccfb390d8dbdfd919c221246caaee29d8725ed51", size = 1239404, upload-time = "2026-05-15T04:51:07.843Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3d/fef7e06e3b33e7538db0ced734cf9fe23b6832d2ac4990c119c377aec55e/tiktoken-0.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:115c4f26ffa11caac8b54eea35c2ad38c612c20a48d35dd15d70a02ac6f51f58", size = 918686, upload-time = "2026-05-15T04:51:08.925Z" }, + { url = "https://files.pythonhosted.org/packages/c1/82/a7fc44582bc32ab00de988a2299bf77c077f59068b233109e34b7d6ca7e6/tiktoken-0.13.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:472527e9132952f2fbf77cd290658bacf003d4d5a3fabc18e5fbd407cbae4d9b", size = 1034454, upload-time = "2026-05-15T04:51:10.035Z" }, + { url = "https://files.pythonhosted.org/packages/37/d0/24d8a890c14f432a05cea669c17bebeaa99f96a7c79523b590f564246411/tiktoken-0.13.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:4e2f67d27c9626cdd25fe33d9313c5cdb3d8d82da646b68d6eb8e7e9c20e6448", size = 982976, upload-time = "2026-05-15T04:51:11.23Z" }, + { url = "https://files.pythonhosted.org/packages/49/b7/2ab43f62788a9266187a9bfc1d3af99ad83e5eaa25fbef168a69cd5ad14f/tiktoken-0.13.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:2b920b35805cd64585a37c3dc7ce65fba4d2d36016be01e1d7942482ca29093a", size = 1115526, upload-time = "2026-05-15T04:51:12.608Z" }, + { url = "https://files.pythonhosted.org/packages/64/39/1494321ed323ce7a14d88e3cd6cb9058625977df1c6961ddc492bd10a9f3/tiktoken-0.13.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:493af3aa28a4aaf2e3d2600a2ee717252c9bf5ab38fff94eb5a02db5ab77e5ad", size = 1136466, upload-time = "2026-05-15T04:51:13.926Z" }, + { url = "https://files.pythonhosted.org/packages/96/d9/dfd086aa2d918c563a140720e0ce296cada1634efd2783d5cf51e05f984e/tiktoken-0.13.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6644c9c2b5cf3916f5a3641d7d12fdb3f006a7b3d9ff6acdaec44e29ab1ff91e", size = 1181863, upload-time = "2026-05-15T04:51:15.025Z" }, + { url = "https://files.pythonhosted.org/packages/2f/68/a18b4f307086954fdae32714cb4f85562e34f9d34ab206e61f1816aa6018/tiktoken-0.13.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5cb65b60b9408563676d874a3a4ee573370066f0dc4e29d84e82e989c6517424", size = 1239218, upload-time = "2026-05-15T04:51:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/16/5b/f2aa703a4fc5d2dff73460a7d46cc2f3f44aa0f3dd8eeb20d2a0ecf68862/tiktoken-0.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:85b78cc3a2c3d48723ca751fa981f1fedccd54194ca0471b957364353a898b07", size = 918110, upload-time = "2026-05-15T04:51:17.237Z" }, +] + [[package]] name = "tinycss2" version = "1.4.0"