Skip to content

Commit adfc335

Browse files
committed
add test suite
1 parent fbc6c17 commit adfc335

25 files changed

+80908
-28
lines changed

setup.py

Lines changed: 0 additions & 23 deletions
This file was deleted.
2.88 KB
Binary file not shown.
19.2 KB
Binary file not shown.
34.9 KB
Binary file not shown.
71.8 KB
Binary file not shown.
37.2 KB
Binary file not shown.
48.7 KB
Binary file not shown.

tests/conftest.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Shared fixtures and helpers for the text-preprocessing test suite."""
2+
3+
import os
4+
import pytest
5+
6+
from text_preprocessing import Tokens
7+
from text_preprocessing import Token as PreprocessorToken
8+
9+
# ---------------------------------------------------------------------------
10+
# Fixture file paths
11+
# ---------------------------------------------------------------------------
12+
13+
PLAIN = os.path.join(os.path.dirname(__file__), "fixtures", "plain")
14+
HAMLET = os.path.join(PLAIN, "hamlet.txt")
15+
MOBY_DICK = os.path.join(PLAIN, "moby_dick.txt")
16+
MONTAIGNE = os.path.join(PLAIN, "montaigne.txt")
17+
GERMINAL = os.path.join(PLAIN, "germinal.txt")
18+
19+
20+
# ---------------------------------------------------------------------------
21+
# File-based fixtures
22+
# ---------------------------------------------------------------------------
23+
24+
25+
@pytest.fixture
26+
def stopwords_file(tmp_path):
27+
"""Simple stopwords file: the, and, of, a."""
28+
f = tmp_path / "stopwords.txt"
29+
f.write_text("the\nand\nof\na\n", encoding="utf-8")
30+
return str(f)
31+
32+
33+
@pytest.fixture
34+
def lemma_file(tmp_path):
35+
"""Tab-separated word→lemma file (lowercase keys — lemmatizer is case-sensitive)."""
36+
f = tmp_path / "lemmas.txt"
37+
f.write_text("running\trun\nflies\tfly\nwent\tgo\n", encoding="utf-8")
38+
return str(f)
39+
40+
41+
# ---------------------------------------------------------------------------
42+
# Token construction helpers
43+
# ---------------------------------------------------------------------------
44+
45+
46+
def make_token(text: str, start: int = 0, end: int = None) -> PreprocessorToken:
47+
"""Create a PreprocessorToken with byte-position metadata."""
48+
if end is None:
49+
end = start + len(text)
50+
return PreprocessorToken(text, ext={"token": text, "start_byte": start, "end_byte": end})
51+
52+
53+
def make_tokens(words: list[str], metadata: dict = None) -> Tokens:
54+
"""Build a Tokens container from a list of words, auto-assigning byte positions."""
55+
byte = 0
56+
token_list = []
57+
for w in words:
58+
token_list.append(make_token(w, byte, byte + len(w)))
59+
byte += len(w) + 1 # +1 for implicit space separator
60+
if metadata is None:
61+
metadata = {
62+
"filename": "test.txt",
63+
"start_byte": 0,
64+
"end_byte": max(byte - 1, 0),
65+
}
66+
return Tokens(token_list, metadata)

tests/fixtures/philo/data/toms.db

1.25 MB
Binary file not shown.
497 KB
Binary file not shown.

0 commit comments

Comments
 (0)