|
| 1 | +"""Shared fixtures and helpers for the text-preprocessing test suite.""" |
| 2 | + |
| 3 | +import os |
| 4 | +import pytest |
| 5 | + |
| 6 | +from text_preprocessing import Tokens |
| 7 | +from text_preprocessing import Token as PreprocessorToken |
| 8 | + |
| 9 | +# --------------------------------------------------------------------------- |
| 10 | +# Fixture file paths |
| 11 | +# --------------------------------------------------------------------------- |
| 12 | + |
| 13 | +PLAIN = os.path.join(os.path.dirname(__file__), "fixtures", "plain") |
| 14 | +HAMLET = os.path.join(PLAIN, "hamlet.txt") |
| 15 | +MOBY_DICK = os.path.join(PLAIN, "moby_dick.txt") |
| 16 | +MONTAIGNE = os.path.join(PLAIN, "montaigne.txt") |
| 17 | +GERMINAL = os.path.join(PLAIN, "germinal.txt") |
| 18 | + |
| 19 | + |
| 20 | +# --------------------------------------------------------------------------- |
| 21 | +# File-based fixtures |
| 22 | +# --------------------------------------------------------------------------- |
| 23 | + |
| 24 | + |
| 25 | +@pytest.fixture |
| 26 | +def stopwords_file(tmp_path): |
| 27 | + """Simple stopwords file: the, and, of, a.""" |
| 28 | + f = tmp_path / "stopwords.txt" |
| 29 | + f.write_text("the\nand\nof\na\n", encoding="utf-8") |
| 30 | + return str(f) |
| 31 | + |
| 32 | + |
| 33 | +@pytest.fixture |
| 34 | +def lemma_file(tmp_path): |
| 35 | + """Tab-separated word→lemma file (lowercase keys — lemmatizer is case-sensitive).""" |
| 36 | + f = tmp_path / "lemmas.txt" |
| 37 | + f.write_text("running\trun\nflies\tfly\nwent\tgo\n", encoding="utf-8") |
| 38 | + return str(f) |
| 39 | + |
| 40 | + |
| 41 | +# --------------------------------------------------------------------------- |
| 42 | +# Token construction helpers |
| 43 | +# --------------------------------------------------------------------------- |
| 44 | + |
| 45 | + |
| 46 | +def make_token(text: str, start: int = 0, end: int = None) -> PreprocessorToken: |
| 47 | + """Create a PreprocessorToken with byte-position metadata.""" |
| 48 | + if end is None: |
| 49 | + end = start + len(text) |
| 50 | + return PreprocessorToken(text, ext={"token": text, "start_byte": start, "end_byte": end}) |
| 51 | + |
| 52 | + |
| 53 | +def make_tokens(words: list[str], metadata: dict = None) -> Tokens: |
| 54 | + """Build a Tokens container from a list of words, auto-assigning byte positions.""" |
| 55 | + byte = 0 |
| 56 | + token_list = [] |
| 57 | + for w in words: |
| 58 | + token_list.append(make_token(w, byte, byte + len(w))) |
| 59 | + byte += len(w) + 1 # +1 for implicit space separator |
| 60 | + if metadata is None: |
| 61 | + metadata = { |
| 62 | + "filename": "test.txt", |
| 63 | + "start_byte": 0, |
| 64 | + "end_byte": max(byte - 1, 0), |
| 65 | + } |
| 66 | + return Tokens(token_list, metadata) |
0 commit comments