Skip to content

Commit 1399ef5

Browse files
committed
bug fix
1 parent 21501ad commit 1399ef5

File tree

2 files changed

+5
-7
lines changed

2 files changed

+5
-7
lines changed

text_preprocessing/preprocessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def process_texts(
143143
fetched_texts = self.text_fetcher(
144144
texts, do_nlp=self.do_nlp, keep_all=keep_all, progress=progress, post_func=self.post_func
145145
)
146-
if self.text_fetcher.text_object_type in ("para", "sent"):
146+
if self.text_fetcher.text_object_type in ("para", "sent") and self.do_nlp is True:
147147
fetched_texts = self.nlp.pipe(
148148
((make_spacy_doc(self.nlp, tokens), c) for tokens, c in fetched_texts),
149149
as_tuples=True,

text_preprocessing/spacy_helpers.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Helper functions for Spacy"""
22

33
import os
4-
import pickle
54
import re
65
import sys
76
import unicodedata
@@ -10,14 +9,14 @@
109
from typing import Any, Dict, Iterable, List, Optional, Union
1110
from xml.sax.saxutils import unescape as unescape_xml
1211

12+
import pickle
1313
import spacy
1414
from spacy.language import Language
1515
from spacy.tokens import Doc, Token
1616
from Stemmer import Stemmer
1717
from thinc.api import prefer_gpu, set_gpu_allocator
1818
from unidecode import unidecode
1919

20-
2120
# Updated as of 8/23/2022
2221
SPACY_LANGUAGE_MODEL_MAP: Dict[str, List[str]] = {
2322
"catalan": ["ca_core_news_sm", "ca_core_news_md", "ca_core_news_lg", "ca_core_news_trf"],
@@ -71,8 +70,6 @@ class PreprocessorToken(str):
7170
7271
"""
7372

74-
ext: dict[str, Any]
75-
7673
def __new__(cls, value, pos_="", ent_type_="", ext={}):
7774
return str.__new__(cls, value)
7875

@@ -83,6 +80,7 @@ def __init__(
8380
ent_type_: str = "",
8481
ext: dict[str, Any] | None = None,
8582
):
83+
super().__init__()
8684
self.text = text or ""
8785
self.ext = ext or {}
8886
if self.ext is not None:
@@ -149,8 +147,8 @@ def __get_tokens(self, doc: Doc):
149147
yield PreprocessorToken(token.text, token.pos_, token.ent_type_, token._.ext)
150148
elif self.keep_all is True:
151149
yield PreprocessorToken("", token.pos_, token.ent_type_, token._.ext)
152-
if token.whitespace_ and index < max_index: # remove trailing whitespace
153-
yield PreprocessorToken(token.whitespace_, "", "", {**token._.ext, "token": token.whitespace_})
150+
if token.whitespace_ and index < max_index: # remove trailing whitespace
151+
yield PreprocessorToken(token.whitespace_, "", "", {**token._.ext, "token": token.whitespace_})
154152

155153
def __iter__(self) -> Iterable[PreprocessorToken]:
156154
for token in self.tokens:

0 commit comments

Comments
 (0)