JournalTopicModel/TopicModel/TextProcessor.py at master · arch1904/JournalTopicModel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import nltk
import spacy

import re

spacy.load('en_core_web_sm')  # OR spacy.load('en_core_web_sm')
# Instructions to install the spaCy English model
"""
python -m spacy download en
OR
python3 -m pip install --user https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
"""

from spacy.lang.en import English

parser = English()  # English parser

if "stopwords" not in dir(nltk.corpus):
    nltk.download('stopwords')  # Download NLTK stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))


def remove_non_ascii(text):
    """ Clean up text to support characters
    Args:
        text: string of text
    Returns: cleaned text
    """
    if isinstance(text, float):
        return ""
    return re.sub(r'[^\x00-\x7F]+', ' ', text)


def tokenize(text):
    """ Remove stopwords and spaces from text to get tokens for model
    Args:
        text: string of text
    Returns: List of tokens
    """
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


def prepare_text_for_lda(text):
    """ Prepare text before training LDA model
    Args:
        text: string of text
    Returns: cleaned tokens
    """
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    return tokens


def apply_tokenization(title, abstract):
    """ Tokenize text for title + abstract
    Args:
        title: title of the paper
        abstract: abstract of the paper
    Returns: cleaned tokens
    """
    word = ""
    word += remove_non_ascii(title)
    word += remove_non_ascii(abstract)
    return prepare_text_for_lda(word)