|
1 | 1 | from .POSTagger import POSTagger |
2 | 2 | import nltk |
| 3 | +from nltk import UnigramTagger |
3 | 4 | from nltk.corpus import cess_esp |
4 | 5 |
|
5 | 6 |
|
6 | | -def get_tagger(): |
| 7 | +def get_tagger() -> UnigramTagger: |
7 | 8 | patterns = [ |
8 | 9 | (r".*é$", "VBD"), # past verb |
9 | 10 | (r".*ó$", "VBD"), # past verb |
@@ -36,11 +37,40 @@ class POSTaggerNltk(POSTagger): |
36 | 37 | def __init__(self): |
37 | 38 | nltk.download('cess_esp') |
38 | 39 | nltk.download('universal_tagset') |
39 | | - |
| 40 | + |
40 | 41 | def tag_sentences_with_pos(self, tokenized_sentences: list) -> list: |
41 | | - unigram_tagger = get_tagger() |
| 42 | + unigram_tagger = self.get_tagger() |
42 | 43 | tagged_sentences = [ |
43 | 44 | unigram_tagger.tag(nltk.word_tokenize(sentence)) |
44 | 45 | for sentence in tokenized_sentences |
45 | 46 | ] |
46 | 47 | return tagged_sentences |
| 48 | + |
| 49 | + @staticmethod |
| 50 | + def get_tagger() -> nltk.UnigramTagger: |
| 51 | + patterns = [ |
| 52 | + (r".*é$", "VBD"), # past verb |
| 53 | + (r".*ó$", "VBD"), # past verb |
| 54 | + (r".*rán$", "VBD"), # past verb |
| 55 | + (r".*ando$", "VBG"), # gerund |
| 56 | + (r".*iendo$", "VBG"), # gerund |
| 57 | + (r".*endo$", "VBG"), # gerund |
| 58 | + (r".*osa$", "ADJ"), # adjective |
| 59 | + (r".*oso$", "ADJ"), # adjective |
| 60 | + (r".*o$", "NOUN"), # noun masculine singular |
| 61 | + (r".*os$", "NOUN"), # noun masculine plural |
| 62 | + (r".*a$", "NOUN"), # noun feminine singular |
| 63 | + (r".*as$", "NOUN"), # noun feminine plural |
| 64 | + ] |
| 65 | + |
| 66 | + default_tagger = "NOUN" |
| 67 | + default = nltk.DefaultTagger(default_tagger) |
| 68 | + |
| 69 | + sentences_tagged = [] |
| 70 | + for sentence in cess_esp.tagged_sents(tagset='universal_tagset'): |
| 71 | + sentences_tagged.append([(word, tag) for (word, tag) in sentence]) |
| 72 | + |
| 73 | + regex_tagger = nltk.RegexpTagger(patterns, backoff=default) |
| 74 | + unigram_tagger = nltk.UnigramTagger(sentences_tagged, backoff=regex_tagger) |
| 75 | + |
| 76 | + return unigram_tagger |
0 commit comments