Skip to content

Commit 7e1de7a

Browse files
committed
Splitting "dajžto".
1 parent 89e110b commit 7e1de7a

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

udapi/block/ud/cs/addmwt.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,22 @@ def multiword_analysis(self, node):
158158
'main': 0,
159159
'shape': 'subtree',
160160
}
161+
# dajžto = dajž + to
162+
if subtokens[1] == 'to':
163+
if token_from_subtokens != node.form:
164+
logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form))
165+
return None
166+
node.misc['AddMwt'] = ''
167+
return {
168+
'form': subtokens[0] + ' ' + subtokens[1],
169+
'lemma': '* ten',
170+
'upos': '* DET',
171+
'xpos': '* PDNS4----------',
172+
'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem',
173+
'deprel': '* obj',
174+
'main': 0,
175+
'shape': 'subtree',
176+
}
161177
# Contractions of prepositions and pronouns almost could be processed
162178
# regardless of AddMwt instructions by the annotator, but we still
163179
# require it to be on the safe side. For example, both 'přědeň' and

0 commit comments

Comments
 (0)