|
61 | 61 | from types import ModuleType |
62 | 62 |
|
63 | 63 | from islenska.basics import Ksnid |
64 | | -from reynir import TOK, Greynir, Sentence, TokenList, _Job, correct_spaces |
| 64 | +from reynir import TOK, Greynir, Paragraph, Sentence, TokenList, _Job, correct_spaces |
65 | 65 | from reynir.binparser import BIN_Grammar, BIN_Parser, VariantHandler |
66 | 66 | from reynir.bintokenizer import StringIterable |
67 | 67 | from reynir.fastparser import ffi # type: ignore |
68 | 68 | from reynir.fastparser import Fast_Parser |
69 | 69 | from reynir.incparser import ICELANDIC_RATIO |
70 | 70 | from reynir.reducer import Reducer |
71 | | -from reynir.reynir import Job, ProgressFunc |
| 71 | +from reynir.reynir import Job, ProgressFunc, DEFAULT_MAX_SENT_TOKENS |
72 | 72 | from tokenizer import Abbreviations, Tok |
73 | 73 |
|
74 | 74 | from .settings import Settings |
75 | 75 | from .annotation import Annotation |
76 | 76 | from .errfinder import ErrorDetectionToken, ErrorFinder |
77 | | -from .errtokenizer import CorrectionPipeline, CorrectToken |
| 77 | +from .errtokenizer import CorrectionPipeline, CorrectToken, settings_or_default |
78 | 78 | from .pattern import PatternMatcher |
79 | 79 |
|
80 | 80 | # Style mark from BÍN: |
@@ -478,3 +478,88 @@ def parse_all_tokens(self, tokens: Iterable[Tok], *, progress_func: ProgressFunc |
478 | 478 | ambiguity=job.ambiguity, |
479 | 479 | parse_time=job.parse_time, |
480 | 480 | ) |
| 481 | + |
| 482 | + |
| 483 | +def check_single( |
| 484 | + sentence_text: str, rc: Optional[GreynirCorrect] = None, **options: Any |
| 485 | +) -> Optional[AnnotatedSentence]: |
| 486 | + """Check and annotate a single sentence, given in plain text""" |
| 487 | + # Returns None if no sentence was parsed |
| 488 | + max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS) |
| 489 | + if rc is None: |
| 490 | + settings = settings_or_default() |
| 491 | + pipeline = CorrectionPipeline("", settings, **options) |
| 492 | + rc = GreynirCorrect(settings, pipeline, **options) |
| 493 | + return cast(AnnotatedSentence, rc.parse_single(sentence_text, max_sent_tokens=max_sent_tokens)) |
| 494 | + |
| 495 | + |
| 496 | +def check_tokens( |
| 497 | + tokens: Iterable[CorrectToken], rc: Optional[GreynirCorrect] = None, **options: Any |
| 498 | +) -> Optional[Sentence]: |
| 499 | + """Check and annotate a single sentence, given as a token list""" |
| 500 | + # Returns None if no sentence was parsed |
| 501 | + max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS) |
| 502 | + if rc is None: |
| 503 | + settings = settings_or_default() |
| 504 | + pipeline = CorrectionPipeline("", settings, **options) |
| 505 | + rc = GreynirCorrect(settings, pipeline, **options) |
| 506 | + return rc.parse_tokens(tokens, max_sent_tokens=max_sent_tokens) |
| 507 | + |
| 508 | + |
| 509 | +def check( |
| 510 | + text: str, rc: Optional[GreynirCorrect] = None, **options: Any |
| 511 | +) -> Iterable[Paragraph]: |
| 512 | + """Return a generator of checked paragraphs of text, |
| 513 | + each being a generator of checked sentences with |
| 514 | + annotations""" |
| 515 | + split_paragraphs = options.pop("split_paragraphs", False) |
| 516 | + max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS) |
| 517 | + if rc is None: |
| 518 | + settings = settings_or_default() |
| 519 | + pipeline = CorrectionPipeline("", settings, **options) |
| 520 | + rc = GreynirCorrect(settings, pipeline, **options) |
| 521 | + # This is an asynchronous (on-demand) parse job |
| 522 | + job = rc.submit( |
| 523 | + text, |
| 524 | + parse=True, |
| 525 | + split_paragraphs=split_paragraphs, |
| 526 | + max_sent_tokens=max_sent_tokens, |
| 527 | + ) |
| 528 | + yield from job.paragraphs() |
| 529 | + |
| 530 | + |
| 531 | +def check_with_stats( |
| 532 | + text: str, |
| 533 | + *, |
| 534 | + settings: Optional[Settings] = None, |
| 535 | + split_paragraphs: bool = False, |
| 536 | + progress_func: ProgressFunc = None, |
| 537 | + **options: Any, |
| 538 | +) -> CheckResult: |
| 539 | + """Return a dict containing parsed paragraphs as well as statistics, |
| 540 | + using the given correction/parser class. This is a low-level |
| 541 | + function; normally check_with_stats() should be used.""" |
| 542 | + settings = settings_or_default(settings) |
| 543 | + split_paragraphs = options.pop("split_paragraphs", False) |
| 544 | + max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS) |
| 545 | + pipeline = CorrectionPipeline("", settings, **options) |
| 546 | + rc = GreynirCorrect(settings, pipeline, **options) |
| 547 | + # This is an asynchronous (on-demand) parse job |
| 548 | + job = rc.submit( |
| 549 | + text, |
| 550 | + parse=True, |
| 551 | + split_paragraphs=split_paragraphs, |
| 552 | + progress_func=progress_func, |
| 553 | + max_sent_tokens=max_sent_tokens, |
| 554 | + ) |
| 555 | + # Enumerating through the job's paragraphs and sentences causes them |
| 556 | + # to be parsed and their statistics collected |
| 557 | + sentences = [cast(AnnotatedSentence, sent) for pg in job.paragraphs() for sent in pg] |
| 558 | + return CheckResult( |
| 559 | + sentences=sentences, |
| 560 | + num_tokens=job.num_tokens, |
| 561 | + num_sentences=job.num_sentences, |
| 562 | + num_parsed=job.num_parsed, |
| 563 | + ambiguity=job.ambiguity, |
| 564 | + parse_time=job.parse_time, |
| 565 | + ) |
0 commit comments