Re-introduced legacy APIs; fixed bugs; improved capitalization in suggestions

vthorsteinsson · vthorsteinsson · commit 842c5339efb5 · 2025-05-05T22:21:16.000Z
diff --git a/src/reynir_correct/__init__.py b/src/reynir_correct/__init__.py
@@ -38,13 +38,26 @@
 from .annotation import Annotation
 
 # Grammar checking
-from .checker import AnnotatedSentence, GreynirCorrect
+from .checker import (
+    AnnotatedSentence,
+    GreynirCorrect,
+    check,
+    check_single,
+    check_with_stats,
+    check_tokens,
+)
 
 # Token-level correction
 from .errtokenizer import Correct_TOK, CorrectionPipeline, CorrectToken, tokenize
 from .readability import FleschKincaidFeedback, FleschKincaidScorer, RareWordsFinder
 from .settings import Settings
-from .wrappers import CorrectedSentence, CorrectionResult, GreynirCorrectAPI, ParseResultStats, check_errors
+from .wrappers import (
+    CorrectedSentence,
+    CorrectionResult,
+    GreynirCorrectAPI,
+    ParseResultStats,
+    check_errors,
+)
 
 __author__ = "Miðeind ehf"
 __copyright__ = "© 2025 Miðeind ehf."
@@ -70,6 +83,10 @@
     "GreynirCorrectAPI",
     "CorrectionResult",
     "CorrectedSentence",
+    "check",
+    "check_single",
+    "check_with_stats",
+    "check_tokens",
     "check_errors",
     "AnnotatedSentence",
     "Annotation",
diff --git a/src/reynir_correct/annotation.py b/src/reynir_correct/annotation.py
@@ -85,13 +85,14 @@ def __str__(self) -> str:
             orig_sugg = f" | '{self._original}' -> '{self._suggest}'"
         else:
             orig_sugg = ""
-        return "{0:03}-{1:03}: {2:6} {3}{4} | {5}".format(
+        sugg_list = f" | {self._suggestlist}" if self._suggestlist else ""
+        return "{0:03}-{1:03}: {2:20} {3}{4}{5}".format(
             self._start,
             self._end,
             self._code,
             self._text,
             orig_sugg,
-            self._suggestlist,
+            sugg_list,
         )
 
     @property
diff --git a/src/reynir_correct/checker.py b/src/reynir_correct/checker.py
@@ -61,20 +61,20 @@
 from types import ModuleType
 
 from islenska.basics import Ksnid
-from reynir import TOK, Greynir, Sentence, TokenList, _Job, correct_spaces
+from reynir import TOK, Greynir, Paragraph, Sentence, TokenList, _Job, correct_spaces
 from reynir.binparser import BIN_Grammar, BIN_Parser, VariantHandler
 from reynir.bintokenizer import StringIterable
 from reynir.fastparser import ffi  # type: ignore
 from reynir.fastparser import Fast_Parser
 from reynir.incparser import ICELANDIC_RATIO
 from reynir.reducer import Reducer
-from reynir.reynir import Job, ProgressFunc
+from reynir.reynir import Job, ProgressFunc, DEFAULT_MAX_SENT_TOKENS
 from tokenizer import Abbreviations, Tok
 
 from .settings import Settings
 from .annotation import Annotation
 from .errfinder import ErrorDetectionToken, ErrorFinder
-from .errtokenizer import CorrectionPipeline, CorrectToken
+from .errtokenizer import CorrectionPipeline, CorrectToken, settings_or_default
 from .pattern import PatternMatcher
 
 # Style mark from BÍN:
@@ -478,3 +478,88 @@ def parse_all_tokens(self, tokens: Iterable[Tok], *, progress_func: ProgressFunc
             ambiguity=job.ambiguity,
             parse_time=job.parse_time,
         )
+
+
+def check_single(
+    sentence_text: str, rc: Optional[GreynirCorrect] = None, **options: Any
+) -> Optional[AnnotatedSentence]:
+    """Check and annotate a single sentence, given in plain text"""
+    # Returns None if no sentence was parsed
+    max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
+    if rc is None:
+        settings = settings_or_default()
+        pipeline = CorrectionPipeline("", settings, **options)
+        rc = GreynirCorrect(settings, pipeline, **options)
+    return cast(AnnotatedSentence, rc.parse_single(sentence_text, max_sent_tokens=max_sent_tokens))
+
+
+def check_tokens(
+    tokens: Iterable[CorrectToken], rc: Optional[GreynirCorrect] = None, **options: Any
+) -> Optional[Sentence]:
+    """Check and annotate a single sentence, given as a token list"""
+    # Returns None if no sentence was parsed
+    max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
+    if rc is None:
+        settings = settings_or_default()
+        pipeline = CorrectionPipeline("", settings, **options)
+        rc = GreynirCorrect(settings, pipeline, **options)
+    return rc.parse_tokens(tokens, max_sent_tokens=max_sent_tokens)
+
+
+def check(
+    text: str, rc: Optional[GreynirCorrect] = None, **options: Any
+) -> Iterable[Paragraph]:
+    """Return a generator of checked paragraphs of text,
+    each being a generator of checked sentences with
+    annotations"""
+    split_paragraphs = options.pop("split_paragraphs", False)
+    max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
+    if rc is None:
+        settings = settings_or_default()
+        pipeline = CorrectionPipeline("", settings, **options)
+        rc = GreynirCorrect(settings, pipeline, **options)
+    # This is an asynchronous (on-demand) parse job
+    job = rc.submit(
+        text,
+        parse=True,
+        split_paragraphs=split_paragraphs,
+        max_sent_tokens=max_sent_tokens,
+    )
+    yield from job.paragraphs()
+
+
+def check_with_stats(
+    text: str,
+    *,
+    settings: Optional[Settings] = None,
+    split_paragraphs: bool = False,
+    progress_func: ProgressFunc = None,
+    **options: Any,
+) -> CheckResult:
+    """Return a dict containing parsed paragraphs as well as statistics,
+    using the given correction/parser class. This is a low-level
+    function; normally check_with_stats() should be used."""
+    settings = settings_or_default(settings)
+    split_paragraphs = options.pop("split_paragraphs", False)
+    max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
+    pipeline = CorrectionPipeline("", settings, **options)
+    rc = GreynirCorrect(settings, pipeline, **options)
+    # This is an asynchronous (on-demand) parse job
+    job = rc.submit(
+        text,
+        parse=True,
+        split_paragraphs=split_paragraphs,
+        progress_func=progress_func,
+        max_sent_tokens=max_sent_tokens,
+    )
+    # Enumerating through the job's paragraphs and sentences causes them
+    # to be parsed and their statistics collected
+    sentences = [cast(AnnotatedSentence, sent) for pg in job.paragraphs() for sent in pg]
+    return CheckResult(
+        sentences=sentences,
+        num_tokens=job.num_tokens,
+        num_sentences=job.num_sentences,
+        num_parsed=job.num_parsed,
+        ambiguity=job.ambiguity,
+        parse_time=job.parse_time,
+    )
diff --git a/src/reynir_correct/errtokenizer.py b/src/reynir_correct/errtokenizer.py
@@ -217,6 +217,9 @@ class TemplateDict(TypedDict):
 
 _ErrorClass = TypeVar("_ErrorClass", bound=ErrorType)
 
+# Cached settings for simple (legacy) API
+_cached_settings: Optional[Settings] = None
+
 
 def load_config(tov_config_path: Optional[str] = None) -> Settings:
     """Load the default configuration file and return a Settings object. Optionally load
@@ -239,10 +242,19 @@ def register_error_class(cls: _ErrorClass) -> _ErrorClass:
 
 def emulate_case(s: str, *, template: str) -> str:
     """Return the string s but emulating the case of the template
-    (lower/upper/capitalized)"""
+    (lower/upper/capitalized), also for multi-word templates ('Hesturinn Skjóni')"""
+    s_list = s.split()
+    if len(s_list) > 1:
+        template_list = template.split()
+        if len(s_list) == len(template_list):
+            # Multi-word case emulation
+            return " ".join(
+                emulate_case(word, template=template_word) for word, template_word in zip(s_list, template_list)
+            )
     if template.isupper():
         return s.upper()
     if template and template[0].isupper():
+        # The first letter of the template word is uppercase
         return s.capitalize()
     return s
 
@@ -2677,7 +2689,6 @@ def late_fix_merges(
 
 
 def create_template_dict(
-    settings: Settings,
     explanation: str,
     explanation_w_sugg: str,
     error_warning: Type[ToneOfVoiceWarning] | Type[TabooWarning],
@@ -2700,15 +2711,12 @@ def check_wording(
     """Annotate words to be flagged, with warnings. Here we check for both taboo words and
     tone of voice issues as determined by an additional config, if given."""
     taboo_data = create_template_dict(
-        settings,
         "Óheppilegt eða óviðurkvæmilegt orð",
         "Óheppilegt eða óviðurkvæmilegt orð, skárra væri t.d. ",
         TabooWarning,
         settings.taboo_words.DICT,
     )
-
     tone_of_voice_data = create_template_dict(
-        settings,
         "Orðið er ekki í samræmi við raddblæ okkar",
         "Orðið er ekki í samræmi við raddblæ okkar, í staðinn gætirðu notað",
         ToneOfVoiceWarning,
@@ -3058,7 +3066,6 @@ def check_spelling(self, stream: TokenIterator) -> TokenIterator:
         err_codes = {"T001/w", "T001", "V001/w", "V001"}
         if not only_ci and all(code not in ignore_rules for code in err_codes):
             ct_stream = check_wording(ct_stream, self.settings, self._db, self._suggest_not_correct)
-
         # Check context-independent style errors, indicated in BÍN
         ct_stream = check_style(ct_stream, self._db, ignore_rules)
         return ct_stream
@@ -3079,23 +3086,27 @@ def final_correct(self, stream: TokenIterator) -> TokenIterator:
             self._suppress_suggestions,
             self.settings,
         )
-
         ct_stream = late_fix_merges(ct_stream, self._ignore_wordlist, self._ignore_rules)
         return ct_stream
 
-_cached_settings: Optional[Settings] = None
+
+def settings_or_default(settings: Optional[Settings] = None) -> Settings:
+    """Return the given settings or a cached default if not given"""
+    if settings is not None:
+        # If a settings object is provided, use it
+        return settings
+    global _cached_settings
+    if _cached_settings is None:
+        # Create a new default settings object and cache it
+        _cached_settings = load_config()
+    return _cached_settings
+
 
 def tokenize(
     text_or_gen: StringIterable, *, settings: Optional[Settings] = None, **options: Any
 ) -> Iterator[CorrectToken]:
     """Tokenize text using the correction pipeline,
     overriding a part of the default tokenization pipeline"""
-    if settings is None:
-        global _cached_settings
-        settings = _cached_settings
-        if settings is None:
-            # Create a new settings object if none is provided
-            settings = load_config()
-            _cached_settings = settings
+    settings = settings_or_default(settings)
     pipeline = CorrectionPipeline(text_or_gen, settings, **options)
     return cast(Iterator[CorrectToken], pipeline.tokenize())
diff --git a/src/reynir_correct/main.py b/src/reynir_correct/main.py
@@ -166,7 +166,7 @@
 def from_args(args: argparse.Namespace) -> Dict[str, Union[str, bool]]:
     """Fill options with information from args"""
     format = args.format
-    if args.json:
+    if args.json or args.grammar:  # The --grammar option implies --json
         format = "json"
     elif args.csv:
         format = "csv"
diff --git a/test.py b/test.py
@@ -1,17 +1,22 @@
-# type: ignore
+
 
 import sys
+from typing import cast
 import reynir_correct as rc
+from reynir_correct.checker import AnnotatedSentence
 
 
+"""
 from reynir_correct import check_single
 sent = check_single("Páli, vini mínum, langaði að horfa á sjónnvarpið.")
-for annotation in sent.annotations:
-    print("{0}".format(annotation))
+if sent:
+    for annotation in sent.annotations:
+        print("{0}".format(annotation))
 
 sys.exit(0)
+"""
 
-def display_annotations(sent):
+def display_annotations(sent: rc.AnnotatedSentence):
     print("\nSetning:")
     print(sent.text)
     print("\nNiðurstaða tókunar:")
@@ -37,7 +42,7 @@ def display_annotations(sent):
 print("\nUpphaflegur texti: '{0}'".format(txt))
 for pg in rc.check(txt, split_paragraphs=True):
     for sent in pg:
-        display_annotations(sent)
+        display_annotations(cast(AnnotatedSentence, sent))
     print("---")
 
 sys.exit(0)
@@ -51,7 +56,7 @@ def display_annotations(sent):
     c = Corrector(db)  # type: Corrector
 
 
-def test(c, word):
+def test(c: Corrector, word: str) -> None:
     t0 = time.time()
     result = list(c.subs(word))
     valid = [r for r in result if r in c]