Update comment and revert change to gibberish.py #2402

JonoYang · JonoYang · commit 0930f5303cf2 · 2026-01-12T14:42:46.000-08:00
Signed-off-by: Jono Yang &lt;jyang@nexb.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -38,8 +38,9 @@ WORKDIR /scancode-toolkit
 COPY . /scancode-toolkit
 
 # Initial configuration using ./configure, scancode-reindex-licenses to build
-# the base license index and scancode-reindex-package-patterns to build the
-# package patterns cache
+# the base license index, scancode-reindex-package-patterns to build the package
+# patterns cache, and scancode-train-gibberish-model to train the Markov chain
+# model used for gibberish detection.
 RUN ./configure \
  && ./venv/bin/scancode-reindex-licenses \
  && ./venv/bin/scancode-reindex-package-patterns \
diff --git a/src/textcode/gibberish.py b/src/textcode/gibberish.py
@@ -9,7 +9,6 @@
 # https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
 #
 
-import io
 import math
 import pickle
 from pathlib import Path
@@ -32,11 +31,11 @@ def __init__(self):
             self.train()
 
     def persist_model(self):
-        with io.open(model_path, mode='wb') as f:
+        with open(model_path, mode='wb') as f:
             pickle.dump(vars(self), f)
 
     def load_persisted_model(self):
-        with io.open(model_path, mode='rb') as f:
+        with open(model_path, mode='rb') as f:
             persisted_model = pickle.load(f)
             for key, value in persisted_model.items():
                 setattr(self, key, value)
@@ -75,7 +74,7 @@ def train(self, bigfile=big_file_path, goodfile=good_file_path,
 
         # Count transitions from big text file, taken
         # from http://norvig.com/spell-correct.html
-        for line in io.open(bigfile, encoding='utf-8'):
+        for line in open(bigfile, encoding='utf-8'):
             for a, b in self.ngram(2, line):
                 counts[pos[a]][pos[b]] += 1
 
@@ -91,8 +90,8 @@ def train(self, bigfile=big_file_path, goodfile=good_file_path,
 
         # Find the probability of generating a few arbitrarily choosen good and
         # bad phrases.
-        good_probs = [self.avg_transition_prob(l, counts) for l in io.open(goodfile, encoding='utf-8')]
-        bad_probs = [self.avg_transition_prob(l, counts) for l in io.open(badfile, encoding='utf-8')]
+        good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile, encoding='utf-8')]
+        bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile, encoding='utf-8')]
 
         # Assert that we actually are capable of detecting the junk.
         assert min(good_probs) > max(bad_probs)