Skip to content

Commit 0930f53

Browse files
committed
Update comment and revert change to gibberish.py #2402
Signed-off-by: Jono Yang <jyang@nexb.com>
1 parent 4524ffc commit 0930f53

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ WORKDIR /scancode-toolkit
3838
COPY . /scancode-toolkit
3939

4040
# Initial configuration using ./configure, scancode-reindex-licenses to build
41-
# the base license index and scancode-reindex-package-patterns to build the
42-
# package patterns cache
41+
# the base license index, scancode-reindex-package-patterns to build the package
42+
# patterns cache, and scancode-train-gibberish-model to train the Markov chain
43+
# model used for gibberish detection.
4344
RUN ./configure \
4445
&& ./venv/bin/scancode-reindex-licenses \
4546
&& ./venv/bin/scancode-reindex-package-patterns \

src/textcode/gibberish.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
1010
#
1111

12-
import io
1312
import math
1413
import pickle
1514
from pathlib import Path
@@ -32,11 +31,11 @@ def __init__(self):
3231
self.train()
3332

3433
def persist_model(self):
35-
with io.open(model_path, mode='wb') as f:
34+
with open(model_path, mode='wb') as f:
3635
pickle.dump(vars(self), f)
3736

3837
def load_persisted_model(self):
39-
with io.open(model_path, mode='rb') as f:
38+
with open(model_path, mode='rb') as f:
4039
persisted_model = pickle.load(f)
4140
for key, value in persisted_model.items():
4241
setattr(self, key, value)
@@ -75,7 +74,7 @@ def train(self, bigfile=big_file_path, goodfile=good_file_path,
7574

7675
# Count transitions from big text file, taken
7776
# from http://norvig.com/spell-correct.html
78-
for line in io.open(bigfile, encoding='utf-8'):
77+
for line in open(bigfile, encoding='utf-8'):
7978
for a, b in self.ngram(2, line):
8079
counts[pos[a]][pos[b]] += 1
8180

@@ -91,8 +90,8 @@ def train(self, bigfile=big_file_path, goodfile=good_file_path,
9190

9291
# Find the probability of generating a few arbitrarily choosen good and
9392
# bad phrases.
94-
good_probs = [self.avg_transition_prob(l, counts) for l in io.open(goodfile, encoding='utf-8')]
95-
bad_probs = [self.avg_transition_prob(l, counts) for l in io.open(badfile, encoding='utf-8')]
93+
good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile, encoding='utf-8')]
94+
bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile, encoding='utf-8')]
9695

9796
# Assert that we actually are capable of detecting the junk.
9897
assert min(good_probs) > max(bad_probs)

0 commit comments

Comments
 (0)