99# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
1010#
1111
12+ import io
1213import math
1314import pickle
1415from pathlib import Path
@@ -31,11 +32,11 @@ def __init__(self):
3132 self .train ()
3233
3334 def persist_model (self ):
34- with open (model_path , 'wb' ) as f :
35+ with io . open (model_path , mode = 'wb' ) as f :
3536 pickle .dump (vars (self ), f )
3637
3738 def load_persisted_model (self ):
38- with open (model_path , 'rb' ) as f :
39+ with io . open (model_path , mode = 'rb' ) as f :
3940 persisted_model = pickle .load (f )
4041 for key , value in persisted_model .items ():
4142 setattr (self , key , value )
@@ -74,7 +75,7 @@ def train(self, bigfile=big_file_path, goodfile=good_file_path,
7475
7576 # Count transitions from big text file, taken
7677 # from http://norvig.com/spell-correct.html
77- for line in open (bigfile ):
78+ for line in io . open (bigfile , encoding = 'utf-8' ):
7879 for a , b in self .ngram (2 , line ):
7980 counts [pos [a ]][pos [b ]] += 1
8081
@@ -90,8 +91,8 @@ def train(self, bigfile=big_file_path, goodfile=good_file_path,
9091
9192 # Find the probability of generating a few arbitrarily choosen good and
9293 # bad phrases.
93- good_probs = [self .avg_transition_prob (l , counts ) for l in open (goodfile )]
94- bad_probs = [self .avg_transition_prob (l , counts ) for l in open (badfile )]
94+ good_probs = [self .avg_transition_prob (l , counts ) for l in io . open (goodfile , encoding = 'utf-8' )]
95+ bad_probs = [self .avg_transition_prob (l , counts ) for l in io . open (badfile , encoding = 'utf-8' )]
9596
9697 # Assert that we actually are capable of detecting the junk.
9798 assert min (good_probs ) > max (bad_probs )
0 commit comments