diff --git a/bin/demeuk.py b/bin/demeuk.py
index 6cfd179..3a6540a 100755
--- a/bin/demeuk.py
+++ b/bin/demeuk.py
@@ -148,9 +148,14 @@
from glob import glob
from html import unescape
from inspect import cleandoc
-from locale import LC_ALL, setlocale
+from locale import LC_ALL, setlocale, getlocale
from math import ceil
-from multiprocessing import cpu_count, Pool
+from multiprocessing import cpu_count
+from os import name as os_name
+if os_name == 'nt':
+ from multiprocessing.pool import ThreadPool as Pool
+else:
+ from multiprocessing import Pool
from os import linesep, access, path, R_OK, F_OK, W_OK
from re import compile as re_compile
from re import search
@@ -647,9 +652,7 @@ def check_empty_line(line):
Returns:
true of line is empty or only contains whitespace chars
"""
- if line == '':
- return True
- elif line.isspace():
+ if line == '' or line.isspace():
return True
return False
@@ -1375,7 +1378,7 @@ def chunkify(filename, size=CHUNK_SIZE):
fh.readline()
while True:
- lines = [line.rstrip(b'\n') for line in fh.readlines(size)]
+ lines = [line.rstrip(linesep.encode()) for line in fh.readlines(size)]
yield lines
if len(lines) == 0:
break
@@ -1388,6 +1391,16 @@ def stderr_print(*args, **kwargs):
print(*args, **kwargs)
+def init_worker(config_data):
+ global config
+ config = config_data
+
+ try:
+ signal(SIGINT, SIG_IGN)
+ except ValueError:
+ pass # signal() only works in the main thread; ThreadPool workers are threads
+
+
def main():
#
# Config parser
@@ -1507,10 +1520,9 @@ def main():
if arguments.get('--input-encoding'):
config['input_encoding'] = arguments.get('--input-encoding').split(',')
+ setlocale(LC_ALL, 'en_US.UTF-8')
if arguments.get('--output-encoding'):
setlocale(LC_ALL, arguments.get('--output-encoding'))
- else:
- setlocale(LC_ALL, 'en_US.UTF-8')
if arguments.get('--punctuation'):
config['punctuation'] = arguments.get('--punctuation')
@@ -1741,13 +1753,14 @@ def main():
stderr_print('Main: done chunking file.')
stderr_print('Main: processing started.')
+ encoding = getlocale()[1]
if output_file:
- p_output_file = open(output_file, 'w')
+ p_output_file = open(output_file, 'w', encoding=encoding, newline='')
else:
p_output_file = stdout
if log_file:
- p_log_file = open(log_file, 'a')
+ p_log_file = open(log_file, 'a', encoding=encoding, newline='')
else:
p_log_file = stderr
@@ -1764,9 +1777,6 @@ def write_results_and_log(async_result):
write_results(async_result['results'])
write_log(async_result['log'])
- def init_worker():
- signal(SIGINT, SIG_IGN)
-
def process_jobs(chunk_start):
# Cut file in to chunks and process each trunk multi-threaded
while True:
@@ -1791,7 +1801,7 @@ def process_jobs(chunk_start):
sleep(1)
write_log(f'Running demeuk - {version}{linesep}')
- with Pool(a_threads, init_worker) as pool:
+ with Pool(a_threads, init_worker, initargs=(config,)) as pool:
jobs = []
# chunk_start will be the started value of the combined output lines
chunk_start = 0
diff --git a/requirements.txt b/requirements.txt
index 7ac56f4..cfcfae2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
docopt
-chardet
+chardet==5.2.0
nltk
ftfy
unidecode
diff --git a/tests/conftest.py b/tests/conftest.py
index 1d46472..c327e9c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -43,7 +43,7 @@
file.write('line'.encode('utf-8') + b'\x09' + f'entry{linesep}'.encode('utf-8'))
file.write('line2'.encode('utf-8') + b'\x09\x09' + f'entry2{linesep}'.encode('utf-8'))
-with open('testdata/input5', 'w') as file:
+with open('testdata/input5', 'w', encoding='utf-8') as file:
file.write(f'line1{linesep}')
file.write(f'line2{linesep}')
file.write(f'line3{linesep}')
@@ -51,7 +51,7 @@
file.write(f'email@example.com;line5{linesep}')
file.write(f'test:email@example.com:line6{linesep}')
-with open('testdata/input6', 'w') as file:
+with open('testdata/input6', 'w', encoding='utf-8') as file:
file.write(f'I\'Afrique_ADJ occidental_ADJ\t1927\t2\t2{linesep}')
file.write(f'I\'Allemagne )\t2009\t1\t1{linesep}')
file.write(f'I\'ain _VERB_\t2009\t2\t2{linesep}')
@@ -64,7 +64,7 @@
file.write(b'\x4C\x41\x4E\x43\x49\x41\x20\x41\x55\x52\x45\x4C\x49\x41\x20\x42\x32\x30\x20\x43\x4F\x55\x50\xC3\x83\xC2\x89\x20\x47\x54\x0A') # noqa: E501
-with open('testdata/input8', 'w') as file:
+with open('testdata/input8', 'w', encoding='utf-8') as file:
file.write(f'test@example.com:password1{linesep}')
file.write(f'test@sub.example.com:password2{linesep}')
file.write(f'test@example.ugur:password3{linesep}')
@@ -76,12 +76,12 @@
# Russian
file.write(f'!!!ееместной%%@!{linesep}'.encode('WINDOWS-1251'))
-with open('testdata/input10', 'w') as file:
+with open('testdata/input10', 'w', encoding='utf-8') as file:
file.write(f'cijfer/Aa{linesep}')
file.write(f'3M/Aa{linesep}')
file.write(f'VERYVERYVERYVERYVERYVERYLONGLINE?{linesep}')
-with open('testdata/input11', 'w') as file:
+with open('testdata/input11', 'w', encoding='utf-8') as file:
file.write(f'cijfer/Aa{linesep}')
file.write(f'3M-test/Aa{linesep}')
file.write(f'St. Maarten{linesep}')
@@ -92,39 +92,39 @@
with open('testdata/input12', 'wb') as file:
file.write(b'\x57\x65\x73\x74\x2D\x46\x72\x79\x73\x6C\xC2\x89\x6E' + f'{linesep}'.encode('utf-8'))
-with open('testdata/input13', 'w') as file:
+with open('testdata/input13', 'w', encoding='utf-8') as file:
file.write(f'field1:field2:field3:field4:field5:field6:field7{linesep}')
file.write(f'onefield{linesep}')
-with open('testdata/input14', 'w') as file:
+with open('testdata/input14', 'w', encoding='utf-8') as file:
file.write(f'field1:field2:field3:field4:field5:field6:field7{linesep}')
file.write(f'onefield{linesep}')
-with open('testdata/input15', 'w') as file:
+with open('testdata/input15', 'w', encoding='utf-8') as file:
file.write(f'$HEX[5045d141524f4c]{linesep}')
file.write(f'$HEX[51574552545955494f50c5]{linesep}')
file.write(f'$HEX[5a73f3666932303030]{linesep}')
file.write(f'$HEX[617261f16173]{linesep}')
-with open('testdata/input16', 'w') as file:
+with open('testdata/input16', 'w', encoding='utf-8') as file:
file.write(f'İSMAİL{linesep}')
file.write(f'İSTANBUL{linesep}')
file.write(f'şifreyok{linesep}')
file.write(f'>{linesep}')
file.write(f'α{linesep}')
-with open('testdata/input17', 'w') as file:
+with open('testdata/input17', 'w', encoding='utf-8') as file:
file.write(f'İSMAİL{linesep}')
file.write(f'İSTANBUL{linesep}')
file.write(f'şifreyok{linesep}')
file.write(f'>{linesep}')
file.write(f'α{linesep}')
-with open('testdata/input18', 'w') as file:
+with open('testdata/input18', 'w', encoding='utf-8') as file:
file.write(f'field1:field2:field3:field4:field5:field6:field7{linesep}')
file.write(f'onefield{linesep}')
-with open('testdata/input19', 'w') as file:
+with open('testdata/input19', 'w', encoding='utf-8') as file:
file.write(f'line01{linesep}')
file.write(f'line02{linesep}')
file.write(f'line03{linesep}')
@@ -136,24 +136,24 @@
file.write(f'line09{linesep}')
file.write(f'line10{linesep}')
-with open('testdata/input20', 'w') as file:
+with open('testdata/input20', 'w', encoding='utf-8') as file:
file.write(f'Eselsbru"cke{linesep}')
file.write(f'Fremdscha"men{linesep}')
file.write(f'KA"SEHOCH{linesep}')
-with open('testdata/input21', 'w') as file:
+with open('testdata/input21', 'w', encoding='utf-8') as file:
file.write(f'user;password{linesep}')
file.write(f'user2:password2{linesep}')
file.write(f'user3----password3{linesep}')
-with open('testdata/input22', 'w') as file:
+with open('testdata/input22', 'w', encoding='utf-8') as file:
file.write(f'line1@example{linesep}')
file.write(f'line2@example.com{linesep}')
file.write(f'line3@ex-ample.com{linesep}')
file.write(f'line4@ex.ample.com{linesep}')
file.write(f'test@example.com:line5{linesep}')
-with open('testdata/input23', 'w') as file:
+with open('testdata/input23', 'w', encoding='utf-8') as file:
file.write(f'line1@example.com:baabe00a81fc405af4ab9b0f99615498{linesep}')
file.write(f'line2@example.com:$h$7/uhfibmxg83yq6y1rh5y9wjee13kh.{linesep}')
file.write(f'line3@example.com:$6$/fasjdfsadj$safjasdfasjdfasdjf/asdfsadfasdfasdfas/fadsfasdfa{linesep}')
@@ -167,46 +167,46 @@
file.write(f'$H$8abc{linesep}')
file.write(f'$pizza$like{linesep}')
-with open('testdata/input24', 'w') as file:
+with open('testdata/input24', 'w', encoding='utf-8') as file:
file.write(f'line1@example.com,angus{linesep}')
file.write(f'line2@example.com:snow{linesep}')
file.write(f'line3@example.com:julia{linesep}')
-with open('testdata/input25', 'w') as file:
+with open('testdata/input25', 'w', encoding='utf-8') as file:
file.write(f'laténight{linesep}')
file.write(f'thestrokes{linesep}')
-with open('testdata/input26', 'w') as file:
+with open('testdata/input26', 'w', encoding='utf-8') as file:
file.write(f'polopaç{linesep}')
file.write(f'mündster{linesep}')
-with open('testdata/input27', 'w') as file:
+with open('testdata/input27', 'w', encoding='utf-8') as file:
file.write(f'rip-it.up{linesep}')
file.write(f'orange juice{linesep}')
-with open('testdata/input28', 'w') as file:
+with open('testdata/input28', 'w', encoding='utf-8') as file:
file.write(f'stand_by_me{linesep}')
file.write(f'the clash{linesep}')
-with open('testdata/input29', 'w') as file:
+with open('testdata/input29', 'w', encoding='utf-8') as file:
file.write(f'stand_by_me{linesep}')
file.write(f'the clash{linesep}')
-with open('testdata/input31', 'w') as file:
+with open('testdata/input31', 'w', encoding='utf-8') as file:
file.write(f'{linesep}')
-with open('testdata/input32', 'w') as file:
+with open('testdata/input32', 'w', encoding='utf-8') as file:
file.write(f'$1$2$3$4{linesep}')
file.write(f'$1$money$1${linesep}')
file.write(f'$1$ilovepizza{linesep}')
file.write(f'$1$1+l0l$aaaaaaaaaaaa./{linesep}')
file.write(f'$4$4$4pizza{linesep}')
-with open('testdata/input33', 'w') as file:
+with open('testdata/input33', 'w', encoding='utf-8') as file:
file.write(f'invalidstring�{linesep}')
file.write(f'jungejunge{linesep}')
-with open('testdata/input34', 'w') as file:
+with open('testdata/input34', 'w', encoding='utf-8') as file:
file.write(f'P@ssw0rd.1{linesep}')
file.write(f'bar@example.com{linesep}')
file.write(f'cr@ssT0rd{linesep}')
@@ -216,7 +216,7 @@
file.write(f'p@ssW0rd.me@Home{linesep}')
file.write(f'w@ssB0rd.we{linesep}')
-with open('testdata/input35', 'w') as file:
+with open('testdata/input35', 'w', encoding='utf-8') as file:
file.write(f'Avocado{linesep}')
file.write(f'Banana\\r\\n{linesep}')
file.write(f'Coconut\\n{linesep}')
@@ -228,7 +228,7 @@
file.write(f'Icaco
{linesep}')
file.write(f'
Jambul{linesep}')
-with open('testdata/input36', 'w') as file:
+with open('testdata/input36', 'w', encoding='utf-8') as file:
file.write(f'angleball\\r{linesep}')
file.write(f'badminton\\n{linesep}')
file.write(f'crossminton
{linesep}')
@@ -247,13 +247,13 @@
file.write(f'
tchoukball{linesep}')
file.write(f'vigoro{linesep}')
-with open('testdata/input37', 'w') as file:
+with open('testdata/input37', 'w', encoding='utf-8') as file:
file.write(f'$HEX[e]tiredofwaiting{linesep}')
file.write(f'$hex[6C6F73696E67746F756368]{linesep}')
file.write(f'$HEX[6C657469746B69636B696E]123!{linesep}')
file.write(f'$HEX[eee]{linesep}')
-with open('testdata/input38', 'w') as file:
+with open('testdata/input38', 'w', encoding='utf-8') as file:
file.write(f'112345678{linesep}')
file.write(f'#firstlovesong{linesep}')
file.write(f'/secondlovesong{linesep}')
@@ -261,7 +261,7 @@
file.write(f'\tcaliforniastars{linesep}')
file.write(f'lastlovesong{linesep}')
-with open('testdata/input39', 'w') as file:
+with open('testdata/input39', 'w', encoding='utf-8') as file:
file.write(f'112345678{linesep}')
file.write(f'#firstlovesong{linesep}')
file.write(f'/secondlovesong{linesep}')
@@ -269,7 +269,7 @@
file.write(f'\tcaliforniastars{linesep}')
file.write(f'lastlovesong{linesep}')
-with open('testdata/input40', 'w') as file:
+with open('testdata/input40', 'w', encoding='utf-8') as file:
file.write(f'112345678{linesep}')
file.write(f'#firstlovesong{linesep}')
file.write(f'/secondlovesong{linesep}')
@@ -277,21 +277,21 @@
file.write(f'\tcaliforniastars{linesep}')
file.write(f'lastlovesong{linesep}')
-with open('testdata/input41', 'w') as file:
+with open('testdata/input41', 'w', encoding='utf-8') as file:
file.write(f'dummy{linesep}')
file.write(f'2C:C5:D3:70:78:2c{linesep}')
-with open('testdata/input42', 'w') as file:
+with open('testdata/input42', 'w', encoding='utf-8') as file:
file.write(f'dummy{linesep}')
file.write(f'd4662e44-00f1-4ef6-857e-76e3c61604cd{linesep}')
file.write(f'D4662E44-00F1-4EF6-857E-76E3C61604CD{linesep}')
-with open('testdata/input43', 'w') as file:
+with open('testdata/input43', 'w', encoding='utf-8') as file:
file.write(f'dummy{linesep}')
file.write(f'test.jpg{linesep}')
file.write(f'hello@whatsapp.com{linesep}')
-with open('testdata/input44', 'w') as file:
+with open('testdata/input44', 'w', encoding='utf-8') as file:
file.write(f'3 doors down{linesep}')
with open('testdata/input45', 'wb') as file:
@@ -321,7 +321,7 @@
file.write(f'{linesep}'.encode('utf-8'))
-with open('testdata/input46', 'w') as file:
+with open('testdata/input46', 'w', encoding='utf-8') as file:
file.write(f'abc{linesep}')
file.write(f'abcd{linesep}')
file.write(f'a{linesep}')
@@ -329,15 +329,15 @@
file.write(f'aBc{linesep}')
file.write(f'123{linesep}')
-with open('testdata/input47', 'w') as file:
+with open('testdata/input47', 'w', encoding='utf-8') as file:
file.write(f'alpha{linesep}')
file.write(f'alpha123{linesep}')
file.write(f'alpha1234!{linesep}')
-with open('testdata/input48', 'w') as file:
+with open('testdata/input48', 'w', encoding='utf-8') as file:
file.write(f'3 Doors Down{linesep}')
-with open('testdata/input49', 'w') as file:
+with open('testdata/input49', 'w', encoding='utf-8') as file:
# no digits
file.write(f'nodigits{linesep}')
# digit at start
@@ -351,7 +351,7 @@
# multiple digits
file.write(f'pw123!{linesep}')
-with open('testdata/input50', 'w') as file:
+with open('testdata/input50', 'w', encoding='utf-8') as file:
# no uppercase
file.write(f'noupper{linesep}')
# uppercase at start
@@ -365,7 +365,7 @@
# multiple uppercase
file.write(f'ThisIsUpperCase!!!{linesep}')
-with open('testdata/input51', 'w') as file:
+with open('testdata/input51', 'w', encoding='utf-8') as file:
# no special
file.write(f'NoSpecialsHere{linesep}')
# special at start
@@ -382,22 +382,22 @@
# a combination of U+1F64C (raising hands) and U+1F3FD (skin tone)
file.write(f'8bytesemoji*4🙌🏽🙌🏽🙌🏽🙌🏽{linesep}')
-with open('testdata/input52', 'w') as file:
+with open('testdata/input52', 'w', encoding='utf-8') as file:
file.write(f'three doors down {linesep}')
file.write(f'amsterdam {linesep}')
file.write(f'ROTTERDAM {linesep}')
file.write(f'Cookie Monster {linesep}')
-with open('testdata/input53', 'w') as file:
+with open('testdata/input53', 'w', encoding='utf-8') as file:
file.write(f'three_down {linesep}')
file.write(f'_amsterdam {linesep}')
file.write(f'ROTTERDAM_ {linesep}')
file.write(f'Cookie Monster {linesep}')
-with open('testdata/input54', 'w') as file:
+with open('testdata/input54', 'w', encoding='utf-8') as file:
file.write(f'Golf Trip{linesep}')
file.write(f'Sequences{linesep}')
-with open('testdata/input55', 'w') as file:
+with open('testdata/input55', 'w', encoding='utf-8') as file:
file.write(f'здраво пријатељу{linesep}')
file.write(f'жута банана{linesep}')
diff --git a/tests/test_app.py b/tests/test_app.py
index 4302f26..7b007d9 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -1,5 +1,6 @@
import sys
from subprocess import PIPE, run
+from os import name, linesep
from unittest.mock import patch
from pytest import raises, mark
@@ -27,7 +28,7 @@ def test_demeuk():
assert line_num_log1 == 5
assert line_num_output1 == 9
assert line_num_input1 == (line_num_output1 + line_num_log1 - 1)
- with open('testdata/output1') as file:
+ with open('testdata/output1', encoding='utf-8') as file:
filecontent = file.read()
assert 'Password123!@"\n' in filecontent
assert 'ǓǝǪǼȧɠ\n' in filecontent
@@ -61,7 +62,7 @@ def test_newline():
assert line_num_output1 == 8
assert line_num_input1 == line_num_output1
- with open('testdata/output3') as file:
+ with open('testdata/output3', encoding='utf-8') as file:
filecontent = file.read()
for x in range(7):
assert f'line{x}\n' in filecontent
@@ -74,7 +75,7 @@ def test_tabchar():
line_num_output1 = calculate_line_numbers('testdata/output4')
assert line_num_output1 == 2
- with open('testdata/output4') as file:
+ with open('testdata/output4', encoding='utf-8') as file:
filecontent = file.read()
assert 'line:entry\n' in filecontent
assert 'line2:entry2\n' in filecontent
@@ -86,7 +87,7 @@ def test_split_email():
main()
line_num_output = calculate_line_numbers('testdata/output5')
assert line_num_output == 6
- with open('testdata/output5') as file:
+ with open('testdata/output5', encoding='utf-8') as file:
filecontent = file.read()
assert 'line1\n' in filecontent
assert 'email@example.com' not in filecontent
@@ -102,7 +103,7 @@ def test_googlengram():
main()
line_num_output = calculate_line_numbers('testdata/output6')
assert line_num_output == 4
- with open('testdata/output6') as f:
+ with open('testdata/output6', encoding='utf-8') as f:
filecontent = f.read()
assert 'I\'ain\n' in filecontent
assert 'I\'Afrique occidental\n' in filecontent
@@ -117,7 +118,7 @@ def test_coupe():
line_num_output = calculate_line_numbers('testdata/output7')
assert line_num_output == 2
- with open('testdata/output7') as f:
+ with open('testdata/output7', encoding='utf-8') as f:
filecontent = f.read()
assert 'coupÉ' in filecontent
assert 'LANCIA AURELIA B20 COUPÉ GT\n' in filecontent
@@ -130,7 +131,7 @@ def test_split():
line_num_output = calculate_line_numbers('testdata/output8')
assert line_num_output == 4
- with open('testdata/output8') as f:
+ with open('testdata/output8', encoding='utf-8') as f:
filecontent = f.read()
assert 'example.com' not in filecontent
assert 'sub.example.com' not in filecontent
@@ -152,7 +153,7 @@ def test_input_encoding():
main()
line_num_output = calculate_line_numbers('testdata/output9')
assert line_num_output == 2
- with open('testdata/output9') as f:
+ with open('testdata/output9', encoding='utf-8') as f:
filecontent = f.read()
assert '16THEBEST!!!\n' in filecontent
assert '!!!ееместной%%@!\n' in filecontent
@@ -170,7 +171,7 @@ def test_delimiter():
main()
line_num_output = calculate_line_numbers('testdata/output10')
assert line_num_output == 1
- with open('testdata/output10') as f:
+ with open('testdata/output10', encoding='utf-8') as f:
filecontent = f.read()
assert 'cijfer\n' in filecontent
assert '3M\n' not in filecontent
@@ -189,7 +190,7 @@ def test_language_processing():
main()
line_num_output = calculate_line_numbers('testdata/output11')
assert line_num_output == 21
- with open('testdata/output11') as f:
+ with open('testdata/output11', encoding='utf-8') as f:
filecontent = f.read()
assert 'cijfer\n' in filecontent
assert 'cijfer\n' in filecontent
@@ -213,10 +214,10 @@ def test_fries():
'-l', 'testdata/log12', '--encode', '--check-controlchar']
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/log12') as f:
+ with open('testdata/log12', encoding='utf-8') as f:
filecontent = f.read()
assert 'West-Frysl' in filecontent
- with open('testdata/output12') as f:
+ with open('testdata/output12', encoding='utf-8') as f:
filecontent = f.read()
assert 'West-Frysl‰n' not in filecontent
@@ -228,7 +229,7 @@ def test_cut_fields():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output13') as f:
+ with open('testdata/output13', encoding='utf-8') as f:
filecontent = f.read()
assert 'field5:field6:field7\n' in filecontent
assert 'field4' not in filecontent
@@ -241,7 +242,7 @@ def test_cut_fields_single():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output14') as f:
+ with open('testdata/output14', encoding='utf-8') as f:
filecontent = f.read()
assert 'field5\n' in filecontent
assert 'field4' not in filecontent
@@ -254,7 +255,7 @@ def test_unhex():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output15') as f:
+ with open('testdata/output15', encoding='utf-8') as f:
filecontent = f.read()
assert 'PEÑAROL\n' in filecontent
assert 'QWERTYUIOPÅ\n' in filecontent
@@ -270,7 +271,7 @@ def test_unhtml():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output16') as f:
+ with open('testdata/output16', encoding='utf-8') as f:
filecontent = f.read()
assert 'İSMAİL\n' in filecontent
assert 'İSTANBUL\n' in filecontent
@@ -286,7 +287,7 @@ def test_unhtml_named():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output17') as f:
+ with open('testdata/output17', encoding='utf-8') as f:
filecontent = f.read()
assert 'İSMAİL\n' in filecontent
assert 'İSTANBUL\n' in filecontent
@@ -302,7 +303,7 @@ def test_verbose():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/log18') as f:
+ with open('testdata/log18', encoding='utf-8') as f:
filecontent = f.read()
assert 'Clean_cut; ' in filecontent
@@ -327,7 +328,7 @@ def test_clean_add_umlaut():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output20') as f:
+ with open('testdata/output20', encoding='utf-8') as f:
filecontent = f.read()
assert 'Eselsbrücke' in filecontent
assert 'Fremdschämen' in filecontent
@@ -341,7 +342,7 @@ def test_clean_add_umlaut():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output20.2') as f:
+ with open('testdata/output20.2', encoding='utf-8') as f:
filecontent = f.read()
assert 'Eselsbrücke' in filecontent
assert 'Fremdschämen' in filecontent
@@ -357,7 +358,7 @@ def test_multiple_delimiters():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output21') as f:
+ with open('testdata/output21', encoding='utf-8') as f:
filecontent = f.read()
assert 'password\n' in filecontent
assert 'password2\n' in filecontent
@@ -373,7 +374,7 @@ def test_check_email():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output22') as f:
+ with open('testdata/output22', encoding='utf-8') as f:
filecontent = f.read()
assert 'line1' in filecontent
assert 'line2' not in filecontent
@@ -389,7 +390,7 @@ def test_check_hash():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output23') as f:
+ with open('testdata/output23', encoding='utf-8') as f:
filecontent = f.read()
assert 'baabe00a81fc405af4ab9b0f99615498' not in filecontent
assert '$h$7/uhfibmxg83yq6y1rh5y9wjee13kh.' not in filecontent
@@ -412,7 +413,7 @@ def test_check_bug_comma_d():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output24') as f:
+ with open('testdata/output24', encoding='utf-8') as f:
filecontent = f.read()
assert 'line1' not in filecontent
assert 'angus' in filecontent
@@ -427,7 +428,7 @@ def test_check_non_ascii():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output25') as f:
+ with open('testdata/output25', encoding='utf-8') as f:
filecontent = f.read()
assert 'laténight' not in filecontent
assert 'thestrokes' in filecontent
@@ -440,7 +441,7 @@ def test_clean_non_ascii():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output26') as f:
+ with open('testdata/output26', encoding='utf-8') as f:
filecontent = f.read()
assert 'polopaç' not in filecontent
@@ -456,7 +457,7 @@ def test_remove_punctuation():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output27') as f:
+ with open('testdata/output27', encoding='utf-8') as f:
filecontent = f.read()
assert 'ripitup' in filecontent
@@ -470,7 +471,7 @@ def test_remove_different_punctuation():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output28') as f:
+ with open('testdata/output28', encoding='utf-8') as f:
filecontent = f.read()
assert 'standbyme' in filecontent
@@ -484,7 +485,7 @@ def test_add_without_punctuation():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output29') as f:
+ with open('testdata/output29', encoding='utf-8') as f:
filecontent = f.read()
assert 'stand_by_me' in filecontent
@@ -500,7 +501,7 @@ def test_glob():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output30') as f:
+ with open('testdata/output30', encoding='utf-8') as f:
assert len(f.readlines()) > 100
@@ -511,7 +512,7 @@ def test_bug_html_control():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output31') as f:
+ with open('testdata/output31', encoding='utf-8') as f:
filecontent = f.read()
assert '\x0c\x0c' not in filecontent
@@ -523,7 +524,7 @@ def test_bug_dollar_line():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output32') as f:
+ with open('testdata/output32', encoding='utf-8') as f:
filecontent = f.read()
assert '$1$2$3$4' in filecontent
assert '$1$money$1$' in filecontent
@@ -539,7 +540,7 @@ def test_check_replacement_character():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output33') as f:
+ with open('testdata/output33', encoding='utf-8') as f:
filecontent = f.read()
assert 'invalidstring�' not in filecontent
assert 'jungejunge' in filecontent
@@ -552,7 +553,7 @@ def test_email_detection():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output34') as f:
+ with open('testdata/output34', encoding='utf-8') as f:
filecontent = f.read()
assert 'bar@example.com' not in filecontent
assert 'foo@example.com' not in filecontent
@@ -570,7 +571,7 @@ def test_newline_replacement():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output35') as f:
+ with open('testdata/output35', encoding='utf-8') as f:
filecontent = f.read()
assert 'Avocado\n' in filecontent
assert '\nBanana\\r\\n\n' in filecontent
@@ -591,7 +592,7 @@ def test_trim():
]
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output36') as f:
+ with open('testdata/output36', encoding='utf-8') as f:
filecontent = f.read()
assert 'angleball\n' in filecontent
assert '\nbadminton\n' in filecontent
@@ -620,7 +621,7 @@ def test_invalid_unhex():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output37') as f:
+ with open('testdata/output37', encoding='utf-8') as f:
filecontent = f.read()
# Invalid hex string, leaving at as is.
assert '$HEX[e]tiredofwaiting\n' in filecontent
@@ -640,7 +641,7 @@ def test_skip():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output38') as f:
+ with open('testdata/output38', encoding='utf-8') as f:
filecontent = f.read()
assert '112345678' not in filecontent
@@ -654,7 +655,7 @@ def test_check_starting_with():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output39') as f:
+ with open('testdata/output39', encoding='utf-8') as f:
filecontent = f.read()
assert 'firstlovesong' not in filecontent
@@ -671,7 +672,7 @@ def test_check_empty_line():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output40') as f:
+ with open('testdata/output40', encoding='utf-8') as f:
filecontent = f.read()
assert '\n\n' not in filecontent
@@ -685,7 +686,7 @@ def test_check_mac_address():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output41') as f:
+ with open('testdata/output41', encoding='utf-8') as f:
filecontent = f.read()
assert '2C:C5:D3:70:78:2c' not in filecontent
@@ -700,7 +701,7 @@ def test_check_uuid():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output42') as f:
+ with open('testdata/output42', encoding='utf-8') as f:
filecontent = f.read()
assert 'd4662e44-00f1-4ef6-857e-76e3c61604cd' not in filecontent
@@ -716,7 +717,7 @@ def test_check_ending_with():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output43') as f:
+ with open('testdata/output43', encoding='utf-8') as f:
filecontent = f.read()
assert 'test.jpg' not in filecontent
@@ -732,7 +733,7 @@ def test_check_title_case():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output44') as f:
+ with open('testdata/output44', encoding='utf-8') as f:
filecontent = f.read()
assert '3 Doors Down' in filecontent
@@ -746,7 +747,7 @@ def test_leak_full():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output45') as f:
+ with open('testdata/output45', encoding='utf-8') as f:
filecontent = f.read()
# Test for mojibake
@@ -783,7 +784,7 @@ def test_check_regex():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output46') as f:
+ with open('testdata/output46', encoding='utf-8') as f:
filecontent = f.read()
assert 'abc' in filecontent
@@ -803,7 +804,7 @@ def test_check_multiple_regexes():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output47') as f:
+ with open('testdata/output47', encoding='utf-8') as f:
filecontent = f.read()
assert 'alpha\n' not in filecontent
@@ -813,11 +814,18 @@ def test_check_multiple_regexes():
def test_stdin_stdout():
comlist = ['bin/demeuk.py']
- script = b'input\nlines\n'
+ # On Windows scripts cant be executed with the shebang so manually add python3 in front
+ if name == 'nt':
+ comlist.insert(0, 'python3')
+ script = f'input{linesep}lines{linesep}'.encode()
res = run(comlist, input=script,
stdout=PIPE, stderr=PIPE)
assert res.returncode == 0
- assert res.stdout == b'input\nlines\n'
+ if name == 'nt':
+ # On Windows when stdout is used, there is a trailing \r in new lines, but this does not effect terminal output
+ assert res.stdout == b'input\r\r\nlines\r\r\n'
+ else:
+ assert res.stdout == script
assert res.stderr == b''
@@ -829,7 +837,7 @@ def test_check_lowercase():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output48') as f:
+ with open('testdata/output48', encoding='utf-8') as f:
filecontent = f.read()
assert '3 doors down' in filecontent
@@ -847,7 +855,7 @@ def _run_demeuk(file_name, *extra_args):
with patch.object(sys, 'argv', testargs):
main()
- with open(f'testdata/{file_name}.out') as f:
+ with open(f'testdata/{file_name}.out', encoding='utf-8') as f:
return f.read()
@@ -933,7 +941,7 @@ def test_add_first_upper():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output52') as f:
+ with open('testdata/output52', encoding='utf-8') as f:
filecontent = f.read()
assert 'three doors down' in filecontent
@@ -954,7 +962,7 @@ def test_add_title_case():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output52') as f:
+ with open('testdata/output52', encoding='utf-8') as f:
filecontent = f.read()
assert 'three doors down' in filecontent
@@ -975,7 +983,7 @@ def test_check_contains():
with patch.object(sys, 'argv', testargs):
main()
- with open('testdata/output53') as f:
+ with open('testdata/output53', encoding='utf-8') as f:
filecontent = f.read()
assert 'three_down' not in filecontent