From 4818dc391a32dc0ec2947e6a769da7a26353a5bc Mon Sep 17 00:00:00 2001 From: jessevz Date: Sat, 12 Apr 2025 15:17:00 +0200 Subject: [PATCH 1/6] Added windows support --- bin/demeuk.py | 27 +++++------ tests/conftest.py | 88 +++++++++++++++++------------------ tests/test_app.py | 116 +++++++++++++++++++++++++--------------------- 3 files changed, 120 insertions(+), 111 deletions(-) diff --git a/bin/demeuk.py b/bin/demeuk.py index 3381db0..00f4462 100755 --- a/bin/demeuk.py +++ b/bin/demeuk.py @@ -145,7 +145,7 @@ from glob import glob from html import unescape from inspect import cleandoc -from locale import LC_ALL, setlocale +from locale import LC_ALL, setlocale, getlocale from math import ceil from multiprocessing import cpu_count, Pool from os import linesep, access, path, R_OK, F_OK, W_OK @@ -643,9 +643,7 @@ def check_empty_line(line): Returns: true of line is empty or only contains whitespace chars """ - if line == '': - return True - elif line.isspace(): + if line == '' or line.isspace(): return True return False @@ -1340,7 +1338,7 @@ def chunkify(filename, size=CHUNK_SIZE): fh.readline() while True: - lines = [line.rstrip(b'\n') for line in fh.readlines(size)] + lines = [line.rstrip(linesep.encode()) for line in fh.readlines(size)] yield lines if len(lines) == 0: break @@ -1353,6 +1351,12 @@ def stderr_print(*args, **kwargs): print(*args, **kwargs) +def init_worker(config_data): + global config + config = config_data + + signal(SIGINT, SIG_IGN) + def main(): # # Config parser @@ -1471,10 +1475,9 @@ def main(): if arguments.get('--input-encoding'): config['input_encoding'] = arguments.get('--input-encoding').split(',') + setlocale(LC_ALL, 'en_US.UTF-8') if arguments.get('--output-encoding'): setlocale(LC_ALL, arguments.get('--output-encoding')) - else: - setlocale(LC_ALL, 'en_US.UTF-8') if arguments.get('--punctuation'): config['punctuation'] = arguments.get('--punctuation') @@ -1702,13 +1705,14 @@ def main(): stderr_print('Main: done chunking file.') stderr_print('Main: processing started.') + encoding = getlocale()[1] if output_file: - p_output_file = open(output_file, 'w') + p_output_file = open(output_file, 'w', encoding=encoding, newline='') else: p_output_file = stdout if log_file: - p_log_file = open(log_file, 'a') + p_log_file = open(log_file, 'a', encoding=encoding, newline='') else: p_log_file = stderr @@ -1725,9 +1729,6 @@ def write_results_and_log(async_result): write_results(async_result['results']) write_log(async_result['log']) - def init_worker(): - signal(SIGINT, SIG_IGN) - def process_jobs(chunk_start): # Cut file in to chunks and process each trunk multi-threaded while True: @@ -1752,7 +1753,7 @@ def process_jobs(chunk_start): sleep(1) write_log(f'Running demeuk - {version}{linesep}') - with Pool(a_threads, init_worker) as pool: + with Pool(a_threads, init_worker, initargs=(config,)) as pool: jobs = [] # chunk_start will be the started value of the combined output lines chunk_start = 0 diff --git a/tests/conftest.py b/tests/conftest.py index 52b75c6..29cc324 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,7 +43,7 @@ file.write('line'.encode('utf-8') + b'\x09' + f'entry{linesep}'.encode('utf-8')) file.write('line2'.encode('utf-8') + b'\x09\x09' + f'entry2{linesep}'.encode('utf-8')) -with open('testdata/input5', 'w') as file: +with open('testdata/input5', 'w', encoding='utf-8') as file: file.write(f'line1{linesep}') file.write(f'line2{linesep}') file.write(f'line3{linesep}') @@ -51,7 +51,7 @@ file.write(f'email@example.com;line5{linesep}') file.write(f'test:email@example.com:line6{linesep}') -with open('testdata/input6', 'w') as file: +with open('testdata/input6', 'w', encoding='utf-8') as file: file.write(f'I\'Afrique_ADJ occidental_ADJ\t1927\t2\t2{linesep}') file.write(f'I\'Allemagne )\t2009\t1\t1{linesep}') file.write(f'I\'ain _VERB_\t2009\t2\t2{linesep}') @@ -64,7 +64,7 @@ file.write(b'\x4C\x41\x4E\x43\x49\x41\x20\x41\x55\x52\x45\x4C\x49\x41\x20\x42\x32\x30\x20\x43\x4F\x55\x50\xC3\x83\xC2\x89\x20\x47\x54\x0A') # noqa: E501 -with open('testdata/input8', 'w') as file: +with open('testdata/input8', 'w', encoding='utf-8') as file: file.write(f'test@example.com:password1{linesep}') file.write(f'test@sub.example.com:password2{linesep}') file.write(f'test@example.ugur:password3{linesep}') @@ -76,12 +76,12 @@ # Russian file.write(f'!!!ееместной%%@!{linesep}'.encode('WINDOWS-1251')) -with open('testdata/input10', 'w') as file: +with open('testdata/input10', 'w', encoding='utf-8') as file: file.write(f'cijfer/Aa{linesep}') file.write(f'3M/Aa{linesep}') file.write(f'VERYVERYVERYVERYVERYVERYLONGLINE?{linesep}') -with open('testdata/input11', 'w') as file: +with open('testdata/input11', 'w', encoding='utf-8') as file: file.write(f'cijfer/Aa{linesep}') file.write(f'3M-test/Aa{linesep}') file.write(f'St. Maarten{linesep}') @@ -92,39 +92,39 @@ with open('testdata/input12', 'wb') as file: file.write(b'\x57\x65\x73\x74\x2D\x46\x72\x79\x73\x6C\xC2\x89\x6E' + f'{linesep}'.encode('utf-8')) -with open('testdata/input13', 'w') as file: +with open('testdata/input13', 'w', encoding='utf-8') as file: file.write(f'field1:field2:field3:field4:field5:field6:field7{linesep}') file.write(f'onefield{linesep}') -with open('testdata/input14', 'w') as file: +with open('testdata/input14', 'w', encoding='utf-8') as file: file.write(f'field1:field2:field3:field4:field5:field6:field7{linesep}') file.write(f'onefield{linesep}') -with open('testdata/input15', 'w') as file: +with open('testdata/input15', 'w', encoding='utf-8') as file: file.write(f'$HEX[5045d141524f4c]{linesep}') file.write(f'$HEX[51574552545955494f50c5]{linesep}') file.write(f'$HEX[5a73f3666932303030]{linesep}') file.write(f'$HEX[617261f16173]{linesep}') -with open('testdata/input16', 'w') as file: +with open('testdata/input16', 'w', encoding='utf-8') as file: file.write(f'İSMAİL{linesep}') file.write(f'İSTANBUL{linesep}') file.write(f'şifreyok{linesep}') file.write(f'>{linesep}') file.write(f'α{linesep}') -with open('testdata/input17', 'w') as file: +with open('testdata/input17', 'w', encoding='utf-8') as file: file.write(f'İSMAİL{linesep}') file.write(f'İSTANBUL{linesep}') file.write(f'şifreyok{linesep}') file.write(f'>{linesep}') file.write(f'α{linesep}') -with open('testdata/input18', 'w') as file: +with open('testdata/input18', 'w', encoding='utf-8') as file: file.write(f'field1:field2:field3:field4:field5:field6:field7{linesep}') file.write(f'onefield{linesep}') -with open('testdata/input19', 'w') as file: +with open('testdata/input19', 'w', encoding='utf-8') as file: file.write(f'line01{linesep}') file.write(f'line02{linesep}') file.write(f'line03{linesep}') @@ -136,24 +136,24 @@ file.write(f'line09{linesep}') file.write(f'line10{linesep}') -with open('testdata/input20', 'w') as file: +with open('testdata/input20', 'w', encoding='utf-8') as file: file.write(f'Eselsbru"cke{linesep}') file.write(f'Fremdscha"men{linesep}') file.write(f'KA"SEHOCH{linesep}') -with open('testdata/input21', 'w') as file: +with open('testdata/input21', 'w', encoding='utf-8') as file: file.write(f'user;password{linesep}') file.write(f'user2:password2{linesep}') file.write(f'user3----password3{linesep}') -with open('testdata/input22', 'w') as file: +with open('testdata/input22', 'w', encoding='utf-8') as file: file.write(f'line1@example{linesep}') file.write(f'line2@example.com{linesep}') file.write(f'line3@ex-ample.com{linesep}') file.write(f'line4@ex.ample.com{linesep}') file.write(f'test@example.com:line5{linesep}') -with open('testdata/input23', 'w') as file: +with open('testdata/input23', 'w', encoding='utf-8') as file: file.write(f'line1@example.com:baabe00a81fc405af4ab9b0f99615498{linesep}') file.write(f'line2@example.com:$h$7/uhfibmxg83yq6y1rh5y9wjee13kh.{linesep}') file.write(f'line3@example.com:$6$/fasjdfsadj$safjasdfasjdfasdjf/asdfsadfasdfasdfas/fadsfasdfa{linesep}') @@ -167,46 +167,46 @@ file.write(f'$H$8abc{linesep}') file.write(f'$pizza$like{linesep}') -with open('testdata/input24', 'w') as file: +with open('testdata/input24', 'w', encoding='utf-8') as file: file.write(f'line1@example.com,angus{linesep}') file.write(f'line2@example.com:snow{linesep}') file.write(f'line3@example.com:julia{linesep}') -with open('testdata/input25', 'w') as file: +with open('testdata/input25', 'w', encoding='utf-8') as file: file.write(f'laténight{linesep}') file.write(f'thestrokes{linesep}') -with open('testdata/input26', 'w') as file: +with open('testdata/input26', 'w', encoding='utf-8') as file: file.write(f'polopaç{linesep}') file.write(f'mündster{linesep}') -with open('testdata/input27', 'w') as file: +with open('testdata/input27', 'w', encoding='utf-8') as file: file.write(f'rip-it.up{linesep}') file.write(f'orange juice{linesep}') -with open('testdata/input28', 'w') as file: +with open('testdata/input28', 'w', encoding='utf-8') as file: file.write(f'stand_by_me{linesep}') file.write(f'the clash{linesep}') -with open('testdata/input29', 'w') as file: +with open('testdata/input29', 'w', encoding='utf-8') as file: file.write(f'stand_by_me{linesep}') file.write(f'the clash{linesep}') -with open('testdata/input31', 'w') as file: +with open('testdata/input31', 'w', encoding='utf-8') as file: file.write(f' {linesep}') -with open('testdata/input32', 'w') as file: +with open('testdata/input32', 'w', encoding='utf-8') as file: file.write(f'$1$2$3$4{linesep}') file.write(f'$1$money$1${linesep}') file.write(f'$1$ilovepizza{linesep}') file.write(f'$1$1+l0l$aaaaaaaaaaaa./{linesep}') file.write(f'$4$4$4pizza{linesep}') -with open('testdata/input33', 'w') as file: +with open('testdata/input33', 'w', encoding='utf-8') as file: file.write(f'invalidstring�{linesep}') file.write(f'jungejunge{linesep}') -with open('testdata/input34', 'w') as file: +with open('testdata/input34', 'w', encoding='utf-8') as file: file.write(f'P@ssw0rd.1{linesep}') file.write(f'bar@example.com{linesep}') file.write(f'cr@ssT0rd{linesep}') @@ -216,7 +216,7 @@ file.write(f'p@ssW0rd.me@Home{linesep}') file.write(f'w@ssB0rd.we{linesep}') -with open('testdata/input35', 'w') as file: +with open('testdata/input35', 'w', encoding='utf-8') as file: file.write(f'Avocado{linesep}') file.write(f'Banana\\r\\n{linesep}') file.write(f'Coconut\\n{linesep}') @@ -228,7 +228,7 @@ file.write(f'Icaco {linesep}') file.write(f' Jambul{linesep}') -with open('testdata/input36', 'w') as file: +with open('testdata/input36', 'w', encoding='utf-8') as file: file.write(f'angleball\\r{linesep}') file.write(f'badminton\\n{linesep}') file.write(f'crossminton
{linesep}') @@ -247,13 +247,13 @@ file.write(f'

tchoukball{linesep}') file.write(f'vigoro{linesep}') -with open('testdata/input37', 'w') as file: +with open('testdata/input37', 'w', encoding='utf-8') as file: file.write(f'$HEX[e]tiredofwaiting{linesep}') file.write(f'$hex[6C6F73696E67746F756368]{linesep}') file.write(f'$HEX[6C657469746B69636B696E]123!{linesep}') file.write(f'$HEX[eee]{linesep}') -with open('testdata/input38', 'w') as file: +with open('testdata/input38', 'w', encoding='utf-8') as file: file.write(f'112345678{linesep}') file.write(f'#firstlovesong{linesep}') file.write(f'/secondlovesong{linesep}') @@ -261,7 +261,7 @@ file.write(f'\tcaliforniastars{linesep}') file.write(f'lastlovesong{linesep}') -with open('testdata/input39', 'w') as file: +with open('testdata/input39', 'w', encoding='utf-8') as file: file.write(f'112345678{linesep}') file.write(f'#firstlovesong{linesep}') file.write(f'/secondlovesong{linesep}') @@ -269,7 +269,7 @@ file.write(f'\tcaliforniastars{linesep}') file.write(f'lastlovesong{linesep}') -with open('testdata/input40', 'w') as file: +with open('testdata/input40', 'w', encoding='utf-8') as file: file.write(f'112345678{linesep}') file.write(f'#firstlovesong{linesep}') file.write(f'/secondlovesong{linesep}') @@ -277,21 +277,21 @@ file.write(f'\tcaliforniastars{linesep}') file.write(f'lastlovesong{linesep}') -with open('testdata/input41', 'w') as file: +with open('testdata/input41', 'w', encoding='utf-8') as file: file.write(f'dummy{linesep}') file.write(f'2C:C5:D3:70:78:2c{linesep}') -with open('testdata/input42', 'w') as file: +with open('testdata/input42', 'w', encoding='utf-8') as file: file.write(f'dummy{linesep}') file.write(f'd4662e44-00f1-4ef6-857e-76e3c61604cd{linesep}') file.write(f'D4662E44-00F1-4EF6-857E-76E3C61604CD{linesep}') -with open('testdata/input43', 'w') as file: +with open('testdata/input43', 'w', encoding='utf-8') as file: file.write(f'dummy{linesep}') file.write(f'test.jpg{linesep}') file.write(f'hello@whatsapp.com{linesep}') -with open('testdata/input44', 'w') as file: +with open('testdata/input44', 'w', encoding='utf-8') as file: file.write(f'3 doors down{linesep}') with open('testdata/input45', 'wb') as file: @@ -321,7 +321,7 @@ file.write(f'{linesep}'.encode('utf-8')) -with open('testdata/input46', 'w') as file: +with open('testdata/input46', 'w', encoding='utf-8') as file: file.write(f'abc{linesep}') file.write(f'abcd{linesep}') file.write(f'a{linesep}') @@ -329,15 +329,15 @@ file.write(f'aBc{linesep}') file.write(f'123{linesep}') -with open('testdata/input47', 'w') as file: +with open('testdata/input47', 'w', encoding='utf-8') as file: file.write(f'alpha{linesep}') file.write(f'alpha123{linesep}') file.write(f'alpha1234!{linesep}') -with open('testdata/input48', 'w') as file: +with open('testdata/input48', 'w', encoding='utf-8') as file: file.write(f'3 Doors Down{linesep}') -with open('testdata/input49', 'w') as file: +with open('testdata/input49', 'w', encoding='utf-8') as file: # no digits file.write(f'nodigits{linesep}') # digit at start @@ -351,7 +351,7 @@ # multiple digits file.write(f'pw123!{linesep}') -with open('testdata/input50', 'w') as file: +with open('testdata/input50', 'w', encoding='utf-8') as file: # no uppercase file.write(f'noupper{linesep}') # uppercase at start @@ -365,7 +365,7 @@ # multiple uppercase file.write(f'ThisIsUpperCase!!!{linesep}') -with open('testdata/input51', 'w') as file: +with open('testdata/input51', 'w', encoding='utf-8') as file: # no special file.write(f'NoSpecialsHere{linesep}') # special at start @@ -382,13 +382,13 @@ # a combination of U+1F64C (raising hands) and U+1F3FD (skin tone) file.write(f'8bytesemoji*4🙌🏽🙌🏽🙌🏽🙌🏽{linesep}') -with open('testdata/input52', 'w') as file: +with open('testdata/input52', 'w', encoding='utf-8') as file: file.write(f'three doors down {linesep}') file.write(f'amsterdam {linesep}') file.write(f'ROTTERDAM {linesep}') file.write(f'Cookie Monster {linesep}') -with open('testdata/input53', 'w') as file: +with open('testdata/input53', 'w', encoding='utf-8') as file: file.write(f'three_down {linesep}') file.write(f'_amsterdam {linesep}') file.write(f'ROTTERDAM_ {linesep}') diff --git a/tests/test_app.py b/tests/test_app.py index f57ab7f..6cfcf6c 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,5 +1,6 @@ import sys from subprocess import PIPE, run +from os import name, linesep from unittest.mock import patch from pytest import raises @@ -27,7 +28,7 @@ def test_demeuk(): assert line_num_log1 == 5 assert line_num_output1 == 9 assert line_num_input1 == (line_num_output1 + line_num_log1 - 1) - with open('testdata/output1') as file: + with open('testdata/output1', encoding='utf-8') as file: filecontent = file.read() assert 'Password123!@"\n' in filecontent assert 'ǓǝǪǼȧɠ\n' in filecontent @@ -61,7 +62,7 @@ def test_newline(): assert line_num_output1 == 8 assert line_num_input1 == line_num_output1 - with open('testdata/output3') as file: + with open('testdata/output3', encoding='utf-8') as file: filecontent = file.read() for x in range(7): assert f'line{x}\n' in filecontent @@ -74,7 +75,7 @@ def test_tabchar(): line_num_output1 = calculate_line_numbers('testdata/output4') assert line_num_output1 == 2 - with open('testdata/output4') as file: + with open('testdata/output4', encoding='utf-8') as file: filecontent = file.read() assert 'line:entry\n' in filecontent assert 'line2:entry2\n' in filecontent @@ -86,7 +87,7 @@ def test_split_email(): main() line_num_output = calculate_line_numbers('testdata/output5') assert line_num_output == 6 - with open('testdata/output5') as file: + with open('testdata/output5', encoding='utf-8') as file: filecontent = file.read() assert 'line1\n' in filecontent assert 'email@example.com' not in filecontent @@ -102,7 +103,7 @@ def test_googlengram(): main() line_num_output = calculate_line_numbers('testdata/output6') assert line_num_output == 4 - with open('testdata/output6') as f: + with open('testdata/output6', encoding='utf-8') as f: filecontent = f.read() assert 'I\'ain\n' in filecontent assert 'I\'Afrique occidental\n' in filecontent @@ -117,7 +118,7 @@ def test_coupe(): line_num_output = calculate_line_numbers('testdata/output7') assert line_num_output == 2 - with open('testdata/output7') as f: + with open('testdata/output7', encoding='utf-8') as f: filecontent = f.read() assert 'coupÉ' in filecontent assert 'LANCIA AURELIA B20 COUPÉ GT\n' in filecontent @@ -130,7 +131,7 @@ def test_split(): line_num_output = calculate_line_numbers('testdata/output8') assert line_num_output == 4 - with open('testdata/output8') as f: + with open('testdata/output8', encoding='utf-8') as f: filecontent = f.read() assert 'example.com' not in filecontent assert 'sub.example.com' not in filecontent @@ -152,7 +153,7 @@ def test_input_encoding(): main() line_num_output = calculate_line_numbers('testdata/output9') assert line_num_output == 2 - with open('testdata/output9') as f: + with open('testdata/output9', encoding='utf-8') as f: filecontent = f.read() assert '16THEBEST!!!\n' in filecontent assert '!!!ееместной%%@!\n' in filecontent @@ -170,7 +171,7 @@ def test_delimiter(): main() line_num_output = calculate_line_numbers('testdata/output10') assert line_num_output == 1 - with open('testdata/output10') as f: + with open('testdata/output10', encoding='utf-8') as f: filecontent = f.read() assert 'cijfer\n' in filecontent assert '3M\n' not in filecontent @@ -189,7 +190,7 @@ def test_language_processing(): main() line_num_output = calculate_line_numbers('testdata/output11') assert line_num_output == 29 - with open('testdata/output11') as f: + with open('testdata/output11', encoding='utf-8') as f: filecontent = f.read() assert 'cijfer\n' in filecontent assert 'cijfer\n' in filecontent @@ -213,10 +214,10 @@ def test_fries(): '-l', 'testdata/log12', '--encode', '--check-controlchar'] with patch.object(sys, 'argv', testargs): main() - with open('testdata/log12') as f: + with open('testdata/log12', encoding='utf-8') as f: filecontent = f.read() assert 'West-Frysl' in filecontent - with open('testdata/output12') as f: + with open('testdata/output12', encoding='utf-8') as f: filecontent = f.read() assert 'West-Frysl‰n' not in filecontent @@ -228,7 +229,7 @@ def test_cut_fields(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output13') as f: + with open('testdata/output13', encoding='utf-8') as f: filecontent = f.read() assert 'field5:field6:field7\n' in filecontent assert 'field4' not in filecontent @@ -241,7 +242,7 @@ def test_cut_fields_single(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output14') as f: + with open('testdata/output14', encoding='utf-8') as f: filecontent = f.read() assert 'field5\n' in filecontent assert 'field4' not in filecontent @@ -254,7 +255,7 @@ def test_unhex(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output15') as f: + with open('testdata/output15', encoding='utf-8') as f: filecontent = f.read() assert 'PEÑAROL\n' in filecontent assert 'QWERTYUIOPÅ\n' in filecontent @@ -270,7 +271,7 @@ def test_unhtml(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output16') as f: + with open('testdata/output16', encoding='utf-8') as f: filecontent = f.read() assert 'İSMAİL\n' in filecontent assert 'İSTANBUL\n' in filecontent @@ -286,7 +287,7 @@ def test_unhtml_named(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output17') as f: + with open('testdata/output17', encoding='utf-8') as f: filecontent = f.read() assert 'İSMAİL\n' in filecontent assert 'İSTANBUL\n' in filecontent @@ -302,7 +303,7 @@ def test_verbose(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/log18') as f: + with open('testdata/log18', encoding='utf-8') as f: filecontent = f.read() assert 'Clean_cut; ' in filecontent @@ -327,7 +328,7 @@ def test_clean_add_umlaut(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output20') as f: + with open('testdata/output20', encoding='utf-8') as f: filecontent = f.read() assert 'Eselsbrücke' in filecontent assert 'Fremdschämen' in filecontent @@ -341,7 +342,7 @@ def test_clean_add_umlaut(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output20.2') as f: + with open('testdata/output20.2', encoding='utf-8') as f: filecontent = f.read() assert 'Eselsbrücke' in filecontent assert 'Fremdschämen' in filecontent @@ -357,7 +358,7 @@ def test_multiple_delimiters(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output21') as f: + with open('testdata/output21', encoding='utf-8') as f: filecontent = f.read() assert 'password\n' in filecontent assert 'password2\n' in filecontent @@ -373,7 +374,7 @@ def test_check_email(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output22') as f: + with open('testdata/output22', encoding='utf-8') as f: filecontent = f.read() assert 'line1' in filecontent assert 'line2' not in filecontent @@ -389,7 +390,7 @@ def test_check_hash(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output23') as f: + with open('testdata/output23', encoding='utf-8') as f: filecontent = f.read() assert 'baabe00a81fc405af4ab9b0f99615498' not in filecontent assert '$h$7/uhfibmxg83yq6y1rh5y9wjee13kh.' not in filecontent @@ -412,7 +413,7 @@ def test_check_bug_comma_d(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output24') as f: + with open('testdata/output24', encoding='utf-8') as f: filecontent = f.read() assert 'line1' not in filecontent assert 'angus' in filecontent @@ -427,7 +428,7 @@ def test_check_non_ascii(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output25') as f: + with open('testdata/output25', encoding='utf-8') as f: filecontent = f.read() assert 'laténight' not in filecontent assert 'thestrokes' in filecontent @@ -440,7 +441,7 @@ def test_clean_non_ascii(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output26') as f: + with open('testdata/output26', encoding='utf-8') as f: filecontent = f.read() assert 'polopaç' not in filecontent @@ -456,7 +457,7 @@ def test_remove_punctuation(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output27') as f: + with open('testdata/output27', encoding='utf-8') as f: filecontent = f.read() assert 'ripitup' in filecontent @@ -470,7 +471,7 @@ def test_remove_different_punctuation(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output28') as f: + with open('testdata/output28', encoding='utf-8') as f: filecontent = f.read() assert 'standbyme' in filecontent @@ -484,7 +485,7 @@ def test_add_without_punctuation(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output29') as f: + with open('testdata/output29', encoding='utf-8') as f: filecontent = f.read() assert 'stand_by_me' in filecontent @@ -500,7 +501,7 @@ def test_glob(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output30') as f: + with open('testdata/output30', encoding='utf-8') as f: assert len(f.readlines()) > 100 @@ -511,7 +512,7 @@ def test_bug_html_control(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output31') as f: + with open('testdata/output31', encoding='utf-8') as f: filecontent = f.read() assert '\x0c\x0c' not in filecontent @@ -523,7 +524,7 @@ def test_bug_dollar_line(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output32') as f: + with open('testdata/output32', encoding='utf-8') as f: filecontent = f.read() assert '$1$2$3$4' in filecontent assert '$1$money$1$' in filecontent @@ -539,7 +540,7 @@ def test_check_replacement_character(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output33') as f: + with open('testdata/output33', encoding='utf-8') as f: filecontent = f.read() assert 'invalidstring�' not in filecontent assert 'jungejunge' in filecontent @@ -552,7 +553,7 @@ def test_email_detection(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output34') as f: + with open('testdata/output34', encoding='utf-8') as f: filecontent = f.read() assert 'bar@example.com' not in filecontent assert 'foo@example.com' not in filecontent @@ -570,7 +571,7 @@ def test_newline_replacement(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output35') as f: + with open('testdata/output35', encoding='utf-8') as f: filecontent = f.read() assert 'Avocado\n' in filecontent assert '\nBanana\\r\\n\n' in filecontent @@ -591,7 +592,7 @@ def test_trim(): ] with patch.object(sys, 'argv', testargs): main() - with open('testdata/output36') as f: + with open('testdata/output36', encoding='utf-8') as f: filecontent = f.read() assert 'angleball\n' in filecontent assert '\nbadminton\n' in filecontent @@ -620,7 +621,7 @@ def test_invalid_unhex(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output37') as f: + with open('testdata/output37', encoding='utf-8') as f: filecontent = f.read() # Invalid hex string, leaving at as is. assert '$HEX[e]tiredofwaiting\n' in filecontent @@ -640,7 +641,7 @@ def test_skip(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output38') as f: + with open('testdata/output38', encoding='utf-8') as f: filecontent = f.read() assert '112345678' not in filecontent @@ -654,7 +655,7 @@ def test_check_starting_with(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output39') as f: + with open('testdata/output39', encoding='utf-8') as f: filecontent = f.read() assert 'firstlovesong' not in filecontent @@ -671,7 +672,7 @@ def test_check_empty_line(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output40') as f: + with open('testdata/output40', encoding='utf-8') as f: filecontent = f.read() assert '\n\n' not in filecontent @@ -685,7 +686,7 @@ def test_check_mac_address(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output41') as f: + with open('testdata/output41', encoding='utf-8') as f: filecontent = f.read() assert '2C:C5:D3:70:78:2c' not in filecontent @@ -700,7 +701,7 @@ def test_check_uuid(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output42') as f: + with open('testdata/output42', encoding='utf-8') as f: filecontent = f.read() assert 'd4662e44-00f1-4ef6-857e-76e3c61604cd' not in filecontent @@ -716,7 +717,7 @@ def test_check_ending_with(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output43') as f: + with open('testdata/output43', encoding='utf-8') as f: filecontent = f.read() assert 'test.jpg' not in filecontent @@ -732,7 +733,7 @@ def test_check_title_case(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output44') as f: + with open('testdata/output44', encoding='utf-8') as f: filecontent = f.read() assert '3 Doors Down' in filecontent @@ -746,7 +747,7 @@ def test_leak_full(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output45') as f: + with open('testdata/output45', encoding='utf-8') as f: filecontent = f.read() # Test for mojibake @@ -783,7 +784,7 @@ def test_check_regex(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output46') as f: + with open('testdata/output46', encoding='utf-8') as f: filecontent = f.read() assert 'abc' in filecontent @@ -803,7 +804,7 @@ def test_check_multiple_regexes(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output47') as f: + with open('testdata/output47', encoding='utf-8') as f: filecontent = f.read() assert 'alpha\n' not in filecontent @@ -813,11 +814,18 @@ def test_check_multiple_regexes(): def test_stdin_stdout(): comlist = ['bin/demeuk.py'] - script = b'input\nlines\n' + #On Windows scripts cant be executed with the shebang so manually add python3 in front + if name == 'nt': + comlist.insert(0, 'python3') + script = f'input{linesep}lines{linesep}'.encode() res = run(comlist, input=script, stdout=PIPE, stderr=PIPE) assert res.returncode == 0 - assert res.stdout == b'input\nlines\n' + if name == 'nt': + #On Windows when stdout is used, there is a trailing \r in new lines, but this does not effect terminal output + assert res.stdout == b'input\r\r\nlines\r\r\n' + else: + assert res.stdout == script assert res.stderr == b'' @@ -829,7 +837,7 @@ def test_check_lowercase(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output48') as f: + with open('testdata/output48', encoding='utf-8') as f: filecontent = f.read() assert '3 doors down' in filecontent @@ -847,7 +855,7 @@ def _run_demeuk(file_name, *extra_args): with patch.object(sys, 'argv', testargs): main() - with open(f'testdata/{file_name}.out') as f: + with open(f'testdata/{file_name}.out', encoding='utf-8') as f: return f.read() @@ -933,7 +941,7 @@ def test_add_first_upper(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output52') as f: + with open('testdata/output52', encoding='utf-8') as f: filecontent = f.read() assert 'three doors down' in filecontent @@ -954,7 +962,7 @@ def test_add_title_case(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output52') as f: + with open('testdata/output52', encoding='utf-8') as f: filecontent = f.read() assert 'three doors down' in filecontent @@ -975,7 +983,7 @@ def test_check_contains(): with patch.object(sys, 'argv', testargs): main() - with open('testdata/output53') as f: + with open('testdata/output53', encoding='utf-8') as f: filecontent = f.read() assert 'three_down' not in filecontent From 6c6d078f7890ac1954c7b0eab3b6c7c32f5d76d7 Mon Sep 17 00:00:00 2001 From: jessevz Date: Sat, 12 Apr 2025 15:36:03 +0200 Subject: [PATCH 2/6] Fixed flake8 suggestions --- tests/test_app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_app.py b/tests/test_app.py index 6cfcf6c..e6b0ba2 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -814,7 +814,7 @@ def test_check_multiple_regexes(): def test_stdin_stdout(): comlist = ['bin/demeuk.py'] - #On Windows scripts cant be executed with the shebang so manually add python3 in front + # On Windows scripts cant be executed with the shebang so manually add python3 in front if name == 'nt': comlist.insert(0, 'python3') script = f'input{linesep}lines{linesep}'.encode() @@ -822,7 +822,7 @@ def test_stdin_stdout(): stdout=PIPE, stderr=PIPE) assert res.returncode == 0 if name == 'nt': - #On Windows when stdout is used, there is a trailing \r in new lines, but this does not effect terminal output + # On Windows when stdout is used, there is a trailing \r in new lines, but this does not effect terminal output assert res.stdout == b'input\r\r\nlines\r\r\n' else: assert res.stdout == script From a90a35be54813ddcc5532884afb48f965a51e81f Mon Sep 17 00:00:00 2001 From: jessevz Date: Sat, 12 Apr 2025 15:38:43 +0200 Subject: [PATCH 3/6] Fixed even more flake8 suggestion --- bin/demeuk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/demeuk.py b/bin/demeuk.py index 00f4462..0ac9e5e 100755 --- a/bin/demeuk.py +++ b/bin/demeuk.py @@ -1357,6 +1357,7 @@ def init_worker(config_data): signal(SIGINT, SIG_IGN) + def main(): # # Config parser From c037c1746a81de33d8c3e3f357bcc0be9f65f625 Mon Sep 17 00:00:00 2001 From: jessevz Date: Thu, 2 Apr 2026 21:01:07 +0200 Subject: [PATCH 4/6] Fix windows speed by using threadpools --- bin/demeuk.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/demeuk.py b/bin/demeuk.py index 0ac9e5e..c07a6eb 100755 --- a/bin/demeuk.py +++ b/bin/demeuk.py @@ -147,7 +147,12 @@ from inspect import cleandoc from locale import LC_ALL, setlocale, getlocale from math import ceil -from multiprocessing import cpu_count, Pool +from multiprocessing import cpu_count +from os import name as os_name +if os_name == 'nt': + from multiprocessing.pool import ThreadPool as Pool +else: + from multiprocessing import Pool from os import linesep, access, path, R_OK, F_OK, W_OK from re import compile as re_compile from re import search @@ -1355,7 +1360,10 @@ def init_worker(config_data): global config config = config_data - signal(SIGINT, SIG_IGN) + try: + signal(SIGINT, SIG_IGN) + except ValueError: + pass # signal() only works in the main thread; ThreadPool workers are threads def main(): From 3821dcd425f123fd915296fe6227cf3f3d50e739 Mon Sep 17 00:00:00 2001 From: jessevz Date: Thu, 2 Apr 2026 21:07:08 +0200 Subject: [PATCH 5/6] Fixed new tests for windows after merge --- tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 002a40a..c327e9c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -394,10 +394,10 @@ file.write(f'ROTTERDAM_ {linesep}') file.write(f'Cookie Monster {linesep}') -with open('testdata/input54', 'w') as file: +with open('testdata/input54', 'w', encoding='utf-8') as file: file.write(f'Golf Trip{linesep}') file.write(f'Sequences{linesep}') -with open('testdata/input55', 'w') as file: +with open('testdata/input55', 'w', encoding='utf-8') as file: file.write(f'здраво пријатељу{linesep}') file.write(f'жута банана{linesep}') From 052eca91194a9e538f49a3dd9c00fd82c20e59bd Mon Sep 17 00:00:00 2001 From: jessevz Date: Wed, 8 Apr 2026 18:30:32 +0200 Subject: [PATCH 6/6] Pinned chardet version, tests in chardet 7.4.1 because of incorrect encoding detection --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7ac56f4..cfcfae2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ docopt -chardet +chardet==5.2.0 nltk ftfy unidecode