Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 24 additions & 14 deletions bin/demeuk.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,14 @@
from glob import glob
from html import unescape
from inspect import cleandoc
from locale import LC_ALL, setlocale
from locale import LC_ALL, setlocale, getlocale
from math import ceil
from multiprocessing import cpu_count, Pool
from multiprocessing import cpu_count
from os import name as os_name
if os_name == 'nt':
from multiprocessing.pool import ThreadPool as Pool
else:
from multiprocessing import Pool
from os import linesep, access, path, R_OK, F_OK, W_OK
from re import compile as re_compile
from re import search
Expand Down Expand Up @@ -647,9 +652,7 @@ def check_empty_line(line):
Returns:
true of line is empty or only contains whitespace chars
"""
if line == '':
return True
elif line.isspace():
if line == '' or line.isspace():
return True
return False

Expand Down Expand Up @@ -1375,7 +1378,7 @@ def chunkify(filename, size=CHUNK_SIZE):
fh.readline()

while True:
lines = [line.rstrip(b'\n') for line in fh.readlines(size)]
lines = [line.rstrip(linesep.encode()) for line in fh.readlines(size)]
yield lines
if len(lines) == 0:
break
Expand All @@ -1388,6 +1391,16 @@ def stderr_print(*args, **kwargs):
print(*args, **kwargs)


def init_worker(config_data):
global config
config = config_data

try:
signal(SIGINT, SIG_IGN)
except ValueError:
pass # signal() only works in the main thread; ThreadPool workers are threads


def main():
#
# Config parser
Expand Down Expand Up @@ -1507,10 +1520,9 @@ def main():
if arguments.get('--input-encoding'):
config['input_encoding'] = arguments.get('--input-encoding').split(',')

setlocale(LC_ALL, 'en_US.UTF-8')
if arguments.get('--output-encoding'):
setlocale(LC_ALL, arguments.get('--output-encoding'))
else:
setlocale(LC_ALL, 'en_US.UTF-8')

if arguments.get('--punctuation'):
config['punctuation'] = arguments.get('--punctuation')
Expand Down Expand Up @@ -1741,13 +1753,14 @@ def main():
stderr_print('Main: done chunking file.')
stderr_print('Main: processing started.')

encoding = getlocale()[1]
if output_file:
p_output_file = open(output_file, 'w')
p_output_file = open(output_file, 'w', encoding=encoding, newline='')
else:
p_output_file = stdout

if log_file:
p_log_file = open(log_file, 'a')
p_log_file = open(log_file, 'a', encoding=encoding, newline='')
else:
p_log_file = stderr

Expand All @@ -1764,9 +1777,6 @@ def write_results_and_log(async_result):
write_results(async_result['results'])
write_log(async_result['log'])

def init_worker():
signal(SIGINT, SIG_IGN)

def process_jobs(chunk_start):
# Cut file in to chunks and process each trunk multi-threaded
while True:
Expand All @@ -1791,7 +1801,7 @@ def process_jobs(chunk_start):
sleep(1)

write_log(f'Running demeuk - {version}{linesep}')
with Pool(a_threads, init_worker) as pool:
with Pool(a_threads, init_worker, initargs=(config,)) as pool:
jobs = []
# chunk_start will be the started value of the combined output lines
chunk_start = 0
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
docopt
chardet
chardet==5.2.0
nltk
ftfy
unidecode
Expand Down
Loading
Loading