From 581c18dd2b6f6ecac6ce720a4fc607e51f817c33 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 13 Oct 2024 15:38:52 +0200 Subject: [PATCH 1/9] add support for processing datatet TEI files --- software_mentions_client/client.py | 48 +++++++++++++----------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index d79e4aa..1b411cf 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -33,6 +33,7 @@ endpoint_tei = 'service/annotateSoftwareTEI' endpoint_datastet_pdf = 'service/annotateDatasetPDF' +endpoint_datastet_tei = 'service/processDatasetTEI' endpoint_datastet_txt = 'service/annotateDatasetSentence' # default logging settings @@ -56,7 +57,7 @@ def __init__(self, config_path='./config.json', use_datastet=False): self._init_lmdb(use_datastet=use_datastet) if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len(self.config['bucket_name']) > 0: - self.s3 = S3.S3(self.config) + self.s3 = software_mentions_client.S3.S3(self.config) self.mongo_db_software = None self.mongo_db_dataset = None @@ -169,11 +170,6 @@ def annotate_directory(self, directory, force=False, use_datastet=False): for root, directories, filenames in os.walk(directory): for filename in filenames: if filename.endswith(".pdf") or filename.endswith(".PDF") or filename.endswith(".pdf.gz") or filename.endswith(".xml"): - - # always skip XML files when using datastet service currently - if filename.endswith(".xml") and use_datastet: - continue - if filename.endswith(".pdf"): filename_json = filename.replace(".pdf", "."+target+".json") elif filename.endswith(".pdf.gz"): @@ -189,14 +185,13 @@ def annotate_directory(self, directory, force=False, use_datastet=False): elif filename.endswith(".xml"): filename_json = filename.replace(".xml", "."+target+".json") - # prioretize TEI XML because better quality and faster - if not use_datastet: - filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml")) - filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml")) - if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): - # we have a TEI file, so if the current filename is not this TEI, we skip - if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): - continue + # prioritize TEI XML because better quality and faster + filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml")) + filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml")) + if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): + # we have a TEI file, so if the current filename is not this TEI, we skip + if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): + continue # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF # because processing PDF allows bounding box coordinates in the results for software mentions @@ -435,25 +430,24 @@ def reprocess_failed(self, directory, use_datastet=False): continue # prioretize TEI XML because better quality and faster - if not use_datastet: - filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml")) - filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml")) - if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): - # we have a TEI file, so if the current filename is not this TEI, we skip - if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): - continue + filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml")) + filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml")) + if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): + # we have a TEI file, so if the current filename is not this TEI, we skip + if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): + continue # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF # because processing PDF allows bounding box coordinates in the results for software mentions # which is an added value # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in # PDF coordinates - if not use_datastet: - filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml")) - filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf")) - if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf): - if filename.endswith(".grobid.tei.xml"): - continue + + filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml")) + filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf")) + if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf): + if filename.endswith(".grobid.tei.xml"): + continue sha1 = getSHA1(os.path.join(root,filename)) From 5d79f8afe07a3b86086b4bc28871f5d43c1126fc Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 13 Oct 2024 15:48:44 +0200 Subject: [PATCH 2/9] apply the correct URL when using datastet --- software_mentions_client/client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index 1b411cf..ed10f30 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -616,7 +616,10 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): url += endpoint_datastet_pdf elif file_in.endswith('.tei.xml'): the_file = {'input': open(file_in, 'rb')} - url += endpoint_tei + if not use_datastet: + url += endpoint_tei + else: + url += endpoint_datastet_tei elif file_in.endswith('.xml'): the_file = {'input': open(file_in, 'rb')} # check if we have an XML file or a TEI file to select the best endpoint From 719928bcb8c092b874f16c7ddf172ec4826c00a0 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 20 Nov 2024 17:43:54 +0000 Subject: [PATCH 3/9] add also time/document in the runtime --- software_mentions_client/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index ed10f30..cefbad4 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -251,7 +251,7 @@ def annotate_directory(self, directory, force=False, use_datastet=False): out_files = [] full_records = [] runtime = round(time.time() - start_time, 3) - sys.stdout.write("\rtotal process: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s ") + sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s / {str(round(runtime/nb_total, 2))} s/files") sys.stdout.flush() # last batch @@ -259,7 +259,7 @@ def annotate_directory(self, directory, force=False, use_datastet=False): self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet) nb_total += len(pdf_files) runtime = round(time.time() - start_time, 3) - sys.stdout.write("\rtotal process: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s ") + sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s / {str(round(runtime/nb_total, 2))} s/files") sys.stdout.flush() """ From f9f5dc40bc5e48ff71e823b721a8ba0be3081c6d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 16 Jan 2025 08:23:38 +0900 Subject: [PATCH 4/9] update client --- software_mentions_client/client.py | 609 +++++++++++++++-------------- 1 file changed, 317 insertions(+), 292 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index cefbad4..38bba4a 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -24,7 +24,7 @@ import logging.handlers import pkgutil -map_size = 100 * 1024 * 1024 * 1024 +map_size = 100 * 1024 * 1024 * 1024 # default endpoint endpoint_pdf = 'service/annotateSoftwarePDF' @@ -39,6 +39,7 @@ # default logging settings logging.basicConfig(filename='client.log', filemode='w', level=logging.DEBUG) + class software_mentions_client(object): """ Python client for using the Softcite software mention service. @@ -46,7 +47,7 @@ class software_mentions_client(object): def __init__(self, config_path='./config.json', use_datastet=False): self.config = None - + # standard lmdb environment for keeping track of the status of processing self.env_software = None @@ -56,7 +57,8 @@ def __init__(self, config_path='./config.json', use_datastet=False): self._load_config(config_path) self._init_lmdb(use_datastet=use_datastet) - if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len(self.config['bucket_name']) > 0: + if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len( + self.config['bucket_name']) > 0: self.s3 = software_mentions_client.S3.S3(self.config) self.mongo_db_software = None @@ -68,7 +70,7 @@ def __init__(self, config_path='./config.json', use_datastet=False): blacktext = pkgutil.get_data(__name__, "resources/covid_blacklist.txt").decode() blacktext_lines = blacktext.split("\n") - #with open ("resources/covid_blacklist.txt", "r") as blackfile: + # with open ("resources/covid_blacklist.txt", "r") as blackfile: for line in blacktext_lines: line = line.replace(" ", "").strip() if not line.startswith("#"): @@ -78,7 +80,7 @@ def __init__(self, config_path='./config.json', use_datastet=False): self.scorched_earth = False logs_filename = "client.log" - if "log_file" in self.config: + if "log_file" in self.config: logs_filename = self.config['log_file'] logs_level = logging.DEBUG @@ -110,13 +112,13 @@ def service_isalive(self, use_datastet=False): # test if Softcite software mention recognizer is up and running... the_url = "" the_name = "" - + if use_datastet: - if "dataset_mention_url" in self.config and len(self.config["dataset_mention_url"])>0: + if "dataset_mention_url" in self.config and len(self.config["dataset_mention_url"]) > 0: the_url = self.config["dataset_mention_url"] the_name = "dataset" else: - if "software_mention_url" in self.config and len(self.config["software_mention_url"])>0: + if "software_mention_url" in self.config and len(self.config["software_mention_url"]) > 0: the_url = self.config["software_mention_url"] the_name = "software" @@ -128,13 +130,14 @@ def service_isalive(self, use_datastet=False): r = requests.get(the_url) if r.status_code != 200: - logging.error(the_name + ' mention extraction server does not appear up and running ' + str(r.status_code)) + logging.error( + the_name + ' mention extraction server does not appear up and running ' + str(r.status_code)) else: logging.info(the_name + " mention extraction is up and running") return True - except: - logging.error(the_name + ' mention extraction server does not appear up and running: ' + - 'test call to service, please check and re-start a server.') + except: + logging.error(the_name + ' mention extraction server does not appear up and running: ' + + 'test call to service, please check and re-start a server.') return False def _init_lmdb(self, use_datastet=False): @@ -146,8 +149,8 @@ def _init_lmdb(self, use_datastet=False): envFilePath = os.path.join(self.config["data_path"], 'entries_software') self.env_software = lmdb.open(envFilePath, map_size=map_size) - #envFilePath = os.path.join(self.config["data_path"], 'fail_software') - #self.env_fail_software = lmdb.open(envFilePath, map_size=map_size) + # envFilePath = os.path.join(self.config["data_path"], 'fail_software') + # self.env_fail_software = lmdb.open(envFilePath, map_size=map_size) def annotate_directory(self, directory, force=False, use_datastet=False): ''' @@ -169,99 +172,111 @@ def annotate_directory(self, directory, force=False, use_datastet=False): for root, directories, filenames in os.walk(directory): for filename in filenames: - if filename.endswith(".pdf") or filename.endswith(".PDF") or filename.endswith(".pdf.gz") or filename.endswith(".xml"): - if filename.endswith(".pdf"): - filename_json = filename.replace(".pdf", "."+target+".json") - elif filename.endswith(".pdf.gz"): - filename_json = filename.replace(".pdf.gz", "."+target+".json") - elif filename.endswith(".PDF"): - filename_json = filename.replace(".PDF", "."+target+".json") - elif filename.endswith(".pub2tei.tei.xml"): - filename_json = filename.replace(".pub2tei.tei.xml", "."+target+".json") - elif filename.endswith(".latex.tei.xml"): - filename_json = filename.replace(".latex.tei.xml", "."+target+".json") - elif filename.endswith(".grobid.tei.xml"): - filename_json = filename.replace(".grobid.tei.xml", "."+target+".json") - elif filename.endswith(".xml"): - filename_json = filename.replace(".xml", "."+target+".json") - - # prioritize TEI XML because better quality and faster - filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml")) - filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml")) - if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): - # we have a TEI file, so if the current filename is not this TEI, we skip - if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): - continue - # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF - # because processing PDF allows bounding box coordinates in the results for software mentions - # which is an added value - # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in - # PDF coordinates - filename_tei3 = os.path.join(root, filename_json.replace("."+target+".json", ".grobid.tei.xml")) - filename_pdf = os.path.join(root, filename_json.replace("."+target+".json", ".pdf")) - if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf): - if filename.endswith(".grobid.tei.xml"): - continue + filename_json = self.compute_output_filename(filename, target) + if not filename_json: + continue - sha1 = getSHA1(os.path.join(root,filename)) + # prioritize TEI XML because better quality and faster + filename_tei1 = os.path.join(root, filename_json.replace("." + target + ".json", ".pub2tei.tei.xml")) + filename_tei2 = os.path.join(root, filename_json.replace("." + target + ".json", ".latex.tei.xml")) + if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): + # we have a TEI file, so if the current filename is not this TEI, we skip + if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): + continue - # if the json file already exists and not force, we skip - if os.path.isfile(os.path.join(root, filename_json)) and not force: - # check that this id is considered in the lmdb keeping track of the process + # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF + # because processing PDF allows bounding box coordinates in the results for software mentions + # which is an added value + # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in + # PDF coordinates + filename_tei3 = os.path.join(root, filename_json.replace("." + target + ".json", ".grobid.tei.xml")) + filename_pdf = os.path.join(root, filename_json.replace("." + target + ".json", ".pdf")) + if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf): + if filename.endswith(".grobid.tei.xml"): + continue - if use_datastet: - with self.env_dataset.begin() as txn: - status = txn.get(sha1.encode(encoding='UTF-8')) - if status is None: - with self.env_dataset.begin(write=True) as txn2: - txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) - continue - else: - with self.env_software.begin() as txn: - status = txn.get(sha1.encode(encoding='UTF-8')) - if status is None: - with self.env_software.begin(write=True) as txn2: - txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) - continue + sha1 = getSHA1(os.path.join(root, filename)) + + # if the json file already exists and not force, we skip + if os.path.isfile(os.path.join(root, filename_json)) and not force: + # check that this id is considered in the lmdb keeping track of the process - # if identifier already processed successfully in the local lmdb, we skip - # the hash of the fulltext file is used as unique identifier for the document (SHA1) if use_datastet: with self.env_dataset.begin() as txn: status = txn.get(sha1.encode(encoding='UTF-8')) - if status is not None and not force: - continue + if status is None: + with self.env_dataset.begin(write=True) as txn2: + txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) + continue else: with self.env_software.begin() as txn: status = txn.get(sha1.encode(encoding='UTF-8')) - if status is not None and not force: - continue + if status is None: + with self.env_software.begin(write=True) as txn2: + txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) + continue - pdf_files.append(os.path.join(root,filename)) - out_files.append(os.path.join(root, filename_json)) - record = {} - record["id"] = sha1 - full_records.append(record) - - if len(pdf_files) == self.config["batch_size"]: - self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet) - nb_total += len(pdf_files) - pdf_files = [] - out_files = [] - full_records = [] - runtime = round(time.time() - start_time, 3) - sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s / {str(round(runtime/nb_total, 2))} s/files") - sys.stdout.flush() + # if identifier already processed successfully in the local lmdb, we skip + # the hash of the fulltext file is used as unique identifier for the document (SHA1) + if use_datastet: + with self.env_dataset.begin() as txn: + status = txn.get(sha1.encode(encoding='UTF-8')) + if status is not None and not force: + continue + else: + with self.env_software.begin() as txn: + status = txn.get(sha1.encode(encoding='UTF-8')) + if status is not None and not force: + continue + + pdf_files.append(os.path.join(root, filename)) + out_files.append(os.path.join(root, filename_json)) + record = {} + record["id"] = sha1 + full_records.append(record) + + if len(pdf_files) == self.config["batch_size"]: + self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet) + nb_total += len(pdf_files) + pdf_files = [] + out_files = [] + full_records = [] + runtime = round(time.time() - start_time, 3) + sys.stdout.write( + f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total / runtime, 2))} files/s / {str(round(runtime / nb_total, 2))} s/files") + sys.stdout.flush() # last batch if len(pdf_files) > 0: self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet) nb_total += len(pdf_files) runtime = round(time.time() - start_time, 3) - sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s / {str(round(runtime/nb_total, 2))} s/files") + sys.stdout.write( + f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total / runtime, 2))} files/s / {str(round(runtime / nb_total, 2))} s/files") sys.stdout.flush() + @staticmethod + def compute_output_filename(filename, target): + if filename.endswith(".pdf"): + filename_json = filename.replace(".pdf", "." + target + ".json") + elif filename.endswith(".pdf.gz"): + filename_json = filename.replace(".pdf.gz", "." + target + ".json") + elif filename.endswith(".PDF"): + filename_json = filename.replace(".PDF", "." + target + ".json") + elif filename.endswith(".pub2tei.tei.xml"): + filename_json = filename.replace(".pub2tei.tei.xml", "." + target + ".json") + elif filename.endswith(".latex.tei.xml"): + filename_json = filename.replace(".latex.tei.xml", "." + target + ".json") + elif filename.endswith(".grobid.tei.xml"): + filename_json = filename.replace(".grobid.tei.xml", "." + target + ".json") + elif filename.endswith(".xml"): + filename_json = filename.replace(".xml", "." + target + ".json") + else: + filename_json = None + + return filename_json + """ def annotate_collection(self, data_path, force=False, use_datastet=False): ''' @@ -336,10 +351,11 @@ def annotate_collection(self, data_path, force=False, use_datastet=False): def annotate_batch(self, pdf_files, out_files=None, full_records=None, use_datastet=False): # process a provided list of PDF with ThreadPoolExecutor(max_workers=self.config["concurrency"]) as executor: - #with ProcessPoolExecutor(max_workers=self.config["concurrency"]) as executor: + # with ProcessPoolExecutor(max_workers=self.config["concurrency"]) as executor: # note: ProcessPoolExecutor will not work due to env objects that can't be serailized (e.g. LMDB variables) # client is not cpu bounded but io bounded, so normally it's still okay with threads and GIL - executor.map(self.annotate, pdf_files, out_files, full_records, [use_datastet]*len(pdf_files), timeout=self.config["timeout"]) + executor.map(self.annotate, pdf_files, out_files, full_records, [use_datastet] * len(pdf_files), + timeout=self.config["timeout"]) def reprocess_failed(self, directory, use_datastet=False): """ @@ -404,97 +420,77 @@ def reprocess_failed(self, directory, use_datastet=False): for root, directories, filenames in os.walk(directory): for filename in filenames: - if filename.endswith(".pdf") or filename.endswith(".PDF") or filename.endswith(".pdf.gz") or filename.endswith(".xml"): + filename_json = self.compute_output_filename(filename, target) + if not filename_json: + continue - # always skip XML files when using datastet service currently - if filename.endswith(".xml") and use_datastet: - continue + # if the json file already exists, we skip + if os.path.isfile(os.path.join(root, filename_json)): + continue - if filename.endswith(".pdf"): - filename_json = filename.replace(".pdf", "."+target+".json") - elif filename.endswith(".pdf.gz"): - filename_json = filename.replace(".pdf.gz", "."+target+".json") - elif filename.endswith(".PDF"): - filename_json = filename.replace(".PDF", "."+target+".json") - elif filename.endswith(".pub2tei.tei.xml"): - filename_json = filename.replace(".pub2tei.tei.xml", "."+target+".json") - elif filename.endswith(".latex.tei.xml"): - filename_json = filename.replace(".latex.tei.xml", "."+target+".json") - elif filename.endswith(".grobid.tei.xml"): - filename_json = filename.replace(".grobid.tei.xml", "."+target+".json") - elif filename.endswith(".xml"): - filename_json = filename.replace(".xml", "."+target+".json") - - # if the json file already exists, we skip - if os.path.isfile(os.path.join(root, filename_json)): + # prioritize TEI XML because better quality and faster + filename_tei1 = os.path.join( + root, filename_json.replace("." + target + ".json", ".pub2tei.tei.xml") + ) + filename_tei2 = os.path.join(root, filename_json.replace("." + target + ".json", ".latex.tei.xml")) + if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): + # we have a TEI file, so if the current filename is not this TEI, we skip + if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): continue - # prioretize TEI XML because better quality and faster - filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml")) - filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml")) - if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2): - # we have a TEI file, so if the current filename is not this TEI, we skip - if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"): - continue - - # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF - # because processing PDF allows bounding box coordinates in the results for software mentions - # which is an added value - # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in - # PDF coordinates + # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF + # because processing PDF allows bounding box coordinates in the results for software mentions + # which is an added value + # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in + # PDF coordinates - filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml")) - filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf")) - if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf): - if filename.endswith(".grobid.tei.xml"): - continue - - sha1 = getSHA1(os.path.join(root,filename)) + filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml")) + filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf")) + if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf): + if filename.endswith(".grobid.tei.xml"): + continue - pdf_files.append(os.path.join(root,filename)) - out_files.append(os.path.join(root, filename_json)) + sha1 = getSHA1(os.path.join(root, filename)) - if filename.endswith(".pdf"): - json_file = os.path.join(root, filename.replace(".pdf", ".json")) - elif filename.endswith(".xml"): - json_file = os.path.join(root, filename.replace(".xml", ".json")) - elif filename.endswith(".pdf.gz"): - json_file = os.path.join(root, filename.replace(".pdf.gz", ".json")) - elif filename.endswith(".PDF"): - json_file = os.path.join(root, filename.replace(".PDF", ".json")) + pdf_files.append(os.path.join(root, filename)) + json_file = os.path.join(root, filename_json) + out_files.append(json_file) - if os.path.isfile(json_file): - with open(json_file) as f: - full_record = json.load(f) - if full_record["id"] == sha1: - full_records.append(full_record) - else: - record = {} - record["id"] = sha1 - full_records.append(record) + if os.path.isfile(json_file): + with open(json_file) as f: + full_record = json.load(f) + if full_record["id"] == sha1: + full_records.append(full_record) else: record = {} record["id"] = sha1 full_records.append(record) - i += 1 - - if i == self.config["batch_size"]: - self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet) - nb_total += len(pdf_files) - pdf_files = [] - out_files = [] - full_records = [] - i = 0 - runtime = round(time.time() - start_time, 3) - sys.stdout.write("\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s ") - sys.stdout.flush() + else: + record = {} + record["id"] = sha1 + full_records.append(record) + i += 1 + + if i == self.config["batch_size"]: + self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet) + nb_total += len(pdf_files) + pdf_files = [] + out_files = [] + full_records = [] + i = 0 + runtime = round(time.time() - start_time, 3) + sys.stdout.write("\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str( + runtime) + " s - " + str(round(nb_total / runtime, 2)) + " files/s ") + sys.stdout.flush() # last batch for every cases if len(pdf_files) > 0: self.annotate_batch(pdf_files, out_files, full_records) nb_total += len(pdf_files) runtime = round(time.time() - start_time, 3) - sys.stdout.write("\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s ") + sys.stdout.write( + "\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str( + round(nb_total / runtime, 2)) + " files/s ") sys.stdout.flush() logging.info("re-processed: " + str(nb_total) + " entries") @@ -524,7 +520,7 @@ def reset(self, use_datastet=False): self._init_lmdb(use_datastet=False) def load_mongo(self, directory): - if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0: + if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0: mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"])) self.mongo_db_software = mongo_client[self.config["mongo_db_software"]] self.mongo_db_dataset = mongo_client[self.config["mongo_db_dataset"]] @@ -534,15 +530,15 @@ def load_mongo(self, directory): failed = 0 for root, directories, filenames in os.walk(directory): - for filename in filenames: + for filename in filenames: if filename.endswith(".software.json") or filename.endswith(".dataset.json"): - print(os.path.join(root,filename)) + print(os.path.join(root, filename)) - the_json = open(os.path.join(root,filename)).read() + the_json = open(os.path.join(root, filename)).read() try: jsonObject = json.loads(the_json) except: - print("the json parsing of the following file failed: ", os.path.join(root,filename)) + print("the json parsing of the following file failed: ", os.path.join(root, filename)) continue local_id = None @@ -564,14 +560,15 @@ def load_mongo(self, directory): # possibly clean original file path if "original_file_path" in jsonObject: if jsonObject["original_file_path"].startswith('../biblio-glutton-harvester/'): - jsonObject["original_file_path"] = jsonObject["original_file_path"].replace('../biblio-glutton-harvester/', '') - + jsonObject["original_file_path"] = jsonObject["original_file_path"].replace( + '../biblio-glutton-harvester/', '') + # update metadata via biblio-glutton (this is to be done for mongo upload from file only) - if "biblio_glutton_url" in self.config and len(self.config["biblio_glutton_url"].strip())>0: - if 'metadata' in jsonObject and 'doi' in jsonObject['metadata']: + if "biblio_glutton_url" in self.config and len(self.config["biblio_glutton_url"].strip()) > 0: + if 'metadata' in jsonObject and 'doi' in jsonObject['metadata']: try: glutton_metadata = self.biblio_glutton_lookup(doi=jsonObject['metadata']['doi']) - except: + except: print("the call to biblio-glutton failed for", jsonObject['metadata']['doi']) failed += 1 continue @@ -630,10 +627,10 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): except: logging.exception("input file appears invalid: " + file_in) return - + jsonObject = None try: - response = requests.post(url, files=the_file, data = {'disambiguate': 1}, timeout=self.config["timeout"]) + response = requests.post(url, files=the_file, data={'disambiguate': 1}, timeout=self.config["timeout"]) if response.status_code == 503: logging.info('service overloaded, sleep ' + str(self.config['sleep_time']) + ' seconds') time.sleep(self.config['sleep_time']) @@ -650,7 +647,8 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): # note: in case the recognizer has found no software in the document, it will still return # a json object as result, without mentions, but with MD5 and page information else: - logging.error('Unexpected Error: [HTTP {0}]: Content: {1}'.format(response.status_code, response.content)) + logging.error( + 'Unexpected Error: [HTTP {0}]: Content: {1}'.format(response.status_code, response.content)) except requests.exceptions.Timeout: logging.exception("The request to the annotation service has timeout") @@ -665,7 +663,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): # add file, DOI, date and version info in the JSON, if available if full_record is not None: jsonObject['id'] = full_record['id'] - #if len(full_record) > 1: + # if len(full_record) > 1: jsonObject['metadata'] = full_record; jsonObject['original_file_path'] = file_in jsonObject['file_name'] = os.path.basename(file_in) @@ -683,7 +681,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): new_mentions.append(mention) jsonObject['mentions'] = new_mentions - if file_out is not None: + if file_out is not None: # we write the json result into a file together with the processed pdf with open(file_out, "w", encoding="utf-8") as json_file: json_file.write(json.dumps(jsonObject)) @@ -698,7 +696,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): elif jsonObject is not None: # we have no software mention in the document, we still write an empty result file # along with the PDF/medtadata files to easily keep track of the processing for this doc - if file_out is not None: + if file_out is not None: # force empty explicit no mentions jsonObject['mentions'] = [] with open(file_out, "w", encoding="utf-8") as json_file: @@ -710,7 +708,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): if self.env_dataset is not None and full_record is not None: with self.env_dataset.begin(write=True) as txn: if jsonObject is not None: - txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) + txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) else: # the process failed txn.put(full_record['id'].encode(encoding='UTF-8'), "False".encode(encoding='UTF-8')) @@ -718,7 +716,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): if self.env_software is not None and full_record is not None: with self.env_software.begin(write=True) as txn: if jsonObject is not None: - txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) + txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) else: # the process failed txn.put(full_record['id'].encode(encoding='UTF-8'), "False".encode(encoding='UTF-8')) @@ -726,7 +724,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False): if self.scorched_earth and jsonObject is not None: # processed is done, remove local document file try: - os.remove(file_in) + os.remove(file_in) except: logging.exception("Error while deleting file " + file_in) @@ -738,9 +736,9 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d """ nb_total = 0 nb_fail = 0 - nb_success = 0 + nb_success = 0 - if self.env_software is not None: + if self.env_software is not None: with self.env_software.begin() as txn: cursor = txn.cursor() for key, value in cursor: @@ -751,7 +749,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d else: nb_fail += 1 else: - if self.env_dataset is not None: + if self.env_dataset is not None: with self.env_dataset.begin() as txn: cursor = txn.cursor() for key, value in cursor: @@ -773,50 +771,51 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d if full_diagnostic_mongo: # check mongodb access - if mongodb is not used or available, we don't go further if self.mongo_db_software is None: - if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0: + if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0: mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"])) self.mongo_db_software = mongo_client[self.config["mongo_db_software"]] if self.mongo_db_software is None: - print("MongoDB server is not available for more advanced statistics") + print("MongoDB server is not available for more advanced statistics") return print("MongoDB - number of documents: ", self.mongo_db_software.documents.count_documents({})) print("MongoDB - number of software mentions: ", self.mongo_db_software.annotations.count_documents({})) - result = self.mongo_db_software.annotations.find( {"software-name": {"$exists": True}} ) + result = self.mongo_db_software.annotations.find({"software-name": {"$exists": True}}) print("\t * with software name:", result.count()) - - result = self.mongo_db_software.annotations.find( {"version": {"$exists": True}} ) + + result = self.mongo_db_software.annotations.find({"version": {"$exists": True}}) print("\t * with version:", result.count()) - result = self.mongo_db_software.annotations.find( {"publisher": {"$exists": True}} ) + result = self.mongo_db_software.annotations.find({"publisher": {"$exists": True}}) print("\t * with publisher:", result.count()) - result = self.mongo_db_software.annotations.find( {"url": {"$exists": True}} ) - print("\t * with url:", result.count()) + result = self.mongo_db_software.annotations.find({"url": {"$exists": True}}) + print("\t * with url:", result.count()) - results = self.mongo_db_software.annotations.find( {"references": {"$exists": True}} ) + results = self.mongo_db_software.annotations.find({"references": {"$exists": True}}) nb_ref = 0 has_ref = 0 for result in results: has_ref += 1 the_references = result.get("references") nb_ref += len(the_references) - - print("\t * with at least one reference", nb_ref) - print("\t * total references", nb_ref) - print("MongoDB - number of bibliographical references: ", self.mongo_db_software.references.count_documents({})) + print("\t * with at least one reference", nb_ref) + print("\t * total references", nb_ref) + + print("MongoDB - number of bibliographical references: ", + self.mongo_db_software.references.count_documents({})) - result = self.mongo_db_software.references.find( {"tei": {"$regex": "DOI"}} ) - print("\t * with DOI:", result.count()) + result = self.mongo_db_software.references.find({"tei": {"$regex": "DOI"}}) + print("\t * with DOI:", result.count()) - result = self.mongo_db_software.references.find( {"tei": {"$regex": "PMID"}} ) - print("\t * with PMID:", result.count()) + result = self.mongo_db_software.references.find({"tei": {"$regex": "PMID"}}) + print("\t * with PMID:", result.count()) - result = self.mongo_db_software.references.find( {"tei": {"$regex": "PMC"}} ) - print("\t * with PMC ID:", result.count()) + result = self.mongo_db_software.references.find({"tei": {"$regex": "PMC"}}) + print("\t * with PMC ID:", result.count()) print("---") elif full_diagnostic_files: # in this mode, we go through the produced json files to retrieve information @@ -857,7 +856,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d nb_dataset_publisher = 0 nb_dataset_url = 0 nb_dataset_version = 0 - + nb_dataset_mention_with_ref = 0 nb_dataset_documents = 0 @@ -865,16 +864,16 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d nbDatasetFiles = 0 for root, directories, filenames in os.walk(directory): - - for filename in filenames: + + for filename in filenames: if filename.endswith(".software.json"): nb_documents += 1 - #print(os.path.join(root,filename)) - the_json = open(os.path.join(root,filename)).read() + # print(os.path.join(root,filename)) + the_json = open(os.path.join(root, filename)).read() try: jsonObject = json.loads(the_json) except: - print("the json parsing of the following file failed: ", os.path.join(root,filename)) + print("the json parsing of the following file failed: ", os.path.join(root, filename)) continue nbSoftwareFiles += 1 @@ -883,7 +882,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d sys.stdout.write("\rFiles visited: %i" % nbSoftwareFiles) sys.stdout.flush() - if "mentions" in jsonObject and len(jsonObject["mentions"])>0: + if "mentions" in jsonObject and len(jsonObject["mentions"]) > 0: nb_documents_with_software += 1 for mention in jsonObject["mentions"]: @@ -911,7 +910,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d nb_ref_marker += len(mention["references"]) nb_software_mention_with_ref += 1 - if "references" in jsonObject and len(jsonObject["references"])>0: + if "references" in jsonObject and len(jsonObject["references"]) > 0: nb_ref += len(jsonObject["references"]) # like with mongodb queries, we can use simple matching to count PID in full references @@ -927,12 +926,12 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d elif filename.endswith(".dataset.json"): nb_dataset_documents += 1 - #print(os.path.join(root,filename)) - the_json = open(os.path.join(root,filename)).read() + # print(os.path.join(root,filename)) + the_json = open(os.path.join(root, filename)).read() try: jsonObject = json.loads(the_json) except: - print("the json parsing of the following file failed: ", os.path.join(root,filename)) + print("the json parsing of the following file failed: ", os.path.join(root, filename)) continue nbDatasetFiles += 1 @@ -941,7 +940,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d sys.stdout.write("\rFiles visited: %i" % nbDatasetFiles) sys.stdout.flush() - if "mentions" in jsonObject and len(jsonObject["mentions"])>0: + if "mentions" in jsonObject and len(jsonObject["mentions"]) > 0: nb_documents_with_dataset += 1 for mention in jsonObject["mentions"]: @@ -950,7 +949,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d nb_dataset_implicit += 1 elif mention["type"] == "dataset-name": nb_dataset_name += 1 - + if mention["type"] == "data-device" or "data-device" in mention: nb_data_device += 1 @@ -965,8 +964,8 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d nb_dataset_ref_marker += len(mention["references"]) nb_dataset_mention_with_ref += 1 - if "references" in jsonObject and len(jsonObject["references"])>0: - + if "references" in jsonObject and len(jsonObject["references"]) > 0: + nb_dataset_ref += len(jsonObject["references"]) # like with mongodb queries, we can use simple matching to count PID in full references @@ -982,9 +981,10 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d # report results if nb_documents > 0: - print("\n\n--- SOFTWARE MENTIONS ---") + print("\n\n--- SOFTWARE MENTIONS ---") print("JSON files - number of documents: ", nb_documents) - print("JSON files - number of documents with at least one software mention: ", nb_documents_with_software) + print("JSON files - number of documents with at least one software mention: ", + nb_documents_with_software) print("JSON files - number of software mentions: ", nb_software) nb_standalone = nb_software - (nb_environment + nb_component + nb_implicit) print("\t -> subtype standalone:", nb_standalone) @@ -994,33 +994,33 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d print("\t * with software name:", nb_software) print("\t * with version:", nb_version) print("\t * with publisher:", nb_publisher) - print("\t * with url:", nb_url) - print("\t * with programming language:", nb_language) - print("\t * mentions with at least one reference", nb_software_mention_with_ref) - print("---") + print("\t * with url:", nb_url) + print("\t * with programming language:", nb_language) + print("\t * mentions with at least one reference", nb_software_mention_with_ref) + print("---") print("JSON files - number of bibliographical reference markers: ", nb_ref_marker) print("JSON files - number of bibliographical references: ", nb_ref) - print("\t * with DOI:", nb_ref_with_doi) - print("\t * with PMID:", nb_ref_with_pmid) - print("\t * with PMC ID:", nb_ref_with_pmcid) - print("---") + print("\t * with DOI:", nb_ref_with_doi) + print("\t * with PMID:", nb_ref_with_pmid) + print("\t * with PMC ID:", nb_ref_with_pmcid) + print("---") if nb_dataset_documents > 0: - print("\n\n--- DATASET MENTIONS ---") + print("\n\n--- DATASET MENTIONS ---") print("JSON files - number of documents: ", nb_dataset_documents) print("JSON files - number of documents with at least one dataset mention: ", nb_documents_with_dataset) print("JSON files - number of named dataset mentions: ", nb_dataset_name) print("JSON files - number of implicit dataset mentions: ", nb_dataset_implicit) print("JSON files - number of data device mentions: ", nb_data_device) - print("\t * with url:", nb_dataset_url) - print("\t * mentions with at least one reference", nb_dataset_mention_with_ref) - print("---") + print("\t * with url:", nb_dataset_url) + print("\t * mentions with at least one reference", nb_dataset_mention_with_ref) + print("---") print("JSON files - number of bibliographical reference markers: ", nb_dataset_ref_marker) print("JSON files - number of bibliographical references: ", nb_dataset_ref) - print("\t * with DOI:", nb_dataset_ref_with_doi) - print("\t * with PMID:", nb_dataset_ref_with_pmid) - print("\t * with PMC ID:", nb_dataset_ref_with_pmcid) - print("---") + print("\t * with DOI:", nb_dataset_ref_with_doi) + print("\t * with PMID:", nb_dataset_ref_with_pmid) + print("\t * with PMC ID:", nb_dataset_ref_with_pmcid) + print("---") def _insert_mongo(self, jsonObject, target="software"): if not "id" in jsonObject: @@ -1029,23 +1029,27 @@ def _insert_mongo(self, jsonObject, target="software"): local_mongo_db = None if target == "software": if self.mongo_db_software is None and "mongo_db_software" in self.config: - if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0: + if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0: try: - mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]), serverSelectionTimeoutMS=1000) + mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]), + serverSelectionTimeoutMS=1000) mongo_client.server_info() self.mongo_db_software = mongo_client[self.config["mongo_db_software"]] except: - print("Fail to connect to the MongoDb server:", self.config["mongo_host"]+":"+self.config["mongo_port"]) + print("Fail to connect to the MongoDb server:", + self.config["mongo_host"] + ":" + self.config["mongo_port"]) local_mongo_db = self.mongo_db_software elif target == "dataset": if self.mongo_db_dataset is None and "mongo_db_dataset" in self.config: - if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0: + if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0: try: - mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]), serverSelectionTimeoutMS=1000) + mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]), + serverSelectionTimeoutMS=1000) mongo_client.server_info() self.mongo_db_dataset = mongo_client[self.config["mongo_db_dataset"]] except: - print("Fail to connect to the MongoDb server:", self.config["mongo_host"]+":"+self.config["mongo_port"]) + print("Fail to connect to the MongoDb server:", + self.config["mongo_host"] + ":" + self.config["mongo_port"]) local_mongo_db = self.mongo_db_dataset if local_mongo_db == None: @@ -1053,15 +1057,15 @@ def _insert_mongo(self, jsonObject, target="software"): return # check if the article/annotations are not already present - if local_mongo_db.documents.count_documents({ 'id': jsonObject['id'] }, limit = 1) != 0: + if local_mongo_db.documents.count_documents({'id': jsonObject['id']}, limit=1) != 0: # if yes we replace this object, its annotations and references - result = local_mongo_db.documents.find_one({ 'id': jsonObject['id'] }) + result = local_mongo_db.documents.find_one({'id': jsonObject['id']}) _id = result['_id'] - local_mongo_db.annotations.delete_many( {'document': _id} ) - local_mongo_db.references.delete_many( {'document': _id} ) - result = local_mongo_db.documents.delete_one({ 'id': jsonObject['id'] }) - #print ("result:", type(result), "-- deleted count:", result.deleted_count) - + local_mongo_db.annotations.delete_many({'document': _id}) + local_mongo_db.references.delete_many({'document': _id}) + result = local_mongo_db.documents.delete_one({'id': jsonObject['id']}) + # print ("result:", type(result), "-- deleted count:", result.deleted_count) + # clean json jsonObject = _clean_json(jsonObject) @@ -1072,7 +1076,7 @@ def _insert_mongo(self, jsonObject, target="software"): if 'references' in jsonObjectDocument: del jsonObjectDocument['references'] inserted_doc_id = local_mongo_db.documents.insert_one(jsonObjectDocument).inserted_id - + local_ref_map = {} if 'references' in jsonObject: for reference in jsonObject['references']: @@ -1090,7 +1094,6 @@ def _insert_mongo(self, jsonObject, target="software"): reference["reference_id"] = local_ref_map[str(reference["refKey"])] inserted_mention_id = local_mongo_db.annotations.insert_one(mention).inserted_id - def biblio_glutton_lookup(self, doi=None, pmcid=None, pmid=None, istex_id=None, istex_ark=None): """ Lookup on biblio_glutton with the provided strong identifiers, return the full agregated biblio_glutton record @@ -1101,42 +1104,47 @@ def biblio_glutton_lookup(self, doi=None, pmcid=None, pmid=None, istex_id=None, jsonResult = None if "biblio_glutton_url" in self.config and len(self.config["biblio_glutton_url"].strip()) > 0: - biblio_glutton_url = self.config["biblio_glutton_url"]+"/service/lookup?" + biblio_glutton_url = self.config["biblio_glutton_url"] + "/service/lookup?" - if doi is not None and len(doi)>0: + if doi is not None and len(doi) > 0: response = requests.get(biblio_glutton_url, params={'doi': doi}, verify=False, timeout=5) success = (response.status_code == 200) if success: jsonResult = response.json() - if not success and pmid is not None and len(pmid)>0: + if not success and pmid is not None and len(pmid) > 0: response = requests.get(biblio_glutton_url + "pmid=" + pmid, verify=False, timeout=5) success = (response.status_code == 200) if success: - jsonResult = response.json() + jsonResult = response.json() - if not success and pmcid is not None and len(pmcid)>0: - response = requests.get(biblio_glutton_url + "pmc=" + pmcid, verify=False, timeout=5) + if not success and pmcid is not None and len(pmcid) > 0: + response = requests.get(biblio_glutton_url + "pmc=" + pmcid, verify=False, timeout=5) success = (response.status_code == 200) if success: jsonResult = response.json() - if not success and istex_id is not None and len(istex_id)>0: + if not success and istex_id is not None and len(istex_id) > 0: response = requests.get(biblio_glutton_url + "istexid=" + istex_id, verify=False, timeout=5) success = (response.status_code == 200) if success: jsonResult = response.json() - if not success and doi is not None and len(doi)>0 and "crossref_base" in self.config and len(self.config["crossref_base"].strip())>0: + if not success and doi is not None and len(doi) > 0 and "crossref_base" in self.config and len( + self.config["crossref_base"].strip()) > 0: # let's call crossref as fallback for possible X-months gap in biblio-glutton # https://api.crossref.org/works/10.1037/0003-066X.59.1.29 - if "crossref_email" in self.config and len(self.config["crossref_email"].strip())>0: - user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0 (mailto:'+self.config["crossref_email"]+')'} + if "crossref_email" in self.config and len(self.config["crossref_email"].strip()) > 0: + user_agent = { + 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0 (mailto:' + + self.config["crossref_email"] + ')'} else: - user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0'} + user_agent = { + 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0'} try: - logging.info("calling... " + self.config["crossref_base"]+"/works/"+doi) - response = requests.get(self.config["crossref_base"]+"/works/"+doi, headers=user_agent, verify=False, timeout=5) + logging.info("calling... " + self.config["crossref_base"] + "/works/" + doi) + response = requests.get(self.config["crossref_base"] + "/works/" + doi, headers=user_agent, + verify=False, timeout=5) if response.status_code == 200: jsonResult = response.json()['message'] # filter out references and re-set doi, in case there are obtained via crossref @@ -1147,9 +1155,10 @@ def biblio_glutton_lookup(self, doi=None, pmcid=None, pmid=None, istex_id=None, jsonResult = None except: logging.exception("Could not connect to CrossRef") - + return jsonResult + def generateStoragePath(identifier): ''' Convert a file name into a path with file prefix as directory paths: @@ -1157,9 +1166,11 @@ def generateStoragePath(identifier): ''' return os.path.join(identifier[:2], identifier[2:4], identifier[4:6], identifier[6:8], "") + def _deserialize_pickle(serialized): return pickle.loads(serialized) + def _clean_json(d): # clean recursively a json for insertion in MongoDB, basically remove keys starting with $ if not isinstance(d, (dict, list)): @@ -1167,20 +1178,23 @@ def _clean_json(d): if isinstance(d, list): return [_clean_json(v) for v in d] return {k: _clean_json(v) for k, v in d.items() - if not k.startswith("$") } + if not k.startswith("$")} + def _is_tei(the_file): # based on the header content of the file, check if we have a TEI XML file with open(the_file) as f: n = 5 - while n>=0: + while n >= 0: first_line = f.readline().strip('\n') if " Date: Sun, 8 Jun 2025 08:10:43 +0200 Subject: [PATCH 5/9] use custom LMDB path if provided --- software_mentions_client/client.py | 62 +++++++++++++++++------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index 38bba4a..14164ee 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -2,27 +2,25 @@ Run the software mention recognizer service on PDF or XML fulltext file collections ''' -import gzip -import sys -import os -import shutil -import json -import pickle -import lmdb import argparse -import time -import datetime -import software_mentions_client.S3 -import concurrent.futures -import requests -import pymongo -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed +import gzip import hashlib -import copyreg -import types +import json import logging import logging.handlers +import os +import pickle import pkgutil +import shutil +import sys +import time +from concurrent.futures import ThreadPoolExecutor + +import lmdb +import pymongo +import requests + +import software_mentions_client.S3 map_size = 100 * 1024 * 1024 * 1024 @@ -40,12 +38,17 @@ logging.basicConfig(filename='client.log', filemode='w', level=logging.DEBUG) -class software_mentions_client(object): +class SoftwareMentionsClient(object): """ Python client for using the Softcite software mention service. """ - def __init__(self, config_path='./config.json', use_datastet=False): + def __init__( + self, + config_path='./config.json', + use_datastet=False, + data_path=None + ): self.config = None # standard lmdb environment for keeping track of the status of processing @@ -140,14 +143,15 @@ def service_isalive(self, use_datastet=False): 'test call to service, please check and re-start a server.') return False - def _init_lmdb(self, use_datastet=False): + def _init_lmdb(self, use_datastet=False, data_path=None): # open in write mode + root_data_path = self.config["data_path"] if not data_path else data_path if use_datastet: - envFilePath = os.path.join(self.config["data_path"], 'entries_dataset') - self.env_dataset = lmdb.open(envFilePath, map_size=map_size) + env_file_path = os.path.join(root_data_path, 'entries_dataset') + self.env_dataset = lmdb.open(env_file_path, map_size=map_size) else: - envFilePath = os.path.join(self.config["data_path"], 'entries_software') - self.env_software = lmdb.open(envFilePath, map_size=map_size) + env_file_path = os.path.join(root_data_path, 'entries_software') + self.env_software = lmdb.open(env_file_path, map_size=map_size) # envFilePath = os.path.join(self.config["data_path"], 'fail_software') # self.env_fail_software = lmdb.open(envFilePath, map_size=map_size) @@ -1217,7 +1221,10 @@ def getSHA1(the_file): help="a single PDF or XML input file to be processed by the Softcite software mention recognizer") parser.add_argument("--file-out", default=None, help="path to a single output the software mentions in JSON format, extracted from the PDF file-in") - # parser.add_argument("--data-path", default=None, help="path to the resource files created/harvested by biblio-glutton-harvester") + parser.add_argument( + "--data-path", + default=None, + help="path to the directory containing the LMDB database.") parser.add_argument("--config", default="./config.json", help="path to the config file, default is ./config.json") parser.add_argument("--reprocess", action="store_true", help="reprocessed failed PDF or XML fulltexts") parser.add_argument("--reset", action="store_true", @@ -1240,7 +1247,7 @@ def getSHA1(the_file): args = parser.parse_args() - # data_path = args.data_path + data_path = args.data_path config_path = args.config reprocess = args.reprocess reset = args.reset @@ -1253,7 +1260,10 @@ def getSHA1(the_file): scorched_earth = args.scorched_earth use_datastet = args.datastet - client = software_mentions_client(config_path=config_path, use_datastet=use_datastet) + client = SoftwareMentionsClient( + config_path=config_path, + use_datastet=use_datastet, + data_path=data_path) if not load_mongo and not full_diagnostic_mongo and not full_diagnostic_files and not client.service_isalive( use_datastet=use_datastet): From 570a2e11f1b24efaeae12f738a9d10559cd2121d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 8 Jun 2025 11:18:11 +0200 Subject: [PATCH 6/9] use custom LMDB path also when resetting --- software_mentions_client/client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index 14164ee..7069cff 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -499,16 +499,18 @@ def reprocess_failed(self, directory, use_datastet=False): logging.info("re-processed: " + str(nb_total) + " entries") - def reset(self, use_datastet=False): + def reset(self, use_datastet=False, data_path=None): """ Remove the local lmdb keeping track of the state of advancement of the annotation and of the failed entries """ + root_data_path = self.config["data_path"] if data_path is None else data_path + if use_datastet: # close environments self.env_dataset.close() - envFilePath = os.path.join(self.config["data_path"], 'entries_dataset') + envFilePath = os.path.join(root_data_path, 'entries_dataset') shutil.rmtree(envFilePath) # re-init the environments @@ -517,7 +519,7 @@ def reset(self, use_datastet=False): # close environments self.env_software.close() - envFilePath = os.path.join(self.config["data_path"], 'entries_software') + envFilePath = os.path.join(root_data_path, 'entries_software') shutil.rmtree(envFilePath) # re-init the environments From a1b0fb1ef8e7fe8dd5714f5e46662c80c8d3b74e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 12 Jun 2025 12:00:10 +0200 Subject: [PATCH 7/9] fix --- software_mentions_client/client.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index 7069cff..4b18408 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -58,7 +58,11 @@ def __init__( self.env_dataset = None self._load_config(config_path) - self._init_lmdb(use_datastet=use_datastet) + + self.root_data_path = self._init_lmdb( + use_datastet=use_datastet, + data_path=data_path + ) if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len( self.config['bucket_name']) > 0: @@ -98,6 +102,7 @@ def __init__( logs_level = logging.CRITICAL else: logs_level = logging.NOTSET + logging.basicConfig(filename=logs_filename, filemode='w', level=logs_level) print("logs are written in " + logs_filename) @@ -143,9 +148,11 @@ def service_isalive(self, use_datastet=False): 'test call to service, please check and re-start a server.') return False - def _init_lmdb(self, use_datastet=False, data_path=None): + def _init_lmdb(self, use_datastet=False, data_path=None) -> str: # open in write mode root_data_path = self.config["data_path"] if not data_path else data_path + + os.makedirs(root_data_path, exist_ok=True) if use_datastet: env_file_path = os.path.join(root_data_path, 'entries_dataset') self.env_dataset = lmdb.open(env_file_path, map_size=map_size) @@ -156,6 +163,8 @@ def _init_lmdb(self, use_datastet=False, data_path=None): # envFilePath = os.path.join(self.config["data_path"], 'fail_software') # self.env_fail_software = lmdb.open(envFilePath, map_size=map_size) + return root_data_path + def annotate_directory(self, directory, force=False, use_datastet=False): ''' recursive directory walk for processing in parallel all PDF and XML documents @@ -510,20 +519,20 @@ def reset(self, use_datastet=False, data_path=None): # close environments self.env_dataset.close() - envFilePath = os.path.join(root_data_path, 'entries_dataset') + envFilePath = os.path.join(self.root_data_path, 'entries_dataset') shutil.rmtree(envFilePath) # re-init the environments - self._init_lmdb(use_datastet=True) + self._init_lmdb(use_datastet=True, data_path=root_data_path) else: # close environments self.env_software.close() - envFilePath = os.path.join(root_data_path, 'entries_software') + envFilePath = os.path.join(self.root_data_path, 'entries_software') shutil.rmtree(envFilePath) # re-init the environments - self._init_lmdb(use_datastet=False) + self._init_lmdb(use_datastet=False, data_path=root_data_path) def load_mongo(self, directory): if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0: From 55e3aad961cb6f6ba0ab069ddf561af04e04c264 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 16 Jun 2025 09:21:20 +0200 Subject: [PATCH 8/9] fix again the db paths --- software_mentions_client/client.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py index 4b18408..6dc6732 100644 --- a/software_mentions_client/client.py +++ b/software_mentions_client/client.py @@ -508,13 +508,11 @@ def reprocess_failed(self, directory, use_datastet=False): logging.info("re-processed: " + str(nb_total) + " entries") - def reset(self, use_datastet=False, data_path=None): + def reset(self, use_datastet=False): """ Remove the local lmdb keeping track of the state of advancement of the annotation and of the failed entries """ - root_data_path = self.config["data_path"] if data_path is None else data_path - if use_datastet: # close environments self.env_dataset.close() @@ -523,7 +521,7 @@ def reset(self, use_datastet=False, data_path=None): shutil.rmtree(envFilePath) # re-init the environments - self._init_lmdb(use_datastet=True, data_path=root_data_path) + self._init_lmdb(use_datastet=True, data_path=self.root_data_path) else: # close environments self.env_software.close() @@ -532,7 +530,7 @@ def reset(self, use_datastet=False, data_path=None): shutil.rmtree(envFilePath) # re-init the environments - self._init_lmdb(use_datastet=False, data_path=root_data_path) + self._init_lmdb(use_datastet=False, data_path=self.root_data_path) def load_mongo(self, directory): if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0: From 948e9579d938e9a7197d31c904e01fc052d9cbad Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Oct 2025 12:42:48 +0100 Subject: [PATCH 9/9] update documentation --- Readme.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Readme.md b/Readme.md index 5ab23a5..32788f3 100644 --- a/Readme.md +++ b/Readme.md @@ -55,7 +55,7 @@ Finally install the project in editable state ``` usage: client.py [-h] [--repo-in REPO_IN] [--file-in FILE_IN] [--file-out FILE_OUT] [--config CONFIG] [--reprocess] [--reset] [--load] [--diagnostic-mongo] [--diagnostic-files] - [--scorched-earth] [--datastet] + [--scorched-earth] [--datastet] [--data-path DATA_PATH] Softcite software mention recognizer client @@ -68,6 +68,7 @@ optional arguments: --file-out FILE_OUT path to a single output the software mentions in JSON format, extracted from the PDF file-in --config CONFIG path to the config file, default is ./config.json + --data-path DATA_PATH path to the directory containing the LMDB database. --reprocess reprocessed failed PDF or XML fulltexts --reset ignore previous processing states and re-init the annotation process from the beginning @@ -137,7 +138,7 @@ Anntations will be added along the PDF and XML files, with extension `*.software -rw-rw-r-- 1 lopez lopez 485 Aug 8 03:41 0100a44b-6f3f-4cf7-86f9-8ef5e8401567.software.json ``` -### Reprocess failed PDF or XML fulltexts +### Reprocess failed PDF or XML fulltext Just add `--reprocess` to the command line, the processing will be limited to the PDF and XML files that failed when processing them: @@ -175,7 +176,7 @@ python3 -m software_mentions_client.client --diagnostic-mongo ## Configuration -By default, the concurreny of the parallelized calls to a service is `8`. This parameter can be changed in the configuration file `config.json`. +By default, the concurrency of the parallelized calls to a service is `8`. This parameter can be changed in the configuration file `config.json`. Other important configuration parameter are the URL of the Software mention recognition web service `software_mention_url`, the optional URL of a DataStet server if used `dataset_mention_url`, the MongoDb instance information if you wish to load the produced annotations in MongoDB.