From 581c18dd2b6f6ecac6ce720a4fc607e51f817c33 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Sun, 13 Oct 2024 15:38:52 +0200
Subject: [PATCH 1/9] add support for processing datatet TEI files

---
 software_mentions_client/client.py | 48 +++++++++++++-----------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index d79e4aa..1b411cf 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -33,6 +33,7 @@
 endpoint_tei = 'service/annotateSoftwareTEI'
 
 endpoint_datastet_pdf = 'service/annotateDatasetPDF'
+endpoint_datastet_tei = 'service/processDatasetTEI'
 endpoint_datastet_txt = 'service/annotateDatasetSentence'
 
 # default logging settings
@@ -56,7 +57,7 @@ def __init__(self, config_path='./config.json', use_datastet=False):
         self._init_lmdb(use_datastet=use_datastet)
 
         if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len(self.config['bucket_name']) > 0:
-            self.s3 = S3.S3(self.config)
+            self.s3 = software_mentions_client.S3.S3(self.config)
 
         self.mongo_db_software = None
         self.mongo_db_dataset = None
@@ -169,11 +170,6 @@ def annotate_directory(self, directory, force=False, use_datastet=False):
         for root, directories, filenames in os.walk(directory):
             for filename in filenames:
                 if filename.endswith(".pdf") or filename.endswith(".PDF") or filename.endswith(".pdf.gz") or filename.endswith(".xml"):
-
-                    # always skip XML files when using datastet service currently
-                    if filename.endswith(".xml") and use_datastet:
-                        continue
-
                     if filename.endswith(".pdf"):
                         filename_json = filename.replace(".pdf", "."+target+".json")
                     elif filename.endswith(".pdf.gz"):
@@ -189,14 +185,13 @@ def annotate_directory(self, directory, force=False, use_datastet=False):
                     elif filename.endswith(".xml"):
                         filename_json = filename.replace(".xml", "."+target+".json")
 
-                    # prioretize TEI XML because better quality and faster
-                    if not use_datastet:
-                        filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml"))
-                        filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml"))
-                        if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
-                            # we have a TEI file, so if the current filename is not this TEI, we skip
-                            if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
-                                continue
+                    # prioritize TEI XML because better quality and faster
+                    filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml"))
+                    filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml"))
+                    if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
+                        # we have a TEI file, so if the current filename is not this TEI, we skip
+                        if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
+                            continue
 
                     # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF
                     # because processing PDF allows bounding box coordinates in the results for software mentions
@@ -435,25 +430,24 @@ def reprocess_failed(self, directory, use_datastet=False):
                         continue
 
                     # prioretize TEI XML because better quality and faster
-                    if not use_datastet:
-                        filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml"))
-                        filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml"))
-                        if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
-                            # we have a TEI file, so if the current filename is not this TEI, we skip
-                            if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
-                                continue
+                    filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml"))
+                    filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml"))
+                    if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
+                        # we have a TEI file, so if the current filename is not this TEI, we skip
+                        if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
+                            continue
 
                     # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF
                     # because processing PDF allows bounding box coordinates in the results for software mentions
                     # which is an added value
                     # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in
                     # PDF coordinates
-                    if not use_datastet:
-                        filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml"))
-                        filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf"))
-                        if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf):
-                            if filename.endswith(".grobid.tei.xml"):
-                                continue
+
+                    filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml"))
+                    filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf"))
+                    if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf):
+                        if filename.endswith(".grobid.tei.xml"):
+                            continue
 
                     sha1 = getSHA1(os.path.join(root,filename))
 

From 5d79f8afe07a3b86086b4bc28871f5d43c1126fc Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Sun, 13 Oct 2024 15:48:44 +0200
Subject: [PATCH 2/9] apply the correct URL when using datastet

---
 software_mentions_client/client.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index 1b411cf..ed10f30 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -616,7 +616,10 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
                     url += endpoint_datastet_pdf
             elif file_in.endswith('.tei.xml'):
                 the_file = {'input': open(file_in, 'rb')}
-                url += endpoint_tei
+                if not use_datastet:
+                    url += endpoint_tei
+                else:
+                    url += endpoint_datastet_tei
             elif file_in.endswith('.xml'):
                 the_file = {'input': open(file_in, 'rb')}
                 # check if we have an XML file or a TEI file to select the best endpoint

From 719928bcb8c092b874f16c7ddf172ec4826c00a0 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Wed, 20 Nov 2024 17:43:54 +0000
Subject: [PATCH 3/9] add also time/document in the runtime

---
 software_mentions_client/client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index ed10f30..cefbad4 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -251,7 +251,7 @@ def annotate_directory(self, directory, force=False, use_datastet=False):
                         out_files = []
                         full_records = []
                         runtime = round(time.time() - start_time, 3)
-                        sys.stdout.write("\rtotal process: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s  ")
+                        sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s  / {str(round(runtime/nb_total, 2))} s/files")
                         sys.stdout.flush()
 
         # last batch
@@ -259,7 +259,7 @@ def annotate_directory(self, directory, force=False, use_datastet=False):
             self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet)
             nb_total += len(pdf_files)
             runtime = round(time.time() - start_time, 3)
-            sys.stdout.write("\rtotal process: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s  ")
+            sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s  / {str(round(runtime/nb_total, 2))} s/files")
             sys.stdout.flush()
 
     """

From f9f5dc40bc5e48ff71e823b721a8ba0be3081c6d Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 16 Jan 2025 08:23:38 +0900
Subject: [PATCH 4/9] update client

---
 software_mentions_client/client.py | 609 +++++++++++++++--------------
 1 file changed, 317 insertions(+), 292 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index cefbad4..38bba4a 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -24,7 +24,7 @@
 import logging.handlers
 import pkgutil
 
-map_size = 100 * 1024 * 1024 * 1024 
+map_size = 100 * 1024 * 1024 * 1024
 
 # default endpoint
 endpoint_pdf = 'service/annotateSoftwarePDF'
@@ -39,6 +39,7 @@
 # default logging settings
 logging.basicConfig(filename='client.log', filemode='w', level=logging.DEBUG)
 
+
 class software_mentions_client(object):
     """
     Python client for using the Softcite software mention service. 
@@ -46,7 +47,7 @@ class software_mentions_client(object):
 
     def __init__(self, config_path='./config.json', use_datastet=False):
         self.config = None
-        
+
         # standard lmdb environment for keeping track of the status of processing
         self.env_software = None
 
@@ -56,7 +57,8 @@ def __init__(self, config_path='./config.json', use_datastet=False):
         self._load_config(config_path)
         self._init_lmdb(use_datastet=use_datastet)
 
-        if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len(self.config['bucket_name']) > 0:
+        if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len(
+                self.config['bucket_name']) > 0:
             self.s3 = software_mentions_client.S3.S3(self.config)
 
         self.mongo_db_software = None
@@ -68,7 +70,7 @@ def __init__(self, config_path='./config.json', use_datastet=False):
         blacktext = pkgutil.get_data(__name__, "resources/covid_blacklist.txt").decode()
         blacktext_lines = blacktext.split("\n")
 
-        #with open ("resources/covid_blacklist.txt", "r") as blackfile:
+        # with open ("resources/covid_blacklist.txt", "r") as blackfile:
         for line in blacktext_lines:
             line = line.replace(" ", "").strip()
             if not line.startswith("#"):
@@ -78,7 +80,7 @@ def __init__(self, config_path='./config.json', use_datastet=False):
         self.scorched_earth = False
 
         logs_filename = "client.log"
-        if "log_file" in self.config: 
+        if "log_file" in self.config:
             logs_filename = self.config['log_file']
 
         logs_level = logging.DEBUG
@@ -110,13 +112,13 @@ def service_isalive(self, use_datastet=False):
         # test if Softcite software mention recognizer is up and running...
         the_url = ""
         the_name = ""
-            
+
         if use_datastet:
-            if "dataset_mention_url" in self.config and len(self.config["dataset_mention_url"])>0:
+            if "dataset_mention_url" in self.config and len(self.config["dataset_mention_url"]) > 0:
                 the_url = self.config["dataset_mention_url"]
                 the_name = "dataset"
         else:
-            if "software_mention_url" in self.config and len(self.config["software_mention_url"])>0:
+            if "software_mention_url" in self.config and len(self.config["software_mention_url"]) > 0:
                 the_url = self.config["software_mention_url"]
                 the_name = "software"
 
@@ -128,13 +130,14 @@ def service_isalive(self, use_datastet=False):
             r = requests.get(the_url)
 
             if r.status_code != 200:
-                logging.error(the_name + ' mention extraction server does not appear up and running ' + str(r.status_code))
+                logging.error(
+                    the_name + ' mention extraction server does not appear up and running ' + str(r.status_code))
             else:
                 logging.info(the_name + " mention extraction is up and running")
                 return True
-        except: 
-            logging.error(the_name + ' mention extraction server does not appear up and running: ' + 
-                'test call to service, please check and re-start a server.')
+        except:
+            logging.error(the_name + ' mention extraction server does not appear up and running: ' +
+                          'test call to service, please check and re-start a server.')
         return False
 
     def _init_lmdb(self, use_datastet=False):
@@ -146,8 +149,8 @@ def _init_lmdb(self, use_datastet=False):
             envFilePath = os.path.join(self.config["data_path"], 'entries_software')
             self.env_software = lmdb.open(envFilePath, map_size=map_size)
 
-        #envFilePath = os.path.join(self.config["data_path"], 'fail_software')
-        #self.env_fail_software = lmdb.open(envFilePath, map_size=map_size)
+        # envFilePath = os.path.join(self.config["data_path"], 'fail_software')
+        # self.env_fail_software = lmdb.open(envFilePath, map_size=map_size)
 
     def annotate_directory(self, directory, force=False, use_datastet=False):
         '''
@@ -169,99 +172,111 @@ def annotate_directory(self, directory, force=False, use_datastet=False):
 
         for root, directories, filenames in os.walk(directory):
             for filename in filenames:
-                if filename.endswith(".pdf") or filename.endswith(".PDF") or filename.endswith(".pdf.gz") or filename.endswith(".xml"):
-                    if filename.endswith(".pdf"):
-                        filename_json = filename.replace(".pdf", "."+target+".json")
-                    elif filename.endswith(".pdf.gz"):
-                        filename_json = filename.replace(".pdf.gz", "."+target+".json")
-                    elif filename.endswith(".PDF"):
-                        filename_json = filename.replace(".PDF", "."+target+".json")
-                    elif filename.endswith(".pub2tei.tei.xml"):
-                        filename_json = filename.replace(".pub2tei.tei.xml", "."+target+".json")
-                    elif filename.endswith(".latex.tei.xml"):
-                        filename_json = filename.replace(".latex.tei.xml", "."+target+".json")
-                    elif filename.endswith(".grobid.tei.xml"):
-                        filename_json = filename.replace(".grobid.tei.xml", "."+target+".json")
-                    elif filename.endswith(".xml"):
-                        filename_json = filename.replace(".xml", "."+target+".json")
-
-                    # prioritize TEI XML because better quality and faster
-                    filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml"))
-                    filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml"))
-                    if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
-                        # we have a TEI file, so if the current filename is not this TEI, we skip
-                        if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
-                            continue
 
-                    # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF
-                    # because processing PDF allows bounding box coordinates in the results for software mentions
-                    # which is an added value
-                    # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in
-                    # PDF coordinates
-                    filename_tei3 = os.path.join(root, filename_json.replace("."+target+".json", ".grobid.tei.xml"))
-                    filename_pdf = os.path.join(root, filename_json.replace("."+target+".json", ".pdf"))
-                    if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf):
-                        if filename.endswith(".grobid.tei.xml"):
-                            continue
+                filename_json = self.compute_output_filename(filename, target)
+                if not filename_json:
+                    continue
 
-                    sha1 = getSHA1(os.path.join(root,filename))
+                # prioritize TEI XML because better quality and faster
+                filename_tei1 = os.path.join(root, filename_json.replace("." + target + ".json", ".pub2tei.tei.xml"))
+                filename_tei2 = os.path.join(root, filename_json.replace("." + target + ".json", ".latex.tei.xml"))
+                if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
+                    # we have a TEI file, so if the current filename is not this TEI, we skip
+                    if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
+                        continue
 
-                    # if the json file already exists and not force, we skip 
-                    if os.path.isfile(os.path.join(root, filename_json)) and not force:
-                        # check that this id is considered in the lmdb keeping track of the process
+                # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF
+                # because processing PDF allows bounding box coordinates in the results for software mentions
+                # which is an added value
+                # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in
+                # PDF coordinates
+                filename_tei3 = os.path.join(root, filename_json.replace("." + target + ".json", ".grobid.tei.xml"))
+                filename_pdf = os.path.join(root, filename_json.replace("." + target + ".json", ".pdf"))
+                if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf):
+                    if filename.endswith(".grobid.tei.xml"):
+                        continue
 
-                        if use_datastet:
-                            with self.env_dataset.begin() as txn:
-                                status = txn.get(sha1.encode(encoding='UTF-8'))
-                            if status is None:
-                                with self.env_dataset.begin(write=True) as txn2:
-                                    txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) 
-                            continue
-                        else:
-                            with self.env_software.begin() as txn:
-                                status = txn.get(sha1.encode(encoding='UTF-8'))
-                            if status is None:
-                                with self.env_software.begin(write=True) as txn2:
-                                    txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) 
-                            continue
+                sha1 = getSHA1(os.path.join(root, filename))
+
+                # if the json file already exists and not force, we skip
+                if os.path.isfile(os.path.join(root, filename_json)) and not force:
+                    # check that this id is considered in the lmdb keeping track of the process
 
-                    # if identifier already processed successfully in the local lmdb, we skip
-                    # the hash of the fulltext file is used as unique identifier for the document (SHA1)
                     if use_datastet:
                         with self.env_dataset.begin() as txn:
                             status = txn.get(sha1.encode(encoding='UTF-8'))
-                            if status is not None and not force:
-                                continue
+                        if status is None:
+                            with self.env_dataset.begin(write=True) as txn2:
+                                txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8'))
+                        continue
                     else:
                         with self.env_software.begin() as txn:
                             status = txn.get(sha1.encode(encoding='UTF-8'))
-                            if status is not None and not force:
-                                continue
+                        if status is None:
+                            with self.env_software.begin(write=True) as txn2:
+                                txn2.put(sha1.encode(encoding='UTF-8'), "True".encode(encoding='UTF-8'))
+                        continue
 
-                    pdf_files.append(os.path.join(root,filename))
-                    out_files.append(os.path.join(root, filename_json))
-                    record = {}
-                    record["id"] = sha1
-                    full_records.append(record)
-                    
-                    if len(pdf_files) == self.config["batch_size"]:
-                        self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet)
-                        nb_total += len(pdf_files)
-                        pdf_files = []
-                        out_files = []
-                        full_records = []
-                        runtime = round(time.time() - start_time, 3)
-                        sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s  / {str(round(runtime/nb_total, 2))} s/files")
-                        sys.stdout.flush()
+                # if identifier already processed successfully in the local lmdb, we skip
+                # the hash of the fulltext file is used as unique identifier for the document (SHA1)
+                if use_datastet:
+                    with self.env_dataset.begin() as txn:
+                        status = txn.get(sha1.encode(encoding='UTF-8'))
+                        if status is not None and not force:
+                            continue
+                else:
+                    with self.env_software.begin() as txn:
+                        status = txn.get(sha1.encode(encoding='UTF-8'))
+                        if status is not None and not force:
+                            continue
+
+                pdf_files.append(os.path.join(root, filename))
+                out_files.append(os.path.join(root, filename_json))
+                record = {}
+                record["id"] = sha1
+                full_records.append(record)
+
+                if len(pdf_files) == self.config["batch_size"]:
+                    self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet)
+                    nb_total += len(pdf_files)
+                    pdf_files = []
+                    out_files = []
+                    full_records = []
+                    runtime = round(time.time() - start_time, 3)
+                    sys.stdout.write(
+                        f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total / runtime, 2))} files/s  / {str(round(runtime / nb_total, 2))} s/files")
+                    sys.stdout.flush()
 
         # last batch
         if len(pdf_files) > 0:
             self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet)
             nb_total += len(pdf_files)
             runtime = round(time.time() - start_time, 3)
-            sys.stdout.write(f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total/runtime, 2))} files/s  / {str(round(runtime/nb_total, 2))} s/files")
+            sys.stdout.write(
+                f"\rtotal process: {str(nb_total)} - accumulated runtime: {str(runtime)}s - {str(round(nb_total / runtime, 2))} files/s  / {str(round(runtime / nb_total, 2))} s/files")
             sys.stdout.flush()
 
+    @staticmethod
+    def compute_output_filename(filename, target):
+        if filename.endswith(".pdf"):
+            filename_json = filename.replace(".pdf", "." + target + ".json")
+        elif filename.endswith(".pdf.gz"):
+            filename_json = filename.replace(".pdf.gz", "." + target + ".json")
+        elif filename.endswith(".PDF"):
+            filename_json = filename.replace(".PDF", "." + target + ".json")
+        elif filename.endswith(".pub2tei.tei.xml"):
+            filename_json = filename.replace(".pub2tei.tei.xml", "." + target + ".json")
+        elif filename.endswith(".latex.tei.xml"):
+            filename_json = filename.replace(".latex.tei.xml", "." + target + ".json")
+        elif filename.endswith(".grobid.tei.xml"):
+            filename_json = filename.replace(".grobid.tei.xml", "." + target + ".json")
+        elif filename.endswith(".xml"):
+            filename_json = filename.replace(".xml", "." + target + ".json")
+        else:
+            filename_json = None
+
+        return filename_json
+
     """
     def annotate_collection(self, data_path, force=False, use_datastet=False):
         '''
@@ -336,10 +351,11 @@ def annotate_collection(self, data_path, force=False, use_datastet=False):
     def annotate_batch(self, pdf_files, out_files=None, full_records=None, use_datastet=False):
         # process a provided list of PDF
         with ThreadPoolExecutor(max_workers=self.config["concurrency"]) as executor:
-            #with ProcessPoolExecutor(max_workers=self.config["concurrency"]) as executor:
+            # with ProcessPoolExecutor(max_workers=self.config["concurrency"]) as executor:
             # note: ProcessPoolExecutor will not work due to env objects that can't be serailized (e.g. LMDB variables)
             # client is not cpu bounded but io bounded, so normally it's still okay with threads and GIL
-            executor.map(self.annotate, pdf_files, out_files, full_records, [use_datastet]*len(pdf_files), timeout=self.config["timeout"])
+            executor.map(self.annotate, pdf_files, out_files, full_records, [use_datastet] * len(pdf_files),
+                         timeout=self.config["timeout"])
 
     def reprocess_failed(self, directory, use_datastet=False):
         """
@@ -404,97 +420,77 @@ def reprocess_failed(self, directory, use_datastet=False):
 
         for root, directories, filenames in os.walk(directory):
             for filename in filenames:
-                if filename.endswith(".pdf") or filename.endswith(".PDF") or filename.endswith(".pdf.gz") or filename.endswith(".xml"):
+                filename_json = self.compute_output_filename(filename, target)
+                if not filename_json:
+                    continue
 
-                    # always skip XML files when using datastet service currently
-                    if filename.endswith(".xml") and use_datastet:
-                        continue
+                # if the json file already exists, we skip
+                if os.path.isfile(os.path.join(root, filename_json)):
+                    continue
 
-                    if filename.endswith(".pdf"):
-                        filename_json = filename.replace(".pdf", "."+target+".json")
-                    elif filename.endswith(".pdf.gz"):
-                        filename_json = filename.replace(".pdf.gz", "."+target+".json")
-                    elif filename.endswith(".PDF"):
-                        filename_json = filename.replace(".PDF", "."+target+".json")
-                    elif filename.endswith(".pub2tei.tei.xml"):
-                        filename_json = filename.replace(".pub2tei.tei.xml", "."+target+".json")
-                    elif filename.endswith(".latex.tei.xml"):
-                        filename_json = filename.replace(".latex.tei.xml", "."+target+".json")
-                    elif filename.endswith(".grobid.tei.xml"):
-                        filename_json = filename.replace(".grobid.tei.xml", "."+target+".json")
-                    elif filename.endswith(".xml"):
-                        filename_json = filename.replace(".xml", "."+target+".json")
-
-                    # if the json file already exists, we skip 
-                    if os.path.isfile(os.path.join(root, filename_json)):
+                # prioritize TEI XML because better quality and faster
+                filename_tei1 = os.path.join(
+                    root, filename_json.replace("." + target + ".json", ".pub2tei.tei.xml")
+                )
+                filename_tei2 = os.path.join(root, filename_json.replace("." + target + ".json", ".latex.tei.xml"))
+                if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
+                    # we have a TEI file, so if the current filename is not this TEI, we skip
+                    if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
                         continue
 
-                    # prioretize TEI XML because better quality and faster
-                    filename_tei1 = os.path.join(root, filename_json.replace("."+target+".json", ".pub2tei.tei.xml"))
-                    filename_tei2 = os.path.join(root, filename_json.replace("."+target+".json", ".latex.tei.xml"))
-                    if os.path.isfile(filename_tei1) or os.path.isfile(filename_tei2):
-                        # we have a TEI file, so if the current filename is not this TEI, we skip
-                        if not filename.endswith(".pub2tei.tei.xml") and not filename.endswith(".latex.tei.xml"):
-                            continue
-
-                    # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF
-                    # because processing PDF allows bounding box coordinates in the results for software mentions
-                    # which is an added value
-                    # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in
-                    # PDF coordinates
+                # if Grobid TEI and PDF are both present, we skip the Grobid output to process from PDF
+                # because processing PDF allows bounding box coordinates in the results for software mentions
+                # which is an added value
+                # we could however prefer Grobid output and skip PDF if speed is the concern or no interest in
+                # PDF coordinates
 
-                    filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml"))
-                    filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf"))
-                    if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf):
-                        if filename.endswith(".grobid.tei.xml"):
-                            continue
-
-                    sha1 = getSHA1(os.path.join(root,filename))
+                filename_tei3 = os.path.join(root, filename_json.replace(".software.json", ".grobid.tei.xml"))
+                filename_pdf = os.path.join(root, filename_json.replace(".software.json", ".pdf"))
+                if os.path.isfile(filename_tei3) and os.path.isfile(filename_pdf):
+                    if filename.endswith(".grobid.tei.xml"):
+                        continue
 
-                    pdf_files.append(os.path.join(root,filename))
-                    out_files.append(os.path.join(root, filename_json))
+                sha1 = getSHA1(os.path.join(root, filename))
 
-                    if filename.endswith(".pdf"):
-                        json_file = os.path.join(root, filename.replace(".pdf", ".json"))
-                    elif filename.endswith(".xml"):
-                        json_file = os.path.join(root, filename.replace(".xml", ".json"))
-                    elif filename.endswith(".pdf.gz"):
-                        json_file = os.path.join(root, filename.replace(".pdf.gz", ".json"))
-                    elif filename.endswith(".PDF"):
-                        json_file = os.path.join(root, filename.replace(".PDF", ".json"))
+                pdf_files.append(os.path.join(root, filename))
+                json_file = os.path.join(root, filename_json)
+                out_files.append(json_file)
 
-                    if os.path.isfile(json_file):
-                        with open(json_file) as f:
-                            full_record = json.load(f)
-                        if full_record["id"] == sha1:
-                            full_records.append(full_record)
-                        else:
-                            record = {}
-                            record["id"] = sha1
-                            full_records.append(record)
+                if os.path.isfile(json_file):
+                    with open(json_file) as f:
+                        full_record = json.load(f)
+                    if full_record["id"] == sha1:
+                        full_records.append(full_record)
                     else:
                         record = {}
                         record["id"] = sha1
                         full_records.append(record)
-                    i += 1
-
-                if i == self.config["batch_size"]:
-                    self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet)
-                    nb_total += len(pdf_files)
-                    pdf_files = []
-                    out_files = []
-                    full_records = []
-                    i = 0
-                    runtime = round(time.time() - start_time, 3)
-                    sys.stdout.write("\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s  ")
-                    sys.stdout.flush()
+                else:
+                    record = {}
+                    record["id"] = sha1
+                    full_records.append(record)
+                i += 1
+
+            if i == self.config["batch_size"]:
+                self.annotate_batch(pdf_files, out_files, full_records, use_datastet=use_datastet)
+                nb_total += len(pdf_files)
+                pdf_files = []
+                out_files = []
+                full_records = []
+                i = 0
+                runtime = round(time.time() - start_time, 3)
+                sys.stdout.write("\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(
+                    runtime) + " s - " + str(round(nb_total / runtime, 2)) + " files/s  ")
+                sys.stdout.flush()
 
         # last batch for every cases
         if len(pdf_files) > 0:
             self.annotate_batch(pdf_files, out_files, full_records)
             nb_total += len(pdf_files)
             runtime = round(time.time() - start_time, 3)
-            sys.stdout.write("\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(round(nb_total/runtime, 2)) + " files/s  ")
+            sys.stdout.write(
+                "\rtotal reprocess: " + str(nb_total) + " - accumulated runtime: " + str(runtime) + " s - " + str(
+                    round(nb_total / runtime, 2)) + " files/s  ")
             sys.stdout.flush()
 
         logging.info("re-processed: " + str(nb_total) + " entries")
@@ -524,7 +520,7 @@ def reset(self, use_datastet=False):
             self._init_lmdb(use_datastet=False)
 
     def load_mongo(self, directory):
-        if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0:
+        if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0:
             mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]))
             self.mongo_db_software = mongo_client[self.config["mongo_db_software"]]
             self.mongo_db_dataset = mongo_client[self.config["mongo_db_dataset"]]
@@ -534,15 +530,15 @@ def load_mongo(self, directory):
 
         failed = 0
         for root, directories, filenames in os.walk(directory):
-            for filename in filenames: 
+            for filename in filenames:
                 if filename.endswith(".software.json") or filename.endswith(".dataset.json"):
-                    print(os.path.join(root,filename))
+                    print(os.path.join(root, filename))
 
-                    the_json = open(os.path.join(root,filename)).read()
+                    the_json = open(os.path.join(root, filename)).read()
                     try:
                         jsonObject = json.loads(the_json)
                     except:
-                        print("the json parsing of the following file failed: ", os.path.join(root,filename))
+                        print("the json parsing of the following file failed: ", os.path.join(root, filename))
                         continue
 
                     local_id = None
@@ -564,14 +560,15 @@ def load_mongo(self, directory):
                     # possibly clean original file path
                     if "original_file_path" in jsonObject:
                         if jsonObject["original_file_path"].startswith('../biblio-glutton-harvester/'):
-                            jsonObject["original_file_path"] = jsonObject["original_file_path"].replace('../biblio-glutton-harvester/', '')
-                    
+                            jsonObject["original_file_path"] = jsonObject["original_file_path"].replace(
+                                '../biblio-glutton-harvester/', '')
+
                     # update metadata via biblio-glutton (this is to be done for mongo upload from file only)
-                    if "biblio_glutton_url" in self.config and len(self.config["biblio_glutton_url"].strip())>0:
-                        if 'metadata' in jsonObject and 'doi' in jsonObject['metadata']: 
+                    if "biblio_glutton_url" in self.config and len(self.config["biblio_glutton_url"].strip()) > 0:
+                        if 'metadata' in jsonObject and 'doi' in jsonObject['metadata']:
                             try:
                                 glutton_metadata = self.biblio_glutton_lookup(doi=jsonObject['metadata']['doi'])
-                            except: 
+                            except:
                                 print("the call to biblio-glutton failed for", jsonObject['metadata']['doi'])
                                 failed += 1
                                 continue
@@ -630,10 +627,10 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
         except:
             logging.exception("input file appears invalid: " + file_in)
             return
-        
+
         jsonObject = None
         try:
-            response = requests.post(url, files=the_file, data = {'disambiguate': 1}, timeout=self.config["timeout"])
+            response = requests.post(url, files=the_file, data={'disambiguate': 1}, timeout=self.config["timeout"])
             if response.status_code == 503:
                 logging.info('service overloaded, sleep ' + str(self.config['sleep_time']) + ' seconds')
                 time.sleep(self.config['sleep_time'])
@@ -650,7 +647,8 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
                 # note: in case the recognizer has found no software in the document, it will still return
                 # a json object as result, without mentions, but with MD5 and page information
             else:
-                logging.error('Unexpected Error: [HTTP {0}]: Content: {1}'.format(response.status_code, response.content))
+                logging.error(
+                    'Unexpected Error: [HTTP {0}]: Content: {1}'.format(response.status_code, response.content))
 
         except requests.exceptions.Timeout:
             logging.exception("The request to the annotation service has timeout")
@@ -665,7 +663,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
             # add file, DOI, date and version info in the JSON, if available
             if full_record is not None:
                 jsonObject['id'] = full_record['id']
-                #if len(full_record) > 1:
+                # if len(full_record) > 1:
                 jsonObject['metadata'] = full_record;
             jsonObject['original_file_path'] = file_in
             jsonObject['file_name'] = os.path.basename(file_in)
@@ -683,7 +681,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
                                 new_mentions.append(mention)
                     jsonObject['mentions'] = new_mentions
 
-            if file_out is not None: 
+            if file_out is not None:
                 # we write the json result into a file together with the processed pdf
                 with open(file_out, "w", encoding="utf-8") as json_file:
                     json_file.write(json.dumps(jsonObject))
@@ -698,7 +696,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
         elif jsonObject is not None:
             # we have no software mention in the document, we still write an empty result file
             # along with the PDF/medtadata files to easily keep track of the processing for this doc
-            if file_out is not None: 
+            if file_out is not None:
                 # force empty explicit no mentions
                 jsonObject['mentions'] = []
                 with open(file_out, "w", encoding="utf-8") as json_file:
@@ -710,7 +708,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
             if self.env_dataset is not None and full_record is not None:
                 with self.env_dataset.begin(write=True) as txn:
                     if jsonObject is not None:
-                        txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) 
+                        txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8'))
                     else:
                         # the process failed
                         txn.put(full_record['id'].encode(encoding='UTF-8'), "False".encode(encoding='UTF-8'))
@@ -718,7 +716,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
             if self.env_software is not None and full_record is not None:
                 with self.env_software.begin(write=True) as txn:
                     if jsonObject is not None:
-                        txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8')) 
+                        txn.put(full_record['id'].encode(encoding='UTF-8'), "True".encode(encoding='UTF-8'))
                     else:
                         # the process failed
                         txn.put(full_record['id'].encode(encoding='UTF-8'), "False".encode(encoding='UTF-8'))
@@ -726,7 +724,7 @@ def annotate(self, file_in, file_out, full_record, use_datastet=False):
         if self.scorched_earth and jsonObject is not None:
             # processed is done, remove local document file
             try:
-                os.remove(file_in) 
+                os.remove(file_in)
             except:
                 logging.exception("Error while deleting file " + file_in)
 
@@ -738,9 +736,9 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
         """
         nb_total = 0
         nb_fail = 0
-        nb_success = 0  
+        nb_success = 0
 
-        if self.env_software is not None: 
+        if self.env_software is not None:
             with self.env_software.begin() as txn:
                 cursor = txn.cursor()
                 for key, value in cursor:
@@ -751,7 +749,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                     else:
                         nb_fail += 1
         else:
-            if self.env_dataset is not None: 
+            if self.env_dataset is not None:
                 with self.env_dataset.begin() as txn:
                     cursor = txn.cursor()
                     for key, value in cursor:
@@ -773,50 +771,51 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
         if full_diagnostic_mongo:
             # check mongodb access - if mongodb is not used or available, we don't go further
             if self.mongo_db_software is None:
-                if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0:
+                if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0:
                     mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]))
                     self.mongo_db_software = mongo_client[self.config["mongo_db_software"]]
 
             if self.mongo_db_software is None:
-                print("MongoDB server is not available for more advanced statistics")    
+                print("MongoDB server is not available for more advanced statistics")
                 return
 
             print("MongoDB - number of documents: ", self.mongo_db_software.documents.count_documents({}))
             print("MongoDB - number of software mentions: ", self.mongo_db_software.annotations.count_documents({}))
 
-            result = self.mongo_db_software.annotations.find( {"software-name": {"$exists": True}} )
+            result = self.mongo_db_software.annotations.find({"software-name": {"$exists": True}})
             print("\t  * with software name:", result.count())
- 
-            result = self.mongo_db_software.annotations.find( {"version": {"$exists": True}} )
+
+            result = self.mongo_db_software.annotations.find({"version": {"$exists": True}})
             print("\t  * with version:", result.count())
 
-            result = self.mongo_db_software.annotations.find( {"publisher": {"$exists": True}} )
+            result = self.mongo_db_software.annotations.find({"publisher": {"$exists": True}})
             print("\t  * with publisher:", result.count())
 
-            result = self.mongo_db_software.annotations.find( {"url": {"$exists": True}} )
-            print("\t  * with url:", result.count())    
+            result = self.mongo_db_software.annotations.find({"url": {"$exists": True}})
+            print("\t  * with url:", result.count())
 
-            results = self.mongo_db_software.annotations.find( {"references": {"$exists": True}} )
+            results = self.mongo_db_software.annotations.find({"references": {"$exists": True}})
             nb_ref = 0
             has_ref = 0
             for result in results:
                 has_ref += 1
                 the_references = result.get("references")
                 nb_ref += len(the_references)
-                    
-            print("\t  * with at least one reference", nb_ref) 
-            print("\t  * total references", nb_ref) 
 
-            print("MongoDB - number of bibliographical references: ", self.mongo_db_software.references.count_documents({}))
+            print("\t  * with at least one reference", nb_ref)
+            print("\t  * total references", nb_ref)
+
+            print("MongoDB - number of bibliographical references: ",
+                  self.mongo_db_software.references.count_documents({}))
 
-            result = self.mongo_db_software.references.find( {"tei": {"$regex": "DOI"}} )
-            print("\t  * with DOI:", result.count())  
+            result = self.mongo_db_software.references.find({"tei": {"$regex": "DOI"}})
+            print("\t  * with DOI:", result.count())
 
-            result = self.mongo_db_software.references.find( {"tei": {"$regex": "PMID"}} )
-            print("\t  * with PMID:", result.count())  
+            result = self.mongo_db_software.references.find({"tei": {"$regex": "PMID"}})
+            print("\t  * with PMID:", result.count())
 
-            result = self.mongo_db_software.references.find( {"tei": {"$regex": "PMC"}} )
-            print("\t  * with PMC ID:", result.count())  
+            result = self.mongo_db_software.references.find({"tei": {"$regex": "PMC"}})
+            print("\t  * with PMC ID:", result.count())
             print("---")
         elif full_diagnostic_files:
             # in this mode, we go through the produced json files to retrieve information
@@ -857,7 +856,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
             nb_dataset_publisher = 0
             nb_dataset_url = 0
             nb_dataset_version = 0
-           
+
             nb_dataset_mention_with_ref = 0
 
             nb_dataset_documents = 0
@@ -865,16 +864,16 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
             nbDatasetFiles = 0
 
             for root, directories, filenames in os.walk(directory):
-                
-                for filename in filenames: 
+
+                for filename in filenames:
                     if filename.endswith(".software.json"):
                         nb_documents += 1
-                        #print(os.path.join(root,filename))
-                        the_json = open(os.path.join(root,filename)).read()
+                        # print(os.path.join(root,filename))
+                        the_json = open(os.path.join(root, filename)).read()
                         try:
                             jsonObject = json.loads(the_json)
                         except:
-                            print("the json parsing of the following file failed: ", os.path.join(root,filename))
+                            print("the json parsing of the following file failed: ", os.path.join(root, filename))
                             continue
 
                         nbSoftwareFiles += 1
@@ -883,7 +882,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                             sys.stdout.write("\rFiles visited: %i" % nbSoftwareFiles)
                             sys.stdout.flush()
 
-                        if "mentions" in jsonObject and len(jsonObject["mentions"])>0:
+                        if "mentions" in jsonObject and len(jsonObject["mentions"]) > 0:
                             nb_documents_with_software += 1
 
                         for mention in jsonObject["mentions"]:
@@ -911,7 +910,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                                 nb_ref_marker += len(mention["references"])
                                 nb_software_mention_with_ref += 1
 
-                        if "references" in jsonObject and len(jsonObject["references"])>0:
+                        if "references" in jsonObject and len(jsonObject["references"]) > 0:
                             nb_ref += len(jsonObject["references"])
 
                             # like with mongodb queries, we can use simple matching to count PID in full references
@@ -927,12 +926,12 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
 
                     elif filename.endswith(".dataset.json"):
                         nb_dataset_documents += 1
-                        #print(os.path.join(root,filename))
-                        the_json = open(os.path.join(root,filename)).read()
+                        # print(os.path.join(root,filename))
+                        the_json = open(os.path.join(root, filename)).read()
                         try:
                             jsonObject = json.loads(the_json)
                         except:
-                            print("the json parsing of the following file failed: ", os.path.join(root,filename))
+                            print("the json parsing of the following file failed: ", os.path.join(root, filename))
                             continue
 
                         nbDatasetFiles += 1
@@ -941,7 +940,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                             sys.stdout.write("\rFiles visited: %i" % nbDatasetFiles)
                             sys.stdout.flush()
 
-                        if "mentions" in jsonObject and len(jsonObject["mentions"])>0:
+                        if "mentions" in jsonObject and len(jsonObject["mentions"]) > 0:
                             nb_documents_with_dataset += 1
 
                         for mention in jsonObject["mentions"]:
@@ -950,7 +949,7 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                                     nb_dataset_implicit += 1
                                 elif mention["type"] == "dataset-name":
                                     nb_dataset_name += 1
-                            
+
                             if mention["type"] == "data-device" or "data-device" in mention:
                                 nb_data_device += 1
 
@@ -965,8 +964,8 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                                 nb_dataset_ref_marker += len(mention["references"])
                                 nb_dataset_mention_with_ref += 1
 
-                        if "references" in jsonObject and len(jsonObject["references"])>0:
-                            
+                        if "references" in jsonObject and len(jsonObject["references"]) > 0:
+
                             nb_dataset_ref += len(jsonObject["references"])
 
                             # like with mongodb queries, we can use simple matching to count PID in full references
@@ -982,9 +981,10 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
 
             # report results
             if nb_documents > 0:
-                print("\n\n--- SOFTWARE MENTIONS ---") 
+                print("\n\n--- SOFTWARE MENTIONS ---")
                 print("JSON files - number of documents: ", nb_documents)
-                print("JSON files - number of documents with at least one software mention: ", nb_documents_with_software)
+                print("JSON files - number of documents with at least one software mention: ",
+                      nb_documents_with_software)
                 print("JSON files - number of software mentions: ", nb_software)
                 nb_standalone = nb_software - (nb_environment + nb_component + nb_implicit)
                 print("\t     -> subtype standalone:", nb_standalone)
@@ -994,33 +994,33 @@ def diagnostic(self, full_diagnostic_mongo=False, full_diagnostic_files=False, d
                 print("\t     * with software name:", nb_software)
                 print("\t     * with version:", nb_version)
                 print("\t     * with publisher:", nb_publisher)
-                print("\t     * with url:", nb_url) 
-                print("\t     * with programming language:", nb_language) 
-                print("\t     * mentions with at least one reference", nb_software_mention_with_ref) 
-                print("---") 
+                print("\t     * with url:", nb_url)
+                print("\t     * with programming language:", nb_language)
+                print("\t     * mentions with at least one reference", nb_software_mention_with_ref)
+                print("---")
                 print("JSON files - number of bibliographical reference markers: ", nb_ref_marker)
                 print("JSON files - number of bibliographical references: ", nb_ref)
-                print("\t      * with DOI:", nb_ref_with_doi)  
-                print("\t      * with PMID:", nb_ref_with_pmid)  
-                print("\t      * with PMC ID:", nb_ref_with_pmcid)  
-                print("---")              
+                print("\t      * with DOI:", nb_ref_with_doi)
+                print("\t      * with PMID:", nb_ref_with_pmid)
+                print("\t      * with PMC ID:", nb_ref_with_pmcid)
+                print("---")
 
             if nb_dataset_documents > 0:
-                print("\n\n--- DATASET MENTIONS ---") 
+                print("\n\n--- DATASET MENTIONS ---")
                 print("JSON files - number of documents: ", nb_dataset_documents)
                 print("JSON files - number of documents with at least one dataset mention: ", nb_documents_with_dataset)
                 print("JSON files - number of named dataset mentions: ", nb_dataset_name)
                 print("JSON files - number of implicit dataset mentions: ", nb_dataset_implicit)
                 print("JSON files - number of data device mentions: ", nb_data_device)
-                print("\t     * with url:", nb_dataset_url) 
-                print("\t     * mentions with at least one reference", nb_dataset_mention_with_ref) 
-                print("---") 
+                print("\t     * with url:", nb_dataset_url)
+                print("\t     * mentions with at least one reference", nb_dataset_mention_with_ref)
+                print("---")
                 print("JSON files - number of bibliographical reference markers: ", nb_dataset_ref_marker)
                 print("JSON files - number of bibliographical references: ", nb_dataset_ref)
-                print("\t      * with DOI:", nb_dataset_ref_with_doi)  
-                print("\t      * with PMID:", nb_dataset_ref_with_pmid)  
-                print("\t      * with PMC ID:", nb_dataset_ref_with_pmcid)  
-                print("---") 
+                print("\t      * with DOI:", nb_dataset_ref_with_doi)
+                print("\t      * with PMID:", nb_dataset_ref_with_pmid)
+                print("\t      * with PMC ID:", nb_dataset_ref_with_pmcid)
+                print("---")
 
     def _insert_mongo(self, jsonObject, target="software"):
         if not "id" in jsonObject:
@@ -1029,23 +1029,27 @@ def _insert_mongo(self, jsonObject, target="software"):
         local_mongo_db = None
         if target == "software":
             if self.mongo_db_software is None and "mongo_db_software" in self.config:
-                if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0:
+                if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0:
                     try:
-                        mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]), serverSelectionTimeoutMS=1000)
+                        mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]),
+                                                           serverSelectionTimeoutMS=1000)
                         mongo_client.server_info()
                         self.mongo_db_software = mongo_client[self.config["mongo_db_software"]]
                     except:
-                        print("Fail to connect to the MongoDb server:", self.config["mongo_host"]+":"+self.config["mongo_port"])
+                        print("Fail to connect to the MongoDb server:",
+                              self.config["mongo_host"] + ":" + self.config["mongo_port"])
             local_mongo_db = self.mongo_db_software
         elif target == "dataset":
             if self.mongo_db_dataset is None and "mongo_db_dataset" in self.config:
-                if "mongo_host" in self.config and len(self.config["mongo_host"].strip())>0:
+                if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0:
                     try:
-                        mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]), serverSelectionTimeoutMS=1000)
+                        mongo_client = pymongo.MongoClient(self.config["mongo_host"], int(self.config["mongo_port"]),
+                                                           serverSelectionTimeoutMS=1000)
                         mongo_client.server_info()
                         self.mongo_db_dataset = mongo_client[self.config["mongo_db_dataset"]]
                     except:
-                        print("Fail to connect to the MongoDb server:", self.config["mongo_host"]+":"+self.config["mongo_port"])
+                        print("Fail to connect to the MongoDb server:",
+                              self.config["mongo_host"] + ":" + self.config["mongo_port"])
             local_mongo_db = self.mongo_db_dataset
 
         if local_mongo_db == None:
@@ -1053,15 +1057,15 @@ def _insert_mongo(self, jsonObject, target="software"):
             return
 
         # check if the article/annotations are not already present
-        if local_mongo_db.documents.count_documents({ 'id': jsonObject['id'] }, limit = 1) != 0:
+        if local_mongo_db.documents.count_documents({'id': jsonObject['id']}, limit=1) != 0:
             # if yes we replace this object, its annotations and references
-            result = local_mongo_db.documents.find_one({ 'id': jsonObject['id'] })
+            result = local_mongo_db.documents.find_one({'id': jsonObject['id']})
             _id = result['_id']
-            local_mongo_db.annotations.delete_many( {'document': _id} )
-            local_mongo_db.references.delete_many( {'document': _id} )
-            result = local_mongo_db.documents.delete_one({ 'id': jsonObject['id'] })
-            #print ("result:", type(result), "-- deleted count:", result.deleted_count)
-        
+            local_mongo_db.annotations.delete_many({'document': _id})
+            local_mongo_db.references.delete_many({'document': _id})
+            result = local_mongo_db.documents.delete_one({'id': jsonObject['id']})
+            # print ("result:", type(result), "-- deleted count:", result.deleted_count)
+
         # clean json
         jsonObject = _clean_json(jsonObject)
 
@@ -1072,7 +1076,7 @@ def _insert_mongo(self, jsonObject, target="software"):
         if 'references' in jsonObjectDocument:
             del jsonObjectDocument['references']
         inserted_doc_id = local_mongo_db.documents.insert_one(jsonObjectDocument).inserted_id
-        
+
         local_ref_map = {}
         if 'references' in jsonObject:
             for reference in jsonObject['references']:
@@ -1090,7 +1094,6 @@ def _insert_mongo(self, jsonObject, target="software"):
                             reference["reference_id"] = local_ref_map[str(reference["refKey"])]
                 inserted_mention_id = local_mongo_db.annotations.insert_one(mention).inserted_id
 
-
     def biblio_glutton_lookup(self, doi=None, pmcid=None, pmid=None, istex_id=None, istex_ark=None):
         """
         Lookup on biblio_glutton with the provided strong identifiers, return the full agregated biblio_glutton record
@@ -1101,42 +1104,47 @@ def biblio_glutton_lookup(self, doi=None, pmcid=None, pmid=None, istex_id=None,
         jsonResult = None
 
         if "biblio_glutton_url" in self.config and len(self.config["biblio_glutton_url"].strip()) > 0:
-            biblio_glutton_url = self.config["biblio_glutton_url"]+"/service/lookup?"
+            biblio_glutton_url = self.config["biblio_glutton_url"] + "/service/lookup?"
 
-            if doi is not None and len(doi)>0:
+            if doi is not None and len(doi) > 0:
                 response = requests.get(biblio_glutton_url, params={'doi': doi}, verify=False, timeout=5)
                 success = (response.status_code == 200)
                 if success:
                     jsonResult = response.json()
 
-            if not success and pmid is not None and len(pmid)>0:
+            if not success and pmid is not None and len(pmid) > 0:
                 response = requests.get(biblio_glutton_url + "pmid=" + pmid, verify=False, timeout=5)
                 success = (response.status_code == 200)
                 if success:
-                    jsonResult = response.json()     
+                    jsonResult = response.json()
 
-            if not success and pmcid is not None and len(pmcid)>0:
-                response = requests.get(biblio_glutton_url + "pmc=" + pmcid, verify=False, timeout=5)  
+            if not success and pmcid is not None and len(pmcid) > 0:
+                response = requests.get(biblio_glutton_url + "pmc=" + pmcid, verify=False, timeout=5)
                 success = (response.status_code == 200)
                 if success:
                     jsonResult = response.json()
 
-            if not success and istex_id is not None and len(istex_id)>0:
+            if not success and istex_id is not None and len(istex_id) > 0:
                 response = requests.get(biblio_glutton_url + "istexid=" + istex_id, verify=False, timeout=5)
                 success = (response.status_code == 200)
                 if success:
                     jsonResult = response.json()
 
-        if not success and doi is not None and len(doi)>0 and "crossref_base" in self.config and len(self.config["crossref_base"].strip())>0:
+        if not success and doi is not None and len(doi) > 0 and "crossref_base" in self.config and len(
+                self.config["crossref_base"].strip()) > 0:
             # let's call crossref as fallback for possible X-months gap in biblio-glutton
             # https://api.crossref.org/works/10.1037/0003-066X.59.1.29
-            if "crossref_email" in self.config and len(self.config["crossref_email"].strip())>0:
-                user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0 (mailto:'+self.config["crossref_email"]+')'}
+            if "crossref_email" in self.config and len(self.config["crossref_email"].strip()) > 0:
+                user_agent = {
+                    'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0 (mailto:' +
+                                  self.config["crossref_email"] + ')'}
             else:
-                user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0'}
+                user_agent = {
+                    'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0'}
             try:
-                logging.info("calling... " + self.config["crossref_base"]+"/works/"+doi)
-                response = requests.get(self.config["crossref_base"]+"/works/"+doi, headers=user_agent, verify=False, timeout=5)
+                logging.info("calling... " + self.config["crossref_base"] + "/works/" + doi)
+                response = requests.get(self.config["crossref_base"] + "/works/" + doi, headers=user_agent,
+                                        verify=False, timeout=5)
                 if response.status_code == 200:
                     jsonResult = response.json()['message']
                     # filter out references and re-set doi, in case there are obtained via crossref
@@ -1147,9 +1155,10 @@ def biblio_glutton_lookup(self, doi=None, pmcid=None, pmid=None, istex_id=None,
                     jsonResult = None
             except:
                 logging.exception("Could not connect to CrossRef")
-        
+
         return jsonResult
 
+
 def generateStoragePath(identifier):
     '''
     Convert a file name into a path with file prefix as directory paths:
@@ -1157,9 +1166,11 @@ def generateStoragePath(identifier):
     '''
     return os.path.join(identifier[:2], identifier[2:4], identifier[4:6], identifier[6:8], "")
 
+
 def _deserialize_pickle(serialized):
     return pickle.loads(serialized)
 
+
 def _clean_json(d):
     # clean recursively a json for insertion in MongoDB, basically remove keys starting with $
     if not isinstance(d, (dict, list)):
@@ -1167,20 +1178,23 @@ def _clean_json(d):
     if isinstance(d, list):
         return [_clean_json(v) for v in d]
     return {k: _clean_json(v) for k, v in d.items()
-            if not k.startswith("$") }
+            if not k.startswith("$")}
+
 
 def _is_tei(the_file):
     # based on the header content of the file, check if we have a TEI XML file
     with open(the_file) as f:
         n = 5
-        while n>=0:
+        while n >= 0:
             first_line = f.readline().strip('\n')
             if "<TEI " in first_line or "<tei " in first_line or "<teiCorpus " in first_line:
                 return True
             n -= 1
     return False
 
-BUF_SIZE = 65536    
+
+BUF_SIZE = 65536
+
 
 def getSHA1(the_file):
     sha1 = hashlib.sha1()
@@ -1194,29 +1208,39 @@ def getSHA1(the_file):
 
     return sha1.hexdigest()
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = "Softcite software mention recognizer client")
-    parser.add_argument("--repo-in", default=None, help="path to a directory of PDF or XML fulltext files to be processed by the Softcite software mention recognizer")  
-    parser.add_argument("--file-in", default=None, help="a single PDF or XML input file to be processed by the Softcite software mention recognizer") 
-    parser.add_argument("--file-out", default=None, help="path to a single output the software mentions in JSON format, extracted from the PDF file-in") 
-    #parser.add_argument("--data-path", default=None, help="path to the resource files created/harvested by biblio-glutton-harvester") 
-    parser.add_argument("--config", default="./config.json", help="path to the config file, default is ./config.json") 
-    parser.add_argument("--reprocess", action="store_true", help="reprocessed failed PDF or XML fulltexts") 
-    parser.add_argument("--reset", action="store_true", help="ignore previous processing states and re-init the annotation process from the beginning") 
-    parser.add_argument("--load", action="store_true", help="load json files into the MongoDB instance, the --repo-in or --data-path parameter must indicate the path "
-        +"to the directory of resulting json files to be loaded, --dump must indicate the path to the json dump file of document metadata") 
-    parser.add_argument("--diagnostic-mongo", action="store_true", help="perform a full count of annotations and diagnostic using MongoDB "  
-        +"regarding the harvesting and annotation process") 
-    parser.add_argument("--diagnostic-files", action="store_true", help="perform a full count of annotations and diagnostic using repository files "  
-        +"regarding the harvesting and annotation process") 
-    parser.add_argument("--scorched-earth", action="store_true", help="remove the PDF or XML fulltext files file after their sucessful processing in order to save storage space" 
-        +", careful with this!") 
-    parser.add_argument("--datastet", action="store_true", help="call the DataStet service instead of the software mention extraction service. " +
-        "It requires a DataStet server running instead of the Softcite server, and indicating the Datastet server url in the config file") 
+    parser = argparse.ArgumentParser(description="Softcite software mention recognizer client")
+    parser.add_argument("--repo-in", default=None,
+                        help="path to a directory of PDF or XML fulltext files to be processed by the Softcite software mention recognizer")
+    parser.add_argument("--file-in", default=None,
+                        help="a single PDF or XML input file to be processed by the Softcite software mention recognizer")
+    parser.add_argument("--file-out", default=None,
+                        help="path to a single output the software mentions in JSON format, extracted from the PDF file-in")
+    # parser.add_argument("--data-path", default=None, help="path to the resource files created/harvested by biblio-glutton-harvester")
+    parser.add_argument("--config", default="./config.json", help="path to the config file, default is ./config.json")
+    parser.add_argument("--reprocess", action="store_true", help="reprocessed failed PDF or XML fulltexts")
+    parser.add_argument("--reset", action="store_true",
+                        help="ignore previous processing states and re-init the annotation process from the beginning")
+    parser.add_argument("--load", action="store_true",
+                        help="load json files into the MongoDB instance, the --repo-in or --data-path parameter must indicate the path "
+                             + "to the directory of resulting json files to be loaded, --dump must indicate the path to the json dump file of document metadata")
+    parser.add_argument("--diagnostic-mongo", action="store_true",
+                        help="perform a full count of annotations and diagnostic using MongoDB "
+                             + "regarding the harvesting and annotation process")
+    parser.add_argument("--diagnostic-files", action="store_true",
+                        help="perform a full count of annotations and diagnostic using repository files "
+                             + "regarding the harvesting and annotation process")
+    parser.add_argument("--scorched-earth", action="store_true",
+                        help="remove the PDF or XML fulltext files file after their sucessful processing in order to save storage space"
+                             + ", careful with this!")
+    parser.add_argument("--datastet", action="store_true",
+                        help="call the DataStet service instead of the software mention extraction service. " +
+                             "It requires a DataStet server running instead of the Softcite server, and indicating the Datastet server url in the config file")
 
     args = parser.parse_args()
 
-    #data_path = args.data_path
+    # data_path = args.data_path
     config_path = args.config
     reprocess = args.reprocess
     reset = args.reset
@@ -1231,7 +1255,8 @@ def getSHA1(the_file):
 
     client = software_mentions_client(config_path=config_path, use_datastet=use_datastet)
 
-    if not load_mongo and not full_diagnostic_mongo and not full_diagnostic_files and not client.service_isalive(use_datastet=use_datastet):
+    if not load_mongo and not full_diagnostic_mongo and not full_diagnostic_files and not client.service_isalive(
+            use_datastet=use_datastet):
         sys.exit("mention extraction service not available, leaving...")
 
     force = False
@@ -1260,12 +1285,12 @@ def getSHA1(the_file):
 
     elif reprocess:
         client.reprocess_failed(repo_in, use_datastet=use_datastet)
-    elif repo_in is not None and not full_diagnostic_files: 
+    elif repo_in is not None and not full_diagnostic_files:
         client.annotate_directory(repo_in, force, use_datastet=use_datastet)
     elif file_in is not None:
         client.annotate(file_in, file_out, None, use_datastet=use_datastet)
-    #elif data_path is not None: 
+    # elif data_path is not None:
     #    client.annotate_collection(data_path, force, use_datastet=use_datastet)
 
-    client.diagnostic(full_diagnostic_mongo=full_diagnostic_mongo, full_diagnostic_files=full_diagnostic_files, directory=repo_in)
-    
+    client.diagnostic(full_diagnostic_mongo=full_diagnostic_mongo, full_diagnostic_files=full_diagnostic_files,
+                      directory=repo_in)

From 78b4a937b2d7f784d4f1b9f3b00423ad37b62ecb Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Sun, 8 Jun 2025 08:10:43 +0200
Subject: [PATCH 5/9] use custom LMDB path if provided

---
 software_mentions_client/client.py | 62 +++++++++++++++++-------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index 38bba4a..14164ee 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -2,27 +2,25 @@
 Run the software mention recognizer service on PDF or XML fulltext file collections
 '''
 
-import gzip
-import sys
-import os
-import shutil
-import json
-import pickle
-import lmdb
 import argparse
-import time
-import datetime
-import software_mentions_client.S3
-import concurrent.futures
-import requests
-import pymongo
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
+import gzip
 import hashlib
-import copyreg
-import types
+import json
 import logging
 import logging.handlers
+import os
+import pickle
 import pkgutil
+import shutil
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import lmdb
+import pymongo
+import requests
+
+import software_mentions_client.S3
 
 map_size = 100 * 1024 * 1024 * 1024
 
@@ -40,12 +38,17 @@
 logging.basicConfig(filename='client.log', filemode='w', level=logging.DEBUG)
 
 
-class software_mentions_client(object):
+class SoftwareMentionsClient(object):
     """
     Python client for using the Softcite software mention service. 
     """
 
-    def __init__(self, config_path='./config.json', use_datastet=False):
+    def __init__(
+            self,
+            config_path='./config.json',
+            use_datastet=False,
+            data_path=None
+    ):
         self.config = None
 
         # standard lmdb environment for keeping track of the status of processing
@@ -140,14 +143,15 @@ def service_isalive(self, use_datastet=False):
                           'test call to service, please check and re-start a server.')
         return False
 
-    def _init_lmdb(self, use_datastet=False):
+    def _init_lmdb(self, use_datastet=False, data_path=None):
         # open in write mode
+        root_data_path = self.config["data_path"] if not data_path else data_path
         if use_datastet:
-            envFilePath = os.path.join(self.config["data_path"], 'entries_dataset')
-            self.env_dataset = lmdb.open(envFilePath, map_size=map_size)
+            env_file_path = os.path.join(root_data_path, 'entries_dataset')
+            self.env_dataset = lmdb.open(env_file_path, map_size=map_size)
         else:
-            envFilePath = os.path.join(self.config["data_path"], 'entries_software')
-            self.env_software = lmdb.open(envFilePath, map_size=map_size)
+            env_file_path = os.path.join(root_data_path, 'entries_software')
+            self.env_software = lmdb.open(env_file_path, map_size=map_size)
 
         # envFilePath = os.path.join(self.config["data_path"], 'fail_software')
         # self.env_fail_software = lmdb.open(envFilePath, map_size=map_size)
@@ -1217,7 +1221,10 @@ def getSHA1(the_file):
                         help="a single PDF or XML input file to be processed by the Softcite software mention recognizer")
     parser.add_argument("--file-out", default=None,
                         help="path to a single output the software mentions in JSON format, extracted from the PDF file-in")
-    # parser.add_argument("--data-path", default=None, help="path to the resource files created/harvested by biblio-glutton-harvester")
+    parser.add_argument(
+        "--data-path",
+        default=None,
+        help="path to the directory containing the LMDB database.")
     parser.add_argument("--config", default="./config.json", help="path to the config file, default is ./config.json")
     parser.add_argument("--reprocess", action="store_true", help="reprocessed failed PDF or XML fulltexts")
     parser.add_argument("--reset", action="store_true",
@@ -1240,7 +1247,7 @@ def getSHA1(the_file):
 
     args = parser.parse_args()
 
-    # data_path = args.data_path
+    data_path = args.data_path
     config_path = args.config
     reprocess = args.reprocess
     reset = args.reset
@@ -1253,7 +1260,10 @@ def getSHA1(the_file):
     scorched_earth = args.scorched_earth
     use_datastet = args.datastet
 
-    client = software_mentions_client(config_path=config_path, use_datastet=use_datastet)
+    client = SoftwareMentionsClient(
+        config_path=config_path,
+        use_datastet=use_datastet,
+        data_path=data_path)
 
     if not load_mongo and not full_diagnostic_mongo and not full_diagnostic_files and not client.service_isalive(
             use_datastet=use_datastet):

From 570a2e11f1b24efaeae12f738a9d10559cd2121d Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Sun, 8 Jun 2025 11:18:11 +0200
Subject: [PATCH 6/9] use custom LMDB path also when resetting

---
 software_mentions_client/client.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index 14164ee..7069cff 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -499,16 +499,18 @@ def reprocess_failed(self, directory, use_datastet=False):
 
         logging.info("re-processed: " + str(nb_total) + " entries")
 
-    def reset(self, use_datastet=False):
+    def reset(self, use_datastet=False, data_path=None):
         """
         Remove the local lmdb keeping track of the state of advancement of the annotation and
         of the failed entries
         """
+        root_data_path = self.config["data_path"] if data_path is None else data_path
+
         if use_datastet:
             # close environments
             self.env_dataset.close()
 
-            envFilePath = os.path.join(self.config["data_path"], 'entries_dataset')
+            envFilePath = os.path.join(root_data_path, 'entries_dataset')
             shutil.rmtree(envFilePath)
 
             # re-init the environments
@@ -517,7 +519,7 @@ def reset(self, use_datastet=False):
             # close environments
             self.env_software.close()
 
-            envFilePath = os.path.join(self.config["data_path"], 'entries_software')
+            envFilePath = os.path.join(root_data_path, 'entries_software')
             shutil.rmtree(envFilePath)
 
             # re-init the environments

From a1b0fb1ef8e7fe8dd5714f5e46662c80c8d3b74e Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 12 Jun 2025 12:00:10 +0200
Subject: [PATCH 7/9] fix

---
 software_mentions_client/client.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index 7069cff..4b18408 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -58,7 +58,11 @@ def __init__(
         self.env_dataset = None
 
         self._load_config(config_path)
-        self._init_lmdb(use_datastet=use_datastet)
+
+        self.root_data_path = self._init_lmdb(
+            use_datastet=use_datastet,
+            data_path=data_path
+        )
 
         if 'bucket_name' in self.config and self.config['bucket_name'] is not None and len(
                 self.config['bucket_name']) > 0:
@@ -98,6 +102,7 @@ def __init__(
                 logs_level = logging.CRITICAL
             else:
                 logs_level = logging.NOTSET
+
         logging.basicConfig(filename=logs_filename, filemode='w', level=logs_level)
         print("logs are written in " + logs_filename)
 
@@ -143,9 +148,11 @@ def service_isalive(self, use_datastet=False):
                           'test call to service, please check and re-start a server.')
         return False
 
-    def _init_lmdb(self, use_datastet=False, data_path=None):
+    def _init_lmdb(self, use_datastet=False, data_path=None) -> str:
         # open in write mode
         root_data_path = self.config["data_path"] if not data_path else data_path
+
+        os.makedirs(root_data_path, exist_ok=True)
         if use_datastet:
             env_file_path = os.path.join(root_data_path, 'entries_dataset')
             self.env_dataset = lmdb.open(env_file_path, map_size=map_size)
@@ -156,6 +163,8 @@ def _init_lmdb(self, use_datastet=False, data_path=None):
         # envFilePath = os.path.join(self.config["data_path"], 'fail_software')
         # self.env_fail_software = lmdb.open(envFilePath, map_size=map_size)
 
+        return root_data_path
+
     def annotate_directory(self, directory, force=False, use_datastet=False):
         '''
         recursive directory walk for processing in parallel all PDF and XML documents
@@ -510,20 +519,20 @@ def reset(self, use_datastet=False, data_path=None):
             # close environments
             self.env_dataset.close()
 
-            envFilePath = os.path.join(root_data_path, 'entries_dataset')
+            envFilePath = os.path.join(self.root_data_path, 'entries_dataset')
             shutil.rmtree(envFilePath)
 
             # re-init the environments
-            self._init_lmdb(use_datastet=True)
+            self._init_lmdb(use_datastet=True, data_path=root_data_path)
         else:
             # close environments
             self.env_software.close()
 
-            envFilePath = os.path.join(root_data_path, 'entries_software')
+            envFilePath = os.path.join(self.root_data_path, 'entries_software')
             shutil.rmtree(envFilePath)
 
             # re-init the environments
-            self._init_lmdb(use_datastet=False)
+            self._init_lmdb(use_datastet=False, data_path=root_data_path)
 
     def load_mongo(self, directory):
         if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0:

From 55e3aad961cb6f6ba0ab069ddf561af04e04c264 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Mon, 16 Jun 2025 09:21:20 +0200
Subject: [PATCH 8/9] fix again the db paths

---
 software_mentions_client/client.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/software_mentions_client/client.py b/software_mentions_client/client.py
index 4b18408..6dc6732 100644
--- a/software_mentions_client/client.py
+++ b/software_mentions_client/client.py
@@ -508,13 +508,11 @@ def reprocess_failed(self, directory, use_datastet=False):
 
         logging.info("re-processed: " + str(nb_total) + " entries")
 
-    def reset(self, use_datastet=False, data_path=None):
+    def reset(self, use_datastet=False):
         """
         Remove the local lmdb keeping track of the state of advancement of the annotation and
         of the failed entries
         """
-        root_data_path = self.config["data_path"] if data_path is None else data_path
-
         if use_datastet:
             # close environments
             self.env_dataset.close()
@@ -523,7 +521,7 @@ def reset(self, use_datastet=False, data_path=None):
             shutil.rmtree(envFilePath)
 
             # re-init the environments
-            self._init_lmdb(use_datastet=True, data_path=root_data_path)
+            self._init_lmdb(use_datastet=True, data_path=self.root_data_path)
         else:
             # close environments
             self.env_software.close()
@@ -532,7 +530,7 @@ def reset(self, use_datastet=False, data_path=None):
             shutil.rmtree(envFilePath)
 
             # re-init the environments
-            self._init_lmdb(use_datastet=False, data_path=root_data_path)
+            self._init_lmdb(use_datastet=False, data_path=self.root_data_path)
 
     def load_mongo(self, directory):
         if "mongo_host" in self.config and len(self.config["mongo_host"].strip()) > 0:

From 948e9579d938e9a7197d31c904e01fc052d9cbad Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 2 Oct 2025 12:42:48 +0100
Subject: [PATCH 9/9] update documentation

---
 Readme.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Readme.md b/Readme.md
index 5ab23a5..32788f3 100644
--- a/Readme.md
+++ b/Readme.md
@@ -55,7 +55,7 @@ Finally install the project in editable state
 ```
 usage: client.py [-h] [--repo-in REPO_IN] [--file-in FILE_IN] [--file-out FILE_OUT] [--config CONFIG]
                  [--reprocess] [--reset] [--load] [--diagnostic-mongo] [--diagnostic-files]
-                 [--scorched-earth] [--datastet]
+                 [--scorched-earth] [--datastet] [--data-path DATA_PATH]
 
 Softcite software mention recognizer client
 
@@ -68,6 +68,7 @@ optional arguments:
   --file-out FILE_OUT  path to a single output the software mentions in JSON format, extracted from
                        the PDF file-in
   --config CONFIG      path to the config file, default is ./config.json
+  --data-path DATA_PATH path to the directory containing the LMDB database.
   --reprocess          reprocessed failed PDF or XML fulltexts
   --reset              ignore previous processing states and re-init the annotation process from the
                        beginning
@@ -137,7 +138,7 @@ Anntations will be added along the PDF and XML files, with extension `*.software
 -rw-rw-r-- 1 lopez lopez  485 Aug  8 03:41 0100a44b-6f3f-4cf7-86f9-8ef5e8401567.software.json
 ```
 
-### Reprocess failed PDF or XML fulltexts
+### Reprocess failed PDF or XML fulltext
 
 Just add `--reprocess` to the command line, the processing will be limited to the PDF and XML files that failed when processing them: 
 
@@ -175,7 +176,7 @@ python3 -m software_mentions_client.client --diagnostic-mongo
 
 ## Configuration
 
-By default, the concurreny of the parallelized calls to a service is `8`. This parameter can be changed in the configuration file `config.json`.
+By default, the concurrency of the parallelized calls to a service is `8`. This parameter can be changed in the configuration file `config.json`.
 
 Other important configuration parameter are the URL of the Software mention recognition web service `software_mention_url`, the optional URL of a DataStet server if used `dataset_mention_url`, the MongoDb instance information if you wish to load the produced annotations in MongoDB.