diff --git a/extractor_info.json b/extractor_info.json index f70f5e2..1a690f4 100755 --- a/extractor_info.json +++ b/extractor_info.json @@ -1,7 +1,7 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", "name": "pdf2text-extractor", - "version": "0.12.0", + "version": "0.13.0", "description": "Extracts text from pdf files. Creates an xml, json and csv file and uploads to Clowder dataset. Uses Grobid service and AllenAI s2orc-doc2json", "author": "Mathew, Minu ; Lo, Kyle and Wang, Lucy Lu and Neumann, Mark and Kinney, Rodney and Weld, Daniel", "contributors": [], diff --git a/pdf2text.py b/pdf2text.py index fdc2cfa..9f295b2 100644 --- a/pdf2text.py +++ b/pdf2text.py @@ -73,9 +73,17 @@ def process_message(self, connector, host, secret_key, resource, parameters): pass if len(xml_surface_tags) > 0: - log.info("Extracting pdf dimensions from xml file") - page_width = xml_surface_tags[0]['lrx'] - page_height = xml_surface_tags[0]['lry'] + log.info("Extracting pdf page dimensions from xml file") + page_dimensions = {} + for surface in xml_surface_tags: + page_num = surface.get('n', str(len(page_dimensions) + 1)) + page_dimensions[str(page_num)] = { + "width": float(surface.get('lrx', page_width)), + "height": float(surface.get('lry', page_height)) + } + else: + log.error("No page dimensions found in xml file. Falling back to default dimensions.") + page_dimensions = {"1": {"width": page_width, "height": page_height}} # clean existing duplicate files_in_dataset = pyclowder.datasets.get_file_list(connector, host, secret_key, dataset_id) @@ -97,7 +105,6 @@ def process_message(self, connector, host, secret_key, resource, parameters): {"file_id": json_fileid, "filename": output_json_file, "description": "JSON output file form Grobid"}, {"file_id": csv_fileid, "filename": output_csv_file, "description": "CSV output file with extracted text, section, and coordinates"} ] - page_dimensions = {"width": page_width, "height": page_height} content = {"extractor": "pdf2text-extractor", "extracted_files": extracted_files, "page_dimensions": page_dimensions} context = "http://clowder.ncsa.illinois.edu/contexts/metadata.jsonld" #created_at = datetime.now().strftime("%a %d %B %H:%M:%S UTC %Y")