Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion extractor_info.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
"name": "pdf2text-extractor",
"version": "0.12.0",
"version": "0.13.0",
"description": "Extracts text from pdf files. Creates an xml, json and csv file and uploads to Clowder dataset. Uses Grobid service and AllenAI s2orc-doc2json",
"author": "Mathew, Minu <minum@illinois.edu>; Lo, Kyle and Wang, Lucy Lu and Neumann, Mark and Kinney, Rodney and Weld, Daniel",
"contributors": [],
Expand Down
15 changes: 11 additions & 4 deletions pdf2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,17 @@ def process_message(self, connector, host, secret_key, resource, parameters):
pass

if len(xml_surface_tags) > 0:
log.info("Extracting pdf dimensions from xml file")
page_width = xml_surface_tags[0]['lrx']
page_height = xml_surface_tags[0]['lry']
log.info("Extracting pdf page dimensions from xml file")
page_dimensions = {}
for surface in xml_surface_tags:
page_num = surface.get('n', str(len(page_dimensions) + 1))
page_dimensions[str(page_num)] = {
"width": float(surface.get('lrx', page_width)),
"height": float(surface.get('lry', page_height))
}
else:
log.error("No page dimensions found in xml file. Falling back to default dimensions.")
page_dimensions = {"1": {"width": page_width, "height": page_height}}

# clean existing duplicate
files_in_dataset = pyclowder.datasets.get_file_list(connector, host, secret_key, dataset_id)
Expand All @@ -97,7 +105,6 @@ def process_message(self, connector, host, secret_key, resource, parameters):
{"file_id": json_fileid, "filename": output_json_file, "description": "JSON output file form Grobid"},
{"file_id": csv_fileid, "filename": output_csv_file, "description": "CSV output file with extracted text, section, and coordinates"}
]
page_dimensions = {"width": page_width, "height": page_height}
content = {"extractor": "pdf2text-extractor", "extracted_files": extracted_files, "page_dimensions": page_dimensions}
context = "http://clowder.ncsa.illinois.edu/contexts/metadata.jsonld"
#created_at = datetime.now().strftime("%a %d %B %H:%M:%S UTC %Y")
Expand Down