Skip to content

Commit 433f17a

Browse files
committed
vlm-based graph data extraction added
1 parent 188f07d commit 433f17a

File tree

14 files changed

+952
-1
lines changed

14 files changed

+952
-1
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,12 @@ cython_debug/
175175

176176
# IDE
177177
.vscode/
178+
.claude
179+
CLAUDE.md
178180

179181
# Remove example directory primarily
180182
examples/db/10.*
181183
tests example/
182184

183-
applications
185+
applications
186+
vlm_test

src/comproscanner/article_processors/elsevier_processor.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from ..utils.error_handler import ValueErrorHandler, KeyboardInterruptHandler
3838
from ..utils.logger import setup_logger
3939
from ..utils.common_functions import return_error_message, write_timeout_file
40+
from ..utils.figure_extractor import FigureExtractor
4041

4142
# Load environment variables from .env file
4243
load_dotenv()
@@ -79,6 +80,7 @@ def __init__(
7980
is_sql_db: bool = False,
8081
is_save_xml: bool = False,
8182
rag_config: RAGConfig = RAGConfig(),
83+
caption_keywords: dict = None,
8284
):
8385
keyword_message = return_error_message("main_property_keyword")
8486
property_keywords_message = return_error_message("property_keywords")
@@ -111,6 +113,7 @@ def __init__(
111113
self.is_sql_db = is_sql_db
112114
self.is_save_xml = is_save_xml
113115
self.rag_config = rag_config
116+
self.caption_keywords = caption_keywords
114117
# Takes from config file
115118
self.timeout_file = self.all_paths.TIMEOUT_DOI_LOG_FILENAME
116119
self.article_related_keywords = ArticleRelatedKeywords()
@@ -459,6 +462,104 @@ def _remove_elements(element_names, req_sections):
459462
else:
460463
return None, modified_sections
461464

465+
def _extract_and_save_figures(self, root, doi: str):
466+
"""
467+
Extract figures from Elsevier XML whose captions match self.caption_keywords.
468+
Downloads images from the Elsevier API and saves them to
469+
results/extracted_data/{keyword}/related_figures/{doi_}/{caption_id}.jpg
470+
alongside info.json.
471+
472+
Args:
473+
root: lxml root element of the parsed Elsevier XML.
474+
doi (str): Article DOI.
475+
"""
476+
if not self.caption_keywords:
477+
return
478+
base_path = f"results/extracted_data/{self.keyword}/related_figures"
479+
try:
480+
# Build ref → URL map from <objects> element
481+
objects = root.xpath(
482+
'.//*[local-name()="object"][@category="standard"]'
483+
'[@type="IMAGE-DOWNSAMPLED"]'
484+
)
485+
ref_to_url = {}
486+
for obj in objects:
487+
ref = obj.get("ref")
488+
url = obj.text.strip() if obj.text else None
489+
if ref and url:
490+
ref_to_url[ref] = url
491+
# Find all <ce:figure> elements
492+
figures = root.xpath('.//*[local-name()="figure"]')
493+
for figure in figures:
494+
# Caption text from all <ce:simple-para> descendants inside <ce:caption>
495+
caption_elements = figure.xpath(
496+
'.//*[local-name()="caption"]'
497+
'//*[local-name()="simple-para"]'
498+
)
499+
caption_text = " ".join(
500+
"".join(el.itertext()) for el in caption_elements
501+
).strip()
502+
if not caption_text:
503+
# Try any text inside <ce:caption>
504+
caption_els = figure.xpath('.//*[local-name()="caption"]')
505+
caption_text = " ".join(
506+
"".join(el.itertext()) for el in caption_els
507+
).strip()
508+
509+
if not FigureExtractor.keyword_matches_caption(
510+
caption_text, self.caption_keywords
511+
):
512+
continue
513+
514+
# Get the image reference locator (e.g., "gr1")
515+
link_els = figure.xpath('.//*[local-name()="link"]')
516+
locator = None
517+
for link_el in link_els:
518+
locator = link_el.get("locator")
519+
if locator:
520+
break
521+
if locator is None:
522+
locator = figure.get("id", "unknown")
523+
524+
caption_id = locator
525+
526+
# Always save caption to info.json
527+
FigureExtractor.update_info_json(doi, caption_id, caption_text, base_path)
528+
529+
# Download image if URL is available
530+
url = ref_to_url.get(locator)
531+
if url:
532+
try:
533+
img_headers = {
534+
"X-ELS-APIKey": self.api_key,
535+
"Accept": "*/*",
536+
}
537+
resp = requests.get(url, headers=img_headers, timeout=30)
538+
if resp.status_code == 200:
539+
saved = FigureExtractor.save_figure_from_bytes(
540+
resp.content, doi, caption_id, base_path
541+
)
542+
if saved:
543+
logger.info(
544+
f"Saved Elsevier figure '{caption_id}' for {doi}"
545+
)
546+
else:
547+
logger.warning(
548+
f"Failed to download Elsevier figure '{caption_id}' "
549+
f"for {doi}: HTTP {resp.status_code}"
550+
)
551+
except Exception as e:
552+
logger.warning(
553+
f"Error downloading Elsevier figure '{caption_id}' "
554+
f"for {doi}: {e}"
555+
)
556+
else:
557+
logger.warning(
558+
f"No image URL found for Elsevier figure '{caption_id}' in {doi}"
559+
)
560+
except Exception as e:
561+
logger.warning(f"Error extracting figures from Elsevier XML for {doi}: {e}")
562+
462563
def _extract_paragraphs(self, element):
463564
"""
464565
Extract paragraphs from the sections of the article.
@@ -888,6 +989,7 @@ def _process_articles(self):
888989
if root is None:
889990
logger.error(f"Failed to parse XML...skipping {row["doi"]}...")
890991
continue
992+
self._extract_and_save_figures(root, row["doi"])
891993
if self.is_save_xml:
892994
logger.info("Saving XML for DOI: ", row["doi"])
893995
self._save_xml(response, row["doi"])

0 commit comments

Comments
 (0)