|
37 | 37 | from ..utils.error_handler import ValueErrorHandler, KeyboardInterruptHandler |
38 | 38 | from ..utils.logger import setup_logger |
39 | 39 | from ..utils.common_functions import return_error_message, write_timeout_file |
| 40 | +from ..utils.figure_extractor import FigureExtractor |
40 | 41 |
|
41 | 42 | # Load environment variables from .env file |
42 | 43 | load_dotenv() |
@@ -79,6 +80,7 @@ def __init__( |
79 | 80 | is_sql_db: bool = False, |
80 | 81 | is_save_xml: bool = False, |
81 | 82 | rag_config: RAGConfig = RAGConfig(), |
| 83 | + caption_keywords: dict = None, |
82 | 84 | ): |
83 | 85 | keyword_message = return_error_message("main_property_keyword") |
84 | 86 | property_keywords_message = return_error_message("property_keywords") |
@@ -111,6 +113,7 @@ def __init__( |
111 | 113 | self.is_sql_db = is_sql_db |
112 | 114 | self.is_save_xml = is_save_xml |
113 | 115 | self.rag_config = rag_config |
| 116 | + self.caption_keywords = caption_keywords |
114 | 117 | # Takes from config file |
115 | 118 | self.timeout_file = self.all_paths.TIMEOUT_DOI_LOG_FILENAME |
116 | 119 | self.article_related_keywords = ArticleRelatedKeywords() |
@@ -459,6 +462,104 @@ def _remove_elements(element_names, req_sections): |
459 | 462 | else: |
460 | 463 | return None, modified_sections |
461 | 464 |
|
| 465 | + def _extract_and_save_figures(self, root, doi: str): |
| 466 | + """ |
| 467 | + Extract figures from Elsevier XML whose captions match self.caption_keywords. |
| 468 | + Downloads images from the Elsevier API and saves them to |
| 469 | + results/extracted_data/{keyword}/related_figures/{doi_}/{caption_id}.jpg |
| 470 | + alongside info.json. |
| 471 | +
|
| 472 | + Args: |
| 473 | + root: lxml root element of the parsed Elsevier XML. |
| 474 | + doi (str): Article DOI. |
| 475 | + """ |
| 476 | + if not self.caption_keywords: |
| 477 | + return |
| 478 | + base_path = f"results/extracted_data/{self.keyword}/related_figures" |
| 479 | + try: |
| 480 | + # Build ref → URL map from <objects> element |
| 481 | + objects = root.xpath( |
| 482 | + './/*[local-name()="object"][@category="standard"]' |
| 483 | + '[@type="IMAGE-DOWNSAMPLED"]' |
| 484 | + ) |
| 485 | + ref_to_url = {} |
| 486 | + for obj in objects: |
| 487 | + ref = obj.get("ref") |
| 488 | + url = obj.text.strip() if obj.text else None |
| 489 | + if ref and url: |
| 490 | + ref_to_url[ref] = url |
| 491 | + # Find all <ce:figure> elements |
| 492 | + figures = root.xpath('.//*[local-name()="figure"]') |
| 493 | + for figure in figures: |
| 494 | + # Caption text from all <ce:simple-para> descendants inside <ce:caption> |
| 495 | + caption_elements = figure.xpath( |
| 496 | + './/*[local-name()="caption"]' |
| 497 | + '//*[local-name()="simple-para"]' |
| 498 | + ) |
| 499 | + caption_text = " ".join( |
| 500 | + "".join(el.itertext()) for el in caption_elements |
| 501 | + ).strip() |
| 502 | + if not caption_text: |
| 503 | + # Try any text inside <ce:caption> |
| 504 | + caption_els = figure.xpath('.//*[local-name()="caption"]') |
| 505 | + caption_text = " ".join( |
| 506 | + "".join(el.itertext()) for el in caption_els |
| 507 | + ).strip() |
| 508 | + |
| 509 | + if not FigureExtractor.keyword_matches_caption( |
| 510 | + caption_text, self.caption_keywords |
| 511 | + ): |
| 512 | + continue |
| 513 | + |
| 514 | + # Get the image reference locator (e.g., "gr1") |
| 515 | + link_els = figure.xpath('.//*[local-name()="link"]') |
| 516 | + locator = None |
| 517 | + for link_el in link_els: |
| 518 | + locator = link_el.get("locator") |
| 519 | + if locator: |
| 520 | + break |
| 521 | + if locator is None: |
| 522 | + locator = figure.get("id", "unknown") |
| 523 | + |
| 524 | + caption_id = locator |
| 525 | + |
| 526 | + # Always save caption to info.json |
| 527 | + FigureExtractor.update_info_json(doi, caption_id, caption_text, base_path) |
| 528 | + |
| 529 | + # Download image if URL is available |
| 530 | + url = ref_to_url.get(locator) |
| 531 | + if url: |
| 532 | + try: |
| 533 | + img_headers = { |
| 534 | + "X-ELS-APIKey": self.api_key, |
| 535 | + "Accept": "*/*", |
| 536 | + } |
| 537 | + resp = requests.get(url, headers=img_headers, timeout=30) |
| 538 | + if resp.status_code == 200: |
| 539 | + saved = FigureExtractor.save_figure_from_bytes( |
| 540 | + resp.content, doi, caption_id, base_path |
| 541 | + ) |
| 542 | + if saved: |
| 543 | + logger.info( |
| 544 | + f"Saved Elsevier figure '{caption_id}' for {doi}" |
| 545 | + ) |
| 546 | + else: |
| 547 | + logger.warning( |
| 548 | + f"Failed to download Elsevier figure '{caption_id}' " |
| 549 | + f"for {doi}: HTTP {resp.status_code}" |
| 550 | + ) |
| 551 | + except Exception as e: |
| 552 | + logger.warning( |
| 553 | + f"Error downloading Elsevier figure '{caption_id}' " |
| 554 | + f"for {doi}: {e}" |
| 555 | + ) |
| 556 | + else: |
| 557 | + logger.warning( |
| 558 | + f"No image URL found for Elsevier figure '{caption_id}' in {doi}" |
| 559 | + ) |
| 560 | + except Exception as e: |
| 561 | + logger.warning(f"Error extracting figures from Elsevier XML for {doi}: {e}") |
| 562 | + |
462 | 563 | def _extract_paragraphs(self, element): |
463 | 564 | """ |
464 | 565 | Extract paragraphs from the sections of the article. |
@@ -888,6 +989,7 @@ def _process_articles(self): |
888 | 989 | if root is None: |
889 | 990 | logger.error(f"Failed to parse XML...skipping {row["doi"]}...") |
890 | 991 | continue |
| 992 | + self._extract_and_save_figures(root, row["doi"]) |
891 | 993 | if self.is_save_xml: |
892 | 994 | logger.info("Saving XML for DOI: ", row["doi"]) |
893 | 995 | self._save_xml(response, row["doi"]) |
|
0 commit comments