Merge pull request #475 from amahuli03/title-generation-font-size

sahilds1 · web-flow · commit 13a0a21d5279 · 2026-03-31T18:27:25.000-04:00
refactor: file upload uses font size and more lenient regex to extract titles
diff --git a/server/api/views/uploadFile/test_title.py b/server/api/views/uploadFile/test_title.py
@@ -4,60 +4,75 @@
 from . import title
 
 
+def make_page_dict(blocks):
+    """Helper to build a get_text("dict") return value from a simple list of blocks.
+    Each block is a list of (text, font_size) tuples representing spans.
+    """
+    dict_blocks = []
+    for spans in blocks:
+        dict_blocks.append({
+            "type": 0,
+            "lines": [{
+                "spans": [{"text": text, "size": size} for text, size in spans]
+            }]
+        })
+    return {"blocks": dict_blocks}
+
+
+def make_mock_doc(pages_data, metadata=None):
+    """Build a mock fitz.Document.
+    pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
+    """
+    doc = MagicMock()
+    doc.metadata = metadata or {"title": None}
+    doc.__len__ = lambda self: len(pages_data)
+
+    mock_pages = []
+    for page_blocks in pages_data:
+        page = MagicMock()
+        page.get_text.return_value = make_page_dict(page_blocks)
+        mock_pages.append(page)
+
+    doc.__getitem__ = lambda self, idx: mock_pages[idx]
+    return doc
+
+
 class TestGenerateTitle(unittest.TestCase):
     def test_prefers_metadata_title_if_valid(self):
         doc = MagicMock()
         doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
         self.assertEqual(
             "A Study Regarding The Efficacy of Drugs", title.generate_title(doc))
 
-    def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self):
-        doc = MagicMock()
-        doc.metadata = {"title": ""}
-        doc[0].get_text = MagicMock()
-
-        foo_block = [None] * 7
-        foo_block[4] = "foo"
-        foo_block[6] = 0
-
-        title_block = [None] * 7
-        title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
-        title_block[6] = 0
-
-        bar_block = [None] * 7
-        bar_block[4] = "bar"
-        bar_block[6] = 0
-        doc[0].get_text.return_value = [foo_block, title_block, bar_block]
-
+    def test_falls_back_to_font_size_if_metadata_title_is_empty(self):
+        doc = make_mock_doc(
+            pages_data=[[
+                [("foo", 10.0)],
+                [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
+                [("bar", 10.0)],
+            ]],
+            metadata={"title": ""},
+        )
         expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
         self.assertEqual(expected_title, title.generate_title(doc))
 
-    def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self):
-        doc = MagicMock()
-        doc.metadata = {"title": "abcd1234"}
-        doc[0].get_text = MagicMock()
-
-        foo_block = [None] * 7
-        foo_block[4] = "foo"
-        foo_block[6] = 0
-
-        title_block = [None] * 7
-        title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
-        title_block[6] = 0
-
-        bar_block = [None] * 7
-        bar_block[4] = "bar"
-        bar_block[6] = 0
-        doc[0].get_text.return_value = [foo_block, title_block, bar_block]
-
+    def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex(self):
+        doc = make_mock_doc(
+            pages_data=[[
+                [("foo", 10.0)],
+                [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
+                [("bar", 10.0)],
+            ]],
+            metadata={"title": "abcd1234"},
+        )
         expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
         self.assertEqual(expected_title, title.generate_title(doc))
 
     @patch("api.views.uploadFile.title.openAIServices.openAI")
     def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
-        doc = MagicMock()
-        doc.metadata = {"title": None}
-        doc[0].get_text.return_value = []
+        doc = make_mock_doc(
+            pages_data=[[]]  # no blocks at all
+        )
 
         mock_openAI.return_value = "A Study Regarding The Efficacy of Drugs"
 
@@ -68,9 +83,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
 
     @patch("api.views.uploadFile.title.openAIServices.openAI")
     def test_strips_quotes_from_openai_title(self, mock_openAI):
-        doc = MagicMock()
-        doc.metadata = {"title": None}
-        doc[0].get_text.return_value = []
+        doc = make_mock_doc(pages_data=[[]])
 
         mock_openAI.return_value = '"Updated CANMAT/ISBD Guidelines for Treating Mixed Features in Bipolar Disorder"'
 
@@ -80,13 +93,63 @@ def test_strips_quotes_from_openai_title(self, mock_openAI):
 
     @patch("api.views.uploadFile.title.openAIServices.openAI")
     def test_truncates_long_openai_title(self, mock_openAI):
-        doc = MagicMock()
-        doc.metadata = {"title": None}
-        doc[0].get_text.return_value = []
+        doc = make_mock_doc(pages_data=[[]])
 
         mock_openAI.return_value = "A" * 300
 
         result = title.generate_title(doc)
 
         # Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
         self.assertLessEqual(len(result), 255)
+
+    def test_font_size_joins_adjacent_spans_in_same_block(self):
+        """A title split across multiple spans in the same block should be joined."""
+        doc = make_mock_doc(
+            pages_data=[[
+                [("Author Name", 10.0)],
+                [("Advances in Mood Disorder", 18.0), ("Pharmacotherapy", 18.0)],
+                [("Some journal info", 10.0)],
+            ]],
+        )
+        result = title.extract_title_by_font_size(doc)
+        self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
+
+    def test_font_size_ignores_short_spans(self):
+        """Superscript markers and other tiny spans should be filtered out."""
+        doc = make_mock_doc(
+            pages_data=[[
+                [("Advances in Mood Disorder Pharmacotherapy", 18.0), ("*", 18.0)],
+                [("Author Name et al.", 10.0)],
+            ]],
+        )
+        # The "*" span is < 2 chars, so it should be ignored; title is just the real text
+        result = title.extract_title_by_font_size(doc)
+        self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
+
+    def test_font_size_returns_none_when_no_regex_match(self):
+        """If the largest-font text doesn't match the title regex, return None."""
+        doc = make_mock_doc(
+            pages_data=[[
+                # Only 2 words — regex requires at least 3
+                [("Psychiatry Research", 18.0)],
+                [("Author Name et al.", 10.0)],
+            ]],
+        )
+        result = title.extract_title_by_font_size(doc)
+        self.assertIsNone(result)
+
+    def test_font_size_finds_title_on_later_page(self):
+        """Title on page 2 should still be found if it has the largest font."""
+        doc = make_mock_doc(
+            pages_data=[
+                [  # page 1: cover page with smaller text
+                    [("Some preamble text here", 12.0)],
+                ],
+                [  # page 2: actual title in larger font
+                    [("Advances in Mood Disorder Pharmacotherapy", 18.0)],
+                    [("Author Name et al.", 10.0)],
+                ],
+            ],
+        )
+        result = title.extract_title_by_font_size(doc)
+        self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
diff --git a/server/api/views/uploadFile/title.py b/server/api/views/uploadFile/title.py
@@ -6,44 +6,89 @@
 
 
 # regular expression to match common research white paper titles. Created by Chat-gpt
-# requires at least 3 words, no dates, no version numbers.
+# requires at least 3 words, no version numbers.
 title_regex = re.compile(
-    r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
+    r"^(?=(?:\b\w+\b[^A-Za-z0-9]*){3,})(?!.*\bv\d+\b)[A-Za-z0-9].+[A-Za-z\)?!]$", re.IGNORECASE)
 
 
 def generate_title(pdf: fitz.Document) -> str | None:
     document_metadata_title = pdf.metadata["title"]
     if document_metadata_title is not None and document_metadata_title != "":
         if title_regex.match(document_metadata_title):
-            print("suitable title was found in metadata")
             return document_metadata_title.strip()
-        else:
-            print("metadata title did not match regex")
 
-    print("Looking for title in first page text")
-    first_page = pdf[0]
-    first_page_blocks = first_page.get_text("blocks")
-    text_blocks = [
-        block[4].strip().replace("\n", " ")
-        for block in first_page_blocks
-        if block[6] == 0  # only include text blocks.
-    ]
-
-    # For some reason, extracted PDF text has extra spaces. Collapse them here.
-    regex = r"\s{2,}"
-    text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
-
-    if len(text_blocks) != 0:
-        for text in text_blocks:
-            if title_regex.match(text):
-                return text
-
-    print(
-        "no suitable title found in first page text. Using GPT-4 to summarize the PDF")
+    font_title = extract_title_by_font_size(pdf)
+    if font_title:
+        return font_title
+
     gpt_title = summarize_pdf(pdf)
     return gpt_title or None
 
 
+def extract_title_by_font_size(pdf: fitz.Document, max_pages: int = 3) -> str | None:
+    """
+    Extract the title by finding the largest font size across the first few pages
+    and collecting contiguous runs of text at that size.
+    """
+    pages_to_scan = min(max_pages, len(pdf))
+
+    # First pass: collect all spans with their font size, and find the max font size.
+    all_spans = []
+    max_font_size = 0.0
+
+    for page_idx in range(pages_to_scan):
+        page_dict = pdf[page_idx].get_text("dict")
+        for block in page_dict["blocks"]:
+            if block.get("type") != 0:
+                continue
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    text = span["text"].strip()
+                    size = span["size"]
+                    if len(text) < 2 or size < 6.0:
+                        continue
+                    all_spans.append({"text": text, "size": size})
+                    if size > max_font_size:
+                        max_font_size = size
+
+    if max_font_size == 0.0:
+        return None
+
+    # Second pass: gather contiguous runs of spans at the max font size.
+    # Runs continue across block boundaries so multi-block titles (e.g.,
+    # "BIPOLAR DISORDER IN PRIMARY CARE:" in one block and "DIAGNOSIS AND
+    # MANAGEMENT" in the next) are joined into a single candidate.
+    # A run only ends when a non-max-size span interrupts it.
+    candidates = []
+    current_run = []
+
+    for span in all_spans:
+        if span["size"] == max_font_size:
+            current_run.append(span["text"])
+        else:
+            if current_run:
+                candidates.append(" ".join(current_run))
+                current_run = []
+
+    if current_run:
+        candidates.append(" ".join(current_run))
+
+    # Collapse extra whitespace, validate against title regex, and pick the longest match.
+    # Longest wins because real titles are typically longer than section headers
+    # (e.g., "About the Author") that may share the same max font size.
+    best = None
+    for candidate in candidates:
+        cleaned = re.sub(r"\s{2,}", " ", candidate).strip()
+        if title_regex.match(cleaned):
+            if best is None or len(cleaned) > len(best):
+                best = cleaned
+
+    if best:
+        return best[:255]
+
+    return None
+
+
 def summarize_pdf(pdf: fitz.Document) -> str:
     """
     Summarize a PDF document using OpenAI's GPT-4 model.