Skip to content

Commit 13a0a21

Browse files
authored
Merge pull request #475 from amahuli03/title-generation-font-size
refactor: file upload uses font size and more lenient regex to extract titles
2 parents 75c1a14 + edf1eb6 commit 13a0a21

2 files changed

Lines changed: 178 additions & 70 deletions

File tree

server/api/views/uploadFile/test_title.py

Lines changed: 108 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,60 +4,75 @@
44
from . import title
55

66

7+
def make_page_dict(blocks):
8+
"""Helper to build a get_text("dict") return value from a simple list of blocks.
9+
Each block is a list of (text, font_size) tuples representing spans.
10+
"""
11+
dict_blocks = []
12+
for spans in blocks:
13+
dict_blocks.append({
14+
"type": 0,
15+
"lines": [{
16+
"spans": [{"text": text, "size": size} for text, size in spans]
17+
}]
18+
})
19+
return {"blocks": dict_blocks}
20+
21+
22+
def make_mock_doc(pages_data, metadata=None):
23+
"""Build a mock fitz.Document.
24+
pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
25+
"""
26+
doc = MagicMock()
27+
doc.metadata = metadata or {"title": None}
28+
doc.__len__ = lambda self: len(pages_data)
29+
30+
mock_pages = []
31+
for page_blocks in pages_data:
32+
page = MagicMock()
33+
page.get_text.return_value = make_page_dict(page_blocks)
34+
mock_pages.append(page)
35+
36+
doc.__getitem__ = lambda self, idx: mock_pages[idx]
37+
return doc
38+
39+
740
class TestGenerateTitle(unittest.TestCase):
841
def test_prefers_metadata_title_if_valid(self):
942
doc = MagicMock()
1043
doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
1144
self.assertEqual(
1245
"A Study Regarding The Efficacy of Drugs", title.generate_title(doc))
1346

14-
def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self):
15-
doc = MagicMock()
16-
doc.metadata = {"title": ""}
17-
doc[0].get_text = MagicMock()
18-
19-
foo_block = [None] * 7
20-
foo_block[4] = "foo"
21-
foo_block[6] = 0
22-
23-
title_block = [None] * 7
24-
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
25-
title_block[6] = 0
26-
27-
bar_block = [None] * 7
28-
bar_block[4] = "bar"
29-
bar_block[6] = 0
30-
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
31-
47+
def test_falls_back_to_font_size_if_metadata_title_is_empty(self):
48+
doc = make_mock_doc(
49+
pages_data=[[
50+
[("foo", 10.0)],
51+
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
52+
[("bar", 10.0)],
53+
]],
54+
metadata={"title": ""},
55+
)
3256
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
3357
self.assertEqual(expected_title, title.generate_title(doc))
3458

35-
def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self):
36-
doc = MagicMock()
37-
doc.metadata = {"title": "abcd1234"}
38-
doc[0].get_text = MagicMock()
39-
40-
foo_block = [None] * 7
41-
foo_block[4] = "foo"
42-
foo_block[6] = 0
43-
44-
title_block = [None] * 7
45-
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
46-
title_block[6] = 0
47-
48-
bar_block = [None] * 7
49-
bar_block[4] = "bar"
50-
bar_block[6] = 0
51-
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
52-
59+
def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex(self):
60+
doc = make_mock_doc(
61+
pages_data=[[
62+
[("foo", 10.0)],
63+
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
64+
[("bar", 10.0)],
65+
]],
66+
metadata={"title": "abcd1234"},
67+
)
5368
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
5469
self.assertEqual(expected_title, title.generate_title(doc))
5570

5671
@patch("api.views.uploadFile.title.openAIServices.openAI")
5772
def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
58-
doc = MagicMock()
59-
doc.metadata = {"title": None}
60-
doc[0].get_text.return_value = []
73+
doc = make_mock_doc(
74+
pages_data=[[]] # no blocks at all
75+
)
6176

6277
mock_openAI.return_value = "A Study Regarding The Efficacy of Drugs"
6378

@@ -68,9 +83,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
6883

6984
@patch("api.views.uploadFile.title.openAIServices.openAI")
7085
def test_strips_quotes_from_openai_title(self, mock_openAI):
71-
doc = MagicMock()
72-
doc.metadata = {"title": None}
73-
doc[0].get_text.return_value = []
86+
doc = make_mock_doc(pages_data=[[]])
7487

7588
mock_openAI.return_value = '"Updated CANMAT/ISBD Guidelines for Treating Mixed Features in Bipolar Disorder"'
7689

@@ -80,13 +93,63 @@ def test_strips_quotes_from_openai_title(self, mock_openAI):
8093

8194
@patch("api.views.uploadFile.title.openAIServices.openAI")
8295
def test_truncates_long_openai_title(self, mock_openAI):
83-
doc = MagicMock()
84-
doc.metadata = {"title": None}
85-
doc[0].get_text.return_value = []
96+
doc = make_mock_doc(pages_data=[[]])
8697

8798
mock_openAI.return_value = "A" * 300
8899

89100
result = title.generate_title(doc)
90101

91102
# Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
92103
self.assertLessEqual(len(result), 255)
104+
105+
def test_font_size_joins_adjacent_spans_in_same_block(self):
106+
"""A title split across multiple spans in the same block should be joined."""
107+
doc = make_mock_doc(
108+
pages_data=[[
109+
[("Author Name", 10.0)],
110+
[("Advances in Mood Disorder", 18.0), ("Pharmacotherapy", 18.0)],
111+
[("Some journal info", 10.0)],
112+
]],
113+
)
114+
result = title.extract_title_by_font_size(doc)
115+
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
116+
117+
def test_font_size_ignores_short_spans(self):
118+
"""Superscript markers and other tiny spans should be filtered out."""
119+
doc = make_mock_doc(
120+
pages_data=[[
121+
[("Advances in Mood Disorder Pharmacotherapy", 18.0), ("*", 18.0)],
122+
[("Author Name et al.", 10.0)],
123+
]],
124+
)
125+
# The "*" span is < 2 chars, so it should be ignored; title is just the real text
126+
result = title.extract_title_by_font_size(doc)
127+
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
128+
129+
def test_font_size_returns_none_when_no_regex_match(self):
130+
"""If the largest-font text doesn't match the title regex, return None."""
131+
doc = make_mock_doc(
132+
pages_data=[[
133+
# Only 2 words — regex requires at least 3
134+
[("Psychiatry Research", 18.0)],
135+
[("Author Name et al.", 10.0)],
136+
]],
137+
)
138+
result = title.extract_title_by_font_size(doc)
139+
self.assertIsNone(result)
140+
141+
def test_font_size_finds_title_on_later_page(self):
142+
"""Title on page 2 should still be found if it has the largest font."""
143+
doc = make_mock_doc(
144+
pages_data=[
145+
[ # page 1: cover page with smaller text
146+
[("Some preamble text here", 12.0)],
147+
],
148+
[ # page 2: actual title in larger font
149+
[("Advances in Mood Disorder Pharmacotherapy", 18.0)],
150+
[("Author Name et al.", 10.0)],
151+
],
152+
],
153+
)
154+
result = title.extract_title_by_font_size(doc)
155+
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")

server/api/views/uploadFile/title.py

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,44 +6,89 @@
66

77

88
# regular expression to match common research white paper titles. Created by Chat-gpt
9-
# requires at least 3 words, no dates, no version numbers.
9+
# requires at least 3 words, no version numbers.
1010
title_regex = re.compile(
11-
r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
11+
r"^(?=(?:\b\w+\b[^A-Za-z0-9]*){3,})(?!.*\bv\d+\b)[A-Za-z0-9].+[A-Za-z\)?!]$", re.IGNORECASE)
1212

1313

1414
def generate_title(pdf: fitz.Document) -> str | None:
1515
document_metadata_title = pdf.metadata["title"]
1616
if document_metadata_title is not None and document_metadata_title != "":
1717
if title_regex.match(document_metadata_title):
18-
print("suitable title was found in metadata")
1918
return document_metadata_title.strip()
20-
else:
21-
print("metadata title did not match regex")
2219

23-
print("Looking for title in first page text")
24-
first_page = pdf[0]
25-
first_page_blocks = first_page.get_text("blocks")
26-
text_blocks = [
27-
block[4].strip().replace("\n", " ")
28-
for block in first_page_blocks
29-
if block[6] == 0 # only include text blocks.
30-
]
31-
32-
# For some reason, extracted PDF text has extra spaces. Collapse them here.
33-
regex = r"\s{2,}"
34-
text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
35-
36-
if len(text_blocks) != 0:
37-
for text in text_blocks:
38-
if title_regex.match(text):
39-
return text
40-
41-
print(
42-
"no suitable title found in first page text. Using GPT-4 to summarize the PDF")
20+
font_title = extract_title_by_font_size(pdf)
21+
if font_title:
22+
return font_title
23+
4324
gpt_title = summarize_pdf(pdf)
4425
return gpt_title or None
4526

4627

28+
def extract_title_by_font_size(pdf: fitz.Document, max_pages: int = 3) -> str | None:
29+
"""
30+
Extract the title by finding the largest font size across the first few pages
31+
and collecting contiguous runs of text at that size.
32+
"""
33+
pages_to_scan = min(max_pages, len(pdf))
34+
35+
# First pass: collect all spans with their font size, and find the max font size.
36+
all_spans = []
37+
max_font_size = 0.0
38+
39+
for page_idx in range(pages_to_scan):
40+
page_dict = pdf[page_idx].get_text("dict")
41+
for block in page_dict["blocks"]:
42+
if block.get("type") != 0:
43+
continue
44+
for line in block["lines"]:
45+
for span in line["spans"]:
46+
text = span["text"].strip()
47+
size = span["size"]
48+
if len(text) < 2 or size < 6.0:
49+
continue
50+
all_spans.append({"text": text, "size": size})
51+
if size > max_font_size:
52+
max_font_size = size
53+
54+
if max_font_size == 0.0:
55+
return None
56+
57+
# Second pass: gather contiguous runs of spans at the max font size.
58+
# Runs continue across block boundaries so multi-block titles (e.g.,
59+
# "BIPOLAR DISORDER IN PRIMARY CARE:" in one block and "DIAGNOSIS AND
60+
# MANAGEMENT" in the next) are joined into a single candidate.
61+
# A run only ends when a non-max-size span interrupts it.
62+
candidates = []
63+
current_run = []
64+
65+
for span in all_spans:
66+
if span["size"] == max_font_size:
67+
current_run.append(span["text"])
68+
else:
69+
if current_run:
70+
candidates.append(" ".join(current_run))
71+
current_run = []
72+
73+
if current_run:
74+
candidates.append(" ".join(current_run))
75+
76+
# Collapse extra whitespace, validate against title regex, and pick the longest match.
77+
# Longest wins because real titles are typically longer than section headers
78+
# (e.g., "About the Author") that may share the same max font size.
79+
best = None
80+
for candidate in candidates:
81+
cleaned = re.sub(r"\s{2,}", " ", candidate).strip()
82+
if title_regex.match(cleaned):
83+
if best is None or len(cleaned) > len(best):
84+
best = cleaned
85+
86+
if best:
87+
return best[:255]
88+
89+
return None
90+
91+
4792
def summarize_pdf(pdf: fitz.Document) -> str:
4893
"""
4994
Summarize a PDF document using OpenAI's GPT-4 model.

0 commit comments

Comments
 (0)