Skip to content

Commit edf1eb6

Browse files
committed
Update tests for font-size-based title extraction
Refactor test helpers to use get_text("dict") structure instead of get_text("blocks"). Add tests for multi-span joining, short span filtering, regex rejection, and multi-page title detection.
1 parent 2a822f6 commit edf1eb6

1 file changed

Lines changed: 108 additions & 45 deletions

File tree

server/api/views/uploadFile/test_title.py

Lines changed: 108 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,60 +4,75 @@
44
from . import title
55

66

7+
def make_page_dict(blocks):
8+
"""Helper to build a get_text("dict") return value from a simple list of blocks.
9+
Each block is a list of (text, font_size) tuples representing spans.
10+
"""
11+
dict_blocks = []
12+
for spans in blocks:
13+
dict_blocks.append({
14+
"type": 0,
15+
"lines": [{
16+
"spans": [{"text": text, "size": size} for text, size in spans]
17+
}]
18+
})
19+
return {"blocks": dict_blocks}
20+
21+
22+
def make_mock_doc(pages_data, metadata=None):
23+
"""Build a mock fitz.Document.
24+
pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
25+
"""
26+
doc = MagicMock()
27+
doc.metadata = metadata or {"title": None}
28+
doc.__len__ = lambda self: len(pages_data)
29+
30+
mock_pages = []
31+
for page_blocks in pages_data:
32+
page = MagicMock()
33+
page.get_text.return_value = make_page_dict(page_blocks)
34+
mock_pages.append(page)
35+
36+
doc.__getitem__ = lambda self, idx: mock_pages[idx]
37+
return doc
38+
39+
740
class TestGenerateTitle(unittest.TestCase):
841
def test_prefers_metadata_title_if_valid(self):
942
doc = MagicMock()
1043
doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
1144
self.assertEqual(
1245
"A Study Regarding The Efficacy of Drugs", title.generate_title(doc))
1346

14-
def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self):
15-
doc = MagicMock()
16-
doc.metadata = {"title": ""}
17-
doc[0].get_text = MagicMock()
18-
19-
foo_block = [None] * 7
20-
foo_block[4] = "foo"
21-
foo_block[6] = 0
22-
23-
title_block = [None] * 7
24-
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
25-
title_block[6] = 0
26-
27-
bar_block = [None] * 7
28-
bar_block[4] = "bar"
29-
bar_block[6] = 0
30-
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
31-
47+
def test_falls_back_to_font_size_if_metadata_title_is_empty(self):
48+
doc = make_mock_doc(
49+
pages_data=[[
50+
[("foo", 10.0)],
51+
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
52+
[("bar", 10.0)],
53+
]],
54+
metadata={"title": ""},
55+
)
3256
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
3357
self.assertEqual(expected_title, title.generate_title(doc))
3458

35-
def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self):
36-
doc = MagicMock()
37-
doc.metadata = {"title": "abcd1234"}
38-
doc[0].get_text = MagicMock()
39-
40-
foo_block = [None] * 7
41-
foo_block[4] = "foo"
42-
foo_block[6] = 0
43-
44-
title_block = [None] * 7
45-
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
46-
title_block[6] = 0
47-
48-
bar_block = [None] * 7
49-
bar_block[4] = "bar"
50-
bar_block[6] = 0
51-
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
52-
59+
def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex(self):
60+
doc = make_mock_doc(
61+
pages_data=[[
62+
[("foo", 10.0)],
63+
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
64+
[("bar", 10.0)],
65+
]],
66+
metadata={"title": "abcd1234"},
67+
)
5368
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
5469
self.assertEqual(expected_title, title.generate_title(doc))
5570

5671
@patch("api.views.uploadFile.title.openAIServices.openAI")
5772
def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
58-
doc = MagicMock()
59-
doc.metadata = {"title": None}
60-
doc[0].get_text.return_value = []
73+
doc = make_mock_doc(
74+
pages_data=[[]] # no blocks at all
75+
)
6176

6277
mock_response = MagicMock()
6378
mock_response.choices = [MagicMock()]
@@ -70,9 +85,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
7085

7186
@patch("api.views.uploadFile.title.openAIServices.openAI")
7287
def test_strips_quotes_from_openai_title(self, mock_openAI):
73-
doc = MagicMock()
74-
doc.metadata = {"title": None}
75-
doc[0].get_text.return_value = []
88+
doc = make_mock_doc(pages_data=[[]])
7689

7790
mock_response = MagicMock()
7891
mock_response.choices = [MagicMock()]
@@ -85,9 +98,7 @@ def test_strips_quotes_from_openai_title(self, mock_openAI):
8598

8699
@patch("api.views.uploadFile.title.openAIServices.openAI")
87100
def test_truncates_long_openai_title(self, mock_openAI):
88-
doc = MagicMock()
89-
doc.metadata = {"title": None}
90-
doc[0].get_text.return_value = []
101+
doc = make_mock_doc(pages_data=[[]])
91102

92103
mock_response = MagicMock()
93104
mock_response.choices = [MagicMock()]
@@ -98,3 +109,55 @@ def test_truncates_long_openai_title(self, mock_openAI):
98109

99110
# Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
100111
self.assertLessEqual(len(result), 255)
112+
113+
def test_font_size_joins_adjacent_spans_in_same_block(self):
114+
"""A title split across multiple spans in the same block should be joined."""
115+
doc = make_mock_doc(
116+
pages_data=[[
117+
[("Author Name", 10.0)],
118+
[("Advances in Mood Disorder", 18.0), ("Pharmacotherapy", 18.0)],
119+
[("Some journal info", 10.0)],
120+
]],
121+
)
122+
result = title.extract_title_by_font_size(doc)
123+
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
124+
125+
def test_font_size_ignores_short_spans(self):
126+
"""Superscript markers and other tiny spans should be filtered out."""
127+
doc = make_mock_doc(
128+
pages_data=[[
129+
[("Advances in Mood Disorder Pharmacotherapy", 18.0), ("*", 18.0)],
130+
[("Author Name et al.", 10.0)],
131+
]],
132+
)
133+
# The "*" span is < 2 chars, so it should be ignored; title is just the real text
134+
result = title.extract_title_by_font_size(doc)
135+
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
136+
137+
def test_font_size_returns_none_when_no_regex_match(self):
138+
"""If the largest-font text doesn't match the title regex, return None."""
139+
doc = make_mock_doc(
140+
pages_data=[[
141+
# Only 2 words — regex requires at least 3
142+
[("Psychiatry Research", 18.0)],
143+
[("Author Name et al.", 10.0)],
144+
]],
145+
)
146+
result = title.extract_title_by_font_size(doc)
147+
self.assertIsNone(result)
148+
149+
def test_font_size_finds_title_on_later_page(self):
150+
"""Title on page 2 should still be found if it has the largest font."""
151+
doc = make_mock_doc(
152+
pages_data=[
153+
[ # page 1: cover page with smaller text
154+
[("Some preamble text here", 12.0)],
155+
],
156+
[ # page 2: actual title in larger font
157+
[("Advances in Mood Disorder Pharmacotherapy", 18.0)],
158+
[("Author Name et al.", 10.0)],
159+
],
160+
],
161+
)
162+
result = title.extract_title_by_font_size(doc)
163+
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")

0 commit comments

Comments
 (0)