44from . import title
55
66
7+ def make_page_dict (blocks ):
8+ """Helper to build a get_text("dict") return value from a simple list of blocks.
9+ Each block is a list of (text, font_size) tuples representing spans.
10+ """
11+ dict_blocks = []
12+ for spans in blocks :
13+ dict_blocks .append ({
14+ "type" : 0 ,
15+ "lines" : [{
16+ "spans" : [{"text" : text , "size" : size } for text , size in spans ]
17+ }]
18+ })
19+ return {"blocks" : dict_blocks }
20+
21+
22+ def make_mock_doc (pages_data , metadata = None ):
23+ """Build a mock fitz.Document.
24+ pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
25+ """
26+ doc = MagicMock ()
27+ doc .metadata = metadata or {"title" : None }
28+ doc .__len__ = lambda self : len (pages_data )
29+
30+ mock_pages = []
31+ for page_blocks in pages_data :
32+ page = MagicMock ()
33+ page .get_text .return_value = make_page_dict (page_blocks )
34+ mock_pages .append (page )
35+
36+ doc .__getitem__ = lambda self , idx : mock_pages [idx ]
37+ return doc
38+
39+
740class TestGenerateTitle (unittest .TestCase ):
841 def test_prefers_metadata_title_if_valid (self ):
942 doc = MagicMock ()
1043 doc .metadata = {"title" : "A Study Regarding The Efficacy of Drugs" }
1144 self .assertEqual (
1245 "A Study Regarding The Efficacy of Drugs" , title .generate_title (doc ))
1346
14- def test_falls_back_to_first_page_text_if_metadata_title_is_empty (self ):
15- doc = MagicMock ()
16- doc .metadata = {"title" : "" }
17- doc [0 ].get_text = MagicMock ()
18-
19- foo_block = [None ] * 7
20- foo_block [4 ] = "foo"
21- foo_block [6 ] = 0
22-
23- title_block = [None ] * 7
24- title_block [4 ] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
25- title_block [6 ] = 0
26-
27- bar_block = [None ] * 7
28- bar_block [4 ] = "bar"
29- bar_block [6 ] = 0
30- doc [0 ].get_text .return_value = [foo_block , title_block , bar_block ]
31-
47+ def test_falls_back_to_font_size_if_metadata_title_is_empty (self ):
48+ doc = make_mock_doc (
49+ pages_data = [[
50+ [("foo" , 10.0 )],
51+ [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" , 18.0 )],
52+ [("bar" , 10.0 )],
53+ ]],
54+ metadata = {"title" : "" },
55+ )
3256 expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
3357 self .assertEqual (expected_title , title .generate_title (doc ))
3458
35- def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex (self ):
36- doc = MagicMock ()
37- doc .metadata = {"title" : "abcd1234" }
38- doc [0 ].get_text = MagicMock ()
39-
40- foo_block = [None ] * 7
41- foo_block [4 ] = "foo"
42- foo_block [6 ] = 0
43-
44- title_block = [None ] * 7
45- title_block [4 ] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
46- title_block [6 ] = 0
47-
48- bar_block = [None ] * 7
49- bar_block [4 ] = "bar"
50- bar_block [6 ] = 0
51- doc [0 ].get_text .return_value = [foo_block , title_block , bar_block ]
52-
59+ def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex (self ):
60+ doc = make_mock_doc (
61+ pages_data = [[
62+ [("foo" , 10.0 )],
63+ [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" , 18.0 )],
64+ [("bar" , 10.0 )],
65+ ]],
66+ metadata = {"title" : "abcd1234" },
67+ )
5368 expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
5469 self .assertEqual (expected_title , title .generate_title (doc ))
5570
5671 @patch ("api.views.uploadFile.title.openAIServices.openAI" )
5772 def test_falls_back_to_chatgpt_if_no_title_found (self , mock_openAI ):
58- doc = MagicMock ()
59- doc . metadata = { "title" : None }
60- doc [ 0 ]. get_text . return_value = []
73+ doc = make_mock_doc (
74+ pages_data = [[]] # no blocks at all
75+ )
6176
6277 mock_response = MagicMock ()
6378 mock_response .choices = [MagicMock ()]
@@ -70,9 +85,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
7085
7186 @patch ("api.views.uploadFile.title.openAIServices.openAI" )
7287 def test_strips_quotes_from_openai_title (self , mock_openAI ):
73- doc = MagicMock ()
74- doc .metadata = {"title" : None }
75- doc [0 ].get_text .return_value = []
88+ doc = make_mock_doc (pages_data = [[]])
7689
7790 mock_response = MagicMock ()
7891 mock_response .choices = [MagicMock ()]
@@ -85,9 +98,7 @@ def test_strips_quotes_from_openai_title(self, mock_openAI):
8598
8699 @patch ("api.views.uploadFile.title.openAIServices.openAI" )
87100 def test_truncates_long_openai_title (self , mock_openAI ):
88- doc = MagicMock ()
89- doc .metadata = {"title" : None }
90- doc [0 ].get_text .return_value = []
101+ doc = make_mock_doc (pages_data = [[]])
91102
92103 mock_response = MagicMock ()
93104 mock_response .choices = [MagicMock ()]
@@ -98,3 +109,55 @@ def test_truncates_long_openai_title(self, mock_openAI):
98109
99110 # Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
100111 self .assertLessEqual (len (result ), 255 )
112+
113+ def test_font_size_joins_adjacent_spans_in_same_block (self ):
114+ """A title split across multiple spans in the same block should be joined."""
115+ doc = make_mock_doc (
116+ pages_data = [[
117+ [("Author Name" , 10.0 )],
118+ [("Advances in Mood Disorder" , 18.0 ), ("Pharmacotherapy" , 18.0 )],
119+ [("Some journal info" , 10.0 )],
120+ ]],
121+ )
122+ result = title .extract_title_by_font_size (doc )
123+ self .assertEqual (result , "Advances in Mood Disorder Pharmacotherapy" )
124+
125+ def test_font_size_ignores_short_spans (self ):
126+ """Superscript markers and other tiny spans should be filtered out."""
127+ doc = make_mock_doc (
128+ pages_data = [[
129+ [("Advances in Mood Disorder Pharmacotherapy" , 18.0 ), ("*" , 18.0 )],
130+ [("Author Name et al." , 10.0 )],
131+ ]],
132+ )
133+ # The "*" span is < 2 chars, so it should be ignored; title is just the real text
134+ result = title .extract_title_by_font_size (doc )
135+ self .assertEqual (result , "Advances in Mood Disorder Pharmacotherapy" )
136+
137+ def test_font_size_returns_none_when_no_regex_match (self ):
138+ """If the largest-font text doesn't match the title regex, return None."""
139+ doc = make_mock_doc (
140+ pages_data = [[
141+ # Only 2 words — regex requires at least 3
142+ [("Psychiatry Research" , 18.0 )],
143+ [("Author Name et al." , 10.0 )],
144+ ]],
145+ )
146+ result = title .extract_title_by_font_size (doc )
147+ self .assertIsNone (result )
148+
149+ def test_font_size_finds_title_on_later_page (self ):
150+ """Title on page 2 should still be found if it has the largest font."""
151+ doc = make_mock_doc (
152+ pages_data = [
153+ [ # page 1: cover page with smaller text
154+ [("Some preamble text here" , 12.0 )],
155+ ],
156+ [ # page 2: actual title in larger font
157+ [("Advances in Mood Disorder Pharmacotherapy" , 18.0 )],
158+ [("Author Name et al." , 10.0 )],
159+ ],
160+ ],
161+ )
162+ result = title .extract_title_by_font_size (doc )
163+ self .assertEqual (result , "Advances in Mood Disorder Pharmacotherapy" )
0 commit comments