44from . import title
55
66
7+ def make_page_dict (blocks ):
8+ """Helper to build a get_text("dict") return value from a simple list of blocks.
9+ Each block is a list of (text, font_size) tuples representing spans.
10+ """
11+ dict_blocks = []
12+ for spans in blocks :
13+ dict_blocks .append ({
14+ "type" : 0 ,
15+ "lines" : [{
16+ "spans" : [{"text" : text , "size" : size } for text , size in spans ]
17+ }]
18+ })
19+ return {"blocks" : dict_blocks }
20+
21+
22+ def make_mock_doc (pages_data , metadata = None ):
23+ """Build a mock fitz.Document.
24+ pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
25+ """
26+ doc = MagicMock ()
27+ doc .metadata = metadata or {"title" : None }
28+ doc .__len__ = lambda self : len (pages_data )
29+
30+ mock_pages = []
31+ for page_blocks in pages_data :
32+ page = MagicMock ()
33+ page .get_text .return_value = make_page_dict (page_blocks )
34+ mock_pages .append (page )
35+
36+ doc .__getitem__ = lambda self , idx : mock_pages [idx ]
37+ return doc
38+
39+
740class TestGenerateTitle (unittest .TestCase ):
841 def test_prefers_metadata_title_if_valid (self ):
942 doc = MagicMock ()
1043 doc .metadata = {"title" : "A Study Regarding The Efficacy of Drugs" }
1144 self .assertEqual (
1245 "A Study Regarding The Efficacy of Drugs" , title .generate_title (doc ))
1346
14- def test_falls_back_to_first_page_text_if_metadata_title_is_empty (self ):
15- doc = MagicMock ()
16- doc .metadata = {"title" : "" }
17- doc [0 ].get_text = MagicMock ()
18-
19- foo_block = [None ] * 7
20- foo_block [4 ] = "foo"
21- foo_block [6 ] = 0
22-
23- title_block = [None ] * 7
24- title_block [4 ] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
25- title_block [6 ] = 0
26-
27- bar_block = [None ] * 7
28- bar_block [4 ] = "bar"
29- bar_block [6 ] = 0
30- doc [0 ].get_text .return_value = [foo_block , title_block , bar_block ]
31-
47+ def test_falls_back_to_font_size_if_metadata_title_is_empty (self ):
48+ doc = make_mock_doc (
49+ pages_data = [[
50+ [("foo" , 10.0 )],
51+ [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" , 18.0 )],
52+ [("bar" , 10.0 )],
53+ ]],
54+ metadata = {"title" : "" },
55+ )
3256 expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
3357 self .assertEqual (expected_title , title .generate_title (doc ))
3458
35- def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex (self ):
36- doc = MagicMock ()
37- doc .metadata = {"title" : "abcd1234" }
38- doc [0 ].get_text = MagicMock ()
39-
40- foo_block = [None ] * 7
41- foo_block [4 ] = "foo"
42- foo_block [6 ] = 0
43-
44- title_block = [None ] * 7
45- title_block [4 ] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
46- title_block [6 ] = 0
47-
48- bar_block = [None ] * 7
49- bar_block [4 ] = "bar"
50- bar_block [6 ] = 0
51- doc [0 ].get_text .return_value = [foo_block , title_block , bar_block ]
52-
59+ def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex (self ):
60+ doc = make_mock_doc (
61+ pages_data = [[
62+ [("foo" , 10.0 )],
63+ [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" , 18.0 )],
64+ [("bar" , 10.0 )],
65+ ]],
66+ metadata = {"title" : "abcd1234" },
67+ )
5368 expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
5469 self .assertEqual (expected_title , title .generate_title (doc ))
5570
5671 @patch ("api.views.uploadFile.title.openAIServices.openAI" )
5772 def test_falls_back_to_chatgpt_if_no_title_found (self , mock_openAI ):
58- doc = MagicMock ()
59- doc . metadata = { "title" : None }
60- doc [ 0 ]. get_text . return_value = []
73+ doc = make_mock_doc (
74+ pages_data = [[]] # no blocks at all
75+ )
6176
6277 mock_openAI .return_value = "A Study Regarding The Efficacy of Drugs"
6378
@@ -68,9 +83,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
6883
6984 @patch ("api.views.uploadFile.title.openAIServices.openAI" )
7085 def test_strips_quotes_from_openai_title (self , mock_openAI ):
71- doc = MagicMock ()
72- doc .metadata = {"title" : None }
73- doc [0 ].get_text .return_value = []
86+ doc = make_mock_doc (pages_data = [[]])
7487
7588 mock_openAI .return_value = '"Updated CANMAT/ISBD Guidelines for Treating Mixed Features in Bipolar Disorder"'
7689
@@ -80,13 +93,63 @@ def test_strips_quotes_from_openai_title(self, mock_openAI):
8093
8194 @patch ("api.views.uploadFile.title.openAIServices.openAI" )
8295 def test_truncates_long_openai_title (self , mock_openAI ):
83- doc = MagicMock ()
84- doc .metadata = {"title" : None }
85- doc [0 ].get_text .return_value = []
96+ doc = make_mock_doc (pages_data = [[]])
8697
8798 mock_openAI .return_value = "A" * 300
8899
89100 result = title .generate_title (doc )
90101
91102 # Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
92103 self .assertLessEqual (len (result ), 255 )
104+
105+ def test_font_size_joins_adjacent_spans_in_same_block (self ):
106+ """A title split across multiple spans in the same block should be joined."""
107+ doc = make_mock_doc (
108+ pages_data = [[
109+ [("Author Name" , 10.0 )],
110+ [("Advances in Mood Disorder" , 18.0 ), ("Pharmacotherapy" , 18.0 )],
111+ [("Some journal info" , 10.0 )],
112+ ]],
113+ )
114+ result = title .extract_title_by_font_size (doc )
115+ self .assertEqual (result , "Advances in Mood Disorder Pharmacotherapy" )
116+
117+ def test_font_size_ignores_short_spans (self ):
118+ """Superscript markers and other tiny spans should be filtered out."""
119+ doc = make_mock_doc (
120+ pages_data = [[
121+ [("Advances in Mood Disorder Pharmacotherapy" , 18.0 ), ("*" , 18.0 )],
122+ [("Author Name et al." , 10.0 )],
123+ ]],
124+ )
125+ # The "*" span is < 2 chars, so it should be ignored; title is just the real text
126+ result = title .extract_title_by_font_size (doc )
127+ self .assertEqual (result , "Advances in Mood Disorder Pharmacotherapy" )
128+
129+ def test_font_size_returns_none_when_no_regex_match (self ):
130+ """If the largest-font text doesn't match the title regex, return None."""
131+ doc = make_mock_doc (
132+ pages_data = [[
133+ # Only 2 words — regex requires at least 3
134+ [("Psychiatry Research" , 18.0 )],
135+ [("Author Name et al." , 10.0 )],
136+ ]],
137+ )
138+ result = title .extract_title_by_font_size (doc )
139+ self .assertIsNone (result )
140+
141+ def test_font_size_finds_title_on_later_page (self ):
142+ """Title on page 2 should still be found if it has the largest font."""
143+ doc = make_mock_doc (
144+ pages_data = [
145+ [ # page 1: cover page with smaller text
146+ [("Some preamble text here" , 12.0 )],
147+ ],
148+ [ # page 2: actual title in larger font
149+ [("Advances in Mood Disorder Pharmacotherapy" , 18.0 )],
150+ [("Author Name et al." , 10.0 )],
151+ ],
152+ ],
153+ )
154+ result = title .extract_title_by_font_size (doc )
155+ self .assertEqual (result , "Advances in Mood Disorder Pharmacotherapy" )
0 commit comments