Skip to content

Commit bb3f0a2

Browse files
Adicionar chave text em Abstract.data (#1072)
* Initial plan * Add text property to Abstract.data with sections and non-sections support Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> * Refactor text property to avoid unnecessary list conversion Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com>
1 parent e9fa179 commit bb3f0a2

2 files changed

Lines changed: 290 additions & 0 deletions

File tree

packtools/sps/models/v2/abstract.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ def list_items(self):
147147
@property
148148
def kwds(self):
149149
parent = self.node.getparent()
150+
if parent is None:
151+
return
150152
lang = self.lang
151153
for kwd_group in parent.xpath(f'kwd-group[@xml:lang="{lang}"]'):
152154
for kwd in kwd_group.xpath("kwd"):
@@ -162,6 +164,31 @@ def kwds(self):
162164
def abstract_type(self):
163165
return self.node.get("abstract-type")
164166

167+
@property
168+
def text(self):
169+
"""
170+
Returns the concatenated text content of the abstract.
171+
- With sections: concatenates title and p from each section
172+
- Without sections: concatenates p elements
173+
"""
174+
text_parts = []
175+
176+
# Check if abstract has sections by querying the node directly
177+
if self.node.xpath("sec"):
178+
# With sections: include title and p from each section
179+
for section in self.sections:
180+
if section.get("title") and section["title"].get("plain_text"):
181+
text_parts.append(section["title"]["plain_text"])
182+
if section.get("p") and section["p"].get("plain_text"):
183+
text_parts.append(section["p"]["plain_text"])
184+
else:
185+
# Without sections: include only p elements
186+
for p_item in self.p:
187+
if p_item.get("plain_text"):
188+
text_parts.append(p_item["plain_text"])
189+
190+
return " ".join(text_parts)
191+
165192
@property
166193
def data(self):
167194
if self.lang:
@@ -178,6 +205,7 @@ def data(self):
178205
"sections": list(self.sections),
179206
"list_items": list(self.list_items),
180207
"kwds": list(self.kwds),
208+
"text": self.text,
181209
}
182210

183211

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
from unittest import TestCase
2+
from lxml import etree
3+
4+
from packtools.sps.models.v2.abstract import Abstract
5+
6+
7+
class AbstractTextWithSectionsTest(TestCase):
8+
"""Test the text property when abstract has sections"""
9+
10+
def setUp(self):
11+
xml = """
12+
<abstract xml:lang="en">
13+
<title>Abstract</title>
14+
<sec>
15+
<title>Objective</title>
16+
<p>To examine the effectiveness of day hospital attendance in prolonging independent living for elderly people.</p>
17+
</sec>
18+
<sec>
19+
<title>Design</title>
20+
<p>Systematic review of 12 controlled clinical trials (available by January 1997) comparing day hospital care with comprehensive care (five trials), domiciliary care (four trials), or no comprehensive care (three trials).</p>
21+
</sec>
22+
</abstract>
23+
"""
24+
self.node = etree.fromstring(xml)
25+
self.abstract = Abstract(
26+
self.node, lang="en",
27+
tags_to_keep=None, tags_to_keep_with_content=None,
28+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
29+
)
30+
31+
def test_text_property_with_sections(self):
32+
"""Test that text property includes title and p from each section"""
33+
expected = "Objective To examine the effectiveness of day hospital attendance in prolonging independent living for elderly people. Design Systematic review of 12 controlled clinical trials (available by January 1997) comparing day hospital care with comprehensive care (five trials), domiciliary care (four trials), or no comprehensive care (three trials)."
34+
self.assertEqual(self.abstract.text, expected)
35+
36+
def test_data_contains_text_key(self):
37+
"""Test that data dictionary contains the text key"""
38+
data = self.abstract.data
39+
self.assertIn("text", data)
40+
self.assertIsInstance(data["text"], str)
41+
42+
def test_data_text_value_with_sections(self):
43+
"""Test that data['text'] has correct value for abstract with sections"""
44+
expected = "Objective To examine the effectiveness of day hospital attendance in prolonging independent living for elderly people. Design Systematic review of 12 controlled clinical trials (available by January 1997) comparing day hospital care with comprehensive care (five trials), domiciliary care (four trials), or no comprehensive care (three trials)."
45+
self.assertEqual(self.abstract.data["text"], expected)
46+
47+
48+
class AbstractTextWithoutSectionsTest(TestCase):
49+
"""Test the text property when abstract has no sections"""
50+
51+
def setUp(self):
52+
xml = """
53+
<abstract xml:lang="en">
54+
<title>Abstract</title>
55+
<p>To examine the effectiveness of day hospital attendance in prolonging independent living for elderly people.</p>
56+
<p>Systematic review of 12 controlled clinical trials (available by January 1997) comparing day hospital care with comprehensive care (five trials), domiciliary care (four trials), or no comprehensive care (three trials).</p>
57+
</abstract>
58+
"""
59+
self.node = etree.fromstring(xml)
60+
self.abstract = Abstract(
61+
self.node, lang="en",
62+
tags_to_keep=None, tags_to_keep_with_content=None,
63+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
64+
)
65+
66+
def test_text_property_without_sections(self):
67+
"""Test that text property includes only p elements when no sections"""
68+
expected = "To examine the effectiveness of day hospital attendance in prolonging independent living for elderly people. Systematic review of 12 controlled clinical trials (available by January 1997) comparing day hospital care with comprehensive care (five trials), domiciliary care (four trials), or no comprehensive care (three trials)."
69+
self.assertEqual(self.abstract.text, expected)
70+
71+
def test_data_text_value_without_sections(self):
72+
"""Test that data['text'] has correct value for abstract without sections"""
73+
expected = "To examine the effectiveness of day hospital attendance in prolonging independent living for elderly people. Systematic review of 12 controlled clinical trials (available by January 1997) comparing day hospital care with comprehensive care (five trials), domiciliary care (four trials), or no comprehensive care (three trials)."
74+
self.assertEqual(self.abstract.data["text"], expected)
75+
76+
77+
class AbstractTextWithInlineTagsTest(TestCase):
78+
"""Test the text property handles inline formatting tags correctly"""
79+
80+
def setUp(self):
81+
xml = """
82+
<abstract xml:lang="en">
83+
<sec>
84+
<title>Objective</title>
85+
<p>To examine the <italic>effectiveness</italic> of day hospital attendance.</p>
86+
</sec>
87+
<sec>
88+
<title>Design</title>
89+
<p>Systematic review of <bold>12 controlled</bold> clinical trials.</p>
90+
</sec>
91+
</abstract>
92+
"""
93+
self.node = etree.fromstring(xml)
94+
self.abstract = Abstract(
95+
self.node, lang="en",
96+
tags_to_keep=None, tags_to_keep_with_content=None,
97+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
98+
)
99+
100+
def test_text_property_strips_inline_formatting(self):
101+
"""Test that inline formatting tags are removed but text is preserved"""
102+
expected = "Objective To examine the effectiveness of day hospital attendance. Design Systematic review of 12 controlled clinical trials."
103+
self.assertEqual(self.abstract.text, expected)
104+
105+
106+
class AbstractTextEmptyTest(TestCase):
107+
"""Test the text property with edge cases"""
108+
109+
def test_empty_abstract_with_sections(self):
110+
"""Test abstract with sections but no content"""
111+
xml = """
112+
<abstract xml:lang="en">
113+
<sec>
114+
<title></title>
115+
<p></p>
116+
</sec>
117+
</abstract>
118+
"""
119+
node = etree.fromstring(xml)
120+
abstract = Abstract(
121+
node, lang="en",
122+
tags_to_keep=None, tags_to_keep_with_content=None,
123+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
124+
)
125+
self.assertEqual(abstract.text, "")
126+
127+
def test_empty_abstract_without_sections(self):
128+
"""Test abstract without sections and no content"""
129+
xml = """
130+
<abstract xml:lang="en">
131+
<p></p>
132+
</abstract>
133+
"""
134+
node = etree.fromstring(xml)
135+
abstract = Abstract(
136+
node, lang="en",
137+
tags_to_keep=None, tags_to_keep_with_content=None,
138+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
139+
)
140+
self.assertEqual(abstract.text, "")
141+
142+
def test_abstract_with_only_title_no_sections(self):
143+
"""Test abstract with only title, no sections or paragraphs"""
144+
xml = """
145+
<abstract xml:lang="en">
146+
<title>Abstract</title>
147+
</abstract>
148+
"""
149+
node = etree.fromstring(xml)
150+
abstract = Abstract(
151+
node, lang="en",
152+
tags_to_keep=None, tags_to_keep_with_content=None,
153+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
154+
)
155+
self.assertEqual(abstract.text, "")
156+
157+
158+
class AbstractTextMultipleParagraphsTest(TestCase):
159+
"""Test the text property with multiple paragraphs in sections"""
160+
161+
def test_single_paragraph_per_section(self):
162+
"""Test with one paragraph per section (standard case)"""
163+
xml = """
164+
<abstract xml:lang="en">
165+
<sec>
166+
<title>First</title>
167+
<p>First paragraph.</p>
168+
</sec>
169+
<sec>
170+
<title>Second</title>
171+
<p>Second paragraph.</p>
172+
</sec>
173+
</abstract>
174+
"""
175+
node = etree.fromstring(xml)
176+
abstract = Abstract(
177+
node, lang="en",
178+
tags_to_keep=None, tags_to_keep_with_content=None,
179+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
180+
)
181+
expected = "First First paragraph. Second Second paragraph."
182+
self.assertEqual(abstract.text, expected)
183+
184+
def test_section_without_title(self):
185+
"""Test section without title but with paragraph"""
186+
xml = """
187+
<abstract xml:lang="en">
188+
<sec>
189+
<p>Only paragraph.</p>
190+
</sec>
191+
</abstract>
192+
"""
193+
node = etree.fromstring(xml)
194+
abstract = Abstract(
195+
node, lang="en",
196+
tags_to_keep=None, tags_to_keep_with_content=None,
197+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
198+
)
199+
expected = "Only paragraph."
200+
self.assertEqual(abstract.text, expected)
201+
202+
def test_section_without_paragraph(self):
203+
"""Test section with title but no paragraph"""
204+
xml = """
205+
<abstract xml:lang="en">
206+
<sec>
207+
<title>Only Title</title>
208+
</sec>
209+
</abstract>
210+
"""
211+
node = etree.fromstring(xml)
212+
abstract = Abstract(
213+
node, lang="en",
214+
tags_to_keep=None, tags_to_keep_with_content=None,
215+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
216+
)
217+
expected = "Only Title"
218+
self.assertEqual(abstract.text, expected)
219+
220+
221+
class AbstractTextLanguageTest(TestCase):
222+
"""Test the text property with different languages"""
223+
224+
def test_portuguese_abstract_with_sections(self):
225+
"""Test Portuguese abstract with sections"""
226+
xml = """
227+
<abstract xml:lang="pt">
228+
<sec>
229+
<title>Objetivo</title>
230+
<p>Avaliar o efeito de intervenção educativa domiciliar.</p>
231+
</sec>
232+
<sec>
233+
<title>Método</title>
234+
<p>Ensaio Clínico Randomizado.</p>
235+
</sec>
236+
</abstract>
237+
"""
238+
node = etree.fromstring(xml)
239+
abstract = Abstract(
240+
node, lang="pt",
241+
tags_to_keep=None, tags_to_keep_with_content=None,
242+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
243+
)
244+
expected = "Objetivo Avaliar o efeito de intervenção educativa domiciliar. Método Ensaio Clínico Randomizado."
245+
self.assertEqual(abstract.text, expected)
246+
247+
def test_spanish_abstract_without_sections(self):
248+
"""Test Spanish abstract without sections"""
249+
xml = """
250+
<abstract xml:lang="es">
251+
<p>Evaluar el efecto de intervenciones de atención domiciliaria.</p>
252+
<p>Ensayo Clínico Aleatorizado.</p>
253+
</abstract>
254+
"""
255+
node = etree.fromstring(xml)
256+
abstract = Abstract(
257+
node, lang="es",
258+
tags_to_keep=None, tags_to_keep_with_content=None,
259+
tags_to_remove_with_content=None, tags_to_convert_to_html=None
260+
)
261+
expected = "Evaluar el efecto de intervenciones de atención domiciliaria. Ensayo Clínico Aleatorizado."
262+
self.assertEqual(abstract.text, expected)

0 commit comments

Comments
 (0)