Skip to content

Commit 6eb0a55

Browse files
Retornar no front, metadados (por hora apenas título de artigo) com a "formatação" mantida (#231)
* Cria uma função para obter metadados (títulos dos artigos) mantendo os estilos (tags xml sps) * Inclui no front a chave `display-format` * Faz a conversão de italic para i e bold para b * Troca hífen por `underscore`
1 parent 69f34ec commit 6eb0a55

4 files changed

Lines changed: 189 additions & 0 deletions

File tree

documentstore/domain.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,66 @@ def assets_from_remote_xml(
263263
return xml, get_static_assets(xml)
264264

265265

266+
def display_format(
267+
data: bytes,
268+
) -> dict:
269+
"""
270+
(#PCDATA | email | ext-link | uri | inline-supplementary-material |
271+
related-article | related-object | bold | fixed-case | italic |
272+
monospace | overline | roman | sans-serif | sc | strike | underline |
273+
ruby | alternatives | inline-graphic | inline-media | private-char |
274+
chem-struct | inline-formula | tex-math | mml:math | abbrev | index-term |
275+
index-term-range-end | milestone-end | milestone-start | named-content |
276+
styled-content | fn | target | xref | sub | sup | break)*
277+
"""
278+
metadata = {}
279+
parser = DEFAULT_XMLPARSER
280+
xml = etree.parse(BytesIO(data), parser)
281+
xpaths = [
282+
("article_title", ".", ".//article-meta//article-title"),
283+
("article_title", ".//article-meta//trans-title-group", ".//trans-title"),
284+
("article_title", ".//sub-article", ".//front-stub//article-title"),
285+
]
286+
287+
for label, lang_xpath, content_xpath in xpaths:
288+
for lang_node in xml.findall(lang_xpath):
289+
lang = lang_node.get('{http://www.w3.org/XML/1998/namespace}lang')
290+
for content_node in lang_node.findall(content_xpath):
291+
_display_format_remove_xref(content_node)
292+
_display_format_convert_bold_and_italic(content_node)
293+
content = _display_format_get_content(content_node)
294+
if content and lang:
295+
_display_format_update_output(
296+
metadata, label, lang, content)
297+
298+
return metadata
299+
300+
301+
def _display_format_update_output(output, label, lang, content):
302+
output[label] = output.setdefault(label, {})
303+
output[label][lang] = content
304+
305+
306+
def _display_format_get_content(node):
307+
content = etree.tostring(node, encoding='utf-8').decode("utf-8")
308+
content = content[content.find(">")+1:]
309+
content = content[:content.rfind("</")]
310+
return content
311+
312+
313+
def _display_format_remove_xref(node):
314+
for xref in node.findall(".//xref"):
315+
p = xref.getparent()
316+
p.remove(xref)
317+
318+
319+
def _display_format_convert_bold_and_italic(node):
320+
for tag in ("bold", "italic"):
321+
for found in node.findall(".//{}".format(tag)):
322+
found.tag = tag[0]
323+
324+
325+
266326
class Document:
267327
_timestamp_pattern = (
268328
r"^[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z)?$"

documentstore/services.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from .interfaces import Session
1212
from .domain import Document, DocumentsBundle, Journal, utcnow
13+
from .domain import display_format
1314
from .exceptions import DoesNotExist, AlreadyExists, VersionAlreadySet
1415

1516
__all__ = ["get_handlers"]
@@ -257,6 +258,7 @@ def __call__(self, xml_data: bytes) -> dict:
257258
return {
258259
**clea_article.data_full,
259260
"aff_contrib_full": clea_join.aff_contrib_full(clea_article),
261+
"display_format": display_format(xml_data),
260262
}
261263

262264

tests/test_domain.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,3 +1974,106 @@ def test_sleep_increases_exponentially(self):
19741974

19751975
calls = [mock.call(1.2 ** i) for i in range(1, 3)]
19761976
retry_gracefully._sleep.assert_has_calls(calls)
1977+
1978+
1979+
class MetadataWithStylesForArticleWithTransTitlesTests(unittest.TestCase):
1980+
1981+
def setUp(self):
1982+
self.xml = (
1983+
'<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.1" specific-use="sps-1.9" xml:lang="pt">'
1984+
'<front>'
1985+
'<article-meta>'
1986+
'''<title-group>
1987+
<article-title>Uma Reflexão de Professores sobre Demonstrações Relativas à Irracionalidade de <inline-formula><mml:math display="inline" id="m1"><mml:mrow><mml:msqrt><mml:mn>2</mml:mn></mml:msqrt></mml:mrow></mml:math></inline-formula> </article-title>
1988+
<trans-title-group xml:lang="en">
1989+
<trans-title>Teachers' Considerations on the Irrationality Proof of <inline-formula><mml:math display="inline" id="m2"><mml:mrow><mml:msqrt><mml:mn>2</mml:mn></mml:msqrt></mml:mrow></mml:math></inline-formula> </trans-title>
1990+
</trans-title-group>
1991+
<trans-title-group xml:lang="es">
1992+
<trans-title>Español <inline-formula><mml:math display="inline" id="m2"><mml:mrow><mml:msqrt><mml:mn>2</mml:mn></mml:msqrt></mml:mrow></mml:math></inline-formula> </trans-title>
1993+
</trans-title-group>
1994+
</title-group>'''
1995+
'</article-meta>'
1996+
'</front>'
1997+
'</article>'
1998+
).encode("utf-8")
1999+
2000+
def test_display_format(self):
2001+
result = domain.display_format(self.xml)
2002+
expected = {
2003+
"article_title": {
2004+
"pt":
2005+
('Uma Reflexão de Professores sobre Demonstrações '
2006+
'Relativas à Irracionalidade de '
2007+
'<inline-formula><mml:math display="inline" id="m1">'
2008+
'<mml:mrow><mml:msqrt><mml:mn>2</mml:mn></mml:msqrt>'
2009+
'</mml:mrow></mml:math></inline-formula> '),
2010+
"en": (
2011+
"""Teachers' Considerations on the Irrationality Proof """
2012+
"""of <inline-formula><mml:math display="inline" """
2013+
"""id="m2">"""
2014+
"""<mml:mrow><mml:msqrt><mml:mn>2</mml:mn></mml:msqrt>"""
2015+
"""</mml:mrow></mml:math></inline-formula> """),
2016+
"es": (
2017+
"""Español <inline-formula><mml:math display="inline" """
2018+
"""id="m2"><mml:mrow><mml:msqrt><mml:mn>2</mml:mn>"""
2019+
"""</mml:msqrt></mml:mrow></mml:math></inline-formula> """
2020+
),
2021+
}
2022+
}
2023+
self.assertEqual(expected, result)
2024+
2025+
2026+
class MetadataWithStylesForArticleWithSubarticlesTests(unittest.TestCase):
2027+
2028+
def setUp(self):
2029+
self.xml = (
2030+
'<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.1" specific-use="sps-1.9" xml:lang="en">'
2031+
'<front>'
2032+
'<article-meta>'
2033+
'''
2034+
<title-group>
2035+
<article-title>Heparin solution in the prevention of occlusions in Hickman<sup>®</sup> catheters a randomized clinical trial<xref ref-type="fn" rid="fn1">*</xref></article-title>
2036+
</title-group>
2037+
'''
2038+
'</article-meta>'
2039+
'</front>'
2040+
2041+
'''
2042+
<sub-article article-type="translation" id="s1" xml:lang="pt">
2043+
<front-stub>
2044+
<title-group>
2045+
<article-title>Solução de <bold>heparina</bold> na prevenção de oclusão do Cateter de Hickman<sup>®</sup> ensaio clínico randomizado<xref ref-type="fn" rid="fn2">*</xref></article-title>
2046+
</title-group>
2047+
</front-stub>
2048+
</sub-article>
2049+
<sub-article article-type="translation" id="s2" xml:lang="es">
2050+
<front-stub>
2051+
<title-group>
2052+
<article-title>Solución <italic>de heparina para prevenir</italic> oclusiones en catéteres de Hickman<sup>®</sup> un ensayo clínico aleatorizado<xref ref-type="fn" rid="fn3">*</xref></article-title>
2053+
</title-group>
2054+
</front-stub>
2055+
</sub-article>
2056+
'''
2057+
'</article>'
2058+
).encode("utf-8")
2059+
2060+
def test_display_format_removes_xref(self):
2061+
result = domain.display_format(self.xml)
2062+
expected = {
2063+
"article_title": {
2064+
"en": (
2065+
"""Heparin solution in the prevention of occlusions """
2066+
"""in Hickman<sup>®</sup> catheters a randomized """
2067+
"""clinical trial"""
2068+
),
2069+
"pt": (
2070+
"""Solução de <b>heparina</b> na prevenção de oclusão do """
2071+
"""Cateter de Hickman<sup>®</sup> ensaio clínico """
2072+
"""randomizado"""),
2073+
"es": (
2074+
"""Solución <i>de heparina para prevenir</i> oclusiones en """
2075+
"""catéteres de Hickman<sup>®</sup> un ensayo clínico """
2076+
"""aleatorizado"""),
2077+
}
2078+
}
2079+
self.assertDictEqual(expected, result)

tests/test_services.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,3 +1207,27 @@ def test_swollows_VersionAlreadySet_exception_for_assets(self):
12071207
assets=assets,
12081208
)
12091209
)
1210+
1211+
1212+
class FetchDocumentFrontTest(CommandTestMixin, unittest.TestCase):
1213+
def setUp(self):
1214+
self.services, self.session = make_services()
1215+
self.command = self.services["sanitize_document_front"]
1216+
with open(
1217+
os.path.join(
1218+
os.path.dirname(os.path.abspath(__file__)),
1219+
"0034-8910-rsp-48-2-0347.xml",
1220+
),
1221+
"rb"
1222+
) as fixture:
1223+
self.data = fixture.read()
1224+
1225+
def test_call_returns_display_format(self):
1226+
expected = {
1227+
'article_title': {
1228+
"en": """Proposal for a telehealth concept in the translational research model""",
1229+
"pt": """Proposta conceitual de telessaúde no modelo da pesquisa translacional""",
1230+
}
1231+
}
1232+
result = self.command(self.data)
1233+
self.assertEqual(expected, result['display_format'])

0 commit comments

Comments
 (0)