Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions markup_doc/issue_proc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from lxml import etree
from urllib.parse import urlparse
from packtools.sps.pid_provider.xml_sps_lib import get_xml_with_pre
import os


class Asset:
def __init__(self, wagtail_image):
self.file = wagtail_image.file # tiene .path (ruta absoluta)
self.original_href = wagtail_image.file.name # nombre en el storage


class XmlIssueProc:
def __init__(self, registro):
self.registro = registro
self.xmltree = self._extract_xml_tree()
self.journal_proc = self._extract_journal_proc()
self.issue_folder = self._extract_issue_folder()

def _extract_xml_tree(self):
return get_xml_with_pre(self.registro.text_xml).xmltree

def _extract_journal_proc(self):
acron = self.xmltree.findtext(".//journal-id[@journal-id-type='publisher-id']")
return type("JournalProc", (), {"acron": acron or "journal"})

def _get_issn(self):
issn = self.xmltree.findtext(".//issn[@pub-type='epub']")
if not issn:
issn = self.xmltree.findtext(".//issn[@pub-type='ppub']")
return issn

def _extract_issue_folder(self, lot=None):
issn = self._get_issn() or ""
acron = self.journal_proc.acron or ""
vol = (self.xmltree.findtext(".//volume") or "").strip()
issue = (self.xmltree.findtext(".//issue") or "").strip().lower()
year = self.xmltree.findtext(".//pub-date[@date-type='collection']/year")

parts = [p for p in [issn, acron] if p]

# volumen
if vol:
parts.append(f"v{vol}")

# issue puede ser número, suplemento o especial
if issue:
if issue.startswith("suppl"):
# suplemento de volumen → v10s2
parts[-1] = parts[-1] + f"s{issue.replace('suppl','').strip()}"
elif "suppl" in issue:
# suplemento de número → v10n4s2
tokens = issue.split()
num = tokens[0]
sup = tokens[1:]
parts.append(f"n{num}")
sup_num = "".join(sup).replace("suppl", "").strip()
parts[-1] = parts[-1] + f"s{sup_num}"
elif issue.startswith("spe"):
# número especial → v10nspe1
parts[-1] = parts[-1] + f"nspe{issue.replace('spe','').strip()}"
else:
# número normal → v4n10
parts.append(f"n{issue}")

# carpeta de publicación continua con lote
if lot and year:
lot_str = f"{lot:02d}{year[-2:]}"
parts.append(lot_str)

return "-".join(parts)

def build_pkg_name(self, lang=None):
issn = self._get_issn() or ""
acron = self.journal_proc.acron or ""

# base igual que issue_folder, pero sin el ISSN y acron aún
vol = (self.xmltree.findtext(".//volume") or "").strip()
issue = (self.xmltree.findtext(".//issue") or "").strip().lower()

parts = [issn, acron]

if vol:
parts.append(vol)

if issue:
if issue.startswith("suppl"):
# suplemento de volumen
parts[-1] = parts[-1] + f"s{issue.replace('suppl','').strip()}"
elif "suppl" in issue:
# suplemento de número
tokens = issue.split()
num = tokens[0]
sup = tokens[1:]
parts.append(num)
sup_num = "".join(sup).replace("suppl", "").strip()
parts[-1] = parts[-1] + f"s{sup_num}"
elif issue.startswith("spe"):
# número especial
parts[-1] = parts[-1] + f"nspe{issue.replace('spe','').strip()}"
else:
# número normal
parts.append(issue)

# ARTID
elocation = self.xmltree.findtext(".//elocation-id")
fpage = self.xmltree.findtext(".//fpage")
pid = self.xmltree.findtext(".//article-id[@specific-use='scielo-v2']")

if elocation:
parts.append(elocation.strip())
elif fpage:
parts.append(fpage.strip())
elif pid:
parts.append(pid.strip())
else:
parts.append("na") # fallback si no hay nada

# idioma solo si es traducción
if lang:
parts.append(lang)

return "-".join(parts)

def find_asset(self, basename, name):
"""
Devuelve las imágenes del StreamField como Asset
si coinciden con el nombre puesto en el XML (original_filename)
o con el nombre real en storage.
"""
assets = []
if self.registro.content_body:
for block in self.registro.content_body:
if block.block_type == "image" and block.value:
wagtail_image = block.value.get("image")
if not wagtail_image:
continue

# Nombre real en storage (ej: foto1.abcd1234.jpg)
storage_basename = os.path.basename(wagtail_image.file.name)

# Nombre usado en el XML (ej: foto1.jpg)
original_url = wagtail_image.get_rendition("original").url
xml_basename = os.path.basename(urlparse(original_url).path)

# Si coincide con cualquiera → se acepta
if basename in (storage_basename, xml_basename):
assets.append(Asset(wagtail_image))

return assets
183 changes: 183 additions & 0 deletions markup_doc/pkg_zip_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
from zipfile import ZipFile, ZIP_DEFLATED
import os, sys

from packtools.sps.models.v2.article_assets import ArticleAssets
from packtools.sps.models.article_and_subarticles import ArticleAndSubArticles

class PkgZipBuilder:
def __init__(self, xml_with_pre):
self.xml_with_pre = xml_with_pre
self.sps_pkg_name = xml_with_pre.sps_pkg_name
self.components = {}
self.texts = {}

def build_sps_package(
self,
output_folder,
renditions,
translations,
main_paragraphs_lang,
issue_proc,
):
"""
A partir do XML original ou gerado a partir do HTML, e
dos ativos digitais, todos registrados em MigratedFile,
cria o zip com nome no padrão SPS (ISSN-ACRON-VOL-NUM-SUPPL-ARTICLE) e
o armazena em SPSPkg.not_optimised_zip_file.
Neste momento o XML não contém pid v3.
"""
# gera nome de pacote padrão SPS ISSN-ACRON-VOL-NUM-SUPPL-ARTICLE

sps_pkg_zip_path = os.path.join(output_folder, f"{self.sps_pkg_name}.zip")

# cria pacote zip
with ZipFile(sps_pkg_zip_path, "w", compression=ZIP_DEFLATED) as zf:

# A partir do XML, obtém os nomes dos arquivos dos ativos digitais
self._build_sps_package_add_assets(zf, issue_proc)

# add renditions (pdf) to zip
result = self._build_sps_package_add_renditions(
zf, renditions, translations, main_paragraphs_lang
)
self.texts.update(result)

# adiciona XML em zip
self._build_sps_package_add_xml(zf)

return sps_pkg_zip_path

def _build_sps_package_add_renditions(
self, zf, renditions, translations, main_paragraphs_lang
):
xml = ArticleAndSubArticles(self.xml_with_pre.xmltree)
xml_langs = []
for item in xml.data:
if item.get("lang"):
xml_langs.append(item.get("lang"))

pdf_langs = []

for rendition in renditions:
try:
if rendition.lang:
sps_filename = f"{self.sps_pkg_name}-{rendition.lang}.pdf"
pdf_langs.append(rendition.lang)
else:
sps_filename = f"{self.sps_pkg_name}.pdf"
pdf_langs.append(xml_langs[0])

zf.write(rendition.file.path, arcname=sps_filename)

self.components[sps_filename] = {
"lang": rendition.lang,
"legacy_uri": rendition.original_href,
"component_type": "rendition",
}
except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
self.components[rendition.original_name] = {
"failures": format_traceback(exc_traceback),
}
Comment on lines +77 to +81
html_langs = list(translations.keys())
try:
if main_paragraphs_lang:
html_langs.append(main_paragraphs_lang)
except Exception as e:
pass

return {
"xml_langs": xml_langs,
"pdf_langs": pdf_langs,
"html_langs": html_langs,
}

def _build_sps_package_add_assets(self, zf, issue_proc):
replacements = {}
subdir = os.path.join(
issue_proc.journal_proc.acron,
issue_proc.issue_folder,
)
xml_assets = ArticleAssets(self.xml_with_pre.xmltree)
for xml_graphic in xml_assets.items:
try:
if replacements.get(xml_graphic.xlink_href):
continue

basename = os.path.basename(xml_graphic.xlink_href)
name, ext = os.path.splitext(basename)

found = False

# procura a "imagem" no contexto do "issue"
for asset in issue_proc.find_asset(basename, name):
found = True
self._build_sps_package_add_asset(
zf,
asset,
xml_graphic,
replacements,
)

if not found:
self.components[xml_graphic.xlink_href] = {
"failures": "Not found",
}

except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
#self.components[xml_graphic.xlink_href] = {
# "failures": format_traceback(exc_traceback),
#}
print(e)
print(exc_traceback)
xml_assets.replace_names(replacements)

def _build_sps_package_add_asset(
self,
zf,
asset,
xml_graphic,
replacements,
):
try:
# obtém o nome do arquivo no padrão sps
sps_filename = xml_graphic.name_canonical(self.sps_pkg_name)

# indica a troca de href original para o padrão SPS
replacements[xml_graphic.xlink_href] = sps_filename

# adiciona arquivo ao zip
zf.write(asset.file.path, arcname=sps_filename)

component_type = (
"supplementary-material"
if xml_graphic.is_supplementary_material
else "asset"
)
self.components[sps_filename] = {
"xml_elem_id": xml_graphic.id,
"legacy_uri": asset.original_href,
"component_type": component_type,
}
except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
#self.components[xml_graphic.xlink_href] = {
# "failures": format_traceback(exc_traceback),
#}
print(e)
print(exc_traceback)

def _build_sps_package_add_xml(self, zf):
try:
sps_xml_name = self.sps_pkg_name + ".xml"
zf.writestr(sps_xml_name, self.xml_with_pre.tostring(pretty_print=True))
self.components[sps_xml_name] = {"component_type": "xml"}
except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
#self.components[sps_xml_name] = {
#"component_type": "xml",
#"failures": format_traceback(exc_traceback),
#}
print(e)
print(exc_traceback)
4 changes: 4 additions & 0 deletions markup_doc/static/css/article.css

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions markup_doc/static/css/bootstrap.min.css

Large diffs are not rendered by default.

Loading
Loading