Skip to content

Commit a79333d

Browse files
committed
re-add html to markdown function, fix tests
1 parent 0315940 commit a79333d

2 files changed

Lines changed: 150 additions & 2 deletions

File tree

src/plone/restapi/renderer/markdown.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
from zope.interface import Interface
99

1010
import json
11+
import re
1112
import yaml
1213

1314
EXCLUDED_FROM_FRONTMATTER = {"blocks", "blocks_layout"}
15+
BODY_FIELDS = {"text", "description"}
1416

1517

1618
@implementer(IRenderer)
@@ -39,6 +41,13 @@ def __call__(self, data):
3941
body_parts.extend(
4042
self._render_blocks(value, data.get("blocks_layout", []))
4143
)
44+
elif key in BODY_FIELDS:
45+
# These go into the body, not frontmatter
46+
if key == "text" and isinstance(value, dict):
47+
# RichText field structure
48+
body_parts.append(self._render_richtext(value))
49+
elif value:
50+
body_parts.append(str(value))
4251
else:
4352
frontmatter[key] = value
4453

@@ -54,11 +63,147 @@ def __call__(self, data):
5463
parts.append("---")
5564
parts.append("")
5665

66+
# Add title as H1 if present
67+
# TODO: this makes content objects with the blocks behavior have two titles
68+
# because of the title block converter
69+
if "title" in data and data["title"]:
70+
parts.append(f"# {data['title']}")
71+
parts.append("")
72+
5773
if body_parts:
5874
parts.extend(body_parts)
5975

6076
return "\n".join(parts)
6177

78+
def _html_to_markdown(self, html):
79+
"""Convert HTML to GitHub Flavored Markdown.
80+
81+
This is a basic implementation. For production use, consider using
82+
a library like 'markdownify' or 'html2text'.
83+
"""
84+
if not html:
85+
return ""
86+
87+
text = html
88+
89+
# Convert headings
90+
text = re.sub(
91+
r"<h1[^>]*>(.*?)</h1>", r"# \1", text, flags=re.IGNORECASE | re.DOTALL
92+
)
93+
text = re.sub(
94+
r"<h2[^>]*>(.*?)</h2>", r"## \1", text, flags=re.IGNORECASE | re.DOTALL
95+
)
96+
text = re.sub(
97+
r"<h3[^>]*>(.*?)</h3>", r"### \1", text, flags=re.IGNORECASE | re.DOTALL
98+
)
99+
text = re.sub(
100+
r"<h4[^>]*>(.*?)</h4>", r"#### \1", text, flags=re.IGNORECASE | re.DOTALL
101+
)
102+
text = re.sub(
103+
r"<h5[^>]*>(.*?)</h5>", r"##### \1", text, flags=re.IGNORECASE | re.DOTALL
104+
)
105+
text = re.sub(
106+
r"<h6[^>]*>(.*?)</h6>", r"###### \1", text, flags=re.IGNORECASE | re.DOTALL
107+
)
108+
109+
# Convert bold and italic
110+
text = re.sub(
111+
r"<strong[^>]*>(.*?)</strong>",
112+
r"**\1**",
113+
text,
114+
flags=re.IGNORECASE | re.DOTALL,
115+
)
116+
text = re.sub(
117+
r"<b[^>]*>(.*?)</b>", r"**\1**", text, flags=re.IGNORECASE | re.DOTALL
118+
)
119+
text = re.sub(
120+
r"<em[^>]*>(.*?)</em>", r"*\1*", text, flags=re.IGNORECASE | re.DOTALL
121+
)
122+
text = re.sub(
123+
r"<i[^>]*>(.*?)</i>", r"*\1*", text, flags=re.IGNORECASE | re.DOTALL
124+
)
125+
126+
# Convert links
127+
text = re.sub(
128+
r'<a[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)</a>',
129+
r"[\2](\1)",
130+
text,
131+
flags=re.IGNORECASE | re.DOTALL,
132+
)
133+
134+
# Convert images
135+
text = re.sub(
136+
r'<img[^>]*src=["\']([^"\']+)["\'][^>]*alt=["\']([^"\']*)["\'][^>]*>',
137+
r"![\2](\1)",
138+
text,
139+
flags=re.IGNORECASE,
140+
)
141+
text = re.sub(
142+
r'<img[^>]*alt=["\']([^"\']*)["\'][^>]*src=["\']([^"\']+)["\'][^>]*>',
143+
r"![\1](\2)",
144+
text,
145+
flags=re.IGNORECASE,
146+
)
147+
text = re.sub(
148+
r'<img[^>]*src=["\']([^"\']+)["\'][^>]*>',
149+
r"![](\1)",
150+
text,
151+
flags=re.IGNORECASE,
152+
)
153+
154+
# Convert lists
155+
text = re.sub(r"<ul[^>]*>", "", text, flags=re.IGNORECASE)
156+
text = re.sub(r"</ul>", "\n", text, flags=re.IGNORECASE)
157+
text = re.sub(r"<ol[^>]*>", "", text, flags=re.IGNORECASE)
158+
text = re.sub(r"</ol>", "\n", text, flags=re.IGNORECASE)
159+
text = re.sub(
160+
r"<li[^>]*>(.*?)</li>", r"- \1", text, flags=re.IGNORECASE | re.DOTALL
161+
)
162+
163+
# Convert paragraphs
164+
text = re.sub(
165+
r"<p[^>]*>(.*?)</p>", r"\1\n\n", text, flags=re.IGNORECASE | re.DOTALL
166+
)
167+
168+
# Convert line breaks
169+
text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
170+
171+
# Convert code
172+
text = re.sub(
173+
r"<code[^>]*>(.*?)</code>", r"`\1`", text, flags=re.IGNORECASE | re.DOTALL
174+
)
175+
text = re.sub(
176+
r"<pre[^>]*>(.*?)</pre>",
177+
r"```\n\1\n```",
178+
text,
179+
flags=re.IGNORECASE | re.DOTALL,
180+
)
181+
182+
# Remove remaining HTML tags
183+
text = re.sub(r"<[^>]+>", "", text)
184+
185+
# Clean up whitespace
186+
text = re.sub(r"\n{3,}", "\n\n", text)
187+
text = text.strip()
188+
189+
return text
190+
191+
def _render_richtext(self, richtext_data):
192+
"""Convert a RichText field to Markdown."""
193+
if not isinstance(richtext_data, dict):
194+
return str(richtext_data)
195+
196+
content = richtext_data.get("data", "")
197+
content_type = richtext_data.get("content-type", "text/plain")
198+
199+
if content_type == "text/html":
200+
return self._html_to_markdown(content)
201+
elif content_type == "text/plain":
202+
return content
203+
else:
204+
# Unknown content type, return as-is
205+
return content
206+
62207
def _render_blocks(self, blocks: dict, blocks_layout: list) -> list[str]:
63208
"""Convert Volto blocks to Markdown.
64209

src/plone/restapi/tests/test_renderer_markdown.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Tests for Markdown renderer."""
22

3+
from simplejson.compat import b
4+
35
from plone.app.testing import login
46
from plone.app.testing import setRoles
57
from plone.app.testing import SITE_OWNER_NAME
@@ -67,8 +69,8 @@ def test_markdown_basic_rendering(self):
6769

6870
# Check for YAML frontmatter
6971
self.assertIn("---", content)
70-
self.assertIn("@id:", content)
71-
self.assertIn("@type: Document", content)
72+
self.assertIn("'@id':", content)
73+
self.assertIn("'@type': Document", content)
7274
self.assertIn("title: Test Document", content)
7375

7476
# Check for title as H1
@@ -100,6 +102,7 @@ def test_default_to_json(self):
100102
)
101103
response = requests.get(
102104
url,
105+
headers={"Accept": "*/*"},
103106
auth=(SITE_OWNER_NAME, SITE_OWNER_PASSWORD),
104107
)
105108

0 commit comments

Comments
 (0)