88from zope .interface import Interface
99
1010import json
11+ import re
1112import yaml
1213
1314EXCLUDED_FROM_FRONTMATTER = {"blocks" , "blocks_layout" }
15+ BODY_FIELDS = {"text" , "description" }
1416
1517
1618@implementer (IRenderer )
@@ -39,6 +41,13 @@ def __call__(self, data):
3941 body_parts .extend (
4042 self ._render_blocks (value , data .get ("blocks_layout" , []))
4143 )
44+ elif key in BODY_FIELDS :
45+ # These go into the body, not frontmatter
46+ if key == "text" and isinstance (value , dict ):
47+ # RichText field structure
48+ body_parts .append (self ._render_richtext (value ))
49+ elif value :
50+ body_parts .append (str (value ))
4251 else :
4352 frontmatter [key ] = value
4453
@@ -54,11 +63,147 @@ def __call__(self, data):
5463 parts .append ("---" )
5564 parts .append ("" )
5665
66+ # Add title as H1 if present
67+ # TODO: this makes content objects with the blocks behavior have two titles
68+ # because of the title block converter
69+ if "title" in data and data ["title" ]:
70+ parts .append (f"# { data ['title' ]} " )
71+ parts .append ("" )
72+
5773 if body_parts :
5874 parts .extend (body_parts )
5975
6076 return "\n " .join (parts )
6177
78+ def _html_to_markdown (self , html ):
79+ """Convert HTML to GitHub Flavored Markdown.
80+
81+ This is a basic implementation. For production use, consider using
82+ a library like 'markdownify' or 'html2text'.
83+ """
84+ if not html :
85+ return ""
86+
87+ text = html
88+
89+ # Convert headings
90+ text = re .sub (
91+ r"<h1[^>]*>(.*?)</h1>" , r"# \1" , text , flags = re .IGNORECASE | re .DOTALL
92+ )
93+ text = re .sub (
94+ r"<h2[^>]*>(.*?)</h2>" , r"## \1" , text , flags = re .IGNORECASE | re .DOTALL
95+ )
96+ text = re .sub (
97+ r"<h3[^>]*>(.*?)</h3>" , r"### \1" , text , flags = re .IGNORECASE | re .DOTALL
98+ )
99+ text = re .sub (
100+ r"<h4[^>]*>(.*?)</h4>" , r"#### \1" , text , flags = re .IGNORECASE | re .DOTALL
101+ )
102+ text = re .sub (
103+ r"<h5[^>]*>(.*?)</h5>" , r"##### \1" , text , flags = re .IGNORECASE | re .DOTALL
104+ )
105+ text = re .sub (
106+ r"<h6[^>]*>(.*?)</h6>" , r"###### \1" , text , flags = re .IGNORECASE | re .DOTALL
107+ )
108+
109+ # Convert bold and italic
110+ text = re .sub (
111+ r"<strong[^>]*>(.*?)</strong>" ,
112+ r"**\1**" ,
113+ text ,
114+ flags = re .IGNORECASE | re .DOTALL ,
115+ )
116+ text = re .sub (
117+ r"<b[^>]*>(.*?)</b>" , r"**\1**" , text , flags = re .IGNORECASE | re .DOTALL
118+ )
119+ text = re .sub (
120+ r"<em[^>]*>(.*?)</em>" , r"*\1*" , text , flags = re .IGNORECASE | re .DOTALL
121+ )
122+ text = re .sub (
123+ r"<i[^>]*>(.*?)</i>" , r"*\1*" , text , flags = re .IGNORECASE | re .DOTALL
124+ )
125+
126+ # Convert links
127+ text = re .sub (
128+ r'<a[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)</a>' ,
129+ r"[\2](\1)" ,
130+ text ,
131+ flags = re .IGNORECASE | re .DOTALL ,
132+ )
133+
134+ # Convert images
135+ text = re .sub (
136+ r'<img[^>]*src=["\']([^"\']+)["\'][^>]*alt=["\']([^"\']*)["\'][^>]*>' ,
137+ r"" ,
138+ text ,
139+ flags = re .IGNORECASE ,
140+ )
141+ text = re .sub (
142+ r'<img[^>]*alt=["\']([^"\']*)["\'][^>]*src=["\']([^"\']+)["\'][^>]*>' ,
143+ r"" ,
144+ text ,
145+ flags = re .IGNORECASE ,
146+ )
147+ text = re .sub (
148+ r'<img[^>]*src=["\']([^"\']+)["\'][^>]*>' ,
149+ r"" ,
150+ text ,
151+ flags = re .IGNORECASE ,
152+ )
153+
154+ # Convert lists
155+ text = re .sub (r"<ul[^>]*>" , "" , text , flags = re .IGNORECASE )
156+ text = re .sub (r"</ul>" , "\n " , text , flags = re .IGNORECASE )
157+ text = re .sub (r"<ol[^>]*>" , "" , text , flags = re .IGNORECASE )
158+ text = re .sub (r"</ol>" , "\n " , text , flags = re .IGNORECASE )
159+ text = re .sub (
160+ r"<li[^>]*>(.*?)</li>" , r"- \1" , text , flags = re .IGNORECASE | re .DOTALL
161+ )
162+
163+ # Convert paragraphs
164+ text = re .sub (
165+ r"<p[^>]*>(.*?)</p>" , r"\1\n\n" , text , flags = re .IGNORECASE | re .DOTALL
166+ )
167+
168+ # Convert line breaks
169+ text = re .sub (r"<br\s*/?>" , "\n " , text , flags = re .IGNORECASE )
170+
171+ # Convert code
172+ text = re .sub (
173+ r"<code[^>]*>(.*?)</code>" , r"`\1`" , text , flags = re .IGNORECASE | re .DOTALL
174+ )
175+ text = re .sub (
176+ r"<pre[^>]*>(.*?)</pre>" ,
177+ r"```\n\1\n```" ,
178+ text ,
179+ flags = re .IGNORECASE | re .DOTALL ,
180+ )
181+
182+ # Remove remaining HTML tags
183+ text = re .sub (r"<[^>]+>" , "" , text )
184+
185+ # Clean up whitespace
186+ text = re .sub (r"\n{3,}" , "\n \n " , text )
187+ text = text .strip ()
188+
189+ return text
190+
191+ def _render_richtext (self , richtext_data ):
192+ """Convert a RichText field to Markdown."""
193+ if not isinstance (richtext_data , dict ):
194+ return str (richtext_data )
195+
196+ content = richtext_data .get ("data" , "" )
197+ content_type = richtext_data .get ("content-type" , "text/plain" )
198+
199+ if content_type == "text/html" :
200+ return self ._html_to_markdown (content )
201+ elif content_type == "text/plain" :
202+ return content
203+ else :
204+ # Unknown content type, return as-is
205+ return content
206+
62207 def _render_blocks (self , blocks : dict , blocks_layout : list ) -> list [str ]:
63208 """Convert Volto blocks to Markdown.
64209
0 commit comments