Skip to content

Commit b7c3742

Browse files
committed
wip
1 parent 89494f0 commit b7c3742

3 files changed

Lines changed: 256 additions & 133 deletions

File tree

src/parxy_core/models/models.py

Lines changed: 14 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -165,17 +165,16 @@ def contentmd(
165165
) -> str:
166166
"""Get the document content formatted as content-md.
167167
168-
Generates a content-md string: YAML frontmatter followed by Markdown.
169-
Per the spec, all heading levels are shifted up by one so the document
170-
title occupies the sole h1, and images use ``<figure>`` blocks.
168+
Delegates to :class:`~parxy_core.services.ContentMdService`.
171169
172170
Parameters
173171
----------
174172
title : str, optional
175-
Document title. Falls back to metadata.title, then filename.
173+
Document title. Falls back to metadata.title, a heading inferred
174+
from the first page, filename, then 'Untitled'.
176175
description : str, optional
177-
Short summary (~200 characters). Required by the spec; omitted from
178-
frontmatter when not provided.
176+
Short summary (~200 characters). Falls back to a doc-abstract block,
177+
then the longest TextBlock across the first two pages.
179178
date : str, optional
180179
Creation/publication date in ISO 8601. Falls back to metadata dates.
181180
license : str, optional
@@ -188,133 +187,16 @@ def contentmd(
188187
str
189188
The document content formatted as content-md.
190189
"""
191-
def _guess_title_from_first_page() -> Optional[str]:
192-
if not self.pages:
193-
return None
194-
first_page = self.pages[0]
195-
if not first_page.blocks:
196-
return None
197-
# Prefer an explicit doc-title block, then the highest-ranking heading
198-
doc_title = next(
199-
(
200-
b
201-
for b in first_page.blocks
202-
if isinstance(b, TextBlock)
203-
and b.role == 'doc-title'
204-
and b.text.strip()
205-
),
206-
None,
207-
)
208-
if doc_title:
209-
return _normalize(doc_title.text)
210-
headings = [
211-
b
212-
for b in first_page.blocks
213-
if isinstance(b, TextBlock)
214-
and b.role == 'heading'
215-
and b.text.strip()
216-
]
217-
if not headings:
218-
return None
219-
return _normalize(min(headings, key=lambda b: b.level or 1).text)
220-
221-
resolved_title = (
222-
title
223-
or (self.metadata.title if self.metadata else None)
224-
or _guess_title_from_first_page()
225-
or self.filename
226-
or 'Untitled'
190+
from parxy_core.services.contentmd_service import ContentMdService
191+
192+
return ContentMdService.render(
193+
self,
194+
title=title,
195+
description=description,
196+
date=date,
197+
license=license,
198+
author=author,
227199
)
228-
resolved_date = date or (
229-
(self.metadata.created_at or self.metadata.updated_at)
230-
if self.metadata
231-
else None
232-
)
233-
resolved_author = author or (self.metadata.author if self.metadata else None)
234-
235-
def _infer_description() -> Optional[str]:
236-
first_two_pages = self.pages[:2]
237-
blocks = [
238-
b
239-
for page in first_two_pages
240-
if page.blocks
241-
for b in page.blocks
242-
if isinstance(b, TextBlock) and b.text.strip()
243-
]
244-
abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
245-
if abstract:
246-
return _normalize(abstract.text)
247-
text_blocks = [b for b in blocks if b.role != 'doc-title']
248-
if not text_blocks:
249-
return None
250-
return _normalize(max(text_blocks, key=lambda b: len(b.text)).text)
251-
252-
resolved_description = description or _infer_description()
253-
254-
def _normalize(text: str) -> str:
255-
"""Collapse runs of whitespace to a single space and strip."""
256-
return ' '.join(text.split())
257-
258-
def _yaml_str(v: str) -> str:
259-
return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"'
260-
261-
fm = ['---', f'title: {_yaml_str(resolved_title)}']
262-
if resolved_description:
263-
fm.append(f'description: {_yaml_str(resolved_description)}')
264-
if resolved_date:
265-
fm.append(f'date: {_yaml_str(resolved_date)}')
266-
if license:
267-
fm.append(f'license: {_yaml_str(license)}')
268-
if resolved_author:
269-
fm.append(f'author: {_yaml_str(resolved_author)}')
270-
fm.append('---')
271-
frontmatter = '\n'.join(fm)
272-
273-
if not self.pages:
274-
return f'{frontmatter}\n\n# {resolved_title}\n'
275-
276-
parts = [f'# {resolved_title}']
277-
278-
for page in self.pages:
279-
if not page.blocks:
280-
if page.text.strip():
281-
parts.append(_normalize(page.text))
282-
continue
283-
284-
for block in page.blocks:
285-
role = (block.role or 'generic').lower()
286-
287-
if isinstance(block, TextBlock):
288-
if role == 'doc-title':
289-
# Already rendered as the top-level # heading — skip
290-
pass
291-
elif role == 'heading':
292-
# Shift all heading levels by +1 so h1 content becomes h2
293-
shifted = min((block.level or 1) + 1, 6)
294-
parts.append(f'{"#" * shifted} {_normalize(block.text)}')
295-
elif role in ('list', 'listitem'):
296-
for line in block.text.splitlines():
297-
if line.strip():
298-
parts.append(f'- {_normalize(line)}')
299-
elif role == 'doc-abstract':
300-
lang_attr = f' lang="{self.language}"' if self.language else ''
301-
parts.append(
302-
f'<abstract{lang_attr}>\n{_normalize(block.text)}\n</abstract>'
303-
)
304-
else:
305-
normalized = _normalize(block.text)
306-
if normalized:
307-
parts.append(normalized)
308-
309-
elif isinstance(block, ImageBlock):
310-
alt = block.alt_text or ''
311-
parts.append(f'<figure>\n{alt}\n</figure>')
312-
313-
elif isinstance(block, TableBlock):
314-
if block.text.strip():
315-
parts.append(block.text.strip())
316-
317-
return f'{frontmatter}\n\n' + '\n\n'.join(parts) + '\n'
318200

319201
def markdown(self) -> str:
320202
"""Get the document content formatted as Markdown.
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Services module for parxy_core."""
22

3+
from parxy_core.services.contentmd_service import ContentMdService
34
from parxy_core.services.pdf_service import PdfService
45

5-
__all__ = ['PdfService']
6+
__all__ = ['ContentMdService', 'PdfService']

0 commit comments

Comments
 (0)