Skip to content

Commit 5d7fd9c

Browse files
committed
wip
1 parent be15068 commit 5d7fd9c

2 files changed

Lines changed: 59 additions & 21 deletions

File tree

src/parxy_core/services/contentmd_service.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ class ContentMdService:
2020
# Private helpers
2121
# ------------------------------------------------------------------
2222

23+
# Roles that provide structure or navigation rather than readable body text
24+
_STRUCTURAL_ROLES: frozenset[str] = frozenset(
25+
{
26+
'heading',
27+
'doc-title',
28+
'doc-subtitle',
29+
'doc-abstract',
30+
'doc-toc',
31+
'doc-pageheader',
32+
'doc-pagefooter',
33+
'caption',
34+
}
35+
)
36+
2337
@staticmethod
2438
def _normalize(text: str) -> str:
2539
"""Collapse any run of whitespace to a single space and strip."""
@@ -71,8 +85,10 @@ def _guess_title(document: Document) -> Optional[str]:
7185
def _infer_description(document: Document) -> Optional[str]:
7286
"""Infer a description from document content.
7387
74-
Uses the ``doc-abstract`` block when present, otherwise the longest
75-
:class:`TextBlock` across the first two pages.
88+
Uses the ``doc-abstract`` block when present. Otherwise concatenates
89+
the first five body :class:`TextBlock` objects (non-structural, across
90+
the first two pages), normalises whitespace, and returns at most 200
91+
characters.
7692
"""
7793
from parxy_core.models.models import TextBlock
7894

@@ -88,12 +104,16 @@ def _infer_description(document: Document) -> Optional[str]:
88104
if abstract:
89105
return ContentMdService._normalize(abstract.text)
90106

91-
text_blocks = [b for b in blocks if b.role != 'doc-title']
92-
if not text_blocks:
107+
body_blocks = [
108+
b
109+
for b in blocks
110+
if (b.role or 'generic') not in ContentMdService._STRUCTURAL_ROLES
111+
]
112+
if not body_blocks:
93113
return None
94-
return ContentMdService._normalize(
95-
max(text_blocks, key=lambda b: len(b.text)).text
96-
)
114+
115+
combined = ' '.join(b.text for b in body_blocks[:5])
116+
return ContentMdService._normalize(combined)[:200]
97117

98118
@staticmethod
99119
def _build_frontmatter(

tests/services/test_contentmd_service.py

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -171,27 +171,45 @@ def test_description_from_doc_abstract_block(self):
171171
result = ContentMdService.render(doc)
172172
assert 'description: "Abstract content here."' in result
173173

174-
def test_description_from_longest_textblock_when_no_abstract(self):
175-
blocks = [
176-
make_text_block('Short.', role='paragraph'),
177-
make_text_block(
178-
'This is a considerably longer paragraph block.', role='paragraph'
179-
),
180-
]
174+
def test_description_from_first_five_body_blocks(self):
175+
blocks = [make_text_block(f'Sentence {i}.', role='paragraph') for i in range(7)]
181176
doc = make_doc(pages=[make_page(text='', blocks=blocks)])
182177
result = ContentMdService.render(doc)
183-
assert 'description: "This is a considerably longer paragraph block."' in result
178+
# Only the first five contribute; the sixth and seventh are ignored
179+
assert 'Sentence 5' not in result.split('---\n')[1].split('\n')[0]
180+
assert 'Sentence 0' in result
184181

185-
def test_description_excludes_doc_title_from_longest_candidate(self):
182+
def test_description_excludes_structural_roles(self):
186183
blocks = [
187-
make_text_block(
188-
'This is a very long doc-title block text.', role='doc-title'
189-
),
190-
make_text_block('Shorter paragraph.', role='paragraph'),
184+
make_text_block('Table of contents text.', role='doc-toc'),
185+
make_text_block('Page header text.', role='doc-pageheader'),
186+
make_text_block('A heading block.', role='heading'),
187+
make_text_block('Body content.', role='paragraph'),
191188
]
192189
doc = make_doc(pages=[make_page(text='', blocks=blocks)])
193190
result = ContentMdService.render(doc)
194-
assert 'description: "Shorter paragraph."' in result
191+
assert 'description: "Body content."' in result
192+
193+
def test_description_truncated_to_200_chars(self):
194+
long_text = 'word ' * 60 # well over 200 chars
195+
blocks = [make_text_block(long_text, role='paragraph')]
196+
doc = make_doc(pages=[make_page(text='', blocks=blocks)])
197+
result = ContentMdService.render(doc)
198+
fm_end = result.index('---\n', 4)
199+
frontmatter = result[:fm_end]
200+
desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:'))
201+
# Strip the YAML quoting to measure the actual value length
202+
value = desc_line[len('description: "'):-1]
203+
assert len(value) <= 200
204+
205+
def test_description_contains_no_newlines(self):
206+
blocks = [make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')]
207+
doc = make_doc(pages=[make_page(text='', blocks=blocks)])
208+
result = ContentMdService.render(doc)
209+
fm_end = result.index('---\n', 4)
210+
frontmatter = result[:fm_end]
211+
desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:'))
212+
assert '\n' not in desc_line
195213

196214
def test_description_searches_first_two_pages(self):
197215
page1 = make_page(number=1, text='', blocks=[make_text_block('Page 1 text.')])

0 commit comments

Comments
 (0)