Skip to content

Commit 2e330d7

Browse files
committed
feat: removed subtitle/heading centering, now italicizes terms
1 parent 25d3ed1 commit 2e330d7

9 files changed

Lines changed: 134 additions & 92 deletions

File tree

src/text2markdown/async_text2markdown.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ async def text2markdown_async(
1313
link_xrefs: bool = True,
1414
strike_junk: bool = True,
1515
block_quotes: bool = True,
16+
escape_lists: bool = True,
1617
italicize_refs: bool = True,
18+
italicize_terms: bool = True,
1719
enrichment_model: str = "kanon-2-enricher",
1820
isaacus_client: isaacus.AsyncIsaacus | None = None,
1921
) -> str:
@@ -22,14 +24,18 @@ async def text2markdown_async(
2224
Args:
2325
text (str | ILGSDocument): Input to be converted into Markdown. If an Isaacus Legal Graph Schema (ILGS) Document is supplied, this function will convert the Document's text into Markdown without needing to enrich it first with an Isaacus enrichment model.
2426
25-
link_cross_references (bool, optional): Whether to link cross-references in the input text to their targets, for example, linking "as mentioned in Section 2.1" to the relevant section.
27+
link_xrefs (bool, optional): Whether to link cross-references in the input text to their targets, for example, linking "as mentioned in Section 2.1" to the relevant section.
2628
2729
strike_junk (bool, optional): Whether to strike out junk text.
2830
2931
block_quotes (bool, optional): Whether to transform non-inline quotes into Markdown block quotes.
3032
33+
escape_lists (bool, optional): Whether to escape list-like lines (lines starting with "-", "*", "+", or numbered lists). This leads to nicer rendering at the cost of cleaner Markdown source code.
34+
3135
italicize_refs (bool, optional): Whether to italicize the names of any referenced documents, for example, "as mentioned in *Smith v. Jones*".
3236
37+
italicize_terms (bool, optional): Whether to italicize the names of any defined terms.
38+
3339
enrichment_model (str, optional): The name of the Isaacus enrichment model to use for converting the input text into Markdown. Defaults to the latest and most advanced Isaacus enrichment model, currently `kanon-2-enricher`.
3440
3541
isaacus_client (isaacus.AsyncIsaacus, optional): An Isaacus API client to use for enriching the input text with an Isaacus enrichment model if the input is not already an Isaacus Legal Graph Schema (ILGS) Document. If `None`, a new instance will be created instead where necessary.
@@ -55,7 +61,9 @@ async def text2markdown_async(
5561
link_xrefs=link_xrefs,
5662
strike_junk=strike_junk,
5763
block_quotes=block_quotes,
64+
escape_lists=escape_lists,
5865
italicize_refs=italicize_refs,
66+
italicize_terms=italicize_terms,
5967
enrichment_model=enrichment_model,
6068
isaacus_client=None,
6169
)

src/text2markdown/text2markdown.py

Lines changed: 73 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,40 +17,39 @@
1717
re.compile(r"^\s{0,3}\d+\)\s+"), # Ordered lists with parentheses: 1) 2)
1818
]
1919

20+
_AnnotationKind = Literal[
21+
"heading",
22+
"xref", # Cross referencing another annotation
23+
"junk",
24+
"quote",
25+
"ext_ref", # External references
26+
"src_ref", # Pointed to by a xref
27+
"terms", # Defined terms
28+
]
29+
2030

2131
@dataclass
2232
class _Annotation:
2333
start: int # Annotation starting index
2434
end: int # Annotation ending index
25-
kind: Literal[
26-
"heading",
27-
"subtitle",
28-
"xref", # Cross referencing another annotation
29-
"junk",
30-
"quote",
31-
"ext_ref", # External references
32-
"src_ref", # Pointed to by a xref
33-
]
35+
kind: _AnnotationKind
3436
force_blank_line: bool = False
35-
level: int | None = None # kind=="heading" only
36-
start_id: int | None = None # kind=="xref" or "src_ref" only
37+
level: int | None = None # not `None` for `kind==heading` only
38+
start_id: str | None = None # not `None` for `kind==xref` or `src_ref` only
3739

3840
_static_tags = { # Markdown tags to attach to each `_Annotation` kind
39-
"subtitle": ("""<p style="text-align: center;">""", "</p>"),
4041
"junk": ("~~", "~~"),
4142
"quote": ("> ", None),
42-
"ext_ref": ("*", "*"), # Italicise external references
43+
"ext_ref": ("*", "*"),
44+
"terms": ("*", "*"),
4345
}
4446

4547
@property
4648
def tags(self) -> tuple[str, str | None]:
4749
"""Returns the markdown/html tags that need to be added at the `start` and `end` index of this `_Annotation`, respectively."""
4850
match self.kind:
4951
case "heading":
50-
if self.level == 1:
51-
return ("""<h1 style="text-align: center;">""", "</h1>")
52-
else:
53-
return (f"\n{'#' * min(6, self.level)} ", None)
52+
return (f"\n{'#' * min(6, self.level)} ", None)
5453

5554
case "xref":
5655
return ("[", f"](#{self.start_id.replace(':', '-')})")
@@ -127,8 +126,8 @@ def _filter_events(events: list[_Event]) -> list[_Event]:
127126
priority = {
128127
"junk": 0, # Lower value = lower priority
129128
"ext_ref": 1,
130-
"subtitle": 2,
131-
"xref": 3,
129+
"terms": 1,
130+
"xref": 2,
132131
}
133132
active: list[_Annotation] = [] # stack of active annotations
134133
filtered_events: list[_Event] = []
@@ -175,6 +174,26 @@ def _filter_events(events: list[_Event]) -> list[_Event]:
175174
return filtered_events
176175

177176

177+
def _merge_annotations(anns: list[_Annotation], kinds: set[_AnnotationKind]) -> Iterable[_Annotation]:
178+
"""Merges annotations with `kind` in `kinds` if they have the same start and end indices, returning the merged list of annotations."""
179+
anns = sorted(anns, key=lambda a: (a.start, a.end, a.kind in kinds))
180+
skip_next = False
181+
skipped_ann: _Annotation | None = None
182+
for i in range(len(anns) - 1):
183+
a1, a2 = anns[i], anns[i + 1]
184+
if skip_next:
185+
# Continue skipping if needed
186+
skip_next = a2.kind in kinds and skipped_ann and (skipped_ann.start, skipped_ann.end) == (a2.start, a2.end)
187+
continue
188+
189+
skip_next = a1.kind in kinds and a2.kind in kinds and (a1.start, a2.start) == (a1.end, a2.end)
190+
skipped_ann = a2
191+
yield a1
192+
193+
if not skip_next:
194+
yield anns[-1]
195+
196+
178197
# ==== END HELPER FUNCTIONS ====
179198

180199

@@ -184,7 +203,9 @@ def text2markdown(
184203
link_xrefs: bool = True,
185204
strike_junk: bool = True,
186205
block_quotes: bool = True,
206+
escape_lists: bool = True,
187207
italicize_refs: bool = True,
208+
italicize_terms: bool = True,
188209
enrichment_model: str = "kanon-2-enricher",
189210
isaacus_client: isaacus.Isaacus | None = None,
190211
) -> str:
@@ -199,8 +220,12 @@ def text2markdown(
199220
200221
block_quotes (bool, optional): Whether to transform non-inline quotes into Markdown block quotes.
201222
223+
escape_lists (bool, optional): Whether to escape list-like lines (lines starting with "-", "*", "+", or numbered lists). This leads to nicer rendering at the cost of cleaner Markdown source code.
224+
202225
italicize_refs (bool, optional): Whether to italicize the names of any referenced documents, for example, "as mentioned in *Smith v. Jones*".
203226
227+
italicize_terms (bool, optional): Whether to italicize any terms defined in the document.
228+
204229
enrichment_model (str, optional): The name of the Isaacus enrichment model to use for converting the input text into Markdown. Defaults to the latest and most advanced Isaacus enrichment model, currently `kanon-2-enricher`.
205230
206231
isaacus_client (isaacus.Isaacus, optional): An Isaacus API client to use for enriching the input text with an Isaacus enrichment model if the input is not already an Isaacus Legal Graph Schema (ILGS) Document. If `None`, a new instance will be created instead where necessary.
@@ -222,7 +247,7 @@ def text2markdown(
222247
# Idea: Gather all annotations to queue, build a hierarchy of events ordered by index,
223248
# then perform the necessary plain text -> markdown transformations
224249
# as we iterate over the input text
225-
ann_queue: set[_Annotation] = set()
250+
anns: set[_Annotation] = set()
226251
headings = deque(sorted([h for h in doc.headings if h.decode(text).strip()], key=lambda span: span.start))
227252
segs = sorted(doc.segments, key=lambda s: (s.span.start, -s.span.end))
228253
num_segs = len(segs)
@@ -245,12 +270,7 @@ def text2markdown(
245270
# Check for title; level 1 heading "#" is reserved for the title heading
246271
if (title := doc.title) and headings and headings[0].start <= title.start < headings[0].end:
247272
h = headings.popleft()
248-
ann_queue.add(_Annotation(h.start, h.end, kind="heading", level=1))
249-
250-
# Extract subtitle
251-
if (subtitle := doc.subtitle) and headings and headings[0].start <= subtitle.start < headings[0].end:
252-
h = headings.popleft()
253-
ann_queue.add(_Annotation(h.start, h.end, kind="subtitle"))
273+
anns.add(_Annotation(h.start, h.end, kind="heading", level=1))
254274

255275
id_to_seg: dict[str | None, Segment | None] = {None: None}
256276
has_heading: set[tuple[int, int]] = set()
@@ -267,7 +287,7 @@ def text2markdown(
267287
while headings and headings[0].start < span_start:
268288
h = headings.popleft()
269289
# Default segmentless headings' level
270-
ann_queue.add(_Annotation(h.start, h.end, kind="heading", level=curr_level))
290+
anns.add(_Annotation(h.start, h.end, kind="heading", level=curr_level))
271291

272292
annotations: list[tuple[int, int, int]] = []
273293
# annotate headings in segment
@@ -291,22 +311,23 @@ def text2markdown(
291311
if (curr.span.start, curr.span.end) not in has_heading:
292312
ann_level -= 1
293313
curr = id_to_seg[curr.parent]
294-
ann_queue.update(
314+
anns.update(
295315
_annotate_each_line(_Annotation(ann_start, ann_end, kind="heading", level=max(2, ann_level)), text)
296316
)
297317

298318
has_heading.add((seg.span.start, seg.span.end))
299319

300320
# Add any remaining headings which come after the last segment
301321
for heading in headings:
302-
ann_queue.add(_Annotation(heading.start, heading.end, kind="heading", level=2))
322+
anns.add(_Annotation(heading.start, heading.end, kind="heading", level=2))
303323

304324
# We've annotated all headings, now gather annotations for the optional parameters.
305325
optional_annotators = {
306326
"xref": (doc.crossreferences, link_xrefs),
307327
"junk": (doc.junk, strike_junk),
308328
"quote": (doc.quotes, block_quotes),
309329
"ext_ref": (doc.external_documents, italicize_refs),
330+
"terms": (doc.terms, italicize_terms),
310331
}
311332
for kind, (annotators, asked_to_implement) in optional_annotators.items():
312333
if not asked_to_implement:
@@ -317,26 +338,24 @@ def text2markdown(
317338
case "xref":
318339
start_id = ann.start # references' start segment id
319340
# Add annotations for the text itself (indicated by ann.span)
320-
ann_queue.update(
341+
anns.update(
321342
_annotate_each_line(
322343
_Annotation(ann.span.start, ann.span.end, kind=kind, start_id=start_id), text
323344
)
324345
)
325346

326347
# need to add in annotations for the source reference as well, for anchoring
327348
start_seg_span = id_to_seg[start_id].span
328-
ann_queue.add(
329-
_Annotation(start_seg_span.start, start_seg_span.end, kind="src_ref", start_id=start_id)
330-
)
349+
anns.add(_Annotation(start_seg_span.start, start_seg_span.end, kind="src_ref", start_id=start_id))
331350

332351
case "junk":
333-
ann_queue.update(_annotate_each_line(_Annotation(ann.start, ann.end, kind=kind), text))
352+
anns.update(_annotate_each_line(_Annotation(ann.start, ann.end, kind=kind), text))
334353

335354
case "quote":
336355
if ann.span.start > 0 and text[ann.span.start - 1] != "\n":
337356
# Only annotate block quotes; must be preceded with '\n' char
338357
continue
339-
ann_queue.update(
358+
anns.update(
340359
_annotate_each_line(
341360
_Annotation(ann.span.start, ann.span.end, kind=kind), text, add_newlines=True
342361
)
@@ -345,16 +364,31 @@ def text2markdown(
345364
case "ext_ref":
346365
# Each external reference has an array of mentions we want to annotate.
347366
for mention in ann.mentions:
348-
ann_queue.update(_annotate_each_line(_Annotation(mention.start, mention.end, kind=kind), text))
367+
anns.update(_annotate_each_line(_Annotation(mention.start, mention.end, kind=kind), text))
368+
369+
case "terms":
370+
anns.update(_annotate_each_line(_Annotation(ann.name.start, ann.name.end, kind=kind), text))
371+
372+
# ext_ref and terms both use italics, ensure they are merged to avoid duplication
373+
anns = _merge_annotations(list(anns), kinds={"ext_ref", "terms"})
349374

350375
events: list[_Event] = []
351-
for ann in ann_queue:
376+
for ann in anns:
352377
events.append(_Event(ann.start, "start", ann))
353378
# Don't need end events for some annotation types
354379
if ann.kind != "src_ref":
355380
events.append(_Event(ann.end, "end", ann))
356381

357-
kind_priority = {"heading": 6, "quote": 5, "ext_ref": 4, "junk": 3, "xref": 2, "subtitle": 1, "src_ref": 0}
382+
kind_priority = {
383+
"heading": 6,
384+
"quote": 5,
385+
"ext_ref": 4,
386+
"terms": 4,
387+
"junk": 3,
388+
"xref": 2,
389+
"subtitle": 1,
390+
"src_ref": 0,
391+
}
358392
zero_length_annotations = {"src_ref"}
359393

360394
def event_sort_key(e: _Event):
@@ -410,7 +444,7 @@ def event_sort_key(e: _Event):
410444
prev_is_blank = not line.strip()
411445

412446
# prevent markdown list rendering
413-
if _is_list_line(line) and line.lstrip() == line:
447+
if _is_list_line(line) and line.lstrip() == line and escape_lists:
414448
line = f"&#8203;{line}"
415449

416450
# Convert leading tabs/whitespace to html indent flags

tests/test-out/test2.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<h1 style="text-align: center;">2025 VCE Business Management external assessment report</h1>
1+
# 2025 VCE Business Management external assessment report
22

33
This report provides sample answers, or an indication of what answers may have included. Unless otherwise stated, these are not intended to be exemplary or complete responses.
44

@@ -30,7 +30,7 @@ Average
3030

3131
This question required students to outline one clear difference between the terms ‘retirement’ and ‘redundancy’. There are several differences that were considered acceptable, including:
3232

33-
&emsp;• Retirement is voluntary and initiated by the employee, while redundancy is normally involuntary and is initiated by the business.
33+
&emsp;*Retirement* is voluntary and initiated by the employee, while *redundancy* is normally involuntary and is initiated by the business.
3434

3535
&emsp;• Retirement will often mean that the employee leaves the workforce entirely, while redundancy may mean that the employee seeks employment elsewhere.
3636

tests/test-out/test3.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The three main types of rock, classified by how they form, are igneous, sedimentary, and metamorphic. These types are defined by their formation processes: cooling magma (igneous), accumulation of sediments (sedimentary), or heat and pressure transformation (metamorphic), as discussed in this article from AMNH.
22

3-
## Igneous Rock
3+
## *Igneous Rock*
44

55
Formed from the cooling and solidification of molten rock (magma or lava).
66

tests/test-out/test4.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
<h1 style="text-align: center;">THE OMNIBUS EXAMPLE INSTRUMENT</h1>
1+
# THE OMNIBUS EXAMPLE INSTRUMENT
22

3-
<p style="text-align: center;">Subtitle: Cross-Domain Act, Agreement, and Notice</p>
3+
## Subtitle: Cross-Domain Act, Agreement, and Notice
44

55
~~PDF | HTML | DOCX | TXT | PRINT~~
66

@@ -38,7 +38,7 @@
3838

3939
## <a id="seg-19"></a>Part I — Front Matter
4040

41-
Document Title: “The Omnibus Example Instrument” (the “Instrument”)
41+
Document Title: “The Omnibus Example Instrument” (the “*Instrument*”)
4242

4343
Document Subtitle: “Cross-Domain Act, Agreement, and Notice”
4444

@@ -58,17 +58,17 @@ This document was created on 01 January 2024, signed on 31 January 2024, takes e
5858

5959
For the purposes of this Instrument:
6060

61-
“Agreement” means this Instrument as binding between the Parties.
61+
*Agreement*” means this Instrument as binding between the Parties.
6262

63-
“Confidential Information” means any information marked “confidential” or that a reasonable person would treat as confidential.
63+
*Confidential Information*” means any information marked “confidential” or that a reasonable person would treat as confidential.
6464

65-
“Effective Date” means 01 February 2024.
65+
*Effective Date*” means 01 February 2024.
6666

67-
“Services” means the managed data processing and hosting services described in [Schedule 1](#seg-67).
67+
*Services*” means the managed data processing and hosting services described in [Schedule 1](#seg-67).
6868

69-
“Force Majeure Event” means an event beyond a party’s reasonable control (including acts of God, war, strikes, or Internet backbone failure).
69+
*Force Majeure Event*” means an event beyond a party’s reasonable control (including acts of God, war, strikes, or Internet backbone failure).
7070

71-
law means any statute, regulation, or decision of a competent authority (including, without limitation, the *Privacy Act 1988 (Cth)* and *Regulation (EU) 2016/679 (GDPR)).*
71+
*law* means any statute, regulation, or decision of a competent authority (including, without limitation, the *Privacy Act 1988 (Cth)* and *Regulation (EU) 2016/679 (GDPR)).*
7272

7373
References to Sections, Subsections, Paragraphs, Clauses, Items, Schedules, Annexes, Appendices, Exhibits, and Figures are to those of this Instrument unless otherwise stated.
7474

@@ -172,9 +172,9 @@ Description: A block diagram showing user → app → API → database.
172172

173173
The Parties:
174174

175-
The Licensor (corporate): Acme Data Pty Ltd (ACN 123 456 789), registered in Australia on 15 August 2010; principal place of business: Level 10, 200 Collins Street, Melbourne VIC 3000, Australia; website https://acmedata.example
175+
The *Licensor* (corporate): Acme Data Pty Ltd (ACN 123 456 789), registered in Australia on 15 August 2010; principal place of business: Level 10, 200 Collins Street, Melbourne VIC 3000, Australia; website https://acmedata.example
176176

177-
The Licensee (corporate): Beta Health, Inc., incorporated in Delaware on 22 May 2012; principal place of business: 500 Market St, San Francisco, CA 94105, USA; website https://betahealth.example
177+
The *Licensee* (corporate): Beta Health, Inc., incorporated in Delaware on 22 May 2012; principal place of business: 500 Market St, San Francisco, CA 94105, USA; website https://betahealth.example
178178

179179
The Government (politic, empowered authority): Department of Government Services (Victoria).
180180

0 commit comments

Comments
 (0)