Skip to content

Commit b2dbccf

Browse files
Merge pull request #99 from fcasalen/feature/ordered_list_item_with_value_attr
chg: handling list items with value attribute
2 parents a117fb9 + 5d43159 commit b2dbccf

2 files changed

Lines changed: 23 additions & 3 deletions

File tree

src/inscriptis/model/tag/list_tag.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Handle the <li>, <ol>, <ul> tags."""
2+
from typing import Dict
23

34
from inscriptis.model.html_document_state import HtmlDocumentState
45

@@ -11,9 +12,16 @@ def get_bullet(state: HtmlDocumentState) -> str:
1112
return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN]
1213

1314

14-
def li_start_handler(state: HtmlDocumentState, _: dict) -> None:
15+
def li_start_handler(state: HtmlDocumentState, tag: Dict) -> None:
1516
"""Handle the <li> tag."""
1617
bullet = state.li_counter[-1] if state.li_counter else "* "
18+
if tag.get('value'):
19+
if tag.get('value').isdigit():
20+
bullet = int(tag.get('value'))
21+
if not state.li_counter:
22+
state.li_counter.append(bullet)
23+
else:
24+
bullet = tag.get('value')
1725
if isinstance(bullet, int):
1826
state.li_counter[-1] += 1
1927
state.tags[-1].list_bullet = f"{bullet}. "
@@ -23,7 +31,7 @@ def li_start_handler(state: HtmlDocumentState, _: dict) -> None:
2331
state.tags[-1].write("")
2432

2533

26-
def ul_start_handler(state: HtmlDocumentState, _: dict) -> None:
34+
def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None:
2735
"""Handle the <ul> tag."""
2836
state.li_counter.append(get_bullet(state))
2937

@@ -33,7 +41,7 @@ def ul_end_handler(state: HtmlDocumentState) -> None:
3341
state.li_counter.pop()
3442

3543

36-
def ol_start_handler(state: HtmlDocumentState, _: dict) -> None:
44+
def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None:
3745
"""Handle the <ol> tag."""
3846
state.li_counter.append(1)
3947

tests/test_list_div.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,15 @@ def test_divs():
2727

2828
html = "<body>Thomas <ul><li> a <div>Anton</div>Maria</ul></body>"
2929
assert get_text(html, config) == "Thomas\n * a\n Anton\n Maria"
30+
31+
html = "<body>Thomas <ol><li> a <div>Anton</div>Maria</ol></body>"
32+
assert get_text(html, config) == "Thomas\n 1. a\n Anton\n Maria"
33+
34+
html = """<body>Thomas <ol><li value="2"> a <div>Anton</div>Maria</ol></body>"""
35+
assert get_text(html, config) == "Thomas\n 2. a\n Anton\n Maria"
36+
37+
html = """<body>Thomas <ol><li value="2"> a <ol><li><div>Anton</div></li></ol>Maria</ol></body>"""
38+
assert get_text(html, config) == "Thomas\n 2. a\n 1. Anton\n Maria"
39+
40+
html = """<body>Thomas <ol><li value="2"> a <ol><li value="10"><div>Anton</div></li></ol>Maria</ol></body>"""
41+
assert get_text(html, config) == "Thomas\n 2. a\n 10. Anton\n Maria"

0 commit comments

Comments
 (0)