Skip to content

Commit 84913cd

Browse files
chg: add support for <li> value attributes.
1 parent 5bd02c3 commit 84913cd

6 files changed

Lines changed: 63 additions & 29 deletions

File tree

src/inscriptis/model/tag/list_tag.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""Handle the <li>, <ol>, <ul> tags."""
2-
from typing import Dict
32

43
from inscriptis.model.html_document_state import HtmlDocumentState
54

@@ -12,16 +11,14 @@ def get_bullet(state: HtmlDocumentState) -> str:
1211
return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN]
1312

1413

15-
def li_start_handler(state: HtmlDocumentState, tag: Dict) -> None:
14+
def li_start_handler(state: HtmlDocumentState, tag: dict) -> None:
1615
"""Handle the <li> tag."""
1716
bullet = state.li_counter[-1] if state.li_counter else "* "
18-
if tag.get('value'):
19-
if tag.get('value').isdigit():
20-
bullet = int(tag.get('value'))
21-
if not state.li_counter:
22-
state.li_counter.append(bullet)
23-
else:
24-
bullet = tag.get('value')
17+
# Value can only used for numerical bullets.
18+
if "value" in tag and tag["value"].isdigit() and isinstance(bullet, int):
19+
bullet = int(tag["value"])
20+
state.li_counter[-1] = bullet
21+
2522
if isinstance(bullet, int):
2623
state.li_counter[-1] += 1
2724
state.tags[-1].list_bullet = f"{bullet}. "
@@ -31,7 +28,7 @@ def li_start_handler(state: HtmlDocumentState, tag: Dict) -> None:
3128
state.tags[-1].write("")
3229

3330

34-
def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None:
31+
def ul_start_handler(state: HtmlDocumentState, _: dict) -> None:
3532
"""Handle the <ul> tag."""
3633
state.li_counter.append(get_bullet(state))
3734

@@ -41,7 +38,7 @@ def ul_end_handler(state: HtmlDocumentState) -> None:
4138
state.li_counter.pop()
4239

4340

44-
def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None:
41+
def ol_start_handler(state: HtmlDocumentState, _: dict) -> None:
4542
"""Handle the <ol> tag."""
4643
state.li_counter.append(1)
4744

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<ol>
2+
<li value="100">Coffee</li>
3+
<li>Tea</li>
4+
<li>Milk</li>
5+
<li value="7">Water</li>
6+
<li>Juice</li>
7+
<li>Beer</li>
8+
</ol>
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
100. Coffee
2+
101. Tea
3+
102. Milk
4+
7. Water
5+
8. Juice
6+
9. Beer

tests/html/enumeration-value.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
100. Coffee
2-
101. Tea
3-
102. Milk
4-
103. Water
5-
104. Juice
6-
105. Beer
1+
100. Coffee
2+
101. Tea
3+
102. Milk
4+
103. Water
5+
104. Juice
6+
105. Beer

tests/test_list_div.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,3 @@ def test_divs():
2727

2828
html = "<body>Thomas <ul><li> a <div>Anton</div>Maria</ul></body>"
2929
assert get_text(html, config) == "Thomas\n * a\n Anton\n Maria"
30-
31-
html = "<body>Thomas <ol><li> a <div>Anton</div>Maria</ol></body>"
32-
assert get_text(html, config) == "Thomas\n 1. a\n Anton\n Maria"
33-
34-
html = """<body>Thomas <ol><li value="2"> a <div>Anton</div>Maria</ol></body>"""
35-
assert get_text(html, config) == "Thomas\n 2. a\n Anton\n Maria"
36-
37-
html = """<body>Thomas <ol><li value="2"> a <ol><li><div>Anton</div></li></ol>Maria</ol></body>"""
38-
assert get_text(html, config) == "Thomas\n 2. a\n 1. Anton\n Maria"
39-
40-
html = """<body>Thomas <ol><li value="2"> a <ol><li value="10"><div>Anton</div></li></ol>Maria</ol></body>"""
41-
assert get_text(html, config) == "Thomas\n 2. a\n 10. Anton\n Maria"

tests/test_list_value.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python
2+
3+
"""Test list value in ordered and unordered lists.
4+
"""
5+
6+
from inscriptis import get_text
7+
from inscriptis.css_profiles import CSS_PROFILES
8+
from inscriptis.model.config import ParserConfig
9+
10+
config = ParserConfig(css=CSS_PROFILES["strict"])
11+
12+
13+
def test_value():
14+
html = "<body>Thomas <ol><li> a <div>Anton</div>Maria</ol></body>"
15+
assert get_text(html, config) == "Thomas\n 1. a\n Anton\n Maria"
16+
17+
html = """<body>Thomas <ol><li value="2"> a <div>Anton</div>Maria</ol></body>"""
18+
assert get_text(html, config) == "Thomas\n 2. a\n Anton\n Maria"
19+
20+
html = """<body>Thomas <ol><li value="2"> a <ol><li><div>Anton</div></li></ol>Maria</ol></body>"""
21+
assert get_text(html, config) == "Thomas\n 2. a\n 1. Anton\n Maria"
22+
23+
html = """<body>Thomas <ol><li value="2"> a <ol><li value="10"><div>Anton</div></li></ol>Maria</ol></body>"""
24+
assert get_text(html, config) == "Thomas\n 2. a\n 10. Anton\n Maria"
25+
26+
27+
def test_value_without_ol():
28+
"""Behavior if the <ol> tag is missing."""
29+
html = """<body>Thomas <li value="2">Maria</li><li>Ana</li></body>"""
30+
assert get_text(html, config) == "Thomas\n* Maria\n* Ana"
31+
32+
html = """<body>Thomas <li value="2">Maria</li><li>Ana</li></ul></body>"""
33+
assert get_text(html, config) == "Thomas\n* Maria\n* Ana"
34+
35+

0 commit comments

Comments
 (0)