Skip to content

Commit 0f601d0

Browse files
Merge pull request #101 from weblyzard/feature/improved-ci
Feature/improved ci
2 parents 70826c9 + e8e2828 commit 0f601d0

27 files changed

Lines changed: 151 additions & 170 deletions

.github/workflows/python-package.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
fail-fast: false
1313
matrix:
14-
python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ]
14+
python-version: [ '3.10', '3.11', '3.12', '3.13' ]
1515

1616
steps:
1717
- uses: actions/checkout@v3
@@ -22,7 +22,8 @@ jobs:
2222
- name: Install build environment
2323
run: |
2424
python -m pip install --upgrade pip
25-
python -m pip install tox setuptools pytest pytest-cov codecov
26-
- name: Build and test with tox.
25+
python -m pip install uv
26+
- name: Build and test with uv.
2727
run: |
28-
tox -vv -e flake8
28+
uv run ruff check
29+
uv build

benchmarking/run_benchmarking.py

Lines changed: 22 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
#!/usr/bin/env python3
2-
"""
3-
Runs a benchmarking suite to compare speed
4-
and output of different implementations.
5-
"""
2+
"""Run a benchmarking suite to compare speed and output of different implementations."""
63

74
import argparse
85
import operator
@@ -53,23 +50,16 @@
5350

5451

5552
class AbstractHtmlConverter:
56-
"""
57-
An abstract HTML convert class.
58-
"""
53+
"""An abstract HTML convert class."""
5954

6055
def get_text(self, html):
61-
"""
62-
Returns:
63-
a text representation of the given HTML snippet.
64-
"""
56+
"""Return a text representation of the given HTML snippet."""
6557
raise NotImplementedError
6658

6759
def benchmark(self, html):
68-
"""
69-
Benchmarks the classes HTML to text converter.
60+
"""Benchmarks the classes HTML to text converter.
7061
71-
Returns:
72-
A tuple of the required time and the obtained text representation.
62+
Return a tuple of the required time and the obtained text representation.
7363
"""
7464
start_time = time()
7565
for _ in range(TRIES):
@@ -78,9 +68,7 @@ def benchmark(self, html):
7868

7969

8070
class BeautifulSoupHtmlConverter(AbstractHtmlConverter):
81-
"""
82-
Converts HTML to text using BeautifulSoup.
83-
"""
71+
"""Converts HTML to text using BeautifulSoup."""
8472

8573
name = "BeautifulSoup"
8674

@@ -100,9 +88,7 @@ def get_text(self, html):
10088

10189

10290
class JustextConverter(AbstractHtmlConverter):
103-
"""
104-
Converts HTML to text using Justtext.
105-
"""
91+
"""Converts HTML to text using Justtext."""
10692

10793
name = "Justtext"
10894

@@ -116,9 +102,7 @@ def get_text(self, html):
116102

117103

118104
class Html2TextConverter(AbstractHtmlConverter):
119-
"""
120-
Converts HTML to text using Html2Text.
121-
"""
105+
"""Converts HTML to text using Html2Text."""
122106

123107
name = "Html2Text"
124108

@@ -133,9 +117,7 @@ def get_text(self, html):
133117

134118

135119
class LynxConverter(AbstractHtmlConverter):
136-
"""
137-
Converts HTML to text using lynx.
138-
"""
120+
"""Converts HTML to text using lynx."""
139121

140122
name = "Lynx"
141123

@@ -166,9 +148,7 @@ def kill_lynx(pid):
166148

167149

168150
class LinksConverter(AbstractHtmlConverter):
169-
"""
170-
Converts HTML to text using links.
171-
"""
151+
"""Converts HTML to text using links."""
172152

173153
name = "Links"
174154

@@ -199,9 +179,7 @@ def kill_links(pid):
199179

200180

201181
class InscriptisHtmlConverter(AbstractHtmlConverter):
202-
"""
203-
Converts HTML to text using Inscriptis.
204-
"""
182+
"""Converts HTML to text using Inscriptis."""
205183

206184
name = "Inscriptis"
207185

@@ -217,18 +195,14 @@ def __init__(self):
217195

218196

219197
def save_to_file(algorithm, url, data, benchmarking_results_dir):
220-
"""
221-
Saves a benchmarking result to the given file.
222-
"""
198+
"""Save the benchmarking result to the given file."""
223199
result_file = os.path.join(benchmarking_results_dir, f"{algorithm}_{url}.txt")
224200
with open(result_file, "w") as output_file:
225201
output_file.write(data)
226202

227203

228204
def get_speed_table(times):
229-
"""
230-
Provides the table which compares the conversion speed.
231-
"""
205+
"""Provide the table which compares the conversion speed."""
232206
fastest = min((value for _, value in times.items()))
233207
longest_key = max(len(key) for key, _ in times.items())
234208
longest_value = max(len(str(value)) for _, value in times.items())
@@ -251,9 +225,7 @@ def get_speed_table(times):
251225

252226

253227
def get_fname(url) -> str:
254-
"""
255-
Transforms a URL to a file name.
256-
"""
228+
"""Transform a URL to a file name."""
257229
trash = (("http://", ""), ("https://", ""), ("/", "-"), (":", "-"), ("%", ""))
258230

259231
for key, value in trash:
@@ -272,9 +244,7 @@ def get_fname(url) -> str:
272244

273245

274246
def parse_args():
275-
"""
276-
Parse optional benchmarking arguments.
277-
"""
247+
"""Parse optional benchmarking arguments."""
278248
parser = argparse.ArgumentParser(description="Inscriptis benchmarking suite")
279249
parser.add_argument(
280250
"converter",
@@ -306,11 +276,11 @@ def parse_args():
306276

307277

308278
def _setup_benchmarking_directories(args):
309-
"""
310-
Setup the benchmarking result and caching directories.
279+
"""Set up the benchmarking result and caching directories.
311280
312281
Args:
313282
args: command line arguments that provide the directory names.
283+
314284
"""
315285
if not os.path.exists(args.benchmarking_results):
316286
os.makedirs(args.benchmarking_results)
@@ -319,16 +289,17 @@ def _setup_benchmarking_directories(args):
319289

320290

321291
def _fetch_url(url, cache_dir):
322-
"""
323-
Fetch the given URL either from the cache or from the Web.
292+
"""Fetch the given URL either from the cache or from the Web.
324293
325294
URLs that are not yet cached are added to the cache.
326295
327296
Args:
328297
url: the URL to fetch.
298+
cache_dir: the cache directory.
329299
330300
Returns:
331301
A tuple of the cache file name and the URLs content.
302+
332303
"""
333304
source_name = get_fname(url)
334305
source_cache_path = os.path.join(cache_dir, source_name)
@@ -349,14 +320,13 @@ def _fetch_url(url, cache_dir):
349320

350321

351322
def benchmark(args, source_list):
352-
"""
353-
Run the benchmark.
323+
"""Run the benchmark.
354324
355325
Args:
356326
args: command line arguments
357327
source_list: a list of URLs to benchmark.
358-
"""
359328
329+
"""
360330
_setup_benchmarking_directories(args)
361331

362332
output = []

examples/custom-html-handling.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
#!/usr/bin/env python3
22

3-
"""
4-
Custom HTML tag handling example.
3+
"""Custom HTML tag handling example.
54
65
Add a custom HTML handler for the bold <b> tag which encloses
76
bold text with "**".
87
98
Example:
109
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
10+
1111
"""
1212

1313
from lxml.html import fromstring

publish.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ case "$1" in
1919
# cleanup dist
2020
rm -rf ./dist
2121

22-
# build and publish packages
23-
poetry publish --build
22+
# build with hatchling and publish to PyPI
23+
uv build
24+
uv publish
2425
;;
2526
docker)
2627
echo "Publishing ${IMAGE_NAME} in version ${VERSION}"

pyproject.toml

Lines changed: 58 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ version = "2.7.0"
44
description = "inscriptis - HTML to text converter."
55
license = "Apache-2.0"
66
readme = "README.rst"
7-
requires-python = ">=3.9,<3.14"
7+
requires-python = ">=3.10,<3.15"
88

99
authors = [
1010
{ name = "Albert Weichselbraun", email = "albert.weichselbraun@fhgr.ch" },
@@ -20,12 +20,13 @@ classifiers = [
2020
"Topic :: Text Processing :: Markup :: HTML",
2121
"Topic :: Utilities",
2222
"Programming Language :: Python :: 3",
23-
"Programming Language :: Python :: 3.9",
2423
"Programming Language :: Python :: 3.10",
2524
"Programming Language :: Python :: 3.11",
2625
"Programming Language :: Python :: 3.12",
2726
"Programming Language :: Python :: 3.13",
27+
"Programming Language :: Python :: 3.14",
2828
]
29+
2930

3031
dependencies = [
3132
"requests>=2.32.3,<3.0.0",
@@ -49,14 +50,15 @@ web-service = [
4950

5051
[dependency-groups]
5152
dev = [
52-
"pytest>=8.3.5",
53+
"pytest>=9.0.1",
5354
"fastapi>=0.115.11,<1.0.0",
54-
"ruff>=0.11.12",
55+
"ruff>=0.14.5",
5556
"httpx>=0.28.1",
56-
"uvicorn>=0.34.2",
57-
"ty>=0.0.1a7",
58-
"pytest-cov>=6.1.1",
59-
"safety>=3.5.1",
57+
"uvicorn>=0.38.0",
58+
"ty>=0.0.1a26",
59+
"pytest-cov>=7.0.0",
60+
"safety>=3.7.0",
61+
"tox>=4.23.0",
6062
]
6163
[build-system]
6264
requires = ["hatchling"]
@@ -77,32 +79,56 @@ quote-style = "double"
7779

7880
[tool.ruff.lint]
7981
select = [
80-
# pycodestyle
81-
"E",
82-
# Pyflakes
83-
"F",
84-
# pyupgrade
85-
"UP",
86-
# flake8-builtins
87-
"A",
88-
# flake8-bugbear
89-
"B",
90-
# flake8-comprehensions
91-
"C4",
92-
# flake8-errmsg
93-
"EM",
94-
# flake8-quotes
95-
"Q",
96-
# flake8-pyi
97-
"PYI",
98-
# flake8-simplify
99-
"SIM",
100-
# isort
101-
"I",
102-
"RSE", "RET", "SLOT", "TID", "TC", "C90", "N", "PERF", "E", "W",
103-
"UP", "FURB", "RUF", "TRY", "YTT"
82+
"A", # flake8-builtins
83+
"B", # flake8-bugbear
84+
"COM", # flake8-commas - trailing commas
85+
"BLE", # flake8-blind-except - avoid bare except
86+
"D", # flake8-docstrings
87+
"C4", # flake8-comprehensions
88+
"E", # pycodestyle
89+
"EM", # flake8-errmsg
90+
"F", # Pyflakes
91+
"FA", # flake8-future-annotations - use modern annotations
92+
"ICN", # flake8-import-conventions - standard import aliases
93+
"PIE", # flake8-pie
94+
"PLE", # pylint equivalents
95+
"PLW", # pylint equivalents
96+
"PTH", # flake8-use-pathlib - prefer pathlib over os.path
97+
"PYI", # flake8-pyi
98+
"Q", # flake8-quotes
99+
"N", # flake8-naming
100+
"SIM", # flake8-simplify
101+
"I", # isort
102+
"RET", # flake8-return
103+
"RSE", "SLOT", "TID", "TC", "C90", "PERF", "E", "W",
104+
"FURB", "RUF", "TRY", "YTT",
105+
"TCH", # flake8-type-checking - optimize type checking imports
106+
"S", # flake8-bandit (security) — replaces dlint/bandit
107+
"UP", # pyupgrade
104108
]
105109

110+
ignore = [
111+
"D102", # missing docstring in public method
112+
"D105", # missing docstring in magic method
113+
"D107", # missing docstring in __init__
114+
"D203", # incorrect-blank-line-before-class
115+
"D213", # multi-line-summary-second-line
116+
]
117+
118+
[tool.ruff.lint.per-file-ignores]
119+
"tests/**/*.py" = [
120+
"S101", # allow asserts
121+
"D", # no dockstring checks
122+
"S310", # allow URLs
123+
"PTH", # prefer pathlib
124+
]
125+
"benchmarking/*.py" = [
126+
"S310", # allow URLs
127+
"S603", # call: check for execution of untrusted input
128+
"PTH", # prefer pathlib
129+
]
130+
131+
106132
[tool.ty.src]
107133
root="./src"
108134

src/inscriptis/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def _get_html_tree(html_content: str) -> HtmlElement | None:
8383
8484
Returns:
8585
The corresponding HTML parse tree.
86+
8687
"""
8788
html_content = html_content.strip()
8889
if not html_content:
@@ -107,6 +108,7 @@ def get_text(html_content: str, config: ParserConfig | None = None) -> str:
107108
108109
Returns:
109110
The text representation of the HTML content.
111+
110112
"""
111113
html_tree = _get_html_tree(html_content)
112114
return Inscriptis(html_tree, config).get_text() if html_tree is not None else ""
@@ -128,6 +130,7 @@ def get_annotated_text(html_content: str, config: ParserConfig | None = None) ->
128130
129131
Returns:
130132
A dictionary of text (key: 'text') and annotations (key: 'label')
133+
131134
"""
132135
html_tree = _get_html_tree(html_content)
133136
if html_tree is None:

0 commit comments

Comments
 (0)