Skip to content

Commit 140931d

Browse files
author
Rustem
committed
async httpx concurrent impl
1 parent a2f319e commit 140931d

6 files changed

Lines changed: 284 additions & 17 deletions

File tree

.github/workflows/publish.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,12 @@ jobs:
2323
- name: Install package
2424
run: |
2525
python -m pip install --upgrade pip
26-
pip install -e .
27-
26+
pip install -e .[dev]
27+
28+
- name: Run tests
29+
run: |
30+
python -m pytest tests/ -v
31+
2832
- name: Test basic functionality
2933
run: |
3034
textnano --help || echo "CLI not fully configured yet"

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "textnano"
7-
version = "0.1.0"
7+
version = "0.2.0"
88
description = "A minimal text dataset builder inspired by lazynlp. Perfect for ML students who just want clean text datasets quickly."
99
readme = "README.md"
1010
requires-python = ">=3.10"
@@ -27,7 +27,10 @@ classifiers = [
2727
"Topic :: Software Development :: Libraries :: Python Modules",
2828
]
2929

30-
dependencies = [] # Zero dependencies as advertised!
30+
dependencies = [
31+
"httpx>=0.27.0",
32+
"protego>=0.3.0",
33+
]
3134

3235
[project.optional-dependencies]
3336
dev = [

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
setup(
1414
name="textnano",
15-
version="0.1.0",
15+
version="0.2.0",
1616
description="Minimal text dataset builder - Zero dependencies, single file, perfect for ML students",
1717
long_description=long_description,
1818
long_description_content_type="text/markdown",
@@ -22,7 +22,8 @@
2222
packages=find_packages(),
2323
python_requires=">=3.10",
2424
install_requires=[
25-
# Zero dependencies - pure Python stdlib!
25+
"httpx>=0.27.0",
26+
"protego>=0.3.0",
2627
],
2728
extras_require={
2829
"dev": [

textnano/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,30 @@
44
A single-file library to build text datasets from web URLs.
55
Perfect for ML students who just want clean text quickly.
66
7-
Dependencies: ZERO (pure Python stdlib)
7+
Dependencies: httpx, protego (for async parallel crawling with robots.txt support)
88
"""
99

1010
from textnano.core import (
1111
download_text,
12+
download_text_async,
1213
clean_html,
1314
text_fingerprint,
1415
is_duplicate,
1516
download_and_clean,
17+
download_and_clean_async,
1618
estimate_dataset_size,
1719
merge_datasets,
1820
)
1921

20-
__version__ = "0.1.0"
22+
__version__ = "0.2.0"
2123
__all__ = [
2224
"download_text",
25+
"download_text_async",
2326
"clean_html",
2427
"text_fingerprint",
2528
"is_duplicate",
2629
"download_and_clean",
30+
"download_and_clean_async",
2731
"estimate_dataset_size",
2832
"merge_datasets",
2933
]

textnano/cli.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
"""CLI for textnano with extractor subcommands."""
33

44
import sys
5+
import asyncio
56
import argparse
6-
from textnano.core import download_and_clean, is_duplicate
7+
from textnano.core import download_and_clean_async, is_duplicate
78
from textnano.utils import estimate_dataset_size, merge_datasets
89
from textnano.extractors import extract_wikipedia_urls, extract_reddit_urls, extract_gutenberg_urls
910

@@ -29,6 +30,12 @@ def main():
2930
help='Additional file extensions to exclude')
3031
urls_parser.add_argument('--no-default-excludes', action='store_true',
3132
help='Disable default exclusion lists')
33+
urls_parser.add_argument('--max-concurrent', '-c', type=int, default=10,
34+
help='Maximum concurrent requests (default: 10)')
35+
urls_parser.add_argument('--no-robots', action='store_true',
36+
help='Ignore robots.txt (not recommended)')
37+
urls_parser.add_argument('--timeout', '-t', type=int, default=30,
38+
help='Request timeout in seconds (default: 30)')
3239

3340
# wikipedia command
3441
wiki_parser = subparsers.add_parser('wikipedia', help='Extract URLs from Wikipedia dump')
@@ -65,14 +72,17 @@ def main():
6572

6673
# Handle commands
6774
if args.command == 'urls':
68-
stats = download_and_clean(
75+
stats = asyncio.run(download_and_clean_async(
6976
args.url_file,
7077
args.output_dir,
7178
max_urls=args.max_urls,
7279
exclude_domains=args.exclude_domains,
7380
exclude_extensions=args.exclude_extensions,
74-
use_default_excludes=not args.no_default_excludes
75-
)
81+
use_default_excludes=not args.no_default_excludes,
82+
max_concurrent=args.max_concurrent,
83+
respect_robots=not args.no_robots,
84+
timeout=args.timeout
85+
))
7686
dataset_stats = estimate_dataset_size(args.output_dir)
7787
print(f"\nDataset: {dataset_stats['files']} files, "
7888
f"{dataset_stats['words']:,} words, "

0 commit comments

Comments
 (0)