async httpx concurrent impl

Rustem · Rustem · commit 140931d51ca7 · 2025-10-29T22:50:10.000-07:00
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -23,8 +23,12 @@ jobs:
     - name: Install package
       run: |
         python -m pip install --upgrade pip
-        pip install -e .
-    
+        pip install -e .[dev]
+
+    - name: Run tests
+      run: |
+        python -m pytest tests/ -v
+
     - name: Test basic functionality
       run: |
         textnano --help || echo "CLI not fully configured yet"
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "textnano"
-version = "0.1.0"
+version = "0.2.0"
 description = "A minimal text dataset builder inspired by lazynlp. Perfect for ML students who just want clean text datasets quickly."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -27,7 +27,10 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
 
-dependencies = []  # Zero dependencies as advertised!
+dependencies = [
+    "httpx>=0.27.0",
+    "protego>=0.3.0",
+]
 
 [project.optional-dependencies]
 dev = [
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name="textnano",
-    version="0.1.0",
+    version="0.2.0",
     description="Minimal text dataset builder - Zero dependencies, single file, perfect for ML students",
     long_description=long_description,
     long_description_content_type="text/markdown",
@@ -22,7 +22,8 @@
     packages=find_packages(),
     python_requires=">=3.10",
     install_requires=[
-        # Zero dependencies - pure Python stdlib!
+        "httpx>=0.27.0",
+        "protego>=0.3.0",
     ],
     extras_require={
         "dev": [
diff --git a/textnano/__init__.py b/textnano/__init__.py
@@ -4,26 +4,30 @@
 A single-file library to build text datasets from web URLs.
 Perfect for ML students who just want clean text quickly.
 
-Dependencies: ZERO (pure Python stdlib)
+Dependencies: httpx, protego (for async parallel crawling with robots.txt support)
 """
 
 from textnano.core import (
     download_text,
+    download_text_async,
     clean_html,
     text_fingerprint,
     is_duplicate,
     download_and_clean,
+    download_and_clean_async,
     estimate_dataset_size,
     merge_datasets,
 )
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 __all__ = [
     "download_text",
+    "download_text_async",
     "clean_html",
     "text_fingerprint",
     "is_duplicate",
     "download_and_clean",
+    "download_and_clean_async",
     "estimate_dataset_size",
     "merge_datasets",
 ]
diff --git a/textnano/cli.py b/textnano/cli.py
@@ -2,8 +2,9 @@
 """CLI for textnano with extractor subcommands."""
 
 import sys
+import asyncio
 import argparse
-from textnano.core import download_and_clean, is_duplicate
+from textnano.core import download_and_clean_async, is_duplicate
 from textnano.utils import estimate_dataset_size, merge_datasets
 from textnano.extractors import extract_wikipedia_urls, extract_reddit_urls, extract_gutenberg_urls
 
@@ -29,6 +30,12 @@ def main():
                             help='Additional file extensions to exclude')
     urls_parser.add_argument('--no-default-excludes', action='store_true',
                             help='Disable default exclusion lists')
+    urls_parser.add_argument('--max-concurrent', '-c', type=int, default=10,
+                            help='Maximum concurrent requests (default: 10)')
+    urls_parser.add_argument('--no-robots', action='store_true',
+                            help='Ignore robots.txt (not recommended)')
+    urls_parser.add_argument('--timeout', '-t', type=int, default=30,
+                            help='Request timeout in seconds (default: 30)')
 
     # wikipedia command
     wiki_parser = subparsers.add_parser('wikipedia', help='Extract URLs from Wikipedia dump')
@@ -65,14 +72,17 @@ def main():
 
     # Handle commands
     if args.command == 'urls':
-        stats = download_and_clean(
+        stats = asyncio.run(download_and_clean_async(
             args.url_file,
             args.output_dir,
             max_urls=args.max_urls,
             exclude_domains=args.exclude_domains,
             exclude_extensions=args.exclude_extensions,
-            use_default_excludes=not args.no_default_excludes
-        )
+            use_default_excludes=not args.no_default_excludes,
+            max_concurrent=args.max_concurrent,
+            respect_robots=not args.no_robots,
+            timeout=args.timeout
+        ))
         dataset_stats = estimate_dataset_size(args.output_dir)
         print(f"\nDataset: {dataset_stats['files']} files, "
               f"{dataset_stats['words']:,} words, "
diff --git a/textnano/core.py b/textnano/core.py