22"""CLI for textnano with extractor subcommands."""
33
44import sys
5+ import asyncio
56import argparse
6- from textnano .core import download_and_clean , is_duplicate
7+ from textnano .core import download_and_clean_async , is_duplicate
78from textnano .utils import estimate_dataset_size , merge_datasets
89from textnano .extractors import extract_wikipedia_urls , extract_reddit_urls , extract_gutenberg_urls
910
@@ -29,6 +30,12 @@ def main():
2930 help = 'Additional file extensions to exclude' )
3031 urls_parser .add_argument ('--no-default-excludes' , action = 'store_true' ,
3132 help = 'Disable default exclusion lists' )
33+ urls_parser .add_argument ('--max-concurrent' , '-c' , type = int , default = 10 ,
34+ help = 'Maximum concurrent requests (default: 10)' )
35+ urls_parser .add_argument ('--no-robots' , action = 'store_true' ,
36+ help = 'Ignore robots.txt (not recommended)' )
37+ urls_parser .add_argument ('--timeout' , '-t' , type = int , default = 30 ,
38+ help = 'Request timeout in seconds (default: 30)' )
3239
3340 # wikipedia command
3441 wiki_parser = subparsers .add_parser ('wikipedia' , help = 'Extract URLs from Wikipedia dump' )
@@ -65,14 +72,17 @@ def main():
6572
6673 # Handle commands
6774 if args .command == 'urls' :
68- stats = download_and_clean (
75+ stats = asyncio . run ( download_and_clean_async (
6976 args .url_file ,
7077 args .output_dir ,
7178 max_urls = args .max_urls ,
7279 exclude_domains = args .exclude_domains ,
7380 exclude_extensions = args .exclude_extensions ,
74- use_default_excludes = not args .no_default_excludes
75- )
81+ use_default_excludes = not args .no_default_excludes ,
82+ max_concurrent = args .max_concurrent ,
83+ respect_robots = not args .no_robots ,
84+ timeout = args .timeout
85+ ))
7686 dataset_stats = estimate_dataset_size (args .output_dir )
7787 print (f"\n Dataset: { dataset_stats ['files' ]} files, "
7888 f"{ dataset_stats ['words' ]:,} words, "
0 commit comments