Skip to content

Commit a2f319e

Browse files
author
Rustem
committed
Refactor CLI to use argparse properly and add test step to workflow
- Fix __main__.py to import from cli.py instead of core.py - Remove unnecessary manual argv validation in merge command (argparse handles this) - Add pytest test step to GitHub workflow before basic functionality tests
1 parent 58f0849 commit a2f319e

5 files changed

Lines changed: 16 additions & 134 deletions

File tree

.github/workflows/test-publish.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ jobs:
2525
- name: Install package
2626
run: |
2727
python -m pip install --upgrade pip
28-
pip install -e .
29-
28+
pip install -e .[dev]
29+
30+
- name: Run tests
31+
run: |
32+
python -m pytest tests/ -v
33+
3034
- name: Test basic functionality
3135
run: |
3236
textnano --help || echo "CLI not fully configured yet"

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ textnano merge output_*/ final_dataset/
236236
| Feature | textnano | lazynlp | beautifulsoup + requests |
237237
|---------|----------|---------|--------------------------|
238238
| **Files** | 1 | 5 | Your custom code |
239-
| **LOC** | ~200 | ~800 | 100-200 (yours) |
240239
| **Dependencies** | 0 | 2 | 2+ |
241240
| **Learning time** | 5 min | 30 min | 1 hour |
242241
| **Deduplication** ||| ❌ (you implement) |

textnano/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Entry point for running textnano as a module: python -m textnano
33
"""
44

5-
from textnano.core import main
5+
from textnano.cli import main
66

77
if __name__ == '__main__':
88
main()

textnano/cli.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,6 @@ def main():
9595
print(f"Avg/file: {stats['avg_words_per_file']} words")
9696

9797
elif args.command == 'merge':
98-
if len(args.dirs) < 2:
99-
print("Usage: textnano merge <dir1> <dir2> ... <output_dir>")
100-
sys.exit(1)
10198
output = args.dirs[-1]
10299
inputs = args.dirs[:-1]
103100
merge_datasets(*inputs, output_dir=output, is_duplicate_func=is_duplicate)

textnano/core.py

Lines changed: 9 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -38,39 +38,34 @@
3838
# DOWNLOAD
3939
# =============================================================================
4040

41-
def download_text(url: str, timeout: int = 30) -> Optional[str]:
41+
def download_text(url: str, timeout: int = 30) -> str:
4242
"""Download and extract text from a URL.
4343
4444
Returns:
45-
str or None: Cleaned text content, or None if failed
45+
str: Cleaned text content, or empty string if failed
4646
"""
4747
try:
48-
# Download
4948
headers = {'User-Agent': 'Mozilla/5.0'}
5049
req = urllib.request.Request(url, headers=headers)
5150

52-
# Create SSL context that doesn't verify certificates
5351
context = ssl.create_default_context()
5452
context.check_hostname = False
5553
context.verify_mode = ssl.CERT_NONE
5654

5755
with urllib.request.urlopen(req, timeout=timeout, context=context) as response:
5856
content = response.read().decode('utf-8', errors='ignore')
5957

60-
# Basic HTML cleaning
61-
text = clean_html(content)
62-
63-
return text if text.strip() else None
58+
return clean_html(content)
6459

6560
except urllib.error.HTTPError as e:
6661
logging.error(f"HTTP error {e.code} for {url}: {e.reason}")
67-
return None
62+
return ""
6863
except urllib.error.URLError as e:
6964
logging.error(f"URL error for {url}: {e.reason}")
70-
return None
65+
return ""
7166
except Exception as e:
7267
logging.error(f"Unexpected error for {url}: {type(e).__name__}: {e}")
73-
return None
68+
return ""
7469

7570

7671
# =============================================================================
@@ -86,21 +81,12 @@ def clean_html(html_content: str) -> str:
8681
Returns:
8782
str: Clean text
8883
"""
89-
# Unescape HTML entities
90-
text = html.unescape(html_content)
91-
92-
# Remove script and style tags
93-
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
84+
text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
9485
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
95-
96-
# Remove HTML tags
9786
text = re.sub(r'<[^>]+>', '', text)
98-
99-
# Normalize whitespace
87+
text = html.unescape(text)
10088
text = re.sub(r'\s+', ' ', text)
10189
text = re.sub(r'\n\s*\n', '\n\n', text)
102-
103-
# Remove leading/trailing whitespace
10490
text = text.strip()
10591

10692
return text
@@ -120,7 +106,7 @@ def text_fingerprint(text: str, n: int = 8) -> str:
120106
Returns:
121107
str: MD5 hash of first N words
122108
"""
123-
words = text.lower().split()[:n]
109+
words = text.lower().split(maxsplit=n)[:n]
124110
fingerprint_text = ' '.join(words)
125111
return hashlib.md5(fingerprint_text.encode()).hexdigest()
126112

@@ -261,107 +247,3 @@ def download_and_clean(url_file: str, output_dir: str, min_words: int = 50, max_
261247
print_stats(stats)
262248

263249
return stats
264-
265-
266-
# =============================================================================
267-
# CLI
268-
# =============================================================================
269-
270-
def main():
271-
"""Command-line interface."""
272-
import sys
273-
import argparse
274-
275-
# Check for simple commands (backward compatibility)
276-
if len(sys.argv) >= 2 and sys.argv[1] == 'stats':
277-
if len(sys.argv) < 3:
278-
print("Usage: textnano stats <dir>")
279-
sys.exit(1)
280-
stats = estimate_dataset_size(sys.argv[2])
281-
print(f"Files: {stats['files']}")
282-
print(f"Words: {stats['words']:,}")
283-
print(f"Size: {stats['mb']:.1f} MB")
284-
print(f"Avg/file: {stats['avg_words_per_file']} words")
285-
return
286-
287-
if len(sys.argv) >= 2 and sys.argv[1] == 'merge':
288-
if len(sys.argv) < 4:
289-
print("Usage: textnano merge <dir1> <dir2> ... <output_dir>")
290-
sys.exit(1)
291-
output = sys.argv[-1]
292-
inputs = sys.argv[2:-1]
293-
merge_datasets(*inputs, output_dir=output, is_duplicate_func=is_duplicate)
294-
return
295-
296-
# Parse arguments
297-
parser = argparse.ArgumentParser(
298-
description='textnano - Minimal text dataset builder',
299-
formatter_class=argparse.RawDescriptionHelpFormatter
300-
)
301-
parser.add_argument('url_file', help='File with URLs (one per line)')
302-
parser.add_argument('output_dir', help='Output directory')
303-
parser.add_argument('max_urls', nargs='?', type=int, default=None,
304-
help='Maximum URLs to process')
305-
parser.add_argument('--exclude-domains', '-ed', nargs='+',
306-
help='Additional domains to exclude (adds to defaults)')
307-
parser.add_argument('--exclude-extensions', '-ee', nargs='+',
308-
help='Additional file extensions to exclude (adds to defaults)')
309-
parser.add_argument('--no-default-excludes', action='store_true',
310-
help='Disable default exclusion lists (only use custom excludes)')
311-
312-
args = parser.parse_args()
313-
314-
# Download command
315-
stats = download_and_clean(
316-
args.url_file,
317-
args.output_dir,
318-
max_urls=args.max_urls,
319-
exclude_domains=args.exclude_domains,
320-
exclude_extensions=args.exclude_extensions,
321-
use_default_excludes=not args.no_default_excludes
322-
)
323-
324-
# Show dataset stats
325-
dataset_stats = estimate_dataset_size(args.output_dir)
326-
print(f"\nDataset: {dataset_stats['files']} files, "
327-
f"{dataset_stats['words']:,} words, "
328-
f"{dataset_stats['mb']:.1f} MB")
329-
330-
331-
if __name__ == '__main__':
332-
main()
333-
334-
335-
# =============================================================================
336-
# USAGE EXAMPLES (copy these to test)
337-
# =============================================================================
338-
339-
"""
340-
# Example 1: Basic usage
341-
python textnano.py urls.txt dataset/
342-
343-
# Example 2: Limit to 100 URLs
344-
python textnano.py urls.txt dataset/ 100
345-
346-
# Example 3: In Python
347-
import textnano
348-
349-
textnano.download_and_clean('urls.txt', 'output/')
350-
stats = textnano.estimate_dataset_size('output/')
351-
print(f"Got {stats['words']:,} words")
352-
353-
# Example 4: Create sample URLs file
354-
cat > urls.txt << EOF
355-
https://en.wikipedia.org/wiki/Machine_learning
356-
https://en.wikipedia.org/wiki/Deep_learning
357-
https://en.wikipedia.org/wiki/Natural_language_processing
358-
https://en.wikipedia.org/wiki/Computer_vision
359-
https://www.gutenberg.org/files/1342/1342-h/1342-h.htm
360-
EOF
361-
362-
# Example 5: Get stats
363-
python textnano.py stats dataset/
364-
365-
# Example 6: Merge datasets
366-
python textnano.py merge dataset1/ dataset2/ merged/
367-
"""

0 commit comments

Comments
 (0)