3838# DOWNLOAD
3939# =============================================================================
4040
41- def download_text (url : str , timeout : int = 30 ) -> Optional [ str ] :
41+ def download_text (url : str , timeout : int = 30 ) -> str :
4242 """Download and extract text from a URL.
4343
4444 Returns:
45- str or None : Cleaned text content, or None if failed
45+ str: Cleaned text content, or empty string if failed
4646 """
4747 try :
48- # Download
4948 headers = {'User-Agent' : 'Mozilla/5.0' }
5049 req = urllib .request .Request (url , headers = headers )
5150
52- # Create SSL context that doesn't verify certificates
5351 context = ssl .create_default_context ()
5452 context .check_hostname = False
5553 context .verify_mode = ssl .CERT_NONE
5654
5755 with urllib .request .urlopen (req , timeout = timeout , context = context ) as response :
5856 content = response .read ().decode ('utf-8' , errors = 'ignore' )
5957
60- # Basic HTML cleaning
61- text = clean_html (content )
62-
63- return text if text .strip () else None
58+ return clean_html (content )
6459
6560 except urllib .error .HTTPError as e :
6661 logging .error (f"HTTP error { e .code } for { url } : { e .reason } " )
67- return None
62+ return ""
6863 except urllib .error .URLError as e :
6964 logging .error (f"URL error for { url } : { e .reason } " )
70- return None
65+ return ""
7166 except Exception as e :
7267 logging .error (f"Unexpected error for { url } : { type (e ).__name__ } : { e } " )
73- return None
68+ return ""
7469
7570
7671# =============================================================================
@@ -86,21 +81,12 @@ def clean_html(html_content: str) -> str:
8681 Returns:
8782 str: Clean text
8883 """
89- # Unescape HTML entities
90- text = html .unescape (html_content )
91-
92- # Remove script and style tags
93- text = re .sub (r'<script[^>]*>.*?</script>' , '' , text , flags = re .DOTALL | re .IGNORECASE )
84+ text = re .sub (r'<script[^>]*>.*?</script>' , '' , html_content , flags = re .DOTALL | re .IGNORECASE )
9485 text = re .sub (r'<style[^>]*>.*?</style>' , '' , text , flags = re .DOTALL | re .IGNORECASE )
95-
96- # Remove HTML tags
9786 text = re .sub (r'<[^>]+>' , '' , text )
98-
99- # Normalize whitespace
87+ text = html .unescape (text )
10088 text = re .sub (r'\s+' , ' ' , text )
10189 text = re .sub (r'\n\s*\n' , '\n \n ' , text )
102-
103- # Remove leading/trailing whitespace
10490 text = text .strip ()
10591
10692 return text
@@ -120,7 +106,7 @@ def text_fingerprint(text: str, n: int = 8) -> str:
120106 Returns:
121107 str: MD5 hash of first N words
122108 """
123- words = text .lower ().split ()[:n ]
109+ words = text .lower ().split (maxsplit = n )[:n ]
124110 fingerprint_text = ' ' .join (words )
125111 return hashlib .md5 (fingerprint_text .encode ()).hexdigest ()
126112
@@ -261,107 +247,3 @@ def download_and_clean(url_file: str, output_dir: str, min_words: int = 50, max_
261247 print_stats (stats )
262248
263249 return stats
264-
265-
266- # =============================================================================
267- # CLI
268- # =============================================================================
269-
270- def main ():
271- """Command-line interface."""
272- import sys
273- import argparse
274-
275- # Check for simple commands (backward compatibility)
276- if len (sys .argv ) >= 2 and sys .argv [1 ] == 'stats' :
277- if len (sys .argv ) < 3 :
278- print ("Usage: textnano stats <dir>" )
279- sys .exit (1 )
280- stats = estimate_dataset_size (sys .argv [2 ])
281- print (f"Files: { stats ['files' ]} " )
282- print (f"Words: { stats ['words' ]:,} " )
283- print (f"Size: { stats ['mb' ]:.1f} MB" )
284- print (f"Avg/file: { stats ['avg_words_per_file' ]} words" )
285- return
286-
287- if len (sys .argv ) >= 2 and sys .argv [1 ] == 'merge' :
288- if len (sys .argv ) < 4 :
289- print ("Usage: textnano merge <dir1> <dir2> ... <output_dir>" )
290- sys .exit (1 )
291- output = sys .argv [- 1 ]
292- inputs = sys .argv [2 :- 1 ]
293- merge_datasets (* inputs , output_dir = output , is_duplicate_func = is_duplicate )
294- return
295-
296- # Parse arguments
297- parser = argparse .ArgumentParser (
298- description = 'textnano - Minimal text dataset builder' ,
299- formatter_class = argparse .RawDescriptionHelpFormatter
300- )
301- parser .add_argument ('url_file' , help = 'File with URLs (one per line)' )
302- parser .add_argument ('output_dir' , help = 'Output directory' )
303- parser .add_argument ('max_urls' , nargs = '?' , type = int , default = None ,
304- help = 'Maximum URLs to process' )
305- parser .add_argument ('--exclude-domains' , '-ed' , nargs = '+' ,
306- help = 'Additional domains to exclude (adds to defaults)' )
307- parser .add_argument ('--exclude-extensions' , '-ee' , nargs = '+' ,
308- help = 'Additional file extensions to exclude (adds to defaults)' )
309- parser .add_argument ('--no-default-excludes' , action = 'store_true' ,
310- help = 'Disable default exclusion lists (only use custom excludes)' )
311-
312- args = parser .parse_args ()
313-
314- # Download command
315- stats = download_and_clean (
316- args .url_file ,
317- args .output_dir ,
318- max_urls = args .max_urls ,
319- exclude_domains = args .exclude_domains ,
320- exclude_extensions = args .exclude_extensions ,
321- use_default_excludes = not args .no_default_excludes
322- )
323-
324- # Show dataset stats
325- dataset_stats = estimate_dataset_size (args .output_dir )
326- print (f"\n Dataset: { dataset_stats ['files' ]} files, "
327- f"{ dataset_stats ['words' ]:,} words, "
328- f"{ dataset_stats ['mb' ]:.1f} MB" )
329-
330-
331- if __name__ == '__main__' :
332- main ()
333-
334-
335- # =============================================================================
336- # USAGE EXAMPLES (copy these to test)
337- # =============================================================================
338-
339- """
340- # Example 1: Basic usage
341- python textnano.py urls.txt dataset/
342-
343- # Example 2: Limit to 100 URLs
344- python textnano.py urls.txt dataset/ 100
345-
346- # Example 3: In Python
347- import textnano
348-
349- textnano.download_and_clean('urls.txt', 'output/')
350- stats = textnano.estimate_dataset_size('output/')
351- print(f"Got {stats['words']:,} words")
352-
353- # Example 4: Create sample URLs file
354- cat > urls.txt << EOF
355- https://en.wikipedia.org/wiki/Machine_learning
356- https://en.wikipedia.org/wiki/Deep_learning
357- https://en.wikipedia.org/wiki/Natural_language_processing
358- https://en.wikipedia.org/wiki/Computer_vision
359- https://www.gutenberg.org/files/1342/1342-h/1342-h.htm
360- EOF
361-
362- # Example 5: Get stats
363- python textnano.py stats dataset/
364-
365- # Example 6: Merge datasets
366- python textnano.py merge dataset1/ dataset2/ merged/
367- """
0 commit comments