55from .scrape import _validate_scrape
66import multiprocessing as mp
77from functools import partial
8+ from tqdm import tqdm
89
910logger = logging .getLogger (__name__ )
1011
11- def _process_file (f ):
12- """Helper function to read and validate a single file."""
13- logger .info ("Processing article %s..." % f )
14- try :
15- html = open (f ).read ()
16- except Exception as e :
17- logger .warning ("Failed to read file %s: %s" % (f , str (e )))
18- return f , None
19-
20- if not _validate_scrape (html ):
21- logger .warning ("Invalid HTML for %s" % f )
22- return f , None
23-
24- return f , html
25-
26-
2712def _process_file_with_source (args ):
2813 """Helper function to read, validate, and identify source for a single file."""
2914 f , source_configs = args
30- logger .info ("Processing article %s..." % f )
3115 try :
3216 html = open (f ).read ()
3317 except Exception as e :
@@ -65,18 +49,18 @@ def _parse_article(args):
6549 # Fallback to original source identification
6650 source = manager .identify_source (html )
6751 if source is None :
68- logger .warning ("Could not identify source for %s" % f )
52+ logger .info ("Could not identify source for %s" % f )
6953 return f , None
7054
7155 article = source .parse_article (html , pmid , metadata_dir = metadata_dir , ** kwargs )
7256 return f , article
7357 except Exception as e :
74- logger .warning ("Error parsing article %s: %s" % (f , str (e )))
58+ logger .info ("Error parsing article %s: %s" % (f , str (e )))
7559 return f , None
7660
7761
7862def add_articles (db , files , commit = True , table_dir = None , limit = None ,
79- pmid_filenames = False , metadata_dir = None , force_ingest = True , num_workers = None , ** kwargs ):
63+ pmid_filenames = False , metadata_dir = None , force_ingest = True , num_workers = None , use_readability = None , ** kwargs ):
8064 ''' Process articles and add their data to the DB.
8165 Args:
8266 files: The path to the article(s) to process. Can be a single
@@ -100,10 +84,13 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
10084 force_ingest: Ingest even if no source is identified.
10185 num_workers: Number of worker processes to use when processing in parallel.
10286 If None (default), uses the number of CPUs available on the system.
87+ use_readability: When True, use readability.py for HTML cleaning if available.
88+ When False, use fallback HTML processing by default. If None (default),
89+ uses the value from config.USE_READABILITY.
10390 kwargs: Additional keyword arguments to pass to parse_article.
10491 '''
10592
106- manager = sources .SourceManager (table_dir )
93+ manager = sources .SourceManager (table_dir , use_readability = use_readability if use_readability is not None else config . USE_READABILITY )
10794
10895 # Prepare source configurations for parallel processing
10996 source_configs = {name : source .identifiers for name , source in manager .sources .items ()}
@@ -123,11 +110,11 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
123110 # Process files in parallel to extract HTML content and identify sources
124111 process_args = [(f , source_configs ) for f in files ]
125112 with mp .Pool (processes = num_workers ) as pool :
126- file_html_source_tuples = pool .map (_process_file_with_source , process_args )
113+ file_html_source_tuples = list ( tqdm ( pool .imap_unordered (_process_file_with_source , process_args ), total = len ( process_args ), desc = "Processing files" ) )
127114 else :
128115 # Process files sequentially
129116 file_html_source_tuples = []
130- for f in files :
117+ for f in tqdm ( files , desc = "Processing files" ) :
131118 result = _process_file_with_source ((f , source_configs ))
132119 file_html_source_tuples .append (result )
133120
@@ -142,7 +129,7 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
142129 # Filter out articles that already exist in the database
143130 files_to_process = []
144131 missing_sources = []
145-
132+
146133 for f , html , source_name in valid_files :
147134 pmid = path .splitext (path .basename (f ))[0 ] if pmid_filenames else None
148135
@@ -160,16 +147,16 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
160147 if num_workers is not None and num_workers != 1 and parse_args :
161148 # Parse articles in parallel
162149 with mp .Pool (processes = num_workers ) as pool :
163- parsed_articles = pool .map (_parse_article , parse_args )
150+ parsed_articles = list ( tqdm ( pool .imap_unordered (_parse_article , parse_args ), total = len ( parse_args ), desc = "Parsing articles" ) )
164151 else :
165152 # Parse articles sequentially
166153 parsed_articles = []
167- for args in parse_args :
154+ for args in tqdm ( parse_args , desc = "Parsing articles" ) :
168155 parsed_articles .append (_parse_article (args ))
169156
170157 # Add successfully parsed articles to database
171158 for i , (f , article ) in enumerate (parsed_articles ):
172- if article is None :
159+ if article in [ None , False ] :
173160 missing_sources .append (f )
174161 continue
175162
0 commit comments