|
10 | 10 |
|
11 | 11 | logger = logging.getLogger(__name__) |
12 | 12 |
|
| 13 | + |
| 14 | +def _article_quality_score(article): |
| 15 | + """Score article quality for same-PMID duplicate resolution. |
| 16 | +
|
| 17 | + Higher is better. Prioritizes records that actually contain extracted |
| 18 | + activations and usable tables over placeholder/blocked pages. |
| 19 | + """ |
| 20 | + if article is None: |
| 21 | + return (-1, -1, -1, -1) |
| 22 | + |
| 23 | + tables = getattr(article, "tables", []) or [] |
| 24 | + n_tables = len(tables) |
| 25 | + n_activations = sum(len(getattr(t, "activations", []) or []) for t in tables) |
| 26 | + missing_source = 1 if getattr(article, "missing_source", False) else 0 |
| 27 | + |
| 28 | + # Sort by: |
| 29 | + # 1) Most activations |
| 30 | + # 2) Most tables |
| 31 | + # 3) Prefer non-missing-source parses |
| 32 | + # 4) Presence of any table at all |
| 33 | + return (n_activations, n_tables, -missing_source, int(n_tables > 0)) |
| 34 | + |
| 35 | + |
13 | 36 | def _process_file_with_source(args): |
14 | 37 | """Helper function to read, validate, and identify source for a file.""" |
15 | 38 | f, source_configs = args |
@@ -50,8 +73,12 @@ def _parse_article(args): |
50 | 73 | # Fallback to original source identification |
51 | 74 | source = manager.identify_source(html) |
52 | 75 | if source is None: |
53 | | - logger.info("Could not identify source for %s", f) |
54 | | - return f, None |
| 76 | + if force_ingest and getattr(manager, "default_source", None) is not None: |
| 77 | + logger.info("Could not identify source for %s; using DefaultSource fallback", f) |
| 78 | + source = manager.default_source |
| 79 | + else: |
| 80 | + logger.info("Could not identify source for %s", f) |
| 81 | + return f, None |
55 | 82 |
|
56 | 83 | article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs) |
57 | 84 | if not article: |
@@ -163,23 +190,44 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None, |
163 | 190 | for args in tqdm(parse_args, desc="Parsing articles"): |
164 | 191 | parsed_articles.append(_parse_article(args)) |
165 | 192 |
|
166 | | - # Add successfully parsed articles to database |
| 193 | + # Add successfully parsed articles to database. |
| 194 | + # When pmid_filenames=True we can see duplicate PMID files from different |
| 195 | + # source folders (e.g., one blocked/challenge page + one valid page). Keep |
| 196 | + # the best parsed candidate per PMID within this run. |
167 | 197 | missing_sources = [] |
168 | | - for i, (f, article) in enumerate(parsed_articles): |
169 | | - if article is None: |
170 | | - missing_sources.append(f) |
171 | | - continue |
172 | | - |
| 198 | + if pmid_filenames: |
| 199 | + best_by_pmid = {} |
| 200 | + for f, article in parsed_articles: |
| 201 | + pmid = path.splitext(path.basename(f))[0] |
| 202 | + if article is None: |
| 203 | + missing_sources.append(f) |
| 204 | + continue |
| 205 | + |
| 206 | + score = _article_quality_score(article) |
| 207 | + existing = best_by_pmid.get(pmid) |
| 208 | + if existing is None or score > existing[2]: |
| 209 | + best_by_pmid[pmid] = (f, article, score) |
| 210 | + |
| 211 | + selected_articles = [(f, article) for (f, article, _) in best_by_pmid.values()] |
| 212 | + else: |
| 213 | + selected_articles = [] |
| 214 | + for f, article in parsed_articles: |
| 215 | + if article is None: |
| 216 | + missing_sources.append(f) |
| 217 | + continue |
| 218 | + selected_articles.append((f, article)) |
| 219 | + |
| 220 | + for i, (f, article) in enumerate(selected_articles): |
173 | 221 | if get_config('SAVE_ARTICLES_WITHOUT_ACTIVATIONS') or article.tables: |
174 | 222 | pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None |
175 | 223 | if pmid and db.article_exists(pmid): |
176 | 224 | if get_config('OVERWRITE_EXISTING_ROWS'): |
177 | 225 | db.delete_article(pmid) |
178 | 226 | else: |
179 | 227 | continue |
180 | | - |
| 228 | + |
181 | 229 | db.add(article) |
182 | | - if commit and (i % 100 == 0 or i == len(parsed_articles) - 1): |
| 230 | + if commit and (i % 100 == 0 or i == len(selected_articles) - 1): |
183 | 231 | db.save() |
184 | 232 |
|
185 | 233 | db.save() |
|
0 commit comments