Skip to content

Commit 6b67507

Browse files
committed
capture all categories instead of just first
1 parent ff8258e commit 6b67507

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -243,13 +243,12 @@ def extract_record_metadata(args, record):
243243
authors = record.findall(".//{http://arxiv.org/OAI/arXiv/}author")
244244
metadata["author_count"] = len(authors) if authors else 0
245245

246-
# Extract category (primary category from categories field)
246+
# Extract categories
247247
categories_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}categories")
248248
if categories_elem is not None and categories_elem.text:
249-
# Take first category as primary
250-
metadata["category"] = categories_elem.text.strip().split()[0]
249+
metadata["categories"] = categories_elem.text.strip().split()
251250
else:
252-
metadata["category"] = "Unknown"
251+
metadata["categories"] = False
253252

254253
# Set identifer
255254
metadata["identifer"] = identifer
@@ -387,16 +386,17 @@ def query_arxiv(args, session):
387386

388387
if args.show_added and metadata["added_on"]:
389388
cc_articles_added.append(metadata["added_on"])
390-
391389
identifer = metadata["identifer"]
392390

393391
# Count by author count and identifer
394392
author_count = metadata["author_count"]
395393
author_counts[identifer][author_count] += 1
396394

397395
# Count by category and identifer
398-
category = metadata["category"]
399-
category_counts[identifer][category] += 1
396+
categories = metadata["categories"]
397+
if metadata["categories"]:
398+
for category in categories:
399+
category_counts[identifer][category] += 1
400400

401401
# Count by identifer
402402
tool_counts[identifer] += 1

0 commit comments

Comments
 (0)