Skip to content

Commit 84f5b3d

Browse files
committed
add handling of subsumed categories
1 parent fe2f386 commit 84f5b3d

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
Fetch arXiv articles that use a CC legal tool using the OAI-PMH API.
44
OAI-PMH: Open Archives Initiative Protocol for Metadata Havesting.
55
"""
6+
67
# Standard library
78
import argparse
89
import csv
@@ -61,6 +62,26 @@
6162
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
6263
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
6364
QUARTER = os.path.basename(PATHS["data_quarter"])
65+
SUBSUMED_CATEGORIES = {
66+
# https://arxiv.org/archive/alg-geom
67+
# "The alg-geom archive has been subsumed into Algebraic Geometry
68+
# (math.AG)."
69+
"alg-geom": "math.AG",
70+
# https://arxiv.org/archive/chao-dyn
71+
# "The chao-dyn archive has been subsumed into Chaotic Dynamics (nlin.CD)."
72+
"chao-dyn": "nlin.CD",
73+
# https://arxiv.org/archive/dg-ga
74+
# "The dg-ga archive has been subsumed into Differential Geometry
75+
# (math.DG)."
76+
"dg-ga": "math.DG",
77+
# https://arxiv.org/archive/solv-int
78+
# "The solv-int archive has been subsumed into Exactly Solvable and
79+
# Integrable Systems (nlin.SI)."
80+
"solv-int": "nlin.SI",
81+
# https://arxiv.org/archive/q-alg
82+
# "The q-alg archive has been subsumed into Quantum Algebra (math.QA)."
83+
"q-alg": "math.QA",
84+
}
6485

6586

6687
# parsing arguments function
@@ -247,6 +268,8 @@ def extract_record_metadata(args, record):
247268
categories_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}categories")
248269
if categories_elem is not None and categories_elem.text:
249270
metadata["categories"] = categories_elem.text.strip().split()
271+
for index, code in enumerate(metadata["categories"]):
272+
metadata["categories"][index] = SUBSUMED_CATEGORIES.get(code, code)
250273
else:
251274
metadata["categories"] = False
252275

0 commit comments

Comments
 (0)