Skip to content

Commit 6961aaa

Browse files
committed
Merge branch 'arxiv-minimal-fix' into arxiv-fetch
2 parents d4fe493 + 24bd796 commit 6961aaa

File tree

1 file changed

+59
-203
lines changed

1 file changed

+59
-203
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 59 additions & 203 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import traceback
1414
from collections import Counter, defaultdict
1515
from copy import copy
16-
from datetime import datetime, timezone
1716
from operator import itemgetter
1817

1918
# Third-party
@@ -36,167 +35,7 @@
3635
# Constants
3736
BASE_URL = "https://oaipmh.arxiv.org/oai"
3837
# Defaults should result in quick operation (not complete operation)
39-
# ArXiv Categories - manually curated from ArXiv official taxonomy
40-
# Source: https://arxiv.org/category_taxonomy
41-
CATEGORIES = {
42-
# Computer Science
43-
"cs.AI": "Artificial Intelligence",
44-
"cs.AR": "Hardware Architecture",
45-
"cs.CC": "Computational Complexity",
46-
"cs.CE": "Computational Engineering, Finance, and Science",
47-
"cs.CG": "Computational Geometry",
48-
"cs.CL": "Computation and Language",
49-
"cs.CR": "Cryptography and Security",
50-
"cs.CV": "Computer Vision and Pattern Recognition",
51-
"cs.CY": "Computers and Society",
52-
"cs.DB": "Databases",
53-
"cs.DC": "Distributed, Parallel, and Cluster Computing",
54-
"cs.DL": "Digital Libraries",
55-
"cs.DM": "Discrete Mathematics",
56-
"cs.DS": "Data Structures and Algorithms",
57-
"cs.ET": "Emerging Technologies",
58-
"cs.FL": "Formal Languages and Automata Theory",
59-
"cs.GL": "General Literature",
60-
"cs.GR": "Graphics",
61-
"cs.GT": "Computer Science and Game Theory",
62-
"cs.HC": "Human-Computer Interaction",
63-
"cs.IR": "Information Retrieval",
64-
"cs.IT": "Information Theory",
65-
"cs.LG": "Machine Learning",
66-
"cs.LO": "Logic in Computer Science",
67-
"cs.MA": "Multiagent Systems",
68-
"cs.MM": "Multimedia",
69-
"cs.MS": "Mathematical Software",
70-
"cs.NA": "Numerical Analysis",
71-
"cs.NE": "Neural and Evolutionary Computing",
72-
"cs.NI": "Networking and Internet Architecture",
73-
"cs.OH": "Other Computer Science",
74-
"cs.OS": "Operating Systems",
75-
"cs.PF": "Performance",
76-
"cs.PL": "Programming Languages",
77-
"cs.RO": "Robotics",
78-
"cs.SC": "Symbolic Computation",
79-
"cs.SD": "Sound",
80-
"cs.SE": "Software Engineering",
81-
"cs.SI": "Social and Information Networks",
82-
"cs.SY": "Systems and Control",
83-
# Mathematics
84-
"math.AC": "Commutative Algebra",
85-
"math.AG": "Algebraic Geometry",
86-
"math.AP": "Analysis of PDEs",
87-
"math.AT": "Algebraic Topology",
88-
"math.CA": "Classical Analysis and ODEs",
89-
"math.CO": "Combinatorics",
90-
"math.CT": "Category Theory",
91-
"math.CV": "Complex Variables",
92-
"math.DG": "Differential Geometry",
93-
"math.DS": "Dynamical Systems",
94-
"math.FA": "Functional Analysis",
95-
"math.GM": "General Mathematics",
96-
"math.GN": "General Topology",
97-
"math.GR": "Group Theory",
98-
"math.GT": "Geometric Topology",
99-
"math.HO": "History and Overview",
100-
"math.IT": "Information Theory",
101-
"math.KT": "K-Theory and Homology",
102-
"math.LO": "Logic",
103-
"math.MG": "Metric Geometry",
104-
"math.MP": "Mathematical Physics",
105-
"math.NA": "Numerical Analysis",
106-
"math.NT": "Number Theory",
107-
"math.OA": "Operator Algebras",
108-
"math.OC": "Optimization and Control",
109-
"math.PR": "Probability",
110-
"math.QA": "Quantum Algebra",
111-
"math.RA": "Rings and Algebras",
112-
"math.RT": "Representation Theory",
113-
"math.SG": "Symplectic Geometry",
114-
"math.SP": "Spectral Theory",
115-
"math.ST": "Statistics Theory",
116-
# Physics
117-
"physics.acc-ph": "Accelerator Physics",
118-
"physics.ao-ph": "Atmospheric and Oceanic Physics",
119-
"physics.app-ph": "Applied Physics",
120-
"physics.atm-clus": "Atomic and Molecular Clusters",
121-
"physics.atom-ph": "Atomic Physics",
122-
"physics.bio-ph": "Biological Physics",
123-
"physics.chem-ph": "Chemical Physics",
124-
"physics.class-ph": "Classical Physics",
125-
"physics.comp-ph": "Computational Physics",
126-
"physics.data-an": "Data Analysis, Statistics and Probability",
127-
"physics.ed-ph": "Physics Education",
128-
"physics.flu-dyn": "Fluid Dynamics",
129-
"physics.gen-ph": "General Physics",
130-
"physics.geo-ph": "Geophysics",
131-
"physics.hist-ph": "History and Philosophy of Physics",
132-
"physics.ins-det": "Instrumentation and Detectors",
133-
"physics.med-ph": "Medical Physics",
134-
"physics.optics": "Optics",
135-
"physics.plasm-ph": "Plasma Physics",
136-
"physics.pop-ph": "Popular Physics",
137-
"physics.soc-ph": "Physics and Society",
138-
"physics.space-ph": "Space Physics",
139-
# Statistics
140-
"stat.AP": "Applications",
141-
"stat.CO": "Computation",
142-
"stat.ME": "Methodology",
143-
"stat.ML": "Machine Learning",
144-
"stat.OT": "Other Statistics",
145-
"stat.TH": "Statistics Theory",
146-
# Quantitative Biology
147-
"q-bio.BM": "Biomolecules",
148-
"q-bio.CB": "Cell Behavior",
149-
"q-bio.GN": "Genomics",
150-
"q-bio.MN": "Molecular Networks",
151-
"q-bio.NC": "Neurons and Cognition",
152-
"q-bio.OT": "Other Quantitative Biology",
153-
"q-bio.PE": "Populations and Evolution",
154-
"q-bio.QM": "Quantitative Methods",
155-
"q-bio.SC": "Subcellular Processes",
156-
"q-bio.TO": "Tissues and Organs",
157-
# Economics
158-
"econ.EM": "Econometrics",
159-
"econ.GN": "General Economics",
160-
"econ.TH": "Theoretical Economics",
161-
# Electrical Engineering
162-
"eess.AS": "Audio and Speech Processing",
163-
"eess.IV": "Image and Video Processing",
164-
"eess.SP": "Signal Processing",
165-
"eess.SY": "Systems and Control",
166-
# High Energy Physics
167-
"hep-ex": "High Energy Physics - Experiment",
168-
"hep-lat": "High Energy Physics - Lattice",
169-
"hep-ph": "High Energy Physics - Phenomenology",
170-
"hep-th": "High Energy Physics - Theory",
171-
# Other Physics
172-
"astro-ph": "Astrophysics",
173-
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
174-
"astro-ph.EP": "Earth and Planetary Astrophysics",
175-
"astro-ph.GA": "Astrophysics of Galaxies",
176-
"astro-ph.HE": "High Energy Astrophysical Phenomena",
177-
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
178-
"astro-ph.SR": "Solar and Stellar Astrophysics",
179-
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
180-
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
181-
"cond-mat.mtrl-sci": "Materials Science",
182-
"cond-mat.other": "Other Condensed Matter",
183-
"cond-mat.quant-gas": "Quantum Gases",
184-
"cond-mat.soft": "Soft Condensed Matter",
185-
"cond-mat.stat-mech": "Statistical Mechanics",
186-
"cond-mat.str-el": "Strongly Correlated Electrons",
187-
"cond-mat.supr-con": "Superconductivity",
188-
"gr-qc": "General Relativity and Quantum Cosmology",
189-
"nlin.AO": "Adaptation and Self-Organizing Systems",
190-
"nlin.CD": "Chaotic Dynamics",
191-
"nlin.CG": "Cellular Automata and Lattice Gases",
192-
"nlin.PS": "Pattern Formation and Solitons",
193-
"nlin.SI": "Exactly Solvable and Integrable Systems",
194-
"nucl-ex": "Nuclear Experiment",
195-
"nucl-th": "Nuclear Theory",
196-
"quant-ph": "Quantum Physics",
197-
}
198-
DEFAULT_FETCH_LIMIT = 1000
199-
DEFAULT_YEARS_BACK = 5
38+
DEFAULT_FETCH_LIMIT = 4500 # Fetch 3 batches of 1,500 articles each
20039
# CSV file paths
20140
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
20241
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
@@ -250,40 +89,13 @@ def parse_arguments():
25089
default=DEFAULT_FETCH_LIMIT,
25190
help=(
25291
"Limit number of fetched articles (default:"
253-
f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to remove limit."
92+
f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to fetch all articles"
93+
" (remove limit)."
25494
),
25595
)
256-
parser.add_argument(
257-
"--years-back",
258-
type=int,
259-
default=DEFAULT_YEARS_BACK,
260-
help=(
261-
"Number of years back from current year to fetch (default:"
262-
f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify 2008-02-05"
263-
" (first date a CC licensed article was added)."
264-
),
265-
)
266-
26796
args = parser.parse_args()
26897
if not args.enable_save and args.enable_git:
26998
parser.error("--enable-git requires --enable-save")
270-
# Restrict args.years_back to earliest datetime and initialize
271-
# args.from_date
272-
#
273-
# Survey of records indicated the first CC licenced article was added on
274-
# 2008-02-05
275-
earliest_date = datetime(2008, 2, 5, tzinfo=timezone.utc)
276-
this_year = datetime.now(timezone.utc).year
277-
if args.years_back == -1:
278-
arg_date = earliest_date
279-
else:
280-
start_year = this_year - args.years_back
281-
arg_date = datetime(start_year, 1, 1, tzinfo=timezone.utc)
282-
if arg_date < earliest_date:
283-
arg_date = earliest_date
284-
args.from_date = arg_date.strftime("%Y-%m-%d")
285-
args.years_back = this_year - arg_date.year
286-
28799
return args
288100

289101

@@ -334,6 +146,45 @@ def get_license_mapping():
334146
)
335147

336148

149+
def query_category_mapping(args, session):
150+
"""
151+
Query to establish mapping of category codes and names.
152+
153+
Also see https://arxiv.org/category_taxonomy
154+
"""
155+
global CATEGORY_MAPPING
156+
157+
params = {"verb": "ListSets"}
158+
try:
159+
response = session.get(BASE_URL, params=params, timeout=60)
160+
response.raise_for_status()
161+
except requests.HTTPError as e:
162+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
163+
except requests.RequestException as e:
164+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
165+
166+
root = etree.fromstring(response.content)
167+
CATEGORY_MAPPING = {}
168+
sets = root.findall(".//{http://www.openarchives.org/OAI/2.0/}set")
169+
for set_ in sets:
170+
spec, name = set_.getchildren()
171+
# Ensure category code (key) matches code used in articles
172+
spec_list = spec.text.split(":")
173+
if len(spec_list) > 1:
174+
# Remove parent category and replace colon with period
175+
# 3 part examples:
176+
# match:math:AC => math.AC
177+
# physics:astro-ph:CO => astro-ph.CO
178+
# 2 part examples
179+
# physics:astro-ph => astro-ph
180+
# physics:quant-ph => quant-ph
181+
spec_text = ".".join(spec_list[1:])
182+
else:
183+
spec_text = spec.text
184+
CATEGORY_MAPPING[spec_text] = name.text
185+
CATEGORY_MAPPING = dict(sorted(CATEGORY_MAPPING.items()))
186+
187+
337188
def extract_record_license(record):
338189
"""
339190
Extract CC license information from OAI-PMH XML record.
@@ -436,9 +287,12 @@ def query_arxiv(args, session):
436287
"""
437288
Query ArXiv OAI-PMH API and return information about CC licensed articles.
438289
"""
290+
if args.limit == -1:
291+
count_desc = "all"
292+
else:
293+
count_desc = f"a maximum of {args.limit}"
439294
LOGGER.info(
440-
f"Querying articles from {args.from_date} onwards ({args.years_back}"
441-
" years back)"
295+
f"Fetching {count_desc} articles starting form add date 2008-02-05"
442296
)
443297

444298
# Data structures for counting
@@ -457,19 +311,23 @@ def query_arxiv(args, session):
457311
# resumption token)
458312
proceed = True
459313
while proceed:
314+
if args.limit > 0 and args.limit <= total_fetched:
315+
proceed = False
316+
break
317+
460318
if resumption_token:
461319
# Continue with resumption token
462-
query_params = {
320+
params = {
463321
"verb": "ListRecords",
464322
"resumptionToken": resumption_token,
465323
}
466324
verb = "resuming"
467325
else:
468326
# Initial request with date range
469-
query_params = {
327+
params = {
470328
"verb": "ListRecords",
471329
"metadataPrefix": "arXiv",
472-
"from": args.from_date,
330+
"from": "2008-02-05", # First addition of CC licensed articles
473331
}
474332
verb = "starting"
475333

@@ -481,7 +339,7 @@ def query_arxiv(args, session):
481339

482340
try:
483341
# Build OAI-PMH request URL
484-
response = session.get(BASE_URL, params=query_params, timeout=60)
342+
response = session.get(BASE_URL, params=params, timeout=60)
485343
response.raise_for_status()
486344
except requests.HTTPError as e:
487345
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
@@ -614,7 +472,7 @@ def write_data(args, data):
614472
rows = []
615473
for license_name, categories in data["category_counts"].items():
616474
for code, count in categories.items():
617-
label = CATEGORIES.get(code, code)
475+
label = CATEGORY_MAPPING.get(code, code)
618476
rows.append(
619477
{
620478
"TOOL_IDENTIFIER": license_name,
@@ -658,12 +516,9 @@ def write_provence(args, cc_articles_found):
658516
provenance_data = {
659517
"api_description": desc,
660518
"api_endpoint": BASE_URL,
661-
"arguments": {
662-
"from_date": args.from_date,
663-
"limit": args.limit,
664-
"years_back": args.years_back,
665-
},
666519
"cc_articles_found": cc_articles_found,
520+
"fetch_limit": args.limit,
521+
"from_add_date": "2008-02-05",
667522
"quarter": QUARTER,
668523
"script": os.path.basename(__file__),
669524
}
@@ -687,6 +542,7 @@ def main():
687542
initialize_all_data_files(args)
688543
get_license_mapping()
689544
session = shared.get_session()
545+
query_category_mapping(args, session)
690546
data, cc_articles_found = query_arxiv(args, session)
691547
write_data(args, data)
692548
write_provence(args, cc_articles_found)

0 commit comments

Comments
 (0)