1313import traceback
1414from collections import Counter , defaultdict
1515from copy import copy
16- from datetime import datetime , timezone
1716from operator import itemgetter
1817
1918# Third-party
3635# Constants
3736BASE_URL = "https://oaipmh.arxiv.org/oai"
3837# Defaults should result in quick operation (not complete operation)
39- # ArXiv Categories - manually curated from ArXiv official taxonomy
40- # Source: https://arxiv.org/category_taxonomy
41- CATEGORIES = {
42- # Computer Science
43- "cs.AI" : "Artificial Intelligence" ,
44- "cs.AR" : "Hardware Architecture" ,
45- "cs.CC" : "Computational Complexity" ,
46- "cs.CE" : "Computational Engineering, Finance, and Science" ,
47- "cs.CG" : "Computational Geometry" ,
48- "cs.CL" : "Computation and Language" ,
49- "cs.CR" : "Cryptography and Security" ,
50- "cs.CV" : "Computer Vision and Pattern Recognition" ,
51- "cs.CY" : "Computers and Society" ,
52- "cs.DB" : "Databases" ,
53- "cs.DC" : "Distributed, Parallel, and Cluster Computing" ,
54- "cs.DL" : "Digital Libraries" ,
55- "cs.DM" : "Discrete Mathematics" ,
56- "cs.DS" : "Data Structures and Algorithms" ,
57- "cs.ET" : "Emerging Technologies" ,
58- "cs.FL" : "Formal Languages and Automata Theory" ,
59- "cs.GL" : "General Literature" ,
60- "cs.GR" : "Graphics" ,
61- "cs.GT" : "Computer Science and Game Theory" ,
62- "cs.HC" : "Human-Computer Interaction" ,
63- "cs.IR" : "Information Retrieval" ,
64- "cs.IT" : "Information Theory" ,
65- "cs.LG" : "Machine Learning" ,
66- "cs.LO" : "Logic in Computer Science" ,
67- "cs.MA" : "Multiagent Systems" ,
68- "cs.MM" : "Multimedia" ,
69- "cs.MS" : "Mathematical Software" ,
70- "cs.NA" : "Numerical Analysis" ,
71- "cs.NE" : "Neural and Evolutionary Computing" ,
72- "cs.NI" : "Networking and Internet Architecture" ,
73- "cs.OH" : "Other Computer Science" ,
74- "cs.OS" : "Operating Systems" ,
75- "cs.PF" : "Performance" ,
76- "cs.PL" : "Programming Languages" ,
77- "cs.RO" : "Robotics" ,
78- "cs.SC" : "Symbolic Computation" ,
79- "cs.SD" : "Sound" ,
80- "cs.SE" : "Software Engineering" ,
81- "cs.SI" : "Social and Information Networks" ,
82- "cs.SY" : "Systems and Control" ,
83- # Mathematics
84- "math.AC" : "Commutative Algebra" ,
85- "math.AG" : "Algebraic Geometry" ,
86- "math.AP" : "Analysis of PDEs" ,
87- "math.AT" : "Algebraic Topology" ,
88- "math.CA" : "Classical Analysis and ODEs" ,
89- "math.CO" : "Combinatorics" ,
90- "math.CT" : "Category Theory" ,
91- "math.CV" : "Complex Variables" ,
92- "math.DG" : "Differential Geometry" ,
93- "math.DS" : "Dynamical Systems" ,
94- "math.FA" : "Functional Analysis" ,
95- "math.GM" : "General Mathematics" ,
96- "math.GN" : "General Topology" ,
97- "math.GR" : "Group Theory" ,
98- "math.GT" : "Geometric Topology" ,
99- "math.HO" : "History and Overview" ,
100- "math.IT" : "Information Theory" ,
101- "math.KT" : "K-Theory and Homology" ,
102- "math.LO" : "Logic" ,
103- "math.MG" : "Metric Geometry" ,
104- "math.MP" : "Mathematical Physics" ,
105- "math.NA" : "Numerical Analysis" ,
106- "math.NT" : "Number Theory" ,
107- "math.OA" : "Operator Algebras" ,
108- "math.OC" : "Optimization and Control" ,
109- "math.PR" : "Probability" ,
110- "math.QA" : "Quantum Algebra" ,
111- "math.RA" : "Rings and Algebras" ,
112- "math.RT" : "Representation Theory" ,
113- "math.SG" : "Symplectic Geometry" ,
114- "math.SP" : "Spectral Theory" ,
115- "math.ST" : "Statistics Theory" ,
116- # Physics
117- "physics.acc-ph" : "Accelerator Physics" ,
118- "physics.ao-ph" : "Atmospheric and Oceanic Physics" ,
119- "physics.app-ph" : "Applied Physics" ,
120- "physics.atm-clus" : "Atomic and Molecular Clusters" ,
121- "physics.atom-ph" : "Atomic Physics" ,
122- "physics.bio-ph" : "Biological Physics" ,
123- "physics.chem-ph" : "Chemical Physics" ,
124- "physics.class-ph" : "Classical Physics" ,
125- "physics.comp-ph" : "Computational Physics" ,
126- "physics.data-an" : "Data Analysis, Statistics and Probability" ,
127- "physics.ed-ph" : "Physics Education" ,
128- "physics.flu-dyn" : "Fluid Dynamics" ,
129- "physics.gen-ph" : "General Physics" ,
130- "physics.geo-ph" : "Geophysics" ,
131- "physics.hist-ph" : "History and Philosophy of Physics" ,
132- "physics.ins-det" : "Instrumentation and Detectors" ,
133- "physics.med-ph" : "Medical Physics" ,
134- "physics.optics" : "Optics" ,
135- "physics.plasm-ph" : "Plasma Physics" ,
136- "physics.pop-ph" : "Popular Physics" ,
137- "physics.soc-ph" : "Physics and Society" ,
138- "physics.space-ph" : "Space Physics" ,
139- # Statistics
140- "stat.AP" : "Applications" ,
141- "stat.CO" : "Computation" ,
142- "stat.ME" : "Methodology" ,
143- "stat.ML" : "Machine Learning" ,
144- "stat.OT" : "Other Statistics" ,
145- "stat.TH" : "Statistics Theory" ,
146- # Quantitative Biology
147- "q-bio.BM" : "Biomolecules" ,
148- "q-bio.CB" : "Cell Behavior" ,
149- "q-bio.GN" : "Genomics" ,
150- "q-bio.MN" : "Molecular Networks" ,
151- "q-bio.NC" : "Neurons and Cognition" ,
152- "q-bio.OT" : "Other Quantitative Biology" ,
153- "q-bio.PE" : "Populations and Evolution" ,
154- "q-bio.QM" : "Quantitative Methods" ,
155- "q-bio.SC" : "Subcellular Processes" ,
156- "q-bio.TO" : "Tissues and Organs" ,
157- # Economics
158- "econ.EM" : "Econometrics" ,
159- "econ.GN" : "General Economics" ,
160- "econ.TH" : "Theoretical Economics" ,
161- # Electrical Engineering
162- "eess.AS" : "Audio and Speech Processing" ,
163- "eess.IV" : "Image and Video Processing" ,
164- "eess.SP" : "Signal Processing" ,
165- "eess.SY" : "Systems and Control" ,
166- # High Energy Physics
167- "hep-ex" : "High Energy Physics - Experiment" ,
168- "hep-lat" : "High Energy Physics - Lattice" ,
169- "hep-ph" : "High Energy Physics - Phenomenology" ,
170- "hep-th" : "High Energy Physics - Theory" ,
171- # Other Physics
172- "astro-ph" : "Astrophysics" ,
173- "astro-ph.CO" : "Cosmology and Nongalactic Astrophysics" ,
174- "astro-ph.EP" : "Earth and Planetary Astrophysics" ,
175- "astro-ph.GA" : "Astrophysics of Galaxies" ,
176- "astro-ph.HE" : "High Energy Astrophysical Phenomena" ,
177- "astro-ph.IM" : "Instrumentation and Methods for Astrophysics" ,
178- "astro-ph.SR" : "Solar and Stellar Astrophysics" ,
179- "cond-mat.dis-nn" : "Disordered Systems and Neural Networks" ,
180- "cond-mat.mes-hall" : "Mesoscale and Nanoscale Physics" ,
181- "cond-mat.mtrl-sci" : "Materials Science" ,
182- "cond-mat.other" : "Other Condensed Matter" ,
183- "cond-mat.quant-gas" : "Quantum Gases" ,
184- "cond-mat.soft" : "Soft Condensed Matter" ,
185- "cond-mat.stat-mech" : "Statistical Mechanics" ,
186- "cond-mat.str-el" : "Strongly Correlated Electrons" ,
187- "cond-mat.supr-con" : "Superconductivity" ,
188- "gr-qc" : "General Relativity and Quantum Cosmology" ,
189- "nlin.AO" : "Adaptation and Self-Organizing Systems" ,
190- "nlin.CD" : "Chaotic Dynamics" ,
191- "nlin.CG" : "Cellular Automata and Lattice Gases" ,
192- "nlin.PS" : "Pattern Formation and Solitons" ,
193- "nlin.SI" : "Exactly Solvable and Integrable Systems" ,
194- "nucl-ex" : "Nuclear Experiment" ,
195- "nucl-th" : "Nuclear Theory" ,
196- "quant-ph" : "Quantum Physics" ,
197- }
198- DEFAULT_FETCH_LIMIT = 1000
199- DEFAULT_YEARS_BACK = 5
38+ DEFAULT_FETCH_LIMIT = 4500 # Fetch 3 batches of 1,500 articles each
20039# CSV file paths
20140FILE_ARXIV_AUTHOR_BUCKET = shared .path_join (
20241 PATHS ["data_1-fetch" ], "arxiv_4_count_by_author_bucket.csv"
@@ -250,40 +89,13 @@ def parse_arguments():
25089 default = DEFAULT_FETCH_LIMIT ,
25190 help = (
25291 "Limit number of fetched articles (default:"
253- f" { DEFAULT_FETCH_LIMIT } ). Use a value of -1 to remove limit."
92+ f" { DEFAULT_FETCH_LIMIT } ). Use a value of -1 to fetch all articles"
93+ " (remove limit)."
25494 ),
25595 )
256- parser .add_argument (
257- "--years-back" ,
258- type = int ,
259- default = DEFAULT_YEARS_BACK ,
260- help = (
261- "Number of years back from current year to fetch (default:"
262- f" { DEFAULT_YEARS_BACK } ). Use a value of -1 to specify 2008-02-05"
263- " (first date a CC licensed article was added)."
264- ),
265- )
266-
26796 args = parser .parse_args ()
26897 if not args .enable_save and args .enable_git :
26998 parser .error ("--enable-git requires --enable-save" )
270- # Restrict args.years_back to earliest datetime and initialize
271- # args.from_date
272- #
273- # Survey of records indicated the first CC licenced article was added on
274- # 2008-02-05
275- earliest_date = datetime (2008 , 2 , 5 , tzinfo = timezone .utc )
276- this_year = datetime .now (timezone .utc ).year
277- if args .years_back == - 1 :
278- arg_date = earliest_date
279- else :
280- start_year = this_year - args .years_back
281- arg_date = datetime (start_year , 1 , 1 , tzinfo = timezone .utc )
282- if arg_date < earliest_date :
283- arg_date = earliest_date
284- args .from_date = arg_date .strftime ("%Y-%m-%d" )
285- args .years_back = this_year - arg_date .year
286-
28799 return args
288100
289101
@@ -334,6 +146,45 @@ def get_license_mapping():
334146 )
335147
336148
149+ def query_category_mapping (args , session ):
150+ """
151+ Query to establish mapping of category codes and names.
152+
153+ Also see https://arxiv.org/category_taxonomy
154+ """
155+ global CATEGORY_MAPPING
156+
157+ params = {"verb" : "ListSets" }
158+ try :
159+ response = session .get (BASE_URL , params = params , timeout = 60 )
160+ response .raise_for_status ()
161+ except requests .HTTPError as e :
162+ raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
163+ except requests .RequestException as e :
164+ raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
165+
166+ root = etree .fromstring (response .content )
167+ CATEGORY_MAPPING = {}
168+ sets = root .findall (".//{http://www.openarchives.org/OAI/2.0/}set" )
169+ for set_ in sets :
170+ spec , name = set_ .getchildren ()
171+ # Ensure category code (key) matches code used in articles
172+ spec_list = spec .text .split (":" )
173+ if len (spec_list ) > 1 :
174+ # Remove parent category and replace colon with period
175+ # 3 part examples:
176+ # match:math:AC => math.AC
177+ # physics:astro-ph:CO => astro-ph.CO
178+ # 2 part examples
179+ # physics:astro-ph => astro-ph
180+ # physics:quant-ph => quant-ph
181+ spec_text = "." .join (spec_list [1 :])
182+ else :
183+ spec_text = spec .text
184+ CATEGORY_MAPPING [spec_text ] = name .text
185+ CATEGORY_MAPPING = dict (sorted (CATEGORY_MAPPING .items ()))
186+
187+
337188def extract_record_license (record ):
338189 """
339190 Extract CC license information from OAI-PMH XML record.
@@ -436,9 +287,12 @@ def query_arxiv(args, session):
436287 """
437288 Query ArXiv OAI-PMH API and return information about CC licensed articles.
438289 """
290+ if args .limit == - 1 :
291+ count_desc = "all"
292+ else :
293+ count_desc = f"a maximum of { args .limit } "
439294 LOGGER .info (
440- f"Querying articles from { args .from_date } onwards ({ args .years_back } "
441- " years back)"
295+ f"Fetching { count_desc } articles starting form add date 2008-02-05"
442296 )
443297
444298 # Data structures for counting
@@ -457,19 +311,23 @@ def query_arxiv(args, session):
457311 # resumption token)
458312 proceed = True
459313 while proceed :
314+ if args .limit > 0 and args .limit <= total_fetched :
315+ proceed = False
316+ break
317+
460318 if resumption_token :
461319 # Continue with resumption token
462- query_params = {
320+ params = {
463321 "verb" : "ListRecords" ,
464322 "resumptionToken" : resumption_token ,
465323 }
466324 verb = "resuming"
467325 else :
468326 # Initial request with date range
469- query_params = {
327+ params = {
470328 "verb" : "ListRecords" ,
471329 "metadataPrefix" : "arXiv" ,
472- "from" : args . from_date ,
330+ "from" : "2008-02-05" , # First addition of CC licensed articles
473331 }
474332 verb = "starting"
475333
@@ -481,7 +339,7 @@ def query_arxiv(args, session):
481339
482340 try :
483341 # Build OAI-PMH request URL
484- response = session .get (BASE_URL , params = query_params , timeout = 60 )
342+ response = session .get (BASE_URL , params = params , timeout = 60 )
485343 response .raise_for_status ()
486344 except requests .HTTPError as e :
487345 raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
@@ -614,7 +472,7 @@ def write_data(args, data):
614472 rows = []
615473 for license_name , categories in data ["category_counts" ].items ():
616474 for code , count in categories .items ():
617- label = CATEGORIES .get (code , code )
475+ label = CATEGORY_MAPPING .get (code , code )
618476 rows .append (
619477 {
620478 "TOOL_IDENTIFIER" : license_name ,
@@ -658,12 +516,9 @@ def write_provence(args, cc_articles_found):
658516 provenance_data = {
659517 "api_description" : desc ,
660518 "api_endpoint" : BASE_URL ,
661- "arguments" : {
662- "from_date" : args .from_date ,
663- "limit" : args .limit ,
664- "years_back" : args .years_back ,
665- },
666519 "cc_articles_found" : cc_articles_found ,
520+ "fetch_limit" : args .limit ,
521+ "from_add_date" : "2008-02-05" ,
667522 "quarter" : QUARTER ,
668523 "script" : os .path .basename (__file__ ),
669524 }
@@ -687,6 +542,7 @@ def main():
687542 initialize_all_data_files (args )
688543 get_license_mapping ()
689544 session = shared .get_session ()
545+ query_category_mapping (args , session )
690546 data , cc_articles_found = query_arxiv (args , session )
691547 write_data (args , data )
692548 write_provence (args , cc_articles_found )
0 commit comments