Skip to content

Commit 7448af1

Browse files
committed
Made requested changes
1 parent 4c642a0 commit 7448af1

2 files changed

Lines changed: 23 additions & 16 deletions

File tree

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@
1919
from requests.adapters import HTTPAdapter
2020
from urllib3.util.retry import Retry
2121

22-
# Add parent directory so shared can be imported
23-
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
2422
# First-party/Local
2523
import shared # noqa: E402
24+
from shared import STATUS_FORCELIST, USER_AGENT
25+
26+
# Add parent directory so shared can be imported
27+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
2628

2729
# Setup
2830
LOGGER, PATHS = shared.setup(__file__)
@@ -33,15 +35,6 @@
3335
QUARTER = os.path.basename(PATHS["data_quarter"])
3436
WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
3537
WIKIPEDIA_MATRIX_URL = "https://meta.wikimedia.org/w/api.php"
36-
WIKIPEDIA_RETRY_STATUS_FORCELIST = [
37-
408, # Request Timeout
38-
422, # Unprocessable Content (Validation failed, or endpoint spammed)
39-
429, # Too Many Requests
40-
500, # Internal Server Error
41-
502, # Bad Gateway
42-
503, # Service Unavailable
43-
504, # Gateway Timeout
44-
]
4538

4639

4740
def parse_arguments():
@@ -70,13 +63,11 @@ def get_requests_session():
7063
max_retries = Retry(
7164
total=5,
7265
backoff_factor=10,
73-
status_forcelist=WIKIPEDIA_RETRY_STATUS_FORCELIST,
66+
status_forcelist=STATUS_FORCELIST,
7467
)
7568
session = requests.Session()
7669
session.mount("https://", HTTPAdapter(max_retries=max_retries))
77-
session.headers.update(
78-
{"User-Agent": "quantifying-wikipedia-fetch/1.0 (contact@example.com)"}
79-
)
70+
session.headers.update({"User-Agent": USER_AGENT})
8071
return session
8172

8273

@@ -136,7 +127,9 @@ def query_wikipedia_languages(session):
136127
stats = data["query"]["statistics"]
137128

138129
article_count = stats.get("articles", 0)
139-
130+
if article_count == 0:
131+
LOGGER.info(f"Skipping {language_name} with 0 articles")
132+
continue
140133
tool_data.append(
141134
{
142135
"LANGUAGE_CODE": site["code"],

scripts/shared.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@
77
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
88
from pandas import PeriodIndex
99

10+
USER_AGENT = (
11+
"QuantifyingTheCommons/1.0 "
12+
"(https://github.com/creativecommons/quantifying)"
13+
)
14+
STATUS_FORCELIST = [
15+
408, # Request Timeout
16+
422, # Unprocessable Content (Validation failed, or endpoint spammed)
17+
429, # Too Many Requests
18+
500, # Internal Server Error
19+
502, # Bad Gateway
20+
503, # Service Unavailable
21+
504, # Gateway Timeout
22+
]
23+
1024

1125
class QuantifyingException(Exception):
1226
def __init__(self, message, exit_code=None):

0 commit comments

Comments
 (0)