|
19 | 19 | from requests.adapters import HTTPAdapter |
20 | 20 | from urllib3.util.retry import Retry |
21 | 21 |
|
22 | | -# Add parent directory so shared can be imported |
23 | | -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) |
24 | 22 | # First-party/Local |
25 | 23 | import shared # noqa: E402 |
| 24 | +from shared import STATUS_FORCELIST, USER_AGENT |
| 25 | + |
| 26 | +# Add parent directory so shared can be imported |
| 27 | +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) |
26 | 28 |
|
27 | 29 | # Setup |
28 | 30 | LOGGER, PATHS = shared.setup(__file__) |
|
33 | 35 | QUARTER = os.path.basename(PATHS["data_quarter"]) |
34 | 36 | WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php" |
35 | 37 | WIKIPEDIA_MATRIX_URL = "https://meta.wikimedia.org/w/api.php" |
36 | | -WIKIPEDIA_RETRY_STATUS_FORCELIST = [ |
37 | | - 408, # Request Timeout |
38 | | - 422, # Unprocessable Content (Validation failed, or endpoint spammed) |
39 | | - 429, # Too Many Requests |
40 | | - 500, # Internal Server Error |
41 | | - 502, # Bad Gateway |
42 | | - 503, # Service Unavailable |
43 | | - 504, # Gateway Timeout |
44 | | -] |
45 | 38 |
|
46 | 39 |
|
47 | 40 | def parse_arguments(): |
@@ -70,13 +63,11 @@ def get_requests_session(): |
70 | 63 | max_retries = Retry( |
71 | 64 | total=5, |
72 | 65 | backoff_factor=10, |
73 | | - status_forcelist=WIKIPEDIA_RETRY_STATUS_FORCELIST, |
| 66 | + status_forcelist=STATUS_FORCELIST, |
74 | 67 | ) |
75 | 68 | session = requests.Session() |
76 | 69 | session.mount("https://", HTTPAdapter(max_retries=max_retries)) |
77 | | - session.headers.update( |
78 | | - {"User-Agent": "quantifying-wikipedia-fetch/1.0 (contact@example.com)"} |
79 | | - ) |
| 70 | + session.headers.update({"User-Agent": USER_AGENT}) |
80 | 71 | return session |
81 | 72 |
|
82 | 73 |
|
@@ -136,7 +127,9 @@ def query_wikipedia_languages(session): |
136 | 127 | stats = data["query"]["statistics"] |
137 | 128 |
|
138 | 129 | article_count = stats.get("articles", 0) |
139 | | - |
| 130 | + if article_count == 0: |
| 131 | + LOGGER.info(f"Skipping {language_name} with 0 articles") |
| 132 | + continue |
140 | 133 | tool_data.append( |
141 | 134 | { |
142 | 135 | "LANGUAGE_CODE": site["code"], |
|
0 commit comments