Skip to content

Commit d928671

Browse files
committed
Included only legal tools
1 parent 35498c0 commit d928671

1 file changed

Lines changed: 68 additions & 14 deletions

File tree

scripts/1-fetch/wikicommons_fetch.py

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,48 @@ def fetch_category_totals(category, session):
136136
raise shared.QuantifyingException(message)
137137

138138

139+
# Helper function to check if a category
140+
# name represents a valid CC license tool
141+
def is_valid_license_tool(category_name):
142+
"""
143+
Checks if a category name corresponds to
144+
an official Creative Commons license tool..
145+
Official license categories usually start with
146+
'CC-' followed by a combination
147+
of BY, SA, ND, NC, and a version number (e.g., CC-BY-4.0)
148+
149+
EXCLUDED CC Licenses (marked 'Not OK' in policy):
150+
- Attribution-NonCommercial (CC BY-NC).
151+
- Attribution-NoDerivs (CC BY-ND).
152+
- Any combination containing NC or ND restrictions.
153+
154+
155+
"""
156+
# A list of common patterns to check
157+
if category_name.startswith("CC-") and any(
158+
x in category_name for x in ["BY", "SA"]
159+
):
160+
# Specific exceptions that look like
161+
# licenses but are markers/subcategories
162+
if "migrated" in category_name or "Retired" in category_name:
163+
return False
164+
return True
165+
166+
# Check for CC0 Public Domain Dedication (often just "CC0")
167+
if (
168+
category_name == "CC0"
169+
or category_name.startswith("CC0-")
170+
or category_name == "CC-Zero"
171+
):
172+
return True
173+
174+
# The root category itself is not a license tool
175+
if category_name == ROOT_CATEGORY:
176+
return False
177+
178+
return False
179+
180+
139181
def recursive_collect_data(session, limit=None):
140182
"""Recursively traverse WikiCommons categories and collect data."""
141183

@@ -149,27 +191,39 @@ def traverse(category, path, depth=0):
149191
return
150192
visited.add(category)
151193

152-
# Get counts for the current category itself
153-
contents = fetch_category_totals(category, session)
154-
155-
results.append(
156-
{
157-
"LICENSE_TYPE": path,
158-
"FILE_COUNT": contents["FILE_COUNT"],
159-
"PAGE_COUNT": contents["PAGE_COUNT"],
160-
}
161-
)
162-
163-
# Get subcategories
194+
# Only fetch and collect data for valid license tools
195+
if is_valid_license_tool(category):
196+
try:
197+
# Get counts for the current category
198+
contents = fetch_category_totals(category, session)
199+
200+
results.append(
201+
{
202+
# Use the specific license category name
203+
# as the LICENSE_TYPE
204+
"LICENSE_TYPE": category,
205+
"FILE_COUNT": contents["FILE_COUNT"],
206+
"PAGE_COUNT": contents["PAGE_COUNT"],
207+
}
208+
)
209+
except shared.QuantifyingException as e:
210+
# Log the specific license category failure
211+
LOGGER.error(
212+
f"Failed to process valid license category {category}: {e}"
213+
)
214+
215+
# Get subcategories (check subcategories,
216+
# as a valid license might be nested under a non-license category)
164217
subcats = get_subcategories(category, session)
165-
count = len(subcats)
166218

167219
# Logging label
168220
label = "categories" if depth == 0 else "subcategories"
169-
LOGGER.info(f"Fetched {count} {label} for {category}.")
221+
LOGGER.info(f"Fetched {len(subcats)} {label} for {category}.")
170222

171223
# Recursively traverse subcategories
172224
for sub in subcats:
225+
# Use the subcategory name as the 'path' for traversal,
226+
# but use the category name for the final result.
173227
traverse(sub, f"{path}/{sub}", depth + 1)
174228
time.sleep(0.05) # time to sleep
175229

0 commit comments

Comments
 (0)