@@ -136,6 +136,48 @@ def fetch_category_totals(category, session):
136136 raise shared .QuantifyingException (message )
137137
138138
139+ # Helper function to check if a category
140+ # name represents a valid CC license tool
141+ def is_valid_license_tool (category_name ):
142+ """
143+ Checks if a category name corresponds to
144+ an official Creative Commons license tool..
145+ Official license categories usually start with
146+ 'CC-' followed by a combination
147+ of BY, SA, ND, NC, and a version number (e.g., CC-BY-4.0)
148+
149+ EXCLUDED CC Licenses (marked 'Not OK' in policy):
150+ - Attribution-NonCommercial (CC BY-NC).
151+ - Attribution-NoDerivs (CC BY-ND).
152+ - Any combination containing NC or ND restrictions.
153+
154+
155+ """
156+ # A list of common patterns to check
157+ if category_name .startswith ("CC-" ) and any (
158+ x in category_name for x in ["BY" , "SA" ]
159+ ):
160+ # Specific exceptions that look like
161+ # licenses but are markers/subcategories
162+ if "migrated" in category_name or "Retired" in category_name :
163+ return False
164+ return True
165+
166+ # Check for CC0 Public Domain Dedication (often just "CC0")
167+ if (
168+ category_name == "CC0"
169+ or category_name .startswith ("CC0-" )
170+ or category_name == "CC-Zero"
171+ ):
172+ return True
173+
174+ # The root category itself is not a license tool
175+ if category_name == ROOT_CATEGORY :
176+ return False
177+
178+ return False
179+
180+
139181def recursive_collect_data (session , limit = None ):
140182 """Recursively traverse WikiCommons categories and collect data."""
141183
@@ -149,27 +191,39 @@ def traverse(category, path, depth=0):
149191 return
150192 visited .add (category )
151193
152- # Get counts for the current category itself
153- contents = fetch_category_totals (category , session )
154-
155- results .append (
156- {
157- "LICENSE_TYPE" : path ,
158- "FILE_COUNT" : contents ["FILE_COUNT" ],
159- "PAGE_COUNT" : contents ["PAGE_COUNT" ],
160- }
161- )
162-
163- # Get subcategories
194+ # Only fetch and collect data for valid license tools
195+ if is_valid_license_tool (category ):
196+ try :
197+ # Get counts for the current category
198+ contents = fetch_category_totals (category , session )
199+
200+ results .append (
201+ {
202+ # Use the specific license category name
203+ # as the LICENSE_TYPE
204+ "LICENSE_TYPE" : category ,
205+ "FILE_COUNT" : contents ["FILE_COUNT" ],
206+ "PAGE_COUNT" : contents ["PAGE_COUNT" ],
207+ }
208+ )
209+ except shared .QuantifyingException as e :
210+ # Log the specific license category failure
211+ LOGGER .error (
212+ f"Failed to process valid license category { category } : { e } "
213+ )
214+
215+ # Get subcategories (check subcategories,
216+ # as a valid license might be nested under a non-license category)
164217 subcats = get_subcategories (category , session )
165- count = len (subcats )
166218
167219 # Logging label
168220 label = "categories" if depth == 0 else "subcategories"
169- LOGGER .info (f"Fetched { count } { label } for { category } ." )
221+ LOGGER .info (f"Fetched { len ( subcats ) } { label } for { category } ." )
170222
171223 # Recursively traverse subcategories
172224 for sub in subcats :
225+ # Use the subcategory name as the 'path' for traversal,
226+ # but use the category name for the final result.
173227 traverse (sub , f"{ path } /{ sub } " , depth + 1 )
174228 time .sleep (0.05 ) # time to sleep
175229
0 commit comments