@@ -242,11 +242,11 @@ def is_multi_language(raw_language):
242242def normalize_language (raw_language ):
243243 raw = str (raw_language ).strip ()
244244 if not raw :
245- return "Undetermined "
245+ return "UNKNOWN "
246246
247247 # 1st: check multi-language
248248 if is_multi_language (raw ):
249- return "Multiple languages "
249+ return "MULTIPLE LANGUAGES "
250250
251251 # Prep for subsequent checks by striping noise and normalizing
252252 cleaned = normalize_key (strip_noise (raw ))
@@ -280,7 +280,7 @@ def normalize_language(raw_language):
280280 except ValueError :
281281 pass
282282
283- return "Undetermined "
283+ return "UNKNOWN "
284284
285285
286286def query_internet_archive (args , session , license_mapping ):
@@ -319,14 +319,12 @@ def query_internet_archive(args, session, license_mapping):
319319 license_counter [(normalized_url )] += 1
320320
321321 # Extract and normalize language
322- raw_language = result .get ("language" , "Undetermined " )
322+ raw_language = result .get ("language" , "UNKNOWN " )
323323 if isinstance (raw_language , list ):
324- raw_language = (
325- raw_language [0 ] if raw_language else "Undetermined"
326- )
324+ raw_language = raw_language [0 ] if raw_language else "UNKNOWN"
327325
328326 normalized_lang = normalize_language (raw_language )
329- if normalized_lang == "Undetermined " :
327+ if normalized_lang == "UNKNOWN " :
330328 unmapped_language_counter [raw_language ] += 1
331329
332330 language_counter [(normalized_url , normalized_lang )] += 1
@@ -383,48 +381,24 @@ def query_internet_archive(args, session, license_mapping):
383381 return license_counter , language_counter
384382
385383
386- def write_csv (file_path , header , rows ):
387- with open (file_path , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
388- writer = csv .writer (file_obj , dialect = "unix" )
389- writer .writerow (header )
390- for row in rows :
391- writer .writerow (row )
392- LOGGER .info (f"Wrote { len (rows )} rows to { file_path } " )
393-
394-
395384def write_all (args , license_counter , language_counter ):
396- if not args .enable_save :
397- return args
398-
399- os .makedirs (PATHS ["data_phase" ], exist_ok = True )
400-
401385 # Sort license data by license name
402- sorted_license_rows = sorted (
403- [(license , count ) for license , count in license_counter .items ()],
404- key = lambda x : x [0 ],
405- )
386+ sorted_license_rows = [
387+ {"LICENSE" : key , "COUNT" : license_counter [key ]}
388+ for key in sorted (license_counter .keys ())
389+ ]
390+ shared .rows_to_csv (args , FILE1_COUNT , HEADER1 , sorted_license_rows )
406391
407392 # Sort language data by license then language
408- sorted_language_rows = sorted (
409- [
410- (license , language , count )
411- for (license , language ), count in language_counter .items ()
412- ],
413- key = lambda x : (x [0 ], x [1 ]),
414- )
415-
416- write_csv (
417- FILE1_COUNT ,
418- HEADER1 ,
419- sorted_license_rows ,
420- )
421- write_csv (
422- FILE2_LANGUAGE ,
423- HEADER2 ,
424- sorted_language_rows ,
425- )
426-
427- return args
393+ sorted_language_rows = [
394+ {
395+ "LICENSE" : key_tuple [0 ],
396+ "LANGUAGE" : key_tuple [1 ],
397+ "COUNT" : language_counter [key_tuple ],
398+ }
399+ for key_tuple in sorted (language_counter .keys ())
400+ ]
401+ shared .rows_to_csv (args , FILE2_LANGUAGE , HEADER2 , sorted_language_rows )
428402
429403
430404def main ():
@@ -442,8 +416,7 @@ def main():
442416 args , session , license_mapping
443417 )
444418
445- if args .enable_save :
446- write_all (args , license_data , language_data )
419+ write_all (args , license_data , language_data )
447420
448421 if args .enable_git :
449422 args = shared .git_add_and_commit (
0 commit comments