Skip to content

Commit 0557efd

Browse files
committed
use new shared.rows_to_csv function
1 parent 1076d47 commit 0557efd

File tree

1 file changed

+21
-48
lines changed

1 file changed

+21
-48
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 21 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -242,11 +242,11 @@ def is_multi_language(raw_language):
242242
def normalize_language(raw_language):
243243
raw = str(raw_language).strip()
244244
if not raw:
245-
return "Undetermined"
245+
return "UNKNOWN"
246246

247247
# 1st: check multi-language
248248
if is_multi_language(raw):
249-
return "Multiple languages"
249+
return "MULTIPLE LANGUAGES"
250250

251251
# Prep for subsequent checks by striping noise and normalizing
252252
cleaned = normalize_key(strip_noise(raw))
@@ -280,7 +280,7 @@ def normalize_language(raw_language):
280280
except ValueError:
281281
pass
282282

283-
return "Undetermined"
283+
return "UNKNOWN"
284284

285285

286286
def query_internet_archive(args, session, license_mapping):
@@ -319,14 +319,12 @@ def query_internet_archive(args, session, license_mapping):
319319
license_counter[(normalized_url)] += 1
320320

321321
# Extract and normalize language
322-
raw_language = result.get("language", "Undetermined")
322+
raw_language = result.get("language", "UNKNOWN")
323323
if isinstance(raw_language, list):
324-
raw_language = (
325-
raw_language[0] if raw_language else "Undetermined"
326-
)
324+
raw_language = raw_language[0] if raw_language else "UNKNOWN"
327325

328326
normalized_lang = normalize_language(raw_language)
329-
if normalized_lang == "Undetermined":
327+
if normalized_lang == "UNKNOWN":
330328
unmapped_language_counter[raw_language] += 1
331329

332330
language_counter[(normalized_url, normalized_lang)] += 1
@@ -383,48 +381,24 @@ def query_internet_archive(args, session, license_mapping):
383381
return license_counter, language_counter
384382

385383

386-
def write_csv(file_path, header, rows):
387-
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
388-
writer = csv.writer(file_obj, dialect="unix")
389-
writer.writerow(header)
390-
for row in rows:
391-
writer.writerow(row)
392-
LOGGER.info(f"Wrote {len(rows)} rows to {file_path}")
393-
394-
395384
def write_all(args, license_counter, language_counter):
396-
if not args.enable_save:
397-
return args
398-
399-
os.makedirs(PATHS["data_phase"], exist_ok=True)
400-
401385
# Sort license data by license name
402-
sorted_license_rows = sorted(
403-
[(license, count) for license, count in license_counter.items()],
404-
key=lambda x: x[0],
405-
)
386+
sorted_license_rows = [
387+
{"LICENSE": key, "COUNT": license_counter[key]}
388+
for key in sorted(license_counter.keys())
389+
]
390+
shared.rows_to_csv(args, FILE1_COUNT, HEADER1, sorted_license_rows)
406391

407392
# Sort language data by license then language
408-
sorted_language_rows = sorted(
409-
[
410-
(license, language, count)
411-
for (license, language), count in language_counter.items()
412-
],
413-
key=lambda x: (x[0], x[1]),
414-
)
415-
416-
write_csv(
417-
FILE1_COUNT,
418-
HEADER1,
419-
sorted_license_rows,
420-
)
421-
write_csv(
422-
FILE2_LANGUAGE,
423-
HEADER2,
424-
sorted_language_rows,
425-
)
426-
427-
return args
393+
sorted_language_rows = [
394+
{
395+
"LICENSE": key_tuple[0],
396+
"LANGUAGE": key_tuple[1],
397+
"COUNT": language_counter[key_tuple],
398+
}
399+
for key_tuple in sorted(language_counter.keys())
400+
]
401+
shared.rows_to_csv(args, FILE2_LANGUAGE, HEADER2, sorted_language_rows)
428402

429403

430404
def main():
@@ -442,8 +416,7 @@ def main():
442416
args, session, license_mapping
443417
)
444418

445-
if args.enable_save:
446-
write_all(args, license_data, language_data)
419+
write_all(args, license_data, language_data)
447420

448421
if args.enable_git:
449422
args = shared.git_add_and_commit(

0 commit comments

Comments
 (0)