-
Notifications
You must be signed in to change notification settings - Fork 47
Expand file tree
/
Copy pathfilter_cities.py
More file actions
101 lines (79 loc) · 3.44 KB
/
Copy pathfilter_cities.py
File metadata and controls
101 lines (79 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Download GeoNames cities15000 dataset, filter by population, and generate a sorted CSV.
"""
import csv
import os
import tempfile
import urllib.request
import zipfile
POPULATION_THRESH = 200000
OUTPUT_PATH = "../data/cities.csv"
DATA_URL = "https://download.geonames.org/export/dump/cities15000.zip"
def process_cities(input_txt, output_csv, population_threshold=15000):
"""
Extracts relevant fields from cities15000.txt and writes a filtered, sorted CSV.
Parameters:
- input_txt (str): Path to the cities15000.txt file.
- output_csv (str): Path to the output CSV file.
- population_threshold (int): Minimum population for a city to be included. Defaults to 15000.
"""
try:
cities = []
# Read the .txt file
with open(input_txt, 'r', encoding='utf-8') as txt_file:
reader = csv.reader(txt_file, delimiter='\t')
# Process rows and filter
for row in reader:
try:
city_name = row[1].replace("’", "'")
# FIX: Handle the "Mianzhu, Deyang, Sichuan" anomaly
# by splitting at the comma and taking the first element
# (See PR #80)
city_name = city_name.split(',')[0].strip()
population = int(row[14])
country_code = row[8]
timezone = row[17]
latitude = row[4]
longitude = row[5]
if population >= population_threshold:
cities.append([city_name, population, country_code, timezone, latitude, longitude])
except ValueError:
# Skip rows with invalid population data
continue
# Sort cities by city_name (the sort key must match the normalization in city.c)
cities.sort(key=lambda x: x[0].strip().lower())
# Write sorted data to the output CSV
with open(output_csv, 'w', encoding='utf-8', newline='') as csv_file:
writer = csv.writer(csv_file)
# Write header
writer.writerow(["city_name", "population", "country_code", "timezone", "latitude", "longitude"])
# Write sorted rows
writer.writerows(cities)
print(f"Filtered and sorted CSV created successfully at {output_csv}")
except Exception as e:
print(f"An error occurred: {e}")
def download_and_extract_zip(url, target_dir):
"""
Download a ZIP file from `url` and extract its contents into `target_dir`.
Returns the path to the extracted .txt file.
"""
import glob
zip_path = os.path.join(target_dir, "cities15000.zip")
print(f"Downloading {url}...")
urllib.request.urlretrieve(url, zip_path)
print("Extracting...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(target_dir)
# Find the .txt file
txt_files = glob.glob(os.path.join(target_dir, "*.txt"))
if not txt_files:
raise FileNotFoundError("No .txt file found in the ZIP archive")
return txt_files[0]
if __name__ == "__main__":
# Compute path relative to script
script_dir = os.path.dirname(os.path.abspath(__file__))
output_csv_path = os.path.join(script_dir, OUTPUT_PATH)
# Use a temp dir context
with tempfile.TemporaryDirectory() as tmpdir:
txt_path = download_and_extract_zip(DATA_URL, tmpdir)
process_cities(txt_path, output_csv_path, POPULATION_THRESH)