Skip to content

Commit dbca23a

Browse files
authored
Merge pull request #83 from johnseekins/inspections
add very basic inspections collection via @johnseekins
2 parents 4c89f3e + a0008d7 commit dbca23a

21 files changed

Lines changed: 639 additions & 192 deletions

.config/mise.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[tools]
2-
python = "3.13.3"
2+
python = "3.14.1"
33
node = "latest"
44
lefthook = "latest"
55
yamllint = "latest"

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ __pycache__/
99
**/.xlsx#
1010
*.parquet
1111
**/.parquet
12+
output/

enrichers/__init__.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,13 @@
55
"""
66

77
import copy
8-
import requests
98
from schemas import enrich_resp_schema
10-
import time
11-
from utils import (
12-
default_headers,
13-
session,
14-
)
159

1610

1711
class Enrichment(object):
1812
_required_keys = [
1913
"facility_name",
2014
]
21-
# in seconds
22-
_wait_time: float = 1
2315

2416
def __init__(self, **kwargs):
2517
self.resp_info = copy.deepcopy(enrich_resp_schema)
@@ -32,28 +24,6 @@ def search(self) -> dict:
3224
"""Child objects should implement this"""
3325
return {}
3426

35-
def _req(self, url: str, **kwargs) -> requests.Response:
36-
"""requests response wrapper to ensure we honor waits"""
37-
headers = kwargs.get("headers", {})
38-
# ensure we get all headers configured correctly
39-
# but manually applied headers win the argument
40-
for k, v in default_headers.items():
41-
if k in headers.keys():
42-
continue
43-
headers[k] = v
44-
45-
response = session.get(
46-
url,
47-
allow_redirects=True,
48-
timeout=kwargs.get("timeout", 10),
49-
params=kwargs.get("params", {}),
50-
stream=kwargs.get("stream", False),
51-
headers=headers,
52-
)
53-
response.raise_for_status()
54-
time.sleep(self._wait_time)
55-
return response
56-
5727
def _minimal_clean_facility_name(self, name: str) -> str:
5828
"""Minimal cleaning that preserves important context like 'County Jail'"""
5929
cleaned = name

enrichers/openstreetmap.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from enrichers import Enrichment
2-
from utils import logger
2+
from utils import logger, req_get
33

44

55
class OpenStreetMap(Enrichment):
@@ -40,13 +40,13 @@ def search(self) -> dict:
4040
"dedupe": 1,
4141
},
4242
"street_address": {
43-
"q": f"{full_address}",
43+
"q": full_address,
4444
"format": "json",
4545
"limit": 5,
4646
"dedupe": 1,
4747
},
4848
"locality": {
49-
"q": f"{locality}",
49+
"q": locality,
5050
"format": "json",
5151
"limit": 5,
5252
"dedupe": 1,
@@ -56,7 +56,7 @@ def search(self) -> dict:
5656
logger.debug("Searching OSM for %s", params["q"])
5757
self.resp_info["search_query_steps"].append(params["q"]) # type: ignore [attr-defined]
5858
try:
59-
response = self._req(search_url, params=params, timeout=15)
59+
response = req_get(search_url, params=params, timeout=15)
6060
data.extend(response.json())
6161
except Exception as e:
6262
logger.debug(" OSM search error for '%s': %s", facility_name, e)
@@ -73,10 +73,8 @@ def search(self) -> dict:
7373
lon = first_result.get("lon", self.default_coords["longitude"])
7474
osm_type = first_result.get("osm_type", "")
7575
osm_id = first_result.get("osm_id", "")
76-
self.resp_info["details"]["latitude"] = lat # type: ignore [index]
77-
self.resp_info["details"]["longitude"] = lon # type: ignore [index]
7876
self.resp_info["title"] = first_result.get("display_name", "")
79-
self.resp_info["details"]["class"] = first_result.get("class", "") # type: ignore [index]
77+
self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
8078
if osm_type == "way":
8179
self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
8280
else:

enrichers/wikidata.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from enrichers import Enrichment
2-
from utils import logger
2+
from utils import logger, req_get
33

44

55
class Wikidata(Enrichment):
@@ -11,29 +11,32 @@ def search(self) -> dict:
1111
# Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.
1212
# Falls back to first result (usually truncated, eg. county)
1313
search_name_fallback = self._clean_facility_name(facility_name)
14+
self.resp_info["enrichment_type"] = "wikidata"
1415
logger.debug("Searching wikidata for %s and %s", facility_name, search_name_fallback)
1516
search_url = "https://www.wikidata.org/w/api.php"
1617
params = {
17-
"action": "wbsearchentities",
18-
"search": facility_name,
19-
"language": "en",
20-
"format": "json",
21-
"limit": 3,
18+
"facility_name": {
19+
"action": "wbsearchentities",
20+
"search": facility_name,
21+
"language": "en",
22+
"format": "json",
23+
"limit": 3,
24+
},
25+
"fallback": {
26+
"action": "wbsearchentities",
27+
"search": search_name_fallback,
28+
"language": "en",
29+
"format": "json",
30+
"limit": 3,
31+
},
2232
}
23-
self.resp_info["enrichment_type"] = "wikidata"
2433
data = {}
25-
try:
26-
response = self._req(search_url, params=params)
27-
data = response.json()
28-
except Exception as e:
29-
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
30-
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
31-
if not data.get("search"):
32-
params["search"] = search_name_fallback
33-
self.resp_info["search_query_steps"].append(search_name_fallback) # type: ignore [attr-defined]
34+
for search, params in params.items():
35+
self.resp_info["search_query_steps"].append(params["search"]) # type: ignore [attr-defined]
3436
try:
35-
response = self._req(search_url, params=params)
37+
response = req_get(search_url, params=params, wait_time=self._wait_time)
3638
data = response.json()
39+
break
3740
except Exception as e:
3841
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
3942
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
@@ -45,10 +48,11 @@ def search(self) -> dict:
4548
if any(term in description for term in match_terms):
4649
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
4750
self.resp_info["title"] = result.get("label", "")
48-
return self.resp_info
49-
# fallback to first result
50-
first = data["search"][0]
51-
logger.debug(" Closer matching failed, falling back to first result %s", first)
52-
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
53-
self.resp_info["title"] = result.get("label", "")
51+
break
52+
else:
53+
# fall back to first result
54+
first = data["search"][0]
55+
logger.debug(" Closer matching failed, falling back to first result %s", first)
56+
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{first['id']}"
57+
self.resp_info["title"] = first.get("label", "")
5458
return self.resp_info

enrichers/wikipedia.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from enrichers import Enrichment
22
from urllib.parse import quote
3-
from utils import logger
3+
from utils import logger, req_get
44

55

66
class Wikipedia(Enrichment):
@@ -32,15 +32,15 @@ def search(self) -> dict:
3232
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
3333
initial_response = False
3434
try:
35-
response = self._req(wiki_url)
35+
response = req_get(wiki_url, wait_time=self._wait_time)
3636
initial_response = True
3737
except Exception as e:
3838
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
3939
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
4040
wiki_url = f"{self.static_search}{quote(facility_name.replace(' ', '_').replace('|', '_'))}"
4141
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
4242
try:
43-
response = self._req(wiki_url)
43+
response = req_get(wiki_url, wait_time=self._wait_time)
4444
initial_response = True
4545
except Exception as e:
4646
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
@@ -101,7 +101,7 @@ def search(self) -> dict:
101101
}
102102

103103
try:
104-
response = self._req(self.api_search, params=params)
104+
response = req_get(self.api_search, params=params, wait_time=self._wait_time)
105105
data = response.json()
106106
except Exception as e:
107107
logger.debug(" Wikipedia search for %s failed: %s", self.api_search, e)
@@ -161,7 +161,7 @@ def search(self) -> dict:
161161

162162
# Verify the page exists and isn't a redirect to something unrelated
163163
try:
164-
verify_response = self._req(final_url)
164+
verify_response = req_get(final_url, wait_time=self._wait_time)
165165
except Exception as e:
166166
logger.debug(" Wikipedia query for %s failed: %s", final_url, e)
167167
self.resp_info["search_query_steps"].append(final_url) # type: ignore [attr-defined]

file_utils.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@ def export_to_file(
1818
if not facilities_data or not facilities_data.get("facilities", []):
1919
logger.warning("No data to export!")
2020
return ""
21-
# make sure the folder we're dropping files into exists
22-
os.makedirs(output_folder, exist_ok=True)
23-
full_name = f"{output_folder}/{filename}.{file_type}"
21+
full_name = f"{output_folder}{os.sep}{filename}.{file_type}"
2422
if file_type in ["csv", "xlsx", "parquet"]:
2523
writer = convert_to_dataframe(facilities_data["facilities"])
2624
match file_type:

ice_scrapers/agencies.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,18 @@
1111
import time
1212
from utils import (
1313
logger,
14-
session,
14+
output_folder,
15+
req_get,
1516
)
1617
from .utils import download_file
1718

18-
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
1919
base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"
2020

2121

2222
def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
2323
"""Collect data on participating agencies"""
2424
start_time = time.time()
25-
resp = session.get(base_xlsx_url, timeout=120)
26-
resp.raise_for_status()
25+
resp = req_get(base_xlsx_url, timeout=120)
2726
soup = BeautifulSoup(resp.content, "html.parser")
2827
links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
2928
if not links:
@@ -45,7 +44,7 @@ def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dic
4544
"""
4645
# remove the date so we can easily overwrite the local (cached) file
4746
filename = date_re.sub("", link.split("/")[-1])
48-
path = f"{SCRIPT_DIR}{os.sep}{filename}"
47+
path = f"{output_folder}{os.sep}{filename}"
4948
if force_download or not os.path.exists(path):
5049
logger.info("Downloading agency info sheet from %s", link)
5150
download_file(link, path)

ice_scrapers/facilities_scraper.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
1-
from bs4 import BeautifulSoup
21
import copy
32
import datetime
43
import re
5-
from schemas import facility_schema
64
import time
5+
6+
from bs4 import BeautifulSoup
7+
8+
from schemas import facility_schema
79
from utils import (
810
default_timestamp,
911
logger,
10-
session,
12+
req_get,
1113
timestamp_format,
1214
)
15+
1316
from .utils import (
1417
get_ice_scrape_pages,
1518
repair_locality,
19+
repair_name,
1620
repair_street,
1721
repair_zip,
18-
repair_name,
1922
special_facilities,
2023
update_facility,
2124
)
@@ -33,6 +36,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
3336
scraped_count = 0
3437
for page_num, url in enumerate(urls):
3538
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
39+
facilities = []
3640
try:
3741
facilities = _scrape_page(url)
3842
except Exception as e:
@@ -43,19 +47,23 @@ def scrape_facilities(facilities_data: dict) -> dict:
4347
for facility in facilities:
4448
facility = special_facilities(facility)
4549
addr = facility["address"]
46-
street, cleaned = repair_street(addr["street"], addr["locality"])
50+
street, cleaned, other_st = repair_street(addr["street"], addr["locality"])
51+
addr["other_streets"].extend(other_st)
4752
if cleaned:
4853
addr["street"] = street
4954
facility["_repaired_record"] = True
50-
zcode, cleaned = repair_zip(addr["postal_code"], addr["locality"])
55+
zcode, cleaned, other_zip = repair_zip(addr["postal_code"], addr["locality"])
56+
addr["other_postal_codes"].extend(other_zip)
5157
if cleaned:
5258
addr["postal_code"] = zcode
5359
facility["_repaired_record"] = True
54-
locality, cleaned = repair_locality(addr["locality"], addr["administrative_area"])
60+
locality, cleaned, other_city = repair_locality(addr["locality"], addr["administrative_area"])
61+
addr["other_localities"].extend(other_city)
5562
if cleaned:
5663
addr["locality"] = locality
5764
facility["_repaired_record"] = True
58-
name, cleaned = repair_name(facility["name"], addr["locality"])
65+
name, cleaned, other_name = repair_name(facility["name"], addr["locality"])
66+
facility["other_names"].extend(other_name)
5967
if cleaned:
6068
facility["name"] = name
6169
facility["_repaired_record"] = True
@@ -95,8 +103,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
95103
return datetime.datetime.strptime(default_timestamp, timestamp_format)
96104
logger.debug(" Fetching: %s", url)
97105
try:
98-
response = session.get(url, timeout=30)
99-
response.raise_for_status()
106+
response = req_get(url, timeout=30, wait_time=0.1)
100107
except Exception as e:
101108
logger.error(" Error parsing %s: %s", url, e)
102109
return datetime.datetime.strptime(default_timestamp, timestamp_format)
@@ -118,8 +125,7 @@ def _scrape_page(page_url: str) -> list:
118125
"""Scrape a single page of facilities using BeautifulSoup"""
119126
logger.debug(" Fetching: %s", page_url)
120127
try:
121-
response = session.get(page_url, timeout=30)
122-
response.raise_for_status()
128+
response = req_get(page_url, timeout=30, wait_time=0.1)
123129
except Exception as e:
124130
logger.error(" Error parsing %s: %s", page_url, e)
125131
return []

ice_scrapers/field_offices.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import time
1515
from utils import (
1616
logger,
17-
session,
17+
req_get,
1818
)
1919
from .utils import get_ice_scrape_pages
2020

@@ -45,8 +45,7 @@ def _scrape_page(page_url: str) -> list[dict]:
4545
"""Scrape a single page of facilities using BeautifulSoup"""
4646
logger.debug(" Fetching: %s", page_url)
4747
try:
48-
response = session.get(page_url, timeout=30)
49-
response.raise_for_status()
48+
response = req_get(page_url, timeout=30)
5049
except Exception as e:
5150
logger.error(" Error parsing %s: %s", page_url, e)
5251
return []

0 commit comments

Comments
 (0)