1- from bs4 import BeautifulSoup
21import copy
32import datetime
43import re
5- from schemas import facility_schema
64import time
5+
6+ from bs4 import BeautifulSoup
7+
8+ from schemas import facility_schema
79from utils import (
810 default_timestamp ,
911 logger ,
10- session ,
12+ req_get ,
1113 timestamp_format ,
1214)
15+
1316from .utils import (
1417 get_ice_scrape_pages ,
1518 repair_locality ,
19+ repair_name ,
1620 repair_street ,
1721 repair_zip ,
18- repair_name ,
1922 special_facilities ,
2023 update_facility ,
2124)
@@ -33,6 +36,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
3336 scraped_count = 0
3437 for page_num , url in enumerate (urls ):
3538 logger .info ("Scraping page %s/%s..." , page_num + 1 , len (urls ))
39+ facilities = []
3640 try :
3741 facilities = _scrape_page (url )
3842 except Exception as e :
@@ -43,19 +47,23 @@ def scrape_facilities(facilities_data: dict) -> dict:
4347 for facility in facilities :
4448 facility = special_facilities (facility )
4549 addr = facility ["address" ]
46- street , cleaned = repair_street (addr ["street" ], addr ["locality" ])
50+ street , cleaned , other_st = repair_street (addr ["street" ], addr ["locality" ])
51+ addr ["other_streets" ].extend (other_st )
4752 if cleaned :
4853 addr ["street" ] = street
4954 facility ["_repaired_record" ] = True
50- zcode , cleaned = repair_zip (addr ["postal_code" ], addr ["locality" ])
55+ zcode , cleaned , other_zip = repair_zip (addr ["postal_code" ], addr ["locality" ])
56+ addr ["other_postal_codes" ].extend (other_zip )
5157 if cleaned :
5258 addr ["postal_code" ] = zcode
5359 facility ["_repaired_record" ] = True
54- locality , cleaned = repair_locality (addr ["locality" ], addr ["administrative_area" ])
60+ locality , cleaned , other_city = repair_locality (addr ["locality" ], addr ["administrative_area" ])
61+ addr ["other_localities" ].extend (other_city )
5562 if cleaned :
5663 addr ["locality" ] = locality
5764 facility ["_repaired_record" ] = True
58- name , cleaned = repair_name (facility ["name" ], addr ["locality" ])
65+ name , cleaned , other_name = repair_name (facility ["name" ], addr ["locality" ])
66+ facility ["other_names" ].extend (other_name )
5967 if cleaned :
6068 facility ["name" ] = name
6169 facility ["_repaired_record" ] = True
@@ -95,8 +103,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
95103 return datetime .datetime .strptime (default_timestamp , timestamp_format )
96104 logger .debug (" Fetching: %s" , url )
97105 try :
98- response = session .get (url , timeout = 30 )
99- response .raise_for_status ()
106+ response = req_get (url , timeout = 30 , wait_time = 0.1 )
100107 except Exception as e :
101108 logger .error (" Error parsing %s: %s" , url , e )
102109 return datetime .datetime .strptime (default_timestamp , timestamp_format )
@@ -118,8 +125,7 @@ def _scrape_page(page_url: str) -> list:
118125 """Scrape a single page of facilities using BeautifulSoup"""
119126 logger .debug (" Fetching: %s" , page_url )
120127 try :
121- response = session .get (page_url , timeout = 30 )
122- response .raise_for_status ()
128+ response = req_get (page_url , timeout = 30 , wait_time = 0.1 )
123129 except Exception as e :
124130 logger .error (" Error parsing %s: %s" , page_url , e )
125131 return []
0 commit comments