-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
105 lines (79 loc) · 2.74 KB
/
scraper.py
File metadata and controls
105 lines (79 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
"""
Property Scraper using Scrapling
Run this on your local machine to scrape Fort Worth Focused listings
Setup:
pip install scrapling
Usage:
python scraper.py <property_url>
Example:
python scraper.py "https://renn.fortworthfocused.com/listing-detail/1177727569/5521-Lubbock-Avenue-Fort-Worth-TX"
"""
import sys
import re
import json
from scrapling.fetchers import StealthyFetcher
def extract_property(url):
"""Fetch and extract property data from a listing URL"""
print(f"Fetching: {url}")
# Use StealthyFetcher with headless browser
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch(url, headless=True, network_idle=True)
text = p.text
# Extract data using regex patterns
data = {
'url': url,
'address': '',
'price': '',
'beds': '',
'baths': '',
'sqft': '',
'type': '',
}
# Try to find address (usually in title or h1)
title_match = re.search(r'<h1[^>]*>([^<]+)</h1>', text, re.I)
if title_match:
data['address'] = title_match.group(1).strip()
# Find price - look for $xxx,xxx pattern
price_matches = re.findall(r'\$(\d{1,3}(?:,\d{3})*)', text)
prices = [int(p.replace(',', '')) for p in price_matches if 50000 <= int(p.replace(',', '')) <= 2000000]
if prices:
data['price'] = max(prices) # Take the largest (listing price, not fees)
# Find beds
beds_match = re.search(r'(\d+)\s*(?:bed|beds|Bed|Beds|bedroom)', text, re.I)
if beds_match:
data['beds'] = beds_match.group(1)
# Find baths
baths_match = re.search(r'(\d+\.?\d*)\s*(?:bath|baths|Bath|Baths)', text, re.I)
if baths_match:
data['baths'] = baths_match.group(1)
# Find sqft
sqft_match = re.search(r'([\d,]+)\s*(?:sqft|sq\.ft|square\s*feet)', text, re.I)
if sqft_match:
data['sqft'] = sqft_match.group(1).replace(',', '')
# Property type
type_match = re.search(r'Property Type[:\s]+([^\n<]+)', text, re.I)
if type_match:
data['type'] = type_match.group(1).strip()
return data
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
url = sys.argv[1]
try:
data = extract_property(url)
print("\n" + "="*50)
print("EXTRACTED PROPERTY DATA")
print("="*50)
for key, value in data.items():
if value:
print(f" {key}: {value}")
# Output as JSON for easy parsing
print("\n--- JSON ---")
print(json.dumps(data, indent=2))
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()