-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
81 lines (68 loc) · 2.41 KB
/
scraper.py
File metadata and controls
81 lines (68 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""Scrapes Austin-area software engineering jobs via JobSpy.
Simplified from github.com/neveon/job-alerter for MVP:
- Single broad JobSpy query instead of per-company (latency-sensitive in cron).
- No SQLite persistence (fresh scrape each day).
- No rule-based filtering (the matcher LLM does fit judgment).
"""
from __future__ import annotations
import logging
from typing import Any
from jobspy import scrape_jobs as _jobspy_scrape
logger = logging.getLogger(__name__)
DEFAULT_LOCATION = "Austin, TX"
DEFAULT_SEARCH_TERMS = ["software engineer", "backend engineer"]
def _row_to_dict(row: dict[str, Any]) -> dict[str, Any] | None:
title = (row.get("title") or "").strip()
company = (row.get("company") or "").strip()
url = (row.get("job_url") or "").strip()
if not (title and company and url):
return None
location = (row.get("location") or "").strip()
description = (row.get("description") or "").strip()
return {
"title": title,
"company": company,
"location": location,
"url": url,
"description": description,
}
def scrape_austin_jobs(
*,
search_terms: list[str] | None = None,
location: str = DEFAULT_LOCATION,
hours_old: int = 24,
radius_miles: int = 50,
results_per_term: int = 40,
max_total: int = 50,
) -> list[dict[str, Any]]:
"""Returns up to `max_total` unique jobs. Failures in one term don't kill others."""
terms = search_terms or DEFAULT_SEARCH_TERMS
seen_urls: set[str] = set()
jobs: list[dict[str, Any]] = []
for term in terms:
try:
df = _jobspy_scrape(
site_name="indeed",
search_term=term,
location=location,
results_wanted=results_per_term,
hours_old=hours_old,
distance=radius_miles,
country_indeed="USA",
verbose=False,
)
except Exception as exc:
logger.warning("jobspy failed for term %r: %s", term, exc)
continue
if df is None or df.empty:
continue
df = df.fillna("")
for _, row in df.iterrows():
job = _row_to_dict(row.to_dict())
if not job or job["url"] in seen_urls:
continue
seen_urls.add(job["url"])
jobs.append(job)
if len(jobs) >= max_total:
return jobs
return jobs