From e3488f31176336814a8c141c4d09cb36f35eef12 Mon Sep 17 00:00:00 2001 From: Enxhi Tabaku Date: Thu, 19 Mar 2026 01:57:17 +0100 Subject: [PATCH] fix(glassdoor): fix CSRF token fetch URL and non-fatal error handling Two bugs prevented Glassdoor from returning any results: 1. _get_csrf_token() was fetching /Job/computer-science-jobs.htm which now returns 404 after Glassdoor's Next.js migration. Changed to fetch the homepage (/) which reliably returns the token. 2. _fetch_jobs_page() treated any "errors" key in the GraphQL response as fatal, dropping all job results. Glassdoor commonly returns non- critical 503s on peripheral fields (e.g. jobsPageSeoData) while the actual jobListings data is intact. Now only errors on the jobListings path itself are treated as fatal. Verified: 30 jobs returned for Spain/engineer with both fixes applied. --- jobspy/glassdoor/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/jobspy/glassdoor/__init__.py b/jobspy/glassdoor/__init__.py index 8de7915f..cd00abc5 100644 --- a/jobspy/glassdoor/__init__.py +++ b/jobspy/glassdoor/__init__.py @@ -121,7 +121,16 @@ def _fetch_jobs_page( raise GlassdoorException(exc_msg) res_json = response.json()[0] if "errors" in res_json: - raise ValueError("Error encountered in API response") + # Only treat errors on the jobListings field as fatal. + # Glassdoor commonly returns non-critical 503s on peripheral + # fields (e.g. jobsPageSeoData) while the job data is intact. + job_errors = [ + e for e in res_json["errors"] + if "jobListings" in str(e.get("path", [])) + and "jobsPageSeoData" not in str(e.get("path", [])) + ] + if job_errors: + raise ValueError(f"Error encountered in jobListings API response: {job_errors}") except ( requests.exceptions.ReadTimeout, GlassdoorException, @@ -151,9 +160,10 @@ def _fetch_jobs_page( def _get_csrf_token(self): """ - Fetches csrf token needed for API by visiting a generic page + Fetches csrf token needed for API by visiting the homepage. + Previously used /Job/computer-science-jobs.htm which now returns 404. """ - res = self.session.get(f"{self.base_url}/Job/computer-science-jobs.htm") + res = self.session.get(f"{self.base_url}/") pattern = r'"token":\s*"([^"]+)"' matches = re.findall(pattern, res.text) token = None