From f2e1651f75609ea0ce0669e4bb28476b4ab4dbd4 Mon Sep 17 00:00:00 2001 From: Kajetan Mieloch Date: Sun, 10 May 2026 12:02:11 +0200 Subject: [PATCH 1/2] fix: make pracuj.pl scraper cross-platform (Windows/macOS/Linux) - Remove hardcoded Windows-only AdBlock path (was crashing on macOS/Linux) - Replace deprecated selenium API with Service class (selenium 4.x+) - Add anti-bot-detection options (disable AutomationControlled, CDP webdriver flag) - Remove pinned old package versions incompatible with Python 3.13 Co-Authored-By: Claude Sonnet 4.6 (1M context) --- pracuj-pl-scraping/main.py | 120 ++++++++++++++++++++++++++++ pracuj-pl-scraping/requirements.txt | 4 + 2 files changed, 124 insertions(+) create mode 100644 pracuj-pl-scraping/main.py create mode 100644 pracuj-pl-scraping/requirements.txt diff --git a/pracuj-pl-scraping/main.py b/pracuj-pl-scraping/main.py new file mode 100644 index 0000000..65ee812 --- /dev/null +++ b/pracuj-pl-scraping/main.py @@ -0,0 +1,120 @@ +from selenium import webdriver +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +import pandas as pd +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.base import MIMEBase +from email import encoders +from datetime import datetime + +now = datetime.now() +currentDate = now.strftime("%d/%m/%Y") + +# Please modify these variables +searchKeyword = "?" +searchLocation = "?" +senderAddress = "?" +senderKey = "?" +receiverAddress = "?" + +# Set up Chrome (cross-platform: Windows / macOS / Linux) +chromeOptions = Options() +chromeOptions.add_experimental_option("detach", True) +chromeOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) +chromeOptions.add_experimental_option("useAutomationExtension", False) +chromeOptions.add_argument("--disable-blink-features=AutomationControlled") +chromeOptions.add_argument("--start-maximized") + +service = Service(ChromeDriverManager().install()) +driver = webdriver.Chrome(service=service, options=chromeOptions) + +# Hide the webdriver flag to reduce bot detection +driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" +}) + +# Go to site and show results according to given keyword and location +driver.get("https://pracuj.pl") +driver.switch_to.window(driver.window_handles[0]) +WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Akceptuj wszystkie')]"))).click() +advancedSearch = driver.find_element(By.XPATH, '/html/body/main/div[1]/div/div/div[3]/form/div[2]/button') + +try: + advancedSearch.click() +except Exception: + advancedSearch = driver.find_element(By.XPATH, '/html/body/main/div[1]/div/div/div[3]/form/div[2]/button') + advancedSearch.click() + +try: + WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Rozumiem')]"))).click() +except Exception: + print("Can't find the button that contains 'Rozumiem'") + +jobSearchKeyword = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[1]/div/div[1]/div[1]/div/input[1]') +jobSearchKeyword.send_keys(searchKeyword) +jobSearchPreferredLocation = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[1]/div/div[2]/div[1]/div/input[1]') +jobSearchPreferredLocation.send_keys(searchLocation) +WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[1]/div/div[4]/div/button'))).click() +WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Pozostałe')]"))).click() +WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[3]/div[1]/div/ul/li[6]/div/div[2]/div[2]/fieldset[1]/ul/li[1]/label'))).click() +WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Pokaż oferty')]"))).click() + +jobOffersList = [] + +# Collect each job offer from every available page of search result +while True: + jobOffers = driver.find_elements(By.CLASS_NAME, "offer__info") + + for offer in jobOffers: + jobLink = offer.find_element(By.CLASS_NAME, "offer-details__title-link") + jobLinkHref = jobLink.get_attribute("href") + jobTitle = jobLink.text + jobCompanyName = offer.find_element(By.CLASS_NAME, "offer-company__name").text + jobOfferListElement = { + 'link': jobLinkHref, + 'job title': jobTitle, + 'company name': jobCompanyName + } + jobOffersList.append(jobOfferListElement) + + # Last page detection + try: + nextPageButton = driver.find_element(By.CSS_SELECTOR, "li[class='pagination_element pagination_element--next']") + nextPageButton.click() + except Exception: + print("End page of search result") + break + +# Create table from existing data and make export to Excel +df = pd.DataFrame(jobOffersList) +df.to_excel('jobOffers.xlsx', index=False) + +# Build email message +message = MIMEMultipart() +message['From'] = senderAddress +message['To'] = receiverAddress +message['Subject'] = searchKeyword + " pracuj.pl - " + searchLocation + " - najnowsze oferty pracy! " + currentDate + +# Attach jobOffers.xlsx +jobOffersExcelFile = MIMEBase('application', "octet-stream") +jobOffersExcelFile.set_payload(open("jobOffers.xlsx", "rb").read()) +encoders.encode_base64(jobOffersExcelFile) +jobOffersExcelFile.add_header('Content-Disposition', 'attachment; filename="jobOffers.xlsx"') +message.attach(jobOffersExcelFile) + +# Make secure connection with smtp server through TLS and send a complete message +session = smtplib.SMTP('smtp.gmail.com', 587) +session.starttls() +session.login(senderAddress, senderKey) +messageAsText = message.as_string() +session.sendmail(senderAddress, receiverAddress, messageAsText) +session.quit() +print('Mail sent to ' + receiverAddress) + +# Close the webdriver +driver.quit() diff --git a/pracuj-pl-scraping/requirements.txt b/pracuj-pl-scraping/requirements.txt new file mode 100644 index 0000000..6fbc647 --- /dev/null +++ b/pracuj-pl-scraping/requirements.txt @@ -0,0 +1,4 @@ +openpyxl +pandas +selenium +webdriver-manager From 22eb2e99521f39c7120cf67fee08f2018eb66203 Mon Sep 17 00:00:00 2001 From: Kajetan Mieloch Date: Sun, 10 May 2026 12:22:21 +0200 Subject: [PATCH 2/2] fix: update selectors and pagination for current pracuj.pl layout - Replace all XPath selectors with data-test attributes (stable, not layout-dependent) - Switch from form interaction to direct URL search (?kw=...&wp=...&rd=N) - Fix pagination: scroll to bottom + JS click to handle sticky footer - Add fallback link selector (link-offer-title vs link-offer for sponsored posts) - Collect salary and region fields (new fields available in current layout) - Add daysBack config variable (1=24h, 3=3 days, 30=month) - Add test_run.py for validating scraper without sending email Tested: 134 offers collected across 3 pages (Python/Warszawa/30 days) Co-Authored-By: Claude Sonnet 4.6 (1M context) --- pracuj-pl-scraping/main.py | 138 ++++++++++++++++++--------------- pracuj-pl-scraping/test_run.py | 112 ++++++++++++++++++++++++++ 2 files changed, 187 insertions(+), 63 deletions(-) create mode 100644 pracuj-pl-scraping/test_run.py diff --git a/pracuj-pl-scraping/main.py b/pracuj-pl-scraping/main.py index 65ee812..5e04e95 100644 --- a/pracuj-pl-scraping/main.py +++ b/pracuj-pl-scraping/main.py @@ -5,12 +5,14 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC +from urllib.parse import quote import pandas as pd import smtplib from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase from email import encoders from datetime import datetime +import time now = datetime.now() currentDate = now.strftime("%d/%m/%Y") @@ -22,6 +24,9 @@ senderKey = "?" receiverAddress = "?" +# How many days back to search (1 = last 24h, 3 = last 3 days, 30 = last month) +daysBack = 1 + # Set up Chrome (cross-platform: Windows / macOS / Linux) chromeOptions = Options() chromeOptions.add_experimental_option("detach", True) @@ -32,89 +37,96 @@ service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chromeOptions) - -# Hide the webdriver flag to reduce bot detection driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" }) -# Go to site and show results according to given keyword and location -driver.get("https://pracuj.pl") -driver.switch_to.window(driver.window_handles[0]) -WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Akceptuj wszystkie')]"))).click() -advancedSearch = driver.find_element(By.XPATH, '/html/body/main/div[1]/div/div/div[3]/form/div[2]/button') - -try: - advancedSearch.click() -except Exception: - advancedSearch = driver.find_element(By.XPATH, '/html/body/main/div[1]/div/div/div[3]/form/div[2]/button') - advancedSearch.click() +# Navigate directly to search results +kwEncoded = quote(searchKeyword) +locEncoded = quote(searchLocation) +startUrl = f"https://www.pracuj.pl/praca/{kwEncoded};kw/{locEncoded};wp?rd={daysBack}" +driver.get(startUrl) +time.sleep(3) +# Accept cookies if present try: - WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Rozumiem')]"))).click() + WebDriverWait(driver, 10).until( + EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Akceptuj wszystkie')]")) + ).click() + time.sleep(2) except Exception: - print("Can't find the button that contains 'Rozumiem'") - -jobSearchKeyword = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[1]/div/div[1]/div[1]/div/input[1]') -jobSearchKeyword.send_keys(searchKeyword) -jobSearchPreferredLocation = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[1]/div/div[2]/div[1]/div/input[1]') -jobSearchPreferredLocation.send_keys(searchLocation) -WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[1]/div/div[4]/div/button'))).click() -WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Pozostałe')]"))).click() -WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[2]/div[4]/form/div[3]/div[1]/div/ul/li[6]/div/div[2]/div[2]/fieldset[1]/ul/li[1]/label'))).click() -WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Pokaż oferty')]"))).click() + pass jobOffersList = [] -# Collect each job offer from every available page of search result +# Collect offers from every page while True: - jobOffers = driver.find_elements(By.CLASS_NAME, "offer__info") - - for offer in jobOffers: - jobLink = offer.find_element(By.CLASS_NAME, "offer-details__title-link") - jobLinkHref = jobLink.get_attribute("href") - jobTitle = jobLink.text - jobCompanyName = offer.find_element(By.CLASS_NAME, "offer-company__name").text - jobOfferListElement = { - 'link': jobLinkHref, - 'job title': jobTitle, - 'company name': jobCompanyName - } - jobOffersList.append(jobOfferListElement) - - # Last page detection + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.CSS_SELECTOR, "[data-test='default-offer']")) + ) + + offers = driver.find_elements(By.CSS_SELECTOR, "[data-test='default-offer']") + for offer in offers: + try: + try: + linkEl = offer.find_element(By.CSS_SELECTOR, "a[data-test='link-offer-title']") + except Exception: + linkEl = offer.find_element(By.CSS_SELECTOR, "a[data-test='link-offer']") + title = offer.find_element(By.CSS_SELECTOR, "[data-test='offer-title']").text.strip() + company = offer.find_element(By.CSS_SELECTOR, "[data-test='text-company-name']").text.strip() + link = linkEl.get_attribute("href") + + try: + region = offer.find_element(By.CSS_SELECTOR, "[data-test='text-region']").text.strip() + except Exception: + region = "" + + try: + salary = offer.find_element(By.CSS_SELECTOR, "[data-test='offer-salary']").text.strip() + except Exception: + salary = "" + + jobOffersList.append({ + "job title": title, + "company name": company, + "region": region, + "salary": salary, + "link": link, + }) + except Exception: + continue + + # Go to next page or stop try: - nextPageButton = driver.find_element(By.CSS_SELECTOR, "li[class='pagination_element pagination_element--next']") - nextPageButton.click() + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(1) + nextBtn = driver.find_element(By.CSS_SELECTOR, "[data-test='bottom-pagination-button-next']") + driver.execute_script("arguments[0].click();", nextBtn) + time.sleep(2) except Exception: - print("End page of search result") break -# Create table from existing data and make export to Excel +# Export to Excel df = pd.DataFrame(jobOffersList) -df.to_excel('jobOffers.xlsx', index=False) +df.to_excel("jobOffers.xlsx", index=False) -# Build email message +# Build and send email message = MIMEMultipart() -message['From'] = senderAddress -message['To'] = receiverAddress -message['Subject'] = searchKeyword + " pracuj.pl - " + searchLocation + " - najnowsze oferty pracy! " + currentDate - -# Attach jobOffers.xlsx -jobOffersExcelFile = MIMEBase('application', "octet-stream") -jobOffersExcelFile.set_payload(open("jobOffers.xlsx", "rb").read()) -encoders.encode_base64(jobOffersExcelFile) -jobOffersExcelFile.add_header('Content-Disposition', 'attachment; filename="jobOffers.xlsx"') -message.attach(jobOffersExcelFile) - -# Make secure connection with smtp server through TLS and send a complete message -session = smtplib.SMTP('smtp.gmail.com', 587) +message["From"] = senderAddress +message["To"] = receiverAddress +message["Subject"] = f"{searchKeyword} pracuj.pl - {searchLocation} - najnowsze oferty pracy! {currentDate}" + +attachment = MIMEBase("application", "octet-stream") +attachment.set_payload(open("jobOffers.xlsx", "rb").read()) +encoders.encode_base64(attachment) +attachment.add_header("Content-Disposition", 'attachment; filename="jobOffers.xlsx"') +message.attach(attachment) + +session = smtplib.SMTP("smtp.gmail.com", 587) session.starttls() session.login(senderAddress, senderKey) -messageAsText = message.as_string() -session.sendmail(senderAddress, receiverAddress, messageAsText) +session.sendmail(senderAddress, receiverAddress, message.as_string()) session.quit() -print('Mail sent to ' + receiverAddress) +print(f"Mail sent to {receiverAddress}") -# Close the webdriver driver.quit() diff --git a/pracuj-pl-scraping/test_run.py b/pracuj-pl-scraping/test_run.py new file mode 100644 index 0000000..51e4437 --- /dev/null +++ b/pracuj-pl-scraping/test_run.py @@ -0,0 +1,112 @@ +from selenium import webdriver +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from urllib.parse import quote +import pandas as pd +import time + +searchKeyword = "Python" +searchLocation = "Warszawa" +daysBack = 30 + +chromeOptions = Options() +chromeOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) +chromeOptions.add_experimental_option("useAutomationExtension", False) +chromeOptions.add_argument("--disable-blink-features=AutomationControlled") +chromeOptions.add_argument("--start-maximized") + +service = Service(ChromeDriverManager().install()) +driver = webdriver.Chrome(service=service, options=chromeOptions) +driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" +}) + +kwEncoded = quote(searchKeyword) +locEncoded = quote(searchLocation) +startUrl = f"https://www.pracuj.pl/praca/{kwEncoded};kw/{locEncoded};wp?rd={daysBack}" +print(f"URL: {startUrl}") +driver.get(startUrl) +time.sleep(3) + +try: + WebDriverWait(driver, 10).until( + EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Akceptuj wszystkie')]")) + ).click() + time.sleep(2) + print("Cookies zaakceptowane.") +except Exception: + print("Brak cookies popup.") + +jobOffersList = [] +page = 1 + +while True: + print(f"\n--- Strona {page} ---") + try: + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.CSS_SELECTOR, "[data-test='default-offer']")) + ) + except Exception as e: + print(f"Brak ofert na stronie: {e}") + break + + offers = driver.find_elements(By.CSS_SELECTOR, "[data-test='default-offer']") + print(f"Ofert na stronie: {len(offers)}") + + for offer in offers: + try: + try: + linkEl = offer.find_element(By.CSS_SELECTOR, "a[data-test='link-offer-title']") + except Exception: + linkEl = offer.find_element(By.CSS_SELECTOR, "a[data-test='link-offer']") + title = offer.find_element(By.CSS_SELECTOR, "[data-test='offer-title']").text.strip() + company = offer.find_element(By.CSS_SELECTOR, "[data-test='text-company-name']").text.strip() + link = linkEl.get_attribute("href") + try: + region = offer.find_element(By.CSS_SELECTOR, "[data-test='text-region']").text.strip() + except Exception: + region = "" + try: + salary = offer.find_element(By.CSS_SELECTOR, "[data-test='offer-salary']").text.strip() + except Exception: + salary = "" + jobOffersList.append({ + "job title": title, + "company name": company, + "region": region, + "salary": salary, + "link": link, + }) + except Exception as e: + print(f" Błąd przy parsowaniu: {e}") + continue + + try: + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(1) + nextBtn = driver.find_element(By.CSS_SELECTOR, "[data-test='bottom-pagination-button-next']") + driver.execute_script("arguments[0].click();", nextBtn) + page += 1 + time.sleep(3) + if page > 3: + print("Zatrzymano na stronie 3 (test).") + break + except Exception: + print("Ostatnia strona.") + break + +print(f"\n=== WYNIKI ===") +print(f"Zebrano ofert łącznie: {len(jobOffersList)}") + +if jobOffersList: + df = pd.DataFrame(jobOffersList) + df.to_excel("jobOffers.xlsx", index=False) + print("Zapisano do jobOffers.xlsx") + print(df[["job title", "company name", "salary"]].head(10).to_string()) + +driver.quit() +print("\nTest zakończony.")