From 0c85c115556890be65086d9ce8cecabcafe02038 Mon Sep 17 00:00:00 2001 From: Hasnain190 Date: Wed, 29 Oct 2025 21:20:55 +0500 Subject: [PATCH 1/2] refactor: update browser path, modified: added conditional logic for continuous check to match all the searches, modified requirements.txt to install latest packages for python 3.13 --- .gitignore | 7 +--- main.py | 15 +++++--- requirements.txt | 19 +++++----- result.csv | 96 +++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 116 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index eac93d3e..f514b74c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,2 @@ -__pycache__/ -*.pyc -result.csv -output.csv -playwright/.cache/ \ No newline at end of file +# Created by venv; see https://docs.python.org/3/library/venv.html +* diff --git a/main.py b/main.py index e463d4dd..0e310dcc 100644 --- a/main.py +++ b/main.py @@ -113,13 +113,13 @@ def scrape_places(search_for: str, total: int) -> List[Place]: places: List[Place] = [] with sync_playwright() as p: if platform.system() == "Windows": - browser_path = r"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" + browser_path = r"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe" browser = p.chromium.launch(executable_path=browser_path, headless=False) else: browser = p.chromium.launch(headless=False) page = browser.new_page() try: - page.goto("https://www.google.com/maps/@32.9817464,70.1930781,3.67z?", timeout=60000) + page.goto("https://www.google.com/maps/@33.6207562,72.7564306,10z?", timeout=60000) page.wait_for_timeout(1000) page.locator('//input[@id="searchboxinput"]').fill(search_for) page.keyboard.press("Enter") @@ -131,6 +131,11 @@ def scrape_places(search_for: str, total: int) -> List[Place]: page.wait_for_selector('//a[contains(@href, "https://www.google.com/maps/place")]') found = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count() logging.info(f"Currently Found: {found}") + if found < total: + time.sleep(2) + page.mouse.wheel(0, 10000) + found = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count() + if found >= total: break if found == previously_counted: @@ -140,7 +145,7 @@ def scrape_places(search_for: str, total: int) -> List[Place]: listings = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').all()[:total] listings = [listing.locator("xpath=..") for listing in listings] logging.info(f"Total Found: {len(listings)}") - for idx, listing in enumerate(listings): + for i, listing in enumerate(listings): try: listing.click() page.wait_for_selector('//div[@class="TIHn2 "]//h1[@class="DUwDvf lfPIob"]', timeout=10000) @@ -149,9 +154,9 @@ def scrape_places(search_for: str, total: int) -> List[Place]: if place.name: places.append(place) else: - logging.warning(f"No name found for listing {idx+1}, skipping.") + logging.warning(f"No name found for listing {i+1}, skipping.") except Exception as e: - logging.warning(f"Failed to extract listing {idx+1}: {e}") + logging.warning(f"Failed to extract listing {i+1}: {e}") finally: browser.close() return places diff --git a/requirements.txt b/requirements.txt index 9dd0bed6..e831e4f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,10 @@ -et-xmlfile==1.1.0 -greenlet==3.0.3 -numpy==1.26.4 -openpyxl==3.1.2 -pandas==2.2.2 -playwright==1.44.0 -pyee==11.1.0 +greenlet==3.2.4 +numpy==2.3.4 +pandas==2.3.3 +playwright==1.55.0 +pyee==13.0.0 python-dateutil==2.9.0.post0 -pytz==2024.1 -six==1.16.0 -typing_extensions==4.12.0 +pytz==2025.2 +six==1.17.0 +typing_extensions==4.15.0 +tzdata==2025.2 diff --git a/result.csv b/result.csv index 82d5976e..9898c360 100644 --- a/result.csv +++ b/result.csv @@ -1 +1,95 @@ -name,address,website,phone_number,reviews_count,reviews_average,place_type,opens_at \ No newline at end of file +name,address,website,phone_number,reviews_count,reviews_average,store_shopping,in_store_pickup,store_delivery,place_type,opens_at +Waseem Paint Services,"House # G-13، Street #6-Left Lehtrar Road، P/O Tarlai Ghauri Garden, Islamabad, 44000, Pakistan",,+92 343 4883331,23.0,5.0,No,No,No,Painter, +Hassan ali paint services Private Limited,"Street 38, F-8/1 F 8/1 F-8, Islamabad, Pakistan",hassanalipaintserviceexpertsislamabad.com,+92 314 5700786,19.0,4.9,No,No,No,Painter, +paint pandas,"Gate # 1, 146/5 Street 14, opp. DHA phase 2, Fatima Villas, Islamabad, 43000, Pakistan",paintpandas.com,+92 309 7759479,12.0,4.9,No,No,No,Painter, +Fine Artist (Fauzia H),"House #106 Street 39, G-8/2 G 8/2 G-8, Islamabad, 44000, Pakistan",fhartstore.wordpress.com,+92 336 5515529,86.0,5.0,No,No,No,Artist, +"Unique paint service1, wall paint selling paint wood Polish Deco Professional painters& wall PVC Pinnels install","shop new, Saafi traders, Nazmodeen rode Khokhar Chowk, G11, F-11, Islamabad, 44100, Pakistan",facebook.com,+92 317 5800321,23.0,4.8,No,No,No,Painter, +Professional painter and artist Islamabad,"E, Jamia Darul huda, 13 Golra Rd, E-12, Islamabad, Pakistan",,+92 333 1000173,44.0,5.0,No,No,No,Art studio, +Habib paint services and wood polish,,,+92 300 5578217,57.0,4.9,No,No,No,Painter, +Azan Paints Services,"Subhan real estate, Street 9, jan colony scheme 3, Rawalpindi, 54000, Pakistan",paint.lissoftware.com,+92 311 1158155,122.0,5.0,No,No,No,Painting, +Candid International Construction Pvt. Ltd(Paint Contractors Painters),"Beside Imtiaz Mega Free Parking, Babu Sher Plaza First Floor Office Number 01, 92331, Pakistan",,+92 331 5892878,12.0,4.8,No,No,No,Contractor, Opens 9AM Thu +Waheed Paint Services - Rawalpindi & Islamabad,,,+92 312 5672967,16.0,5.0,No,No,No,Painter, +Painters,"City Plaza, Tipu Sultan Rd, I-8 Markaz, I-8, I 8 Markaz I-8, Islamabad, 44000, Pakistan",,+92 321 5268774,1.0,5.0,No,No,No,, +The Handyman Pvt. Limited,"Floor, 4-A، Utility Plaza، Office 1, Street 2, F-8 Markaz Islamabad, 44000, Pakistan",thehandyman.com.pk,+92 800 426 39,102.0,4.0,No,No,No,Handyman/Handywoman/Handyperson, Opens 9AM Thu +Fahad Builder & Painter Services,"Cantt Street 9, Ch. Jan Colony Chaklala Cantt., Rawalpindi, 46000, Pakistan",,+92 316 2058432,66.0,4.7,No,No,No,Painter, Opens 9AM Thu +Islamabad Rawalpindi Painter,"Town, Street number 68, 1 Phase, Islamabad, 44000, Pakistan",,+92 303 6959143,7.0,4.1,No,No,No,Painter, Opens 7AM Thu +Subhan Paint and Polish Services,,,+92 370 5790565,24.0,5.0,No,No,No,Painter, +Ms building painters,"Hub commercial, Phase 7 bahria town phase 8, Islamabad, 46620, Pakistan",,+92 334 1222313,7.0,5.0,No,No,No,Painter, +AA RockWall,"Ghouri, Sharifabad Rd, Town, Islamabad, 44000, Pakistan",sites.google.com,+92 336 5269023,17.0,5.0,No,No,No,Painter, Opens 9AM Thu +City Rockwall & Material,"MXGM+7P9, Street 4, F-11/1 F 11/1 F-11, Islamabad, Pakistan",,+92 313 5384881,3.0,4.7,No,No,No,Painter, Opens 8AM Thu +Brighto Paints Shop-PAK Traders (Authorised Dealer),"Civic Centre, Block A Jinnah Garden, Islamabad, 46000, Pakistan",facebook.com,+92 333 8711836,54.0,4.9,Yes,No,Yes,Paint store, Opens 8:30AM Thu +Building maintenance islamabad,"plot # 212 east service road industrial area, I-10/3 I 10/3 Islamabad, 44000, Pakistan",facebook.com,+92 331 9944677,80.0,5.0,No,No,No,Building restoration service, Opens 9AM Thu +Gallery 6 Islamabad,"Second Floor، Al Abraaj Centre، Super Market، behind Shaheen Chemist, F-6 Markaz F 6 Markaz F-6, Islamabad, 44000, Pakistan",gallery6islamabad.com,+92 332 2101206,113.0,4.5,No,No,No,Art gallery, Opens 11AM Thu +Painter. Polish design,"G-11/1 G-12, Islamabad, 43701, Pakistan",,+92 307 8861399,3.0,5.0,No,No,No,Painter, +Fancy Paint Polish Deco,"Malik market, Shah Allah Ditta Rd, F-12, Islamabad, 44000, Pakistan",paintpolishdeco.com,+92 301 4717784,20.0,5.0,No,No,No,Home improvement store, Closes 11PM +A-one Paint Services,"Civic Center, Phase 1 Jinnah Garden, Islamabad, 44000, Pakistan",paintshop.pk,+92 333 8711836,7.0,5.0,No,No,No,Painter, Opens 8AM Thu +Shezi Paint Services in Rawalpindi and Islamabad,,,+92 305 9158284,6.0,4.8,No,No,No,Painter, +Asia Paint contractor,"Madina, Pindoriyan road barma town, town, Islamabad, 45550, Pakistan",,+92 300 8527744,2.0,5.0,No,No,No,Painter, Opens 9AM Thu +Zarqoon Painter Services,"J274+MC6, Road, Allahabad, Rawalpindi, 46000, Pakistan",,+92 310 1809132,9.0,4.6,No,No,No,Painter, +Islamabad All Home maintenance work and repair available,"St 25, G-11/1 G 11/2 G-11, Islamabad, 46000, Pakistan",wa.me,+92 331 9934309,85.0,4.8,No,No,No,Construction company,Open 24 hours +Islamabad Color Centre,"Shop 3&4, Islamabad Color Centre, Millat plaza, Markaz, next to HBL bank, F-10 Markaz F 10/4 F-10, Islamabad, Pakistan",,+92 337 0400007,17.0,4.8,Yes,Yes,Yes,Paint store, Opens 9AM Thu +Paint work service,"H4HW+GJ6, St 11, Pakistan Town Phase 1 Pakistan Town, Islamabad, Pakistan",,+92 335 8537481,,,No,No,No,Painter, +Islamabad work service,"MX5C+3G9 Nazmodeen rode, khokar Chowk f11, G-12, Islamabad, 44100, Pakistan",facebook.com,+92 325 5007866,2.0,5.0,No,No,No,Painter, +Nothing,"near Islamabad Model School for Boys, opposite the solution squad, Koral, Islamabad, 47331, Pakistan",,+92 342 5033991,1.0,5.0,No,No,No,Painter, Opens 9AM Thu +Smart House Painter,,,+92 309 0557357,8.0,5.0,No,No,No,Painter, Opens 8AM Thu +Excellent Modern Interior & Exterior paint work services,,,+92 310 1585332,7.0,5.0,No,No,No,Painter, +Jotun Paints Islamabad (STRUCTURE),"I & T Center, Shop 5, Block 35 Kibriya Plaza, G-9/4 G 9/4 G-9, Islamabad, 44000, Pakistan",facebook.com,+92 51 8311138,30.0,3.9,No,Yes,Yes,Paint store, Opens 9AM Thu +Paint for house,"St. 10, Farash Town, Islamabad, 45600, Pakistan",,+92 313 0734946,,,No,No,No,Painter, 10PM +"Weldpac - Home Maintenance (Carpenter, Electrician, Plumber, Painter, Mason, Welder)","H.No E, 158, New Parian Satellite Town, Rawalpindi, 46000, Pakistan",weldpac.com,+92 305 8717079,76.0,4.6,No,No,No,Handyman/Handywoman/Handyperson, Opens 8AM Thu +"Ideal Furniture Shop, Polishing At Your Doorstep Rwp Isb.","J27W+97G, Babu Lal Hussain Rd, Dhoke Ratta, Rawalpindi, 46000, Pakistan",,+92 342 5376887,20.0,5.0,No,No,No,Painter, +Membrane Sheets Hub,"Main Lehterar Road, St#8, Khanna Pul, near Rawal Hospital, Islamabad, 45551, Pakistan",thenextrenovation.com,+92 322 8005624,12.0,5.0,No,No,No,Construction company, +Tameer Enterprises waterproofing in Islamabad & building maintenance,"office No 10, 2nd floor aaley plaza fazal-e-haq road, 44000, Pakistan",tameerenterprises.com,+92 300 5000228,41.0,4.7,No,No,No,Waterproofing service, +Tabbassum Art – sketch Calligraphy & Painting,"Sector O-9 Block B Pwd, Islamabad, 45750, Pakistan",,+92 370 4110315,114.0,4.9,No,No,No,Art studio, Closes 11:30PM +Painters Shop,"Chaudhry Plaza, Sumbal Rd, F-10 Markaz, F-10, F 10/3 F-10, Islamabad, 44000, Pakistan",,+92 300 5071214,2.0,5.0,No,No,No,Building materials store, +Fresh Coat Paint and Painters Pakistan,"RCCI۔ Rawat, Rawat Industrial Estate, Islamabad, Pakistan",freshcoatpainters.pk,+92 333 6166972,,,No,No,No,Paint manufacturer, Opens 9AM Thu +Usman Engineering & Co,"Block No 8 Abbasi Market, Shop 2, 44220, Pakistan",,+92 333 5372856,98.0,4.9,No,No,No,Handyman/Handywoman/Handyperson, +Shamsher Paint & Polish Works,,,+92 336 7305098,1.0,5.0,No,No,No,Painter, +Shamsher Paint & Polish Works,,,+92 336 7305098,1.0,5.0,No,No,No,Painter, +Rizwan Paint Services,,,+92 347 5092426,3.0,4.7,No,No,No,Painter, Opens 7AM Thu +Waqar Brothers Paint & Hardware Store,"Jinnah Ave, Mohran Jejan, Islamabad, 44000, Pakistan",waqarbrothers.pk,+92 335 5254446,93.0,4.7,Yes,Yes,Yes,Paint store, Opens 10AM Thu +Rafay paint services,"Block-A Street 2, near chaklala scheme-3, Dhok Chaudhrian, Rawalpindi, 46000, Pakistan",,+92 345 9734176,1.0,5.0,No,No,No,Painter, +Islamabad plumber and electricians,"G-12, Islamabad, 46000, Pakistan",maintainfix.com,+92 301 5374005,381.0,4.8,No,No,No,Plumber, +"Nayakam - Find Handyman, Electrician, Plumber & Carpenter","Office #109, First Floor, Grand Xcito, Block D Markaz Gulberg Residencia, Islamabad, 44000, Pakistan",nayakam.com,+92 333 5410090,38.0,4.5,No,No,No,Handyman/Handywoman/Handyperson, +Mehran Experts,"Office 04, Street 01 St 1, G-12, Islamabad, 44000, Pakistan",mehranexperts.com,+92 300 0888121,209.0,4.9,No,No,No,Construction company, 10PM +Muhammad Hashim paint service,"M2QM+P36, G-9 Markaz G 9 Markaz G-9, Islamabad, Pakistan",facebook.com,+92 347 9995551,,,No,No,No,Painter, +Painters,"H2MJ+9FR, Abid Majeed Rd, Mughalabad, Tench Bhata, Tench Bhata, Rawalpindi, 46000, Pakistan",,+92 300 9125567,2.0,5.0,No,No,No,Painter, +Paint And finishing,"office 227, Pakistan town phase 01 , street 16, Phase 1, Pakistan town, Islamabad, 44000, Pakistan",,+92 302 7141931,3.0,5.0,No,No,No,Painter, +Sahi Karao,"H565+J5C Ideal Homes The name of Trust Shahnoor Arcade, Street 4, Soan Gardens Block F Islamabad, 45720, Pakistan",sahikarao.com,+92 300 1000009,11.0,5.0,No,No,Yes,Home improvement store, Opens 10AM Thu +R&W maintenance and all paints work services,,,+92 331 5770761,2.0,5.0,No,No,No,Painter, +Salamat Denter & Spray Painter,"J389+QC2, Zafar ul Haq Rd, Taili Mohalla Committee Mohalla Waris Khan, Rawalpindi, 46000, Pakistan",,+92 300 5302162,9.0,4.2,No,No,No,Painter, Opens 9AM Thu +Paint & polish service,"Aslam market, Shah Allah, road, F-12, Islamabad, 09876, Pakistan",,+92 310 8101343,,,No,No,No,Painter, +Umar Qadeer Spray Painter,"JXCH+9Q4, Main Peshawar Rd, Naseerabad Rawalpindi, 46000, Pakistan",,+92 301 5464862,1.0,5.0,No,No,No,Painter, +Moon rook wall & epoxy paint,,,+92 300 9715983,,,No,No,No,Painter, +Moon rook wall & epoxy paint,,,+92 300 9715983,,,No,No,No,Painter, +ApnaPainter,"Bostan Khan Rd, Chaklala 3 Dhok Chaudhrian, Rawalpindi, 46000, Pakistan",,+92 311 5357055,1.0,5.0,No,No,No,Painting, Opens 9AM Thu +Wajid Building construction & Maintenance Services,"Jammu & Kashmir Housing Society G 15/3 G-15, Islamabad, Pakistan",,+92 313 1818145,18.0,5.0,No,No,No,Construction company, Closes 12AM +R..R auto painting workshop,"J264+F3P, Allahabad, Rawalpindi, 46000, Pakistan",,+92 343 5065009,,,No,No,No,Painter, +Hussain traders G-11/1 Branch 01,"Service Rd W, opp. Street 22, G-11/1 G 11/1 G-11, Islamabad, Pakistan",,+92 321 5253076,26.0,4.1,No,Yes,Yes,Paint store, Opens 9AM Thu +Rehmani Engineers,"Main street, 4th Ave, Muslim Colony Nurpur Shahan, Islamabad, 44000, Pakistan",rehmaniengineers.com,+92 333 5132852,80.0,4.9,No,No,No,Construction company, +Awanzada Builders,"7th Avenue, Justice Abdul Rasheed Rd, F-6/1 Islamabad, Pakistan",,+92 333 5289574,,,No,No,No,Painter, +Buildream Construction,,,+92 333 1913536,7.0,5.0,No,No,No,Construction company, 10PM +Hammad Painter's Group,"PRJM+454, Sir Syed road, near mora choke, Wah Cantt, 47010, Pakistan",,+92 300 5883834,9.0,5.0,No,No,No,Painter, +Painter. Polish design,,,+92 307 8861399,,,No,No,No,Painter, +wood renovation,"Shop #1 #Plot #126.I&tcenter, Islamabad, 44002, Pakistan",woodreno.com,+92 334 9000098,,,No,No,No,Interior designer, +Islamabad Art Gallery & Framing,"Super market, F-6 Markaz F 6 Markaz F-6, Islamabad, 44000, Pakistan",,+92 51 2723191,6.0,5.0,No,No,No,Art gallery, +aqleem colour painters,"sarbri darbar wali Gali Line, Chakri road, Street No 12, Khayaban e Shifa Shadman Town, Rawalpindi, Pakistan",,+92 312 8707669,,,No,No,No,Painter, +HussainBrothersinterior pvc & Construction & Home finishing,"Service Rd, near chodary masjid, Sadiqabad Ali Abad, Rawalpindi, 46000, Pakistan",youtube.com,+92 315 5086172,59.0,4.7,No,No,No,Interior designer, 10PM +"Wall Decor Islamabad | Homes, Office, stores, Hotels, Gym, 3DWallpaper Shop","J4JC+6H Wallpaper shop, Khanna Islamabad, 45551, Pakistan",website3.me,+92 340 6661766,70.0,4.9,Yes,Yes,Yes,Wallpaper store, 10PM +پاکستان ہزارہ ہوٹل,"M2MX+Q56 Zilla tax, H 8/2 H-8, Islamabad, Pakistan",,,,,No,No,No,Painter, +Aamir painter,"J4M3+M2C, Shakrial Professors Colony Shakrial, Rawalpindi, 46000, Pakistan",,,,,No,No,No,Home builder, +Stone Cover,"Ayubia rock wall graffiato, main High Ct Rd, Phase 2 Gulrez Housing Scheme, Rawalpindi, 44000, Pakistan",stonecover.net,+92 300 4000653,2.0,5.0,No,No,No,Painter, Opens 9AM Thu +Street Painting and sketch,"P3HF+5MX, F-6 Markaz F 6 Markaz F-6, Islamabad, Pakistan",,,,,No,No,No,Painting, Opens 12PM Thu +Aouf Traders,"Opposite kurram palaza Near Total Pump Fateh Jang, Road, Tarnol, Islamabad, 45211, Pakistan",,+92 335 5509977,14.0,5.0,No,No,Yes,Paint store, Opens 8AM Thu +Studio Asrar Farooqi,"office# 101, 1st floor, FECHS E 11/2 E-11, Islamabad, Pakistan",,+92 300 5306545,1.0,5.0,No,No,No,Art studio, +Minhal Traders-construction - renovation service rawalpindi islamabad,"Dhowk Awan, Islamabad, Pakistan",,+92 301 5727775,11.0,4.8,No,No,No,Contractor, Opens 9AM Thu +"ROYAL CORPORATION PAINTS, HARDWARE, SANITARY AND ELECTRIC STORE IN BAHRIA TOWN","Plaza No. 15, Street No.24, Rafi, Rafi Shopping Centre, Commercial, Phase 8 Rafi Block Bahria Town, Islamabad, Pakistan",royalcorpration.pk,+92 324 5224800,13.0,5.0,Yes,No,Yes,Paint store, Opens 8AM Thu +JAVED PICTURE FRAMING,"Khayaban-e-Suhrwardy Rd, G-6/1 G 6/1 G-6, Islamabad, 44000, Pakistan",instagram.com,+92 333 5234758,68.0,4.8,Yes,Yes,Yes,Picture frame shop, Opens 10AM Thu +AURORA PAKISTAN (Pvt.) Ltd,"Sunrise Arcade plaza DHA 2, Gate 2 GT Rd, near DHA-II, 44000, Pakistan",aurorapakistan.com,+92 300 5008786,165.0,5.0,No,No,No,Construction company, Opens 10AM Thu +Islamabad Art Gallery,"P23C+XGF, F-9, Islamabad, Pakistan",,+92 315 6993443,77.0,4.2,No,No,No,Art gallery, Opens 9AM Thu +Rajput Rock Wall & Paint & Polish,"Chaklala Scheme 3 Bangash Street Near Car Chowk, 44000, Pakistan",,+92 306 7867867,1.0,2.0,No,No,No,Painter, +Sadeeq decoration sticker,"J2JG+736 Raja market carriage factory, I J P Rd، Dhoke Hassu, Rawalpindi, 46000, Pakistan",,+92 333 5166434,1.0,5.0,No,No,No,Painter, +Nomi denting painting,"PRH5+VW9, Taxila, Pakistan",,,1.0,1.0,No,No,No,Painter, +Building painter wah cantt,"New city, Wah Cantt, 47010, Pakistan",,+92 317 5409060,8.0,4.5,No,No,No,Painter, +Sadaqat arts painter,"HJ6R+X48, Masjid Al-Murtaza Rd, Fateh Jang, Pakistan",,+92 301 5325633,3.0,5.0,No,No,No,Painter, Opens 8AM Thu +Faisaletown phase I,"JV58+XV2, Block A Faisal Town, Islamabad, Pakistan",,+92 336 5531959,,,No,No,No,Painter, +Shah Buildings Paintr,"J3HC+WGJ, Service Rd N, Asghar Mall Scheme, Rawalpindi, Pakistan",,+92 305 6599881,1.0,5.0,No,No,No,Painter, From f875c0630e09f1d75b32c8ab8c39e06ffe0b7b59 Mon Sep 17 00:00:00 2001 From: Arman Date: Wed, 17 Jun 2026 18:12:20 +0330 Subject: [PATCH 2/2] Make scraper resilient to Google DOM changes + consent handling, geo data, headless flag --- README.md | 18 +- main.py | 544 ++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 419 insertions(+), 143 deletions(-) diff --git a/README.md b/README.md index e1f618ad..bac96688 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,16 @@ To do a custom web scraping project you can find me on Upwork ## Table of Contents -- [Prerequisites](#prerequisites) -- [Key Features](#key-features) -- [Installation](#installation) -- [Usage](#usage) -- [Example](#example) -- [Notes](#notes) -- [Video Example](#video-example) +- [Google-Maps-Scrapper](#google-maps-scrapper) + - [Table of Contents](#table-of-contents) + - [Prerequisites](#prerequisites) + - [Key Features](#key-features) + - [Installation](#installation) + - [Usage](#usage) + - [Example](#example) + - [Notes](#notes) + - [Video Example](#video-example) + - [License](#license) ## Prerequisites - Python 3.8 or 3.9 (Python 3.10+ may not be compatible with some dependencies) @@ -57,6 +60,7 @@ Run the script with your desired search term and number of results: ```bash python main.py -s "Turkish Restaurants in Toronto Canada" -t 20 +python main.py -s "Vitamin in Tehran Iran" -t 20 ``` - `-s` or `--search`: Search query for Google Maps (default: "turkish stores in toronto Canada") diff --git a/main.py b/main.py index 0e310dcc..ae3cb0aa 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,14 @@ import logging +import re from typing import List, Optional -from playwright.sync_api import sync_playwright, Page -from dataclasses import dataclass, asdict +from playwright.sync_api import sync_playwright, Page, TimeoutError as PlaywrightTimeoutError +from dataclasses import dataclass, asdict, field import pandas as pd import argparse -import platform import time import os + @dataclass class Place: name: str = "" @@ -22,6 +23,11 @@ class Place: place_type: str = "" opens_at: str = "" introduction: str = "" + plus_code: str = "" + latitude: Optional[float] = None + longitude: Optional[float] = None + google_maps_url: str = "" + def setup_logging(): logging.basicConfig( @@ -29,165 +35,431 @@ def setup_logging(): format='%(asctime)s - %(levelname)s - %(message)s', ) -def extract_text(page: Page, xpath: str) -> str: - try: - if page.locator(xpath).count() > 0: - return page.locator(xpath).inner_text() - except Exception as e: - logging.warning(f"Failed to extract text for xpath {xpath}: {e}") + +def safe_text(page: Page, selectors: List[str]) -> str: + """Try a list of CSS/XPath selectors and return the first non-empty inner_text.""" + for sel in selectors: + try: + loc = page.locator(sel).first + if loc.count() > 0: + txt = loc.inner_text(timeout=2000).strip() + if txt: + return txt + except Exception: + continue + return "" + + +def safe_attr(page: Page, selectors: List[str], attr: str) -> str: + for sel in selectors: + try: + loc = page.locator(sel).first + if loc.count() > 0: + val = loc.get_attribute(attr, timeout=2000) + if val: + return val.strip() + except Exception: + continue return "" -def extract_place(page: Page) -> Place: - # XPaths - name_xpath = '//div[@class="TIHn2 "]//h1[@class="DUwDvf lfPIob"]' - address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]' - website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]' - phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]' - reviews_count_xpath = '//div[@class="TIHn2 "]//div[@class="fontBodyMedium dmRWX"]//div//span//span//span[@aria-label]' - reviews_average_xpath = '//div[@class="TIHn2 "]//div[@class="fontBodyMedium dmRWX"]//div//span[@aria-hidden]' - info1 = '//div[@class="LTs0Rc"][1]' - info2 = '//div[@class="LTs0Rc"][2]' - info3 = '//div[@class="LTs0Rc"][3]' - opens_at_xpath = '//button[contains(@data-item-id, "oh")]//div[contains(@class, "fontBodyMedium")]' - opens_at_xpath2 = '//div[@class="MkV9"]//span[@class="ZDu9vd"]//span[2]' - place_type_xpath = '//div[@class="LBgpqf"]//button[@class="DkEaL "]' - intro_xpath = '//div[@class="WeS02d fontBodyMedium"]//div[@class="PYvSYb "]' +def dismiss_consent(page: Page): + """Try to dismiss Google's consent/cookie banner if present, then wait for maps to be ready.""" + if "consent." not in page.url and "consent?" not in page.url: + # Quick check for inline banner; if none, return fast + try: + page.wait_for_selector( + 'button:has-text("Accept all"), button:has-text("Reject all"), ' + 'button:has-text("I agree"), form[action*="consent"]', + timeout=2500, + ) + except PlaywrightTimeoutError: + return + + consent_selectors = [ + 'button:has-text("Accept all")', + 'button:has-text("I agree")', + 'button:has-text("Reject all")', + 'form[action*="consent"] button[type="submit"]', + 'button[aria-label*="Accept"]', + 'button[aria-label*="Agree"]', + ] + for sel in consent_selectors: + try: + btn = page.locator(sel).first + if btn.count() > 0 and btn.is_visible(): + try: + with page.expect_navigation(timeout=8000, wait_until="domcontentloaded"): + btn.click(timeout=3000) + except PlaywrightTimeoutError: + pass + except Exception: + pass + page.wait_for_timeout(800) + logging.info("Dismissed consent dialog") + break + except Exception: + continue + + # If we ended up on consent.google.com still, force back to maps. + if "consent." in page.url: + try: + page.goto("https://www.google.com/maps?hl=en", timeout=60000) + page.wait_for_load_state("domcontentloaded", timeout=30000) + except Exception: + pass + + +def parse_coords_from_url(url: str) -> (Optional[float], Optional[float]): + """Extract lat/lng from a Google Maps place URL (e.g. !3d35.7..!4d51.4..).""" + if not url: + return None, None + m = re.search(r'!3d(-?\d+\.\d+)!4d(-?\d+\.\d+)', url) + if m: + try: + return float(m.group(1)), float(m.group(2)) + except Exception: + pass + m = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', url) + if m: + try: + return float(m.group(1)), float(m.group(2)) + except Exception: + pass + return None, None + + +def extract_place(page: Page) -> Place: place = Place() - place.name = extract_text(page, name_xpath) - place.address = extract_text(page, address_xpath) - place.website = extract_text(page, website_xpath) - place.phone_number = extract_text(page, phone_number_xpath) - place.place_type = extract_text(page, place_type_xpath) - place.introduction = extract_text(page, intro_xpath) or "None Found" - - # Reviews Count - reviews_count_raw = extract_text(page, reviews_count_xpath) - if reviews_count_raw: + + # Name — try h1 in the main panel. + place.name = safe_text(page, [ + 'h1.DUwDvf', + '//h1[contains(@class, "DUwDvf")]', + '//div[contains(@role,"main")]//h1', + ]) + + # Address + place.address = safe_text(page, [ + 'button[data-item-id="address"] >> div.fontBodyMedium', + '//button[@data-item-id="address"]//div[contains(@class,"fontBodyMedium")]', + '//button[@aria-label[starts-with(., "Address:")]]', + ]) + # Strip leading icon text if any + if place.address.lower().startswith("address:"): + place.address = place.address.split(":", 1)[1].strip() + + # Website + place.website = safe_text(page, [ + 'a[data-item-id="authority"] >> div.fontBodyMedium', + '//a[@data-item-id="authority"]//div[contains(@class,"fontBodyMedium")]', + '//a[contains(@aria-label,"Website")]//div[contains(@class,"fontBodyMedium")]', + ]) + + # Phone + place.phone_number = safe_text(page, [ + 'button[data-item-id^="phone:tel:"] >> div.fontBodyMedium', + '//button[starts-with(@data-item-id,"phone:tel:")]//div[contains(@class,"fontBodyMedium")]', + '//button[contains(@aria-label,"Phone")]//div[contains(@class,"fontBodyMedium")]', + ]) + if not place.phone_number: + # fall back to data-item-id value itself + attr = safe_attr(page, ['button[data-item-id^="phone:tel:"]'], 'data-item-id') + if attr and attr.startswith("phone:tel:"): + place.phone_number = attr.replace("phone:tel:", "").strip() + + # Plus code + place.plus_code = safe_text(page, [ + 'button[data-item-id="oloc"] >> div.fontBodyMedium', + '//button[@data-item-id="oloc"]//div[contains(@class,"fontBodyMedium")]', + ]) + + # Place type / category + place.place_type = safe_text(page, [ + 'button.DkEaL', + '//button[contains(@class,"DkEaL")]', + '//div[contains(@class,"LBgpqf")]//button', + ]) + + # Introduction / description + place.introduction = safe_text(page, [ + 'div.PYvSYb', + '//div[contains(@class,"PYvSYb")]', + '//div[contains(@class,"WeS02d")]//div[contains(@class,"PYvSYb")]', + ]) or "None Found" + + # Reviews — average and count are commonly displayed near the title. + avg_raw = safe_text(page, [ + 'div.F7nice >> span[aria-hidden="true"]', + '//div[contains(@class,"F7nice")]//span[@aria-hidden="true"]', + '//div[contains(@class,"fontDisplayLarge")]', + ]) + if avg_raw: try: - temp = reviews_count_raw.replace('\xa0', '').replace('(','').replace(')','').replace(',','') - place.reviews_count = int(temp) - except Exception as e: - logging.warning(f"Failed to parse reviews count: {e}") - # Reviews Average - reviews_avg_raw = extract_text(page, reviews_average_xpath) - if reviews_avg_raw: + place.reviews_average = float(avg_raw.replace(',', '.').strip()) + except Exception: + pass + + count_raw = safe_text(page, [ + 'div.F7nice >> span[aria-label*="review" i]', + '//div[contains(@class,"F7nice")]//span[contains(@aria-label,"review")]', + '//button[contains(@aria-label,"review") and contains(@class,"HHrUdb")]', + '//span[contains(@aria-label,"review") and contains(text(),"(")]', + ]) + if not count_raw: + # Fallback to aria-label of the parent + count_raw = safe_attr(page, [ + 'div.F7nice >> span[aria-label*="review" i]', + '//div[contains(@class,"F7nice")]//span[contains(@aria-label,"review")]', + ], 'aria-label') + if count_raw: + digits = re.sub(r'[^\d]', '', count_raw) + if digits: + try: + place.reviews_count = int(digits) + except Exception: + pass + + # Store options (shopping / pickup / delivery) — sweep visible info chips + try: + chips = page.locator('//div[contains(@class,"LTs0Rc")]').all() + for chip in chips: + try: + text = chip.inner_text(timeout=500).lower() + except Exception: + continue + if 'shop' in text: + place.store_shopping = "Yes" + if 'pickup' in text: + place.in_store_pickup = "Yes" + if 'delivery' in text: + place.store_delivery = "Yes" + except Exception: + pass + + # Opens at / hours + opens_raw = safe_text(page, [ + 'button[data-item-id="oh"] >> div.fontBodyMedium', + '//button[@data-item-id="oh"]//div[contains(@class,"fontBodyMedium")]', + '//div[contains(@class,"MkV9")]//span[contains(@class,"ZDu9vd")]', + ]) + if opens_raw: + opens = opens_raw.split('⋅') + place.opens_at = (opens[1] if len(opens) > 1 else opens_raw).replace(' ', '').strip() + + # URL & coords + url = page.url + place.google_maps_url = url + place.latitude, place.longitude = parse_coords_from_url(url) + + return place + + +def scroll_results(page: Page, total: int) -> int: + """Scroll the results feed until at least `total` listings are loaded or end is reached.""" + feed_selectors = [ + 'div[role="feed"]', + '//div[@role="feed"]', + ] + feed = None + for sel in feed_selectors: try: - temp = reviews_avg_raw.replace(' ','').replace(',','.') - place.reviews_average = float(temp) - except Exception as e: - logging.warning(f"Failed to parse reviews average: {e}") - # Store Info - for idx, info_xpath in enumerate([info1, info2, info3]): - info_raw = extract_text(page, info_xpath) - if info_raw: - temp = info_raw.split('·') - if len(temp) > 1: - check = temp[1].replace("\n", "").lower() - if 'shop' in check: - place.store_shopping = "Yes" - if 'pickup' in check: - place.in_store_pickup = "Yes" - if 'delivery' in check: - place.store_delivery = "Yes" - # Opens At - opens_at_raw = extract_text(page, opens_at_xpath) - if opens_at_raw: - opens = opens_at_raw.split('⋅') - if len(opens) > 1: - place.opens_at = opens[1].replace("\u202f","") + loc = page.locator(sel).first + if loc.count() > 0: + feed = loc + break + except Exception: + continue + + previously_counted = 0 + stale_rounds = 0 + while True: + if feed: + try: + feed.evaluate('(el) => el.scrollBy(0, 2000)') + except Exception: + page.mouse.wheel(0, 4000) else: - place.opens_at = opens_at_raw.replace("\u202f","") - else: - opens_at2_raw = extract_text(page, opens_at_xpath2) - if opens_at2_raw: - opens = opens_at2_raw.split('⋅') - if len(opens) > 1: - place.opens_at = opens[1].replace("\u202f","") - else: - place.opens_at = opens_at2_raw.replace("\u202f","") - return place + page.mouse.wheel(0, 4000) -def scrape_places(search_for: str, total: int) -> List[Place]: + page.wait_for_timeout(1200) + + found = page.locator('//a[contains(@href,"/maps/place/")]').count() + logging.info(f"Currently found: {found}") + + if found >= total: + return found + + # Detect end-of-list marker + end_markers = page.locator('//span[contains(text(),"You\'ve reached the end of the list")] | //p[contains(@class,"fontBodyMedium") and contains(.,"end of the list")]').count() + if end_markers > 0: + logging.info("Reached end of list (marker detected)") + return found + + if found == previously_counted: + stale_rounds += 1 + if stale_rounds >= 4: + logging.info("No new results after several scrolls; stopping.") + return found + else: + stale_rounds = 0 + previously_counted = found + + +def scrape_places(search_for: str, total: int, headless: bool = False) -> List[Place]: setup_logging() places: List[Place] = [] + seen_urls = set() + with sync_playwright() as p: - if platform.system() == "Windows": - browser_path = r"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe" - browser = p.chromium.launch(executable_path=browser_path, headless=False) - else: - browser = p.chromium.launch(headless=False) - page = browser.new_page() + browser = p.chromium.launch(headless=headless) + # Pre-set the CONSENT cookie to skip the GDPR splash entirely. + context = browser.new_context( + locale="en-US", + user_agent=( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + viewport={"width": 1366, "height": 900}, + ) try: - page.goto("https://www.google.com/maps/@33.6207562,72.7564306,10z?", timeout=60000) - page.wait_for_timeout(1000) - page.locator('//input[@id="searchboxinput"]').fill(search_for) - page.keyboard.press("Enter") - page.wait_for_selector('//a[contains(@href, "https://www.google.com/maps/place")]') - page.hover('//a[contains(@href, "https://www.google.com/maps/place")]') - previously_counted = 0 - while True: - page.mouse.wheel(0, 10000) - page.wait_for_selector('//a[contains(@href, "https://www.google.com/maps/place")]') - found = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count() - logging.info(f"Currently Found: {found}") - if found < total: - time.sleep(2) - page.mouse.wheel(0, 10000) - found = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count() - - if found >= total: - break - if found == previously_counted: - logging.info("Arrived at all available") + context.add_cookies([ + {"name": "CONSENT", "value": "YES+cb", "domain": ".google.com", "path": "/"}, + {"name": "SOCS", "value": "CAESHAgBEhJnd3NfMjAyNDA0MDktMF9SQzIaAmVuIAEaBgiAlPmpBg", + "domain": ".google.com", "path": "/"}, + ]) + except Exception as e: + logging.warning(f"Failed to add consent cookie: {e}") + + page = context.new_page() + try: + # Go straight to the search URL so we never touch the search input. + from urllib.parse import quote + search_url = f"https://www.google.com/maps/search/{quote(search_for)}/?hl=en" + page.goto(search_url, timeout=120000) + try: + page.wait_for_load_state("domcontentloaded", timeout=30000) + except PlaywrightTimeoutError: + pass + + # If consent still gets in the way, handle it. + if "consent." in page.url: + dismiss_consent(page) + page.goto(search_url, timeout=120000) + try: + page.wait_for_load_state("domcontentloaded", timeout=30000) + except PlaywrightTimeoutError: + pass + + page.wait_for_timeout(1200) + + # Wait for either a results feed OR a single-place panel. + try: + page.wait_for_selector( + '//a[contains(@href,"/maps/place/")] | //h1[contains(@class,"DUwDvf")]', + timeout=45000, + ) + except PlaywrightTimeoutError: + logging.error("No results returned for the search query.") + return places + + # Single-place direct hit: extract once and return. + if ( + page.locator('//a[contains(@href,"/maps/place/")]').count() == 0 + and page.locator('//h1[contains(@class,"DUwDvf")]').count() > 0 + ): + logging.info("Single result detected; extracting directly.") + place = extract_place(page) + if place.name: + places.append(place) + return places + + scroll_results(page, total) + + # Collect anchors → deduplicated, capped at total. + anchors = page.locator('//a[contains(@href,"/maps/place/")]').all() + unique_hrefs = [] + for a in anchors: + try: + href = a.get_attribute("href") + except Exception: + continue + if not href or href in seen_urls: + continue + seen_urls.add(href) + unique_hrefs.append(href) + if len(unique_hrefs) >= total: break - previously_counted = found - listings = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').all()[:total] - listings = [listing.locator("xpath=..") for listing in listings] - logging.info(f"Total Found: {len(listings)}") - for i, listing in enumerate(listings): + + logging.info(f"Processing {len(unique_hrefs)} unique listings (requested {total})") + + for i, href in enumerate(unique_hrefs, start=1): + logging.info(f"[{i}/{len(unique_hrefs)}] opening listing") + anchor = page.locator(f'//a[@href="{href}"]').first + try: + anchor.scroll_into_view_if_needed(timeout=5000) + anchor.click(timeout=10000) + except Exception as e: + logging.warning(f"Click failed, navigating directly: {e}") + try: + page.goto(href, timeout=30000) + except Exception as ee: + logging.warning(f"Navigate failed too: {ee}") + continue + + try: + page.wait_for_selector( + '//h1[contains(@class,"DUwDvf")]', + timeout=15000, + ) + except PlaywrightTimeoutError: + logging.warning("Detail panel didn't load in time; skipping.") + continue + + page.wait_for_timeout(1200) try: - listing.click() - page.wait_for_selector('//div[@class="TIHn2 "]//h1[@class="DUwDvf lfPIob"]', timeout=10000) - time.sleep(1.5) # Give time for details to load place = extract_place(page) - if place.name: - places.append(place) - else: - logging.warning(f"No name found for listing {i+1}, skipping.") except Exception as e: - logging.warning(f"Failed to extract listing {i+1}: {e}") + logging.warning(f"Extraction error: {e}") + continue + + if place.name: + places.append(place) + else: + logging.warning("No name extracted; skipping listing.") finally: + context.close() browser.close() return places + def save_places_to_csv(places: List[Place], output_path: str = "result.csv", append: bool = False): - df = pd.DataFrame([asdict(place) for place in places]) - if not df.empty: - for column in df.columns: - if df[column].nunique() == 1: - df.drop(column, axis=1, inplace=True) - file_exists = os.path.isfile(output_path) - mode = "a" if append else "w" - header = not (append and file_exists) - df.to_csv(output_path, index=False, mode=mode, header=header) - logging.info(f"Saved {len(df)} places to {output_path} (append={append})") - else: - logging.warning("No data to save. DataFrame is empty.") + if not places: + logging.warning("No data to save.") + return + df = pd.DataFrame([asdict(p) for p in places]) + # Drop exact duplicate rows but KEEP columns even if all values are identical. + df.drop_duplicates(inplace=True) + file_exists = os.path.isfile(output_path) + mode = "a" if append else "w" + header = not (append and file_exists) + df.to_csv(output_path, index=False, mode=mode, header=header, encoding="utf-8-sig") + logging.info(f"Saved {len(df)} places to {output_path} (append={append})") + def main(): - parser = argparse.ArgumentParser() - parser.add_argument("-s", "--search", type=str, help="Search query for Google Maps") - parser.add_argument("-t", "--total", type=int, help="Total number of results to scrape") + parser = argparse.ArgumentParser(description="Google Maps business scraper") + parser.add_argument("-s", "--search", type=str, required=True, help="Search query for Google Maps") + parser.add_argument("-t", "--total", type=int, default=20, help="Total number of results to scrape") parser.add_argument("-o", "--output", type=str, default="result.csv", help="Output CSV file path") parser.add_argument("--append", action="store_true", help="Append results to the output file instead of overwriting") + parser.add_argument("--headless", action="store_true", help="Run browser in headless mode") args = parser.parse_args() - search_for = args.search or "turkish stores in toronto Canada" - total = args.total or 1 - output_path = args.output - append = args.append - places = scrape_places(search_for, total) - save_places_to_csv(places, output_path, append=append) + + places = scrape_places(args.search, args.total, headless=args.headless) + save_places_to_csv(places, args.output, append=args.append) + if __name__ == "__main__": main()