Skip to content

Commit f358a95

Browse files
Initial Commit
Added Complete files of GMapsScraper.
1 parent 209014d commit f358a95

14 files changed

Lines changed: 2048 additions & 194 deletions

README.md

Lines changed: 194 additions & 194 deletions
Large diffs are not rendered by default.
11.8 MB
Binary file not shown.

commandline.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
python maps.py -q ./queries.txt -w 2 -l -1 -u "Not Available" -bw 15 -se contacts -se about -o ./CSV_FILES
3.16 MB
Binary file not shown.

maps.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from webdriver_manager.chrome import ChromeDriverManager
2+
from utils.threading_controller import FastSearchAlgo
3+
from argparse import ArgumentParser
4+
from os.path import isfile
5+
import sys
6+
7+
8+
class GMapsScraper:
9+
def __init__(self):
10+
self._args = None
11+
12+
def arg_parser(self):
13+
parser = ArgumentParser(description='Command Line Google Map Scraper by Abdul Moez')
14+
15+
# Input options
16+
parser.add_argument('-q', '--query-file', help='Path to query file (default: ./queries.txt)', type=str,
17+
default="./queries.txt")
18+
parser.add_argument('-w', '--threads', help='Number of threads to use (default: 1)', type=int, default=1)
19+
parser.add_argument('-l', '--limit', help='Number of results to scrape (-1 for all results, default: -1)',
20+
type=int, default=-1)
21+
parser.add_argument('-u', '--unavailable-text',
22+
help='Replacement text for unavailable information (default: "Not Available")', type=str,
23+
default="Not Available")
24+
parser.add_argument('-bw', '--browser-wait', help='Browser waiting time in seconds (default: 15)', type=int,
25+
default=15)
26+
parser.add_argument('-se', '--suggested-ext',
27+
help='Suggested URL extensions to try (can be specified multiple times)', action='append',
28+
default=[])
29+
parser.add_argument('-wb', '--windowed-browser', help='Disable headless mode', action='store_false',
30+
default=True)
31+
parser.add_argument('-v', '--verbose', help='Enable verbose mode', action='store_true')
32+
parser.add_argument('-o', '--output-folder', help='Output folder to store CSV details (default: ./CSV_FILES)',
33+
type=str, default='./CSV_FILES')
34+
parser.add_argument('-d', '--driver-path',
35+
help='Path to Chrome driver (if not provided, it will be downloaded)', type=str,
36+
default='')
37+
38+
# Custom commands for additional help
39+
parser.add_argument('--help-query-file', action='store_true', help='Get help for specifying the query file')
40+
parser.add_argument('--help-limit', action='store_true', help='Get help for specifying the result limit')
41+
parser.add_argument('--help-driver-path', action='store_true', help='Get help for specifying the driver path')
42+
43+
self._args = parser.parse_args()
44+
45+
@staticmethod
46+
def print_query_file_help():
47+
print("The query file should contain a list of search queries, each query on a separate line.")
48+
print("For example:")
49+
print("Pizza restaurants")
50+
print("Coffee shops")
51+
print("...")
52+
sys.exit(0)
53+
54+
@staticmethod
55+
def print_limit_help():
56+
print("Use this option to specify the maximum number of results to scrape.")
57+
print("Use '-1' to scrape all results.")
58+
sys.exit(0)
59+
60+
@staticmethod
61+
def print_driver_path_help():
62+
print("If you have a specific Chrome driver path, you can provide it using this option.")
63+
print("If not provided, the script will attempt to download the driver automatically.")
64+
print("You can download a compatible driver from https://chromedriver.chromium.org/downloads.")
65+
sys.exit(0)
66+
67+
def check_args(self):
68+
q = self._args.query_file
69+
if not isfile(q):
70+
print(f"[-] File not found at path: {q}")
71+
sys.exit(1)
72+
73+
def scrape_maps_data(self):
74+
self.check_args()
75+
76+
if self._args.help_query_file:
77+
self.print_query_file_help()
78+
79+
if self._args.help_limit:
80+
self.print_limit_help()
81+
82+
if self._args.help_driver_path:
83+
self.print_driver_path_help()
84+
85+
queries_list = FastSearchAlgo.load_query_file(file_name=self._args.query_file)
86+
threads_limit = min(self._args.threads, len(queries_list))
87+
limit_results = None if self._args.limit == -1 else self._args.limit
88+
89+
driver_path = self._args.driver_path
90+
if not self._args.driver_path:
91+
try:
92+
driver_path = ChromeDriverManager().install()
93+
except ValueError:
94+
print("[-] Not able to download the driver which is capable with your browser.")
95+
print("[INFO] Head to this site (https://chromedriver.chromium.org/downloads)"
96+
" and find your version driver and pass it with argument -d.")
97+
exit()
98+
99+
algo_obj = FastSearchAlgo(
100+
unavailable_text=self._args.unavailable_text,
101+
headless=self._args.windowed_browser,
102+
wait_time=self._args.browser_wait,
103+
suggested_ext=self._args.suggested_ext,
104+
output_path=self._args.output_folder,
105+
workers=threads_limit,
106+
result_range=limit_results,
107+
verbose=self._args.verbose,
108+
driver_path=driver_path
109+
)
110+
111+
algo_obj.fast_search_algorithm(queries_list)
112+
113+
114+
if __name__ == '__main__':
115+
App = GMapsScraper()
116+
App.arg_parser()
117+
App.scrape_maps_data()

queries.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
best coffee shops for working in berlin de
2+
best restaurants in berlin

requirements.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
async-generator==1.10
2+
attrs==22.2.0
3+
beautifulsoup4==4.12.0
4+
certifi==2022.12.7
5+
cffi==1.15.1
6+
charset-normalizer==3.1.0
7+
colorama==0.4.6
8+
exceptiongroup==1.1.1
9+
h11==0.14.0
10+
idna==3.4
11+
lxml==4.9.2
12+
numpy==1.24.2
13+
outcome==1.2.0
14+
packaging==23.0
15+
pandas==1.5.3
16+
psutil==5.9.4
17+
pycparser==2.21
18+
PySocks==1.7.1
19+
python-dateutil==2.8.2
20+
python-dotenv==1.0.0
21+
pytz==2023.3
22+
requests==2.28.2
23+
selenium==4.8.3
24+
selenium-stealth==1.0.6
25+
six==1.16.0
26+
sniffio==1.3.0
27+
sortedcontainers==2.4.0
28+
soupsieve==2.4
29+
tqdm==4.65.0
30+
trio==0.22.0
31+
trio-websocket==0.10.2
32+
urllib3==1.26.15
33+
webdriver-manager==3.8.5
34+
wsproto==1.2.0

utils/dict_cleaner_and_writer.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from collections import OrderedDict
2+
3+
4+
class DictCleaner:
5+
def __init__(self, unavailable_data: str = "Not Available"):
6+
self._unavailable_data = unavailable_data
7+
8+
@staticmethod
9+
def _unique_repeating_sets(output_data_dict_list: list[dict]) -> tuple[set, set]:
10+
unique_keys = set()
11+
repeating_keys = set()
12+
13+
for data_dict in output_data_dict_list:
14+
unique_keys.update(data_dict.keys())
15+
for key in data_dict.keys():
16+
if sum(1 for x in output_data_dict_list if key in x) > 1:
17+
repeating_keys.add(key)
18+
19+
return unique_keys, repeating_keys
20+
21+
def _dict_cleaner(self, output_data_dict_list: list[dict], unique_keys: set, repeating_keys: set) -> list[dict]:
22+
23+
final_data = []
24+
for data_dict in output_data_dict_list:
25+
ordered_dict = OrderedDict()
26+
for key in unique_keys:
27+
if key not in data_dict:
28+
ordered_dict[key] = self._unavailable_data
29+
elif key in repeating_keys:
30+
ordered_dict[key] = f"{key}_{data_dict[key]}"
31+
else:
32+
ordered_dict[key] = data_dict[key]
33+
final_data.append(dict(ordered_dict))
34+
return final_data
35+
36+
def start_cleaning_dict_data(self, dict_list: list[dict]) -> list[dict]:
37+
unique_keys, repeating_keys = self._unique_repeating_sets(dict_list)
38+
cleaned_data_list = self._dict_cleaner(dict_list, unique_keys, repeating_keys)
39+
return cleaned_data_list
40+

0 commit comments

Comments
 (0)