Anonym0usWork1221
diff --git a/‎README.md‎
Lines changed: 194 additions & 194 deletions b/‎README.md‎
Lines changed: 194 additions & 194 deletions
diff --git a/‎chrome_driver_backup/chromedriver.exe‎
11.8 MB b/‎chrome_driver_backup/chromedriver.exe‎
11.8 MB
diff --git a/‎commandline.txt‎
Lines changed: 1 addition & 0 deletions b/‎commandline.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extensions/finger_print_defender.crx‎
3.16 MB b/‎extensions/finger_print_defender.crx‎
3.16 MB
diff --git a/‎maps.py‎
Lines changed: 117 additions & 0 deletions b/‎maps.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎queries.txt‎
Lines changed: 2 additions & 0 deletions b/‎queries.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 34 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎utils/dict_cleaner_and_writer.py‎
Lines changed: 40 additions & 0 deletions b/‎utils/dict_cleaner_and_writer.py‎
Lines changed: 40 additions & 0 deletions
@@ -0,0 +1 @@
+python maps.py -q ./queries.txt -w 2 -l -1 -u "Not Available" -bw 15 -se contacts -se about -o ./CSV_FILES
@@ -0,0 +1,117 @@
+from webdriver_manager.chrome import ChromeDriverManager
+from utils.threading_controller import FastSearchAlgo
+from argparse import ArgumentParser
+from os.path import isfile
+import sys
+
+
+class GMapsScraper:
+    def __init__(self):
+        self._args = None
+
+    def arg_parser(self):
+        parser = ArgumentParser(description='Command Line Google Map Scraper by Abdul Moez')
+
+        # Input options
+        parser.add_argument('-q', '--query-file', help='Path to query file (default: ./queries.txt)', type=str,
+                            default="./queries.txt")
+        parser.add_argument('-w', '--threads', help='Number of threads to use (default: 1)', type=int, default=1)
+        parser.add_argument('-l', '--limit', help='Number of results to scrape (-1 for all results, default: -1)',
+                            type=int, default=-1)
+        parser.add_argument('-u', '--unavailable-text',
+                            help='Replacement text for unavailable information (default: "Not Available")', type=str,
+                            default="Not Available")
+        parser.add_argument('-bw', '--browser-wait', help='Browser waiting time in seconds (default: 15)', type=int,
+                            default=15)
+        parser.add_argument('-se', '--suggested-ext',
+                            help='Suggested URL extensions to try (can be specified multiple times)', action='append',
+                            default=[])
+        parser.add_argument('-wb', '--windowed-browser', help='Disable headless mode', action='store_false',
+                            default=True)
+        parser.add_argument('-v', '--verbose', help='Enable verbose mode', action='store_true')
+        parser.add_argument('-o', '--output-folder', help='Output folder to store CSV details (default: ./CSV_FILES)',
+                            type=str, default='./CSV_FILES')
+        parser.add_argument('-d', '--driver-path',
+                            help='Path to Chrome driver (if not provided, it will be downloaded)', type=str,
+                            default='')
+
+        # Custom commands for additional help
+        parser.add_argument('--help-query-file', action='store_true', help='Get help for specifying the query file')
+        parser.add_argument('--help-limit', action='store_true', help='Get help for specifying the result limit')
+        parser.add_argument('--help-driver-path', action='store_true', help='Get help for specifying the driver path')
+
+        self._args = parser.parse_args()
+
+    @staticmethod
+    def print_query_file_help():
+        print("The query file should contain a list of search queries, each query on a separate line.")
+        print("For example:")
+        print("Pizza restaurants")
+        print("Coffee shops")
+        print("...")
+        sys.exit(0)
+
+    @staticmethod
+    def print_limit_help():
+        print("Use this option to specify the maximum number of results to scrape.")
+        print("Use '-1' to scrape all results.")
+        sys.exit(0)
+
+    @staticmethod
+    def print_driver_path_help():
+        print("If you have a specific Chrome driver path, you can provide it using this option.")
+        print("If not provided, the script will attempt to download the driver automatically.")
+        print("You can download a compatible driver from https://chromedriver.chromium.org/downloads.")
+        sys.exit(0)
+
+    def check_args(self):
+        q = self._args.query_file
+        if not isfile(q):
+            print(f"[-] File not found at path: {q}")
+            sys.exit(1)
+
+    def scrape_maps_data(self):
+        self.check_args()
+
+        if self._args.help_query_file:
+            self.print_query_file_help()
+
+        if self._args.help_limit:
+            self.print_limit_help()
+
+        if self._args.help_driver_path:
+            self.print_driver_path_help()
+
+        queries_list = FastSearchAlgo.load_query_file(file_name=self._args.query_file)
+        threads_limit = min(self._args.threads, len(queries_list))
+        limit_results = None if self._args.limit == -1 else self._args.limit
+
+        driver_path = self._args.driver_path
+        if not self._args.driver_path:
+            try:
+                driver_path = ChromeDriverManager().install()
+            except ValueError:
+                print("[-] Not able to download the driver which is capable with your browser.")
+                print("[INFO] Head to this site (https://chromedriver.chromium.org/downloads)"
+                      " and find your version driver and pass it with argument -d.")
+                exit()
+
+        algo_obj = FastSearchAlgo(
+            unavailable_text=self._args.unavailable_text,
+            headless=self._args.windowed_browser,
+            wait_time=self._args.browser_wait,
+            suggested_ext=self._args.suggested_ext,
+            output_path=self._args.output_folder,
+            workers=threads_limit,
+            result_range=limit_results,
+            verbose=self._args.verbose,
+            driver_path=driver_path
+        )
+
+        algo_obj.fast_search_algorithm(queries_list)
+
+
+if __name__ == '__main__':
+    App = GMapsScraper()
+    App.arg_parser()
+    App.scrape_maps_data()
@@ -0,0 +1,2 @@
+best coffee shops for working in berlin de
+best restaurants in berlin
@@ -0,0 +1,34 @@
+async-generator==1.10
+attrs==22.2.0
+beautifulsoup4==4.12.0
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.1.0
+colorama==0.4.6
+exceptiongroup==1.1.1
+h11==0.14.0
+idna==3.4
+lxml==4.9.2
+numpy==1.24.2
+outcome==1.2.0
+packaging==23.0
+pandas==1.5.3
+psutil==5.9.4
+pycparser==2.21
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3
+requests==2.28.2
+selenium==4.8.3
+selenium-stealth==1.0.6
+six==1.16.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+soupsieve==2.4
+tqdm==4.65.0
+trio==0.22.0
+trio-websocket==0.10.2
+urllib3==1.26.15
+webdriver-manager==3.8.5
+wsproto==1.2.0
@@ -0,0 +1,40 @@
+from collections import OrderedDict
+
+
+class DictCleaner:
+    def __init__(self, unavailable_data: str = "Not Available"):
+        self._unavailable_data = unavailable_data
+
+    @staticmethod
+    def _unique_repeating_sets(output_data_dict_list: list[dict]) -> tuple[set, set]:
+        unique_keys = set()
+        repeating_keys = set()
+
+        for data_dict in output_data_dict_list:
+            unique_keys.update(data_dict.keys())
+            for key in data_dict.keys():
+                if sum(1 for x in output_data_dict_list if key in x) > 1:
+                    repeating_keys.add(key)
+
+        return unique_keys, repeating_keys
+
+    def _dict_cleaner(self, output_data_dict_list: list[dict], unique_keys: set, repeating_keys: set) -> list[dict]:
+
+        final_data = []
+        for data_dict in output_data_dict_list:
+            ordered_dict = OrderedDict()
+            for key in unique_keys:
+                if key not in data_dict:
+                    ordered_dict[key] = self._unavailable_data
+                elif key in repeating_keys:
+                    ordered_dict[key] = f"{key}_{data_dict[key]}"
+                else:
+                    ordered_dict[key] = data_dict[key]
+            final_data.append(dict(ordered_dict))
+        return final_data
+
+    def start_cleaning_dict_data(self, dict_list: list[dict]) -> list[dict]:
+        unique_keys, repeating_keys = self._unique_repeating_sets(dict_list)
+        cleaned_data_list = self._dict_cleaner(dict_list, unique_keys, repeating_keys)
+        return cleaned_data_list
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+python maps.py -q ./queries.txt -w 2 -l -1 -u "Not Available" -bw 15 -se contacts -se about -o ./CSV_FILES`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+best coffee shops for working in berlin de`
	`2`	`+best restaurants in berlin`