emailscrape/main.py at master · CodeJawn/emailscrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import scrapy
from scrapy.crawler import CrawlerProcess
from email_scraper import scrape_emails
from scrapy import signals
import csv
import os
from settings import *

print ("Starting...")
class EmailSpider(scrapy.Spider):
    name = "email_search"
    allowed_domains = [DOMAIN]
    start_urls = [START_URL]

    def __init__(self, *args, **kwargs):
        super(EmailSpider, self).__init__(*args, **kwargs)

        # Ensure the 'export' directory exists
        if not os.path.exists('export'):
            os.makedirs('export')

    def parse(self, response):

        # Check for 'text/html' in the 'Content-Type' header to ensure it's an HTML response
        content_type = response.headers.get("Content-Type", b"").decode("utf-8")
        if "text/html" not in content_type:
            return

        emails = scrape_emails(response.text)

        # If emails found, write them to CSV
        if emails:
            with open("export/emails.csv", "a", newline="") as csvfile:
                writer = csv.writer(csvfile)
                for email in emails:
                    writer.writerow([response.url, email])
                    #print(f"Found Email {email} at {response.url}")

        # Extract and follow links within the page
        for link in response.css("a::attr(href)").extract():

            # Filter out mailto links
            if "mailto:" in link:
                continue

            if link.startswith("/") or DOMAIN in link:
                yield response.follow(link, self.parse)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(EmailSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider

    def spider_closed(self, spider):
        print("Crawling finished!")

if __name__ == "__main__":
    process_settings = {
        'LOG_LEVEL': LOG_LEVEL,
        'AUTOTHROTTLE_ENABLED': AUTOTHROTTLE_ENABLED,
        'AUTOTHROTTLE_START_DELAY': AUTOTHROTTLE_START_DELAY,
        'AUTOTHROTTLE_MAX_DELAY': AUTOTHROTTLE_MAX_DELAY,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': AUTOTHROTTLE_TARGET_CONCURRENCY,
        'DOMAIN': DOMAIN,
        'START_URL': START_URL
    }
    process = CrawlerProcess(process_settings)
    process.crawl(EmailSpider)
    process.start()