-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
70 lines (57 loc) · 2.29 KB
/
Copy pathmain.py
File metadata and controls
70 lines (57 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import scrapy
from scrapy.crawler import CrawlerProcess
from email_scraper import scrape_emails
from scrapy import signals
import csv
import os
from settings import *
print ("Starting...")
class EmailSpider(scrapy.Spider):
name = "email_search"
allowed_domains = [DOMAIN]
start_urls = [START_URL]
def __init__(self, *args, **kwargs):
super(EmailSpider, self).__init__(*args, **kwargs)
# Ensure the 'export' directory exists
if not os.path.exists('export'):
os.makedirs('export')
def parse(self, response):
# Check for 'text/html' in the 'Content-Type' header to ensure it's an HTML response
content_type = response.headers.get("Content-Type", b"").decode("utf-8")
if "text/html" not in content_type:
return
emails = scrape_emails(response.text)
# If emails found, write them to CSV
if emails:
with open("export/emails.csv", "a", newline="") as csvfile:
writer = csv.writer(csvfile)
for email in emails:
writer.writerow([response.url, email])
#print(f"Found Email {email} at {response.url}")
# Extract and follow links within the page
for link in response.css("a::attr(href)").extract():
# Filter out mailto links
if "mailto:" in link:
continue
if link.startswith("/") or DOMAIN in link:
yield response.follow(link, self.parse)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(EmailSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
print("Crawling finished!")
if __name__ == "__main__":
process_settings = {
'LOG_LEVEL': LOG_LEVEL,
'AUTOTHROTTLE_ENABLED': AUTOTHROTTLE_ENABLED,
'AUTOTHROTTLE_START_DELAY': AUTOTHROTTLE_START_DELAY,
'AUTOTHROTTLE_MAX_DELAY': AUTOTHROTTLE_MAX_DELAY,
'AUTOTHROTTLE_TARGET_CONCURRENCY': AUTOTHROTTLE_TARGET_CONCURRENCY,
'DOMAIN': DOMAIN,
'START_URL': START_URL
}
process = CrawlerProcess(process_settings)
process.crawl(EmailSpider)
process.start()