-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfindtext.py
More file actions
91 lines (76 loc) · 3.22 KB
/
Copy pathfindtext.py
File metadata and controls
91 lines (76 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import signals
import os
from bs4 import BeautifulSoup
import fitz # PyMuPDF for PDF
# List of text strings to search for
search_texts = ["for888565.com", "googie-anaiytics.com"]
# Log file to store URLs with matches
log_file = "export/matches.log"
print("Starting...")
class EmailSpider(scrapy.Spider):
name = "text_search"
allowed_domains = ["astm.org"]
start_urls = ["https://www.astm.org/"]
def __init__(self, *args, **kwargs):
super(EmailSpider, self).__init__(*args, **kwargs)
# Ensure the 'export' directory exists
if not os.path.exists('export'):
os.makedirs('export')
def parse(self, response):
# Check for 'text/html' in the 'Content-Type' header to ensure it's an HTML response
content_type = response.headers.get("Content-Type", b"").decode("utf-8")
if "text/html" in content_type:
page_content = response.text
soup = BeautifulSoup(page_content, 'html.parser')
text_found = False
# Search for the specified texts in the page content
for text in search_texts:
if text in page_content:
with open(log_file, 'a') as log:
log.write(f"Found '{text}' on {response.url}\n")
text_found = True
# Print the URL of each page crawled
print(f"Crawled: {response.url}")
# Extract and follow links within the page
for link in response.css("a::attr(href)").extract():
# Filter out mailto links
if "mailto:" in link:
continue
if link.startswith("/") or "astm.org" in link:
yield response.follow(link, self.parse)
elif "application/pdf" in content_type:
self.parse_pdf(response)
def parse_pdf(self, response):
try:
doc = fitz.open(stream=response.body, filetype='pdf')
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text()
for search_text in search_texts:
if search_text in text:
with open(log_file, 'a') as log:
log.write(f"Found '{search_text}' in PDF on {response.url}\n")
print(f"Found '{search_text}' in PDF on {response.url}")
break
except Exception as e:
print(f"Failed to parse PDF {response.url}: {e}")
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(EmailSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
print("Crawling finished!")
if __name__ == "__main__":
process_settings = {
'LOG_LEVEL': 'INFO',
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 2,
'AUTOTHROTTLE_MAX_DELAY': 5,
'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0,
}
process = CrawlerProcess(process_settings)
process.crawl(EmailSpider)
process.start()