|
1 | 1 | import json |
| 2 | +import socket |
2 | 3 |
|
3 | 4 | import requests |
| 5 | +import ipaddress |
| 6 | +from urllib.parse import urlparse |
4 | 7 | from bs4 import BeautifulSoup |
5 | 8 | from markdownify import markdownify |
6 | 9 |
|
|
24 | 27 | } |
25 | 28 |
|
26 | 29 |
|
| 30 | + |
| 31 | +BLOCKED_RANGES = [ |
| 32 | + ipaddress.ip_network("127.0.0.0/8"), |
| 33 | + ipaddress.ip_network("10.0.0.0/8"), |
| 34 | + ipaddress.ip_network("172.16.0.0/12"), |
| 35 | + ipaddress.ip_network("192.168.0.0/16"), |
| 36 | + ipaddress.ip_network("169.254.0.0/16"), |
| 37 | + ipaddress.ip_network("::1/128"), |
| 38 | +] |
| 39 | + |
| 40 | +def _is_ip_blocked(ip_str: str) -> bool: |
| 41 | + ip = ipaddress.ip_address(ip_str) |
| 42 | + return any(ip in net for net in BLOCKED_RANGES) |
| 43 | + |
| 44 | + |
| 45 | +def _hostname_resolves_to_blocked_ip(hostname: str) -> bool: |
| 46 | + try: |
| 47 | + resolved = socket.getaddrinfo(hostname, None) |
| 48 | + return any(_is_ip_blocked(info[4][0]) for info in resolved) |
| 49 | + except socket.gaierror: |
| 50 | + return True |
| 51 | + |
| 52 | + |
| 53 | +def is_safe_url(url: str) -> bool: |
| 54 | + parsed = urlparse(url) |
| 55 | + if parsed.scheme not in ("http", "https"): |
| 56 | + return False |
| 57 | + try: |
| 58 | + return not _is_ip_blocked(parsed.hostname) |
| 59 | + except ValueError: |
| 60 | + return not _hostname_resolves_to_blocked_ip(parsed.hostname) |
| 61 | + |
27 | 62 | def fetchHTML(url): |
28 | | - r = requests.get(url) |
| 63 | + if not is_safe_url(url): |
| 64 | + raise ValueError(f"Blocked URL: {url}") |
| 65 | + r = requests.get(url, timeout=10) |
29 | 66 | return r.text |
30 | 67 |
|
31 | | - |
32 | 68 | def stripUselessTags(html): |
33 | 69 | soup = BeautifulSoup(html, "html.parser") |
34 | 70 | toRemove = ["script", "head", "header", "footer", "meta", "link"] |
|
0 commit comments