Skip to content

Commit efd7e93

Browse files
committed
Initial release
1 parent afe0c04 commit efd7e93

File tree

1 file changed

+333
-0
lines changed

1 file changed

+333
-0
lines changed

AFHscraper.py

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import re
4+
import os
5+
import time
6+
from pathlib import Path
7+
from urllib.parse import quote_plus
8+
9+
class AFHScraper:
10+
def __init__(self, download_dir="afh_archive", mirror_preference="USA"):
11+
self.base_url = "https://androidfilehost.com"
12+
self.download_dir = Path(download_dir)
13+
self.download_dir.mkdir(exist_ok=True)
14+
self.mirror_preference = mirror_preference # "USA" or "Germany"
15+
self.session = requests.Session()
16+
self.session.headers.update({
17+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
18+
})
19+
20+
def search_files(self, search_term, page=1, sort_by='date'):
21+
"""Search for files by name with pagination and sorting
22+
23+
Args:
24+
search_term: Search query
25+
page: Page number (1-based)
26+
sort_by: 'date' for newest, 'downloads' for most popular
27+
"""
28+
encoded_term = quote_plus(search_term)
29+
search_url = f"{self.base_url}/?w=search&s={encoded_term}&type=files"
30+
31+
# Add sorting
32+
if sort_by == 'downloads':
33+
search_url += "&sort_by=downloads&sort_dir=DESC"
34+
35+
# Add pagination
36+
if page > 1:
37+
search_url += f"&page={page}"
38+
39+
print(f"Searching page {page}: {search_url}")
40+
41+
try:
42+
response = self.session.get(search_url, timeout=30)
43+
response.raise_for_status()
44+
45+
soup = BeautifulSoup(response.text, 'html.parser')
46+
files = []
47+
48+
# Find all file list items
49+
file_items = soup.select('li.list-group-item.file')
50+
51+
for item in file_items:
52+
# Extract file name and FID
53+
file_link = item.select_one('div.file-name h3 a')
54+
if not file_link:
55+
continue
56+
57+
filename = file_link.text.strip()
58+
href = file_link.get('href', '')
59+
60+
# Extract FID from href (format: /?fid=12495398787939834307)
61+
fid_match = re.search(r'fid=(\d+)', href)
62+
if not fid_match:
63+
continue
64+
65+
fid = fid_match.group(1)
66+
67+
# Extract file metadata
68+
downloads = item.select_one('div.file-attr:nth-of-type(1) span.file-attr-value')
69+
size = item.select_one('div.file-attr:nth-of-type(2) span.file-attr-value')
70+
upload_date = item.select_one('div.file-attr:nth-of-type(3) span.file-attr-value')
71+
72+
files.append({
73+
'fid': fid,
74+
'filename': filename,
75+
'downloads': downloads.text.split('\n')[0].strip() if downloads else 'N/A',
76+
'size': size.text.split('\n')[0].strip() if size else 'N/A',
77+
'upload_date': upload_date.text.split('\n')[0].strip() if upload_date else 'N/A',
78+
'url': f"{self.base_url}/?fid={fid}"
79+
})
80+
81+
return files
82+
83+
except Exception as e:
84+
print(f"Error searching: {e}")
85+
return []
86+
87+
def get_download_mirrors(self, fid):
88+
# Get download mirrors for a file via API
89+
mirrors_api = f"{self.base_url}/libs/otf/mirrors.otf.php"
90+
91+
try:
92+
# POST to the mirrors API
93+
post_data = {
94+
'submit': 'submit',
95+
'action': 'getdownloadmirrors',
96+
'fid': fid
97+
}
98+
99+
response = self.session.post(mirrors_api, data=post_data, timeout=30)
100+
response.raise_for_status()
101+
102+
data = response.json()
103+
104+
if data.get('STATUS') != '1' or data.get('CODE') != '200':
105+
print(f"API returned error: {data.get('MESSAGE')}")
106+
return []
107+
108+
mirrors_data = data.get('MIRRORS', [])
109+
if not mirrors_data:
110+
print("No mirrors in API response")
111+
return []
112+
113+
# Parse mirrors from JSON
114+
mirrors = []
115+
for mirror in mirrors_data:
116+
url = mirror.get('url')
117+
name = mirror.get('name', '')
118+
119+
if not url:
120+
continue
121+
122+
location = "Unknown"
123+
if 'Virginia' in name or 'USA' in name:
124+
location = "USA"
125+
elif 'Germany' in name:
126+
location = "Germany"
127+
128+
# Higher weight = primary mirror
129+
weight = int(mirror.get('weight', 0))
130+
is_primary = weight >= 100000
131+
132+
mirrors.append({
133+
'url': url,
134+
'location': location,
135+
'is_primary': is_primary,
136+
'name': name
137+
})
138+
139+
return mirrors
140+
141+
except Exception as e:
142+
print(f"Error getting mirrors: {e}")
143+
return []
144+
145+
def select_mirror(self, mirrors):
146+
# Try selecting a mirror. Falls back to the primary mirror if Germany fails
147+
if not mirrors:
148+
return None
149+
150+
# Try to find preferred location
151+
preferred = [m for m in mirrors if self.mirror_preference in m['location']]
152+
if preferred:
153+
return preferred[0]['url']
154+
155+
# Fall back to primary
156+
primary = [m for m in mirrors if m['is_primary']]
157+
if primary:
158+
return primary[0]['url']
159+
160+
# Fall back to first available
161+
return mirrors[0]['url']
162+
163+
def download_file(self, fid, filename):
164+
165+
output_path = self.download_dir / filename
166+
167+
# Check if already downloaded
168+
if output_path.exists():
169+
print(f"Already exists: {filename}")
170+
return True
171+
172+
print(f"Getting mirrors for: {filename}")
173+
mirrors = self.get_download_mirrors(fid)
174+
175+
if not mirrors:
176+
print(f"No mirrors found for: {filename}")
177+
return False
178+
179+
download_url = self.select_mirror(mirrors)
180+
if not download_url:
181+
print(f"Could not select mirror for: {filename}")
182+
return False
183+
184+
print(f"Downloading from: {download_url}")
185+
186+
try:
187+
response = self.session.get(download_url, stream=True, timeout=120)
188+
response.raise_for_status()
189+
190+
total_size = int(response.headers.get('content-length', 0))
191+
downloaded = 0
192+
193+
with open(output_path, 'wb') as f:
194+
for chunk in response.iter_content(chunk_size=8192):
195+
if chunk:
196+
f.write(chunk)
197+
downloaded += len(chunk)
198+
if total_size > 0:
199+
progress = (downloaded / total_size) * 100
200+
print(f"\rProgress: {progress:.1f}%", end='', flush=True)
201+
202+
print(f"\nDownloaded: {filename}")
203+
return True
204+
205+
except Exception as e:
206+
print(f"\nError downloading: {e}")
207+
if output_path.exists():
208+
output_path.unlink()
209+
return False
210+
211+
def batch_download(self, search_terms, num_pages=1, sort_by='date', max_files=None, delay=2):
212+
"""Search and download files for multiple terms with pagination
213+
214+
Args:
215+
search_terms: List of search queries
216+
num_pages: Number of pages to scrape per search term
217+
sort_by: 'date' for newest, 'downloads' for most popular
218+
max_files: Maximum files to download per search term (None = unlimited)
219+
delay: Seconds to wait between downloads
220+
"""
221+
all_results = []
222+
223+
for term in search_terms:
224+
print(f"\n{'='*70}")
225+
print(f"Searching for: {term}")
226+
print(f"{'='*70}")
227+
228+
files_downloaded = 0
229+
230+
for page in range(1, num_pages + 1):
231+
if max_files and files_downloaded >= max_files:
232+
break
233+
234+
files = self.search_files(term, page=page, sort_by=sort_by)
235+
236+
if not files:
237+
print(f"No files found on page {page}")
238+
break
239+
240+
print(f"Found {len(files)} files on page {page}\n")
241+
242+
for file_info in files:
243+
if max_files and files_downloaded >= max_files:
244+
print(f"Reached maximum of {max_files} files for this search term")
245+
break
246+
247+
print(f"File: {file_info['filename']}")
248+
print(f" Size:{file_info ['size']} | Downloads: {file_info ['downloads']} | Date: {file_info ['upload_date']}")
249+
print(f" FID: {file_info['fid']}")
250+
251+
success = self.download_file(file_info['fid'], file_info['filename'])
252+
253+
all_results.append({
254+
'search_term': term,
255+
'page': page,
256+
'filename': file_info['filename'],
257+
'fid': file_info['fid'],
258+
'size': file_info['size'],
259+
'success': success
260+
})
261+
262+
if success:
263+
files_downloaded += 1
264+
265+
print()
266+
time.sleep(delay)
267+
268+
return all_results
269+
270+
271+
def main():
272+
print("AndroidFileHost Scraper by fl0w")
273+
print("github.com/codefl0w")
274+
print("https://xdaforums.com/m/fl0w.12361087/")
275+
print("="*70)
276+
277+
# Get search terms
278+
print("\nEnter search terms (comma-separated):")
279+
print("Example: lineage, twrp, magisk")
280+
search_input = input("Search terms: ").strip()
281+
search_terms = [term.strip() for term in search_input.split(',') if term.strip()]
282+
283+
if not search_terms:
284+
print("No search terms provided. Exiting.")
285+
return
286+
287+
# Get user preferences
288+
print("\nHow should files be sorted?")
289+
print("1. Newest first")
290+
print("2. Most popular (by downloads)")
291+
sort_choice = input("Enter choice (1 or 2): ").strip()
292+
sort_by = 'downloads' if sort_choice == '2' else 'date'
293+
294+
print("\nHow many files should be downloaded per search term?")
295+
max_files = int(input("Enter number: ").strip())
296+
297+
# Calculate pages needed (15 files per page)
298+
num_pages = (max_files + 14) // 15
299+
300+
print("\nWhich mirror should be used primarily?")
301+
print("1. USA")
302+
print("2. Germany")
303+
mirror_choice = input("Enter choice (1 or 2): ").strip()
304+
mirror_pref = "Germany" if mirror_choice == '2' else "USA"
305+
306+
# Initialize scraper
307+
308+
scraper = AFHScraper(
309+
download_dir=os.path.dirname(os.path.abspath(__file__)),
310+
mirror_preference=mirror_pref
311+
)
312+
313+
print(f"\n{'='*70}")
314+
print(f"Download directory: {scraper.download_dir.absolute()}")
315+
print(f"Mirror preference: {scraper.mirror_preference}")
316+
print(f"Sort by: {'Most popular' if sort_by == 'downloads' else 'Newest'}")
317+
print(f"Max files per search: {max_files}")
318+
print(f"Search terms: {', '.join(search_terms)}")
319+
print("="*70)
320+
321+
results = scraper.batch_download(search_terms, num_pages=num_pages, sort_by=sort_by,
322+
max_files=max_files, delay=3)
323+
324+
325+
# Summary
326+
successful = sum(1 for r in results if r['success'])
327+
print(f"\n{'='*70}")
328+
print(f"Summary: {successful}/{len(results)} files downloaded successfully")
329+
print(f"{'='*70}")
330+
331+
332+
if __name__ == "__main__":
333+
main()

0 commit comments

Comments
 (0)