|
| 1 | +import requests |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import re |
| 4 | +import os |
| 5 | +import time |
| 6 | +from pathlib import Path |
| 7 | +from urllib.parse import quote_plus |
| 8 | + |
| 9 | +class AFHScraper: |
| 10 | + def __init__(self, download_dir="afh_archive", mirror_preference="USA"): |
| 11 | + self.base_url = "https://androidfilehost.com" |
| 12 | + self.download_dir = Path(download_dir) |
| 13 | + self.download_dir.mkdir(exist_ok=True) |
| 14 | + self.mirror_preference = mirror_preference # "USA" or "Germany" |
| 15 | + self.session = requests.Session() |
| 16 | + self.session.headers.update({ |
| 17 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| 18 | + }) |
| 19 | + |
| 20 | + def search_files(self, search_term, page=1, sort_by='date'): |
| 21 | + """Search for files by name with pagination and sorting |
| 22 | + |
| 23 | + Args: |
| 24 | + search_term: Search query |
| 25 | + page: Page number (1-based) |
| 26 | + sort_by: 'date' for newest, 'downloads' for most popular |
| 27 | + """ |
| 28 | + encoded_term = quote_plus(search_term) |
| 29 | + search_url = f"{self.base_url}/?w=search&s={encoded_term}&type=files" |
| 30 | + |
| 31 | + # Add sorting |
| 32 | + if sort_by == 'downloads': |
| 33 | + search_url += "&sort_by=downloads&sort_dir=DESC" |
| 34 | + |
| 35 | + # Add pagination |
| 36 | + if page > 1: |
| 37 | + search_url += f"&page={page}" |
| 38 | + |
| 39 | + print(f"Searching page {page}: {search_url}") |
| 40 | + |
| 41 | + try: |
| 42 | + response = self.session.get(search_url, timeout=30) |
| 43 | + response.raise_for_status() |
| 44 | + |
| 45 | + soup = BeautifulSoup(response.text, 'html.parser') |
| 46 | + files = [] |
| 47 | + |
| 48 | + # Find all file list items |
| 49 | + file_items = soup.select('li.list-group-item.file') |
| 50 | + |
| 51 | + for item in file_items: |
| 52 | + # Extract file name and FID |
| 53 | + file_link = item.select_one('div.file-name h3 a') |
| 54 | + if not file_link: |
| 55 | + continue |
| 56 | + |
| 57 | + filename = file_link.text.strip() |
| 58 | + href = file_link.get('href', '') |
| 59 | + |
| 60 | + # Extract FID from href (format: /?fid=12495398787939834307) |
| 61 | + fid_match = re.search(r'fid=(\d+)', href) |
| 62 | + if not fid_match: |
| 63 | + continue |
| 64 | + |
| 65 | + fid = fid_match.group(1) |
| 66 | + |
| 67 | + # Extract file metadata |
| 68 | + downloads = item.select_one('div.file-attr:nth-of-type(1) span.file-attr-value') |
| 69 | + size = item.select_one('div.file-attr:nth-of-type(2) span.file-attr-value') |
| 70 | + upload_date = item.select_one('div.file-attr:nth-of-type(3) span.file-attr-value') |
| 71 | + |
| 72 | + files.append({ |
| 73 | + 'fid': fid, |
| 74 | + 'filename': filename, |
| 75 | + 'downloads': downloads.text.split('\n')[0].strip() if downloads else 'N/A', |
| 76 | + 'size': size.text.split('\n')[0].strip() if size else 'N/A', |
| 77 | + 'upload_date': upload_date.text.split('\n')[0].strip() if upload_date else 'N/A', |
| 78 | + 'url': f"{self.base_url}/?fid={fid}" |
| 79 | + }) |
| 80 | + |
| 81 | + return files |
| 82 | + |
| 83 | + except Exception as e: |
| 84 | + print(f"Error searching: {e}") |
| 85 | + return [] |
| 86 | + |
| 87 | + def get_download_mirrors(self, fid): |
| 88 | + # Get download mirrors for a file via API |
| 89 | + mirrors_api = f"{self.base_url}/libs/otf/mirrors.otf.php" |
| 90 | + |
| 91 | + try: |
| 92 | + # POST to the mirrors API |
| 93 | + post_data = { |
| 94 | + 'submit': 'submit', |
| 95 | + 'action': 'getdownloadmirrors', |
| 96 | + 'fid': fid |
| 97 | + } |
| 98 | + |
| 99 | + response = self.session.post(mirrors_api, data=post_data, timeout=30) |
| 100 | + response.raise_for_status() |
| 101 | + |
| 102 | + data = response.json() |
| 103 | + |
| 104 | + if data.get('STATUS') != '1' or data.get('CODE') != '200': |
| 105 | + print(f"API returned error: {data.get('MESSAGE')}") |
| 106 | + return [] |
| 107 | + |
| 108 | + mirrors_data = data.get('MIRRORS', []) |
| 109 | + if not mirrors_data: |
| 110 | + print("No mirrors in API response") |
| 111 | + return [] |
| 112 | + |
| 113 | + # Parse mirrors from JSON |
| 114 | + mirrors = [] |
| 115 | + for mirror in mirrors_data: |
| 116 | + url = mirror.get('url') |
| 117 | + name = mirror.get('name', '') |
| 118 | + |
| 119 | + if not url: |
| 120 | + continue |
| 121 | + |
| 122 | + location = "Unknown" |
| 123 | + if 'Virginia' in name or 'USA' in name: |
| 124 | + location = "USA" |
| 125 | + elif 'Germany' in name: |
| 126 | + location = "Germany" |
| 127 | + |
| 128 | + # Higher weight = primary mirror |
| 129 | + weight = int(mirror.get('weight', 0)) |
| 130 | + is_primary = weight >= 100000 |
| 131 | + |
| 132 | + mirrors.append({ |
| 133 | + 'url': url, |
| 134 | + 'location': location, |
| 135 | + 'is_primary': is_primary, |
| 136 | + 'name': name |
| 137 | + }) |
| 138 | + |
| 139 | + return mirrors |
| 140 | + |
| 141 | + except Exception as e: |
| 142 | + print(f"Error getting mirrors: {e}") |
| 143 | + return [] |
| 144 | + |
| 145 | + def select_mirror(self, mirrors): |
| 146 | + # Try selecting a mirror. Falls back to the primary mirror if Germany fails |
| 147 | + if not mirrors: |
| 148 | + return None |
| 149 | + |
| 150 | + # Try to find preferred location |
| 151 | + preferred = [m for m in mirrors if self.mirror_preference in m['location']] |
| 152 | + if preferred: |
| 153 | + return preferred[0]['url'] |
| 154 | + |
| 155 | + # Fall back to primary |
| 156 | + primary = [m for m in mirrors if m['is_primary']] |
| 157 | + if primary: |
| 158 | + return primary[0]['url'] |
| 159 | + |
| 160 | + # Fall back to first available |
| 161 | + return mirrors[0]['url'] |
| 162 | + |
| 163 | + def download_file(self, fid, filename): |
| 164 | + |
| 165 | + output_path = self.download_dir / filename |
| 166 | + |
| 167 | + # Check if already downloaded |
| 168 | + if output_path.exists(): |
| 169 | + print(f"Already exists: {filename}") |
| 170 | + return True |
| 171 | + |
| 172 | + print(f"Getting mirrors for: {filename}") |
| 173 | + mirrors = self.get_download_mirrors(fid) |
| 174 | + |
| 175 | + if not mirrors: |
| 176 | + print(f"No mirrors found for: {filename}") |
| 177 | + return False |
| 178 | + |
| 179 | + download_url = self.select_mirror(mirrors) |
| 180 | + if not download_url: |
| 181 | + print(f"Could not select mirror for: {filename}") |
| 182 | + return False |
| 183 | + |
| 184 | + print(f"Downloading from: {download_url}") |
| 185 | + |
| 186 | + try: |
| 187 | + response = self.session.get(download_url, stream=True, timeout=120) |
| 188 | + response.raise_for_status() |
| 189 | + |
| 190 | + total_size = int(response.headers.get('content-length', 0)) |
| 191 | + downloaded = 0 |
| 192 | + |
| 193 | + with open(output_path, 'wb') as f: |
| 194 | + for chunk in response.iter_content(chunk_size=8192): |
| 195 | + if chunk: |
| 196 | + f.write(chunk) |
| 197 | + downloaded += len(chunk) |
| 198 | + if total_size > 0: |
| 199 | + progress = (downloaded / total_size) * 100 |
| 200 | + print(f"\rProgress: {progress:.1f}%", end='', flush=True) |
| 201 | + |
| 202 | + print(f"\nDownloaded: {filename}") |
| 203 | + return True |
| 204 | + |
| 205 | + except Exception as e: |
| 206 | + print(f"\nError downloading: {e}") |
| 207 | + if output_path.exists(): |
| 208 | + output_path.unlink() |
| 209 | + return False |
| 210 | + |
| 211 | + def batch_download(self, search_terms, num_pages=1, sort_by='date', max_files=None, delay=2): |
| 212 | + """Search and download files for multiple terms with pagination |
| 213 | + |
| 214 | + Args: |
| 215 | + search_terms: List of search queries |
| 216 | + num_pages: Number of pages to scrape per search term |
| 217 | + sort_by: 'date' for newest, 'downloads' for most popular |
| 218 | + max_files: Maximum files to download per search term (None = unlimited) |
| 219 | + delay: Seconds to wait between downloads |
| 220 | + """ |
| 221 | + all_results = [] |
| 222 | + |
| 223 | + for term in search_terms: |
| 224 | + print(f"\n{'='*70}") |
| 225 | + print(f"Searching for: {term}") |
| 226 | + print(f"{'='*70}") |
| 227 | + |
| 228 | + files_downloaded = 0 |
| 229 | + |
| 230 | + for page in range(1, num_pages + 1): |
| 231 | + if max_files and files_downloaded >= max_files: |
| 232 | + break |
| 233 | + |
| 234 | + files = self.search_files(term, page=page, sort_by=sort_by) |
| 235 | + |
| 236 | + if not files: |
| 237 | + print(f"No files found on page {page}") |
| 238 | + break |
| 239 | + |
| 240 | + print(f"Found {len(files)} files on page {page}\n") |
| 241 | + |
| 242 | + for file_info in files: |
| 243 | + if max_files and files_downloaded >= max_files: |
| 244 | + print(f"Reached maximum of {max_files} files for this search term") |
| 245 | + break |
| 246 | + |
| 247 | + print(f"File: {file_info['filename']}") |
| 248 | + print(f" Size:{file_info ['size']} | Downloads: {file_info ['downloads']} | Date: {file_info ['upload_date']}") |
| 249 | + print(f" FID: {file_info['fid']}") |
| 250 | + |
| 251 | + success = self.download_file(file_info['fid'], file_info['filename']) |
| 252 | + |
| 253 | + all_results.append({ |
| 254 | + 'search_term': term, |
| 255 | + 'page': page, |
| 256 | + 'filename': file_info['filename'], |
| 257 | + 'fid': file_info['fid'], |
| 258 | + 'size': file_info['size'], |
| 259 | + 'success': success |
| 260 | + }) |
| 261 | + |
| 262 | + if success: |
| 263 | + files_downloaded += 1 |
| 264 | + |
| 265 | + print() |
| 266 | + time.sleep(delay) |
| 267 | + |
| 268 | + return all_results |
| 269 | + |
| 270 | + |
| 271 | +def main(): |
| 272 | + print("AndroidFileHost Scraper by fl0w") |
| 273 | + print("github.com/codefl0w") |
| 274 | + print("https://xdaforums.com/m/fl0w.12361087/") |
| 275 | + print("="*70) |
| 276 | + |
| 277 | + # Get search terms |
| 278 | + print("\nEnter search terms (comma-separated):") |
| 279 | + print("Example: lineage, twrp, magisk") |
| 280 | + search_input = input("Search terms: ").strip() |
| 281 | + search_terms = [term.strip() for term in search_input.split(',') if term.strip()] |
| 282 | + |
| 283 | + if not search_terms: |
| 284 | + print("No search terms provided. Exiting.") |
| 285 | + return |
| 286 | + |
| 287 | + # Get user preferences |
| 288 | + print("\nHow should files be sorted?") |
| 289 | + print("1. Newest first") |
| 290 | + print("2. Most popular (by downloads)") |
| 291 | + sort_choice = input("Enter choice (1 or 2): ").strip() |
| 292 | + sort_by = 'downloads' if sort_choice == '2' else 'date' |
| 293 | + |
| 294 | + print("\nHow many files should be downloaded per search term?") |
| 295 | + max_files = int(input("Enter number: ").strip()) |
| 296 | + |
| 297 | + # Calculate pages needed (15 files per page) |
| 298 | + num_pages = (max_files + 14) // 15 |
| 299 | + |
| 300 | + print("\nWhich mirror should be used primarily?") |
| 301 | + print("1. USA") |
| 302 | + print("2. Germany") |
| 303 | + mirror_choice = input("Enter choice (1 or 2): ").strip() |
| 304 | + mirror_pref = "Germany" if mirror_choice == '2' else "USA" |
| 305 | + |
| 306 | + # Initialize scraper |
| 307 | + |
| 308 | + scraper = AFHScraper( |
| 309 | + download_dir=os.path.dirname(os.path.abspath(__file__)), |
| 310 | + mirror_preference=mirror_pref |
| 311 | + ) |
| 312 | + |
| 313 | + print(f"\n{'='*70}") |
| 314 | + print(f"Download directory: {scraper.download_dir.absolute()}") |
| 315 | + print(f"Mirror preference: {scraper.mirror_preference}") |
| 316 | + print(f"Sort by: {'Most popular' if sort_by == 'downloads' else 'Newest'}") |
| 317 | + print(f"Max files per search: {max_files}") |
| 318 | + print(f"Search terms: {', '.join(search_terms)}") |
| 319 | + print("="*70) |
| 320 | + |
| 321 | + results = scraper.batch_download(search_terms, num_pages=num_pages, sort_by=sort_by, |
| 322 | + max_files=max_files, delay=3) |
| 323 | + |
| 324 | + |
| 325 | + # Summary |
| 326 | + successful = sum(1 for r in results if r['success']) |
| 327 | + print(f"\n{'='*70}") |
| 328 | + print(f"Summary: {successful}/{len(results)} files downloaded successfully") |
| 329 | + print(f"{'='*70}") |
| 330 | + |
| 331 | + |
| 332 | +if __name__ == "__main__": |
| 333 | + main() |
0 commit comments