|
1 | 1 | #!/usr/bin/env python3 |
2 | 2 | import os |
3 | | -import sys |
4 | 3 | import json |
5 | 4 | import time |
6 | 5 | import requests |
| 6 | +import base64 |
7 | 7 | from datetime import datetime, timezone |
8 | 8 |
|
9 | 9 | # --- CONFIGURATION --- |
10 | | -TOKEN = os.environ.get('GITHUB_TOKEN') |
11 | | -if not TOKEN: |
12 | | - print("Error: GITHUB_TOKEN environment variable not set") |
13 | | - sys.exit(1) |
| 10 | +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') # Required |
| 11 | +API_ENDPOINT = os.environ.get('WORKER_API_URL') # Optional: Your Worker URL |
| 12 | +RESULTS_FILE = "daily_workers.json" |
14 | 13 |
|
| 14 | +# Headers for GitHub API |
15 | 15 | HEADERS = { |
| 16 | + 'Authorization': f'Bearer {GITHUB_TOKEN}', |
16 | 17 | 'Accept': 'application/vnd.github+json', |
17 | | - 'Authorization': f'Bearer {TOKEN}', |
18 | | - 'X-GitHub-Api-Version': '2022-11-28', |
19 | | - 'User-Agent': 'Cloudflare-Worker-Hunter' |
| 18 | + 'User-Agent': 'Worker-Discovery-Bot/1.0' |
20 | 19 | } |
21 | 20 |
|
22 | | -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
23 | | -OUTPUT_FILE = os.path.join(SCRIPT_DIR, 'results.json') |
| 21 | +# --- TAGGING LOGIC --- |
| 22 | +# We look for these keywords in package.json to auto-label the architecture |
| 23 | +TECH_SIGNATURES = { |
| 24 | + "framework": { |
| 25 | + "hono": "Hono", |
| 26 | + "astro": "Astro", |
| 27 | + "remix": "Remix", |
| 28 | + "next": "Next.js", |
| 29 | + "itty-router": "Itty Router", |
| 30 | + "fastapi": "FastAPI (Python)" |
| 31 | + }, |
| 32 | + "database": { |
| 33 | + "drizzle-orm": "Drizzle ORM", |
| 34 | + "prisma": "Prisma", |
| 35 | + "kysely": "Kysely", |
| 36 | + "@supabase/supabase-js": "Supabase", |
| 37 | + "mongoose": "MongoDB" |
| 38 | + }, |
| 39 | + "infrastructure": { |
| 40 | + "@cloudflare/ai": "Workers AI", |
| 41 | + "@cloudflare/vectorize": "Vectorize", |
| 42 | + "@cloudflare/kv-asset-handler": "KV Assets", |
| 43 | + "toucan-js": "Toucan Telemetry", |
| 44 | + "zod": "Zod Validation" |
| 45 | + }, |
| 46 | + "frontend": { |
| 47 | + "clsx": "Tailwind/Shadcn Utils", |
| 48 | + "lucide-react": "Lucide Icons", |
| 49 | + "radix-ui": "Radix UI", |
| 50 | + "react": "React", |
| 51 | + "vue": "Vue" |
| 52 | + } |
| 53 | +} |
24 | 54 |
|
25 | | -# --- TARGET QUERIES --- |
26 | | -# We use multiple specific queries to bypass GitHub's search limits and find "the good stuff" |
27 | | -QUERIES = [ |
28 | | - # 1. Astro + Cloudflare + Shadcn (The modern stack) |
29 | | - '"@astrojs/cloudflare" "tailwind-merge" path:package.json', |
30 | | - # 2. Next.js on Pages + Shadcn |
31 | | - '"@cloudflare/next-on-pages" "lucide-react" path:package.json', |
32 | | - # 3. Newest Worker standard (jsonc) + AI |
33 | | - '"@cloudflare/ai" path:wrangler.jsonc', |
34 | | - # 4. Pure Workers + Drizzle (signals high quality) |
35 | | - '"drizzle-orm" "wrangler" path:package.json' |
36 | | -] |
| 55 | +def get_file_content(repo_full_name, path): |
| 56 | + """Fetches raw content of a file from a repo.""" |
| 57 | + url = f"https://api.github.com{repo_full_name}/contents/{path}" |
| 58 | + try: |
| 59 | + res = requests.get(url, headers=HEADERS, timeout=5) |
| 60 | + if res.status_code == 200: |
| 61 | + content = base64.b64decode(res.json()['content']).decode('utf-8') |
| 62 | + return content |
| 63 | + except: |
| 64 | + return None |
| 65 | + return None |
37 | 66 |
|
38 | | -def handle_rate_limit(response): |
39 | | - if response.status_code == 403: |
40 | | - retry_after = int(response.headers.get('Retry-After', 60)) |
41 | | - print(f"⚠️ Rate limit. Sleeping {retry_after}s...") |
42 | | - time.sleep(retry_after) |
43 | | - return True |
44 | | - return False |
| 67 | +def analyze_stack(repo_name): |
| 68 | + """Reads package.json to detect tech stack.""" |
| 69 | + tags = [] |
| 70 | + package_json = get_file_content(repo_name, "package.json") |
| 71 | + |
| 72 | + if not package_json: |
| 73 | + return ["Unknown/Non-JS"] |
45 | 74 |
|
46 | | -def search_github(query): |
47 | | - url = 'https://api.github.com/search/code' |
48 | | - params = {'q': query, 'per_page': 50} |
49 | 75 | try: |
50 | | - res = requests.get(url, headers=HEADERS, params=params, timeout=30) |
51 | | - if handle_rate_limit(res): return search_github(query) |
52 | | - res.raise_for_status() |
53 | | - return res.json().get('items', []) |
54 | | - except Exception as e: |
55 | | - print(f"❌ Search Error: {e}") |
56 | | - return [] |
| 76 | + data = json.loads(package_json) |
| 77 | + # Combine deps and devDeps |
| 78 | + all_deps = {**data.get('dependencies', {}), **data.get('devDependencies', {})} |
| 79 | + |
| 80 | + for category, signatures in TECH_SIGNATURES.items(): |
| 81 | + for pkg, label in signatures.items(): |
| 82 | + if any(k for k in all_deps if pkg in k): |
| 83 | + tags.append(label) |
| 84 | + except: |
| 85 | + tags.append("Parse Error") |
| 86 | + |
| 87 | + return list(set(tags)) |
57 | 88 |
|
58 | | -def get_repo_meta(full_name): |
59 | | - url = f'https://api.github.com/repos/{full_name}' |
| 89 | +def search_broad_workers(): |
| 90 | + """Searches for ANY valid worker config file sorted by recently updated.""" |
| 91 | + # We search for 'compatibility_date' which is mandatory in valid wrangler.toml/jsonc |
| 92 | + query = "compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/ sort:indexed" |
| 93 | + url = f"https://api.github.com{query}&per_page=30" |
| 94 | + |
| 95 | + print(f"🌊 Casting wide net with query: {query}") |
60 | 96 | res = requests.get(url, headers=HEADERS) |
61 | | - return res.json() if res.status_code == 200 else None |
62 | | - |
63 | | -def main(): |
64 | | - print("🚀 Starting Edge Tech Discovery...\n") |
65 | | - unique_repos = {} |
| 97 | + |
| 98 | + if res.status_code != 200: |
| 99 | + print(f"❌ Error: {res.status_code} {res.text}") |
| 100 | + return [] |
| 101 | + |
| 102 | + items = res.json().get('items', []) |
| 103 | + results = [] |
| 104 | + seen_repos = set() |
66 | 105 |
|
67 | | - for q in QUERIES: |
68 | | - print(f"🔍 Searching: {q}") |
69 | | - items = search_github(q) |
70 | | - for item in items: |
71 | | - name = item['repository']['full_name'] |
72 | | - if name not in unique_repos: |
73 | | - unique_repos[name] = item['repository'] |
74 | | - time.sleep(2) # Prevent secondary rate limits |
| 106 | + print(f"🔍 Analyzing {len(items)} raw hits...") |
75 | 107 |
|
76 | | - print(f"\n✨ Found {len(unique_repos)} candidates. Refining metadata...") |
77 | | - |
78 | | - final_list = [] |
79 | | - for i, (name, base_data) in enumerate(unique_repos.items(), 1): |
80 | | - meta = get_repo_meta(name) |
81 | | - if meta and not meta.get('fork'): # Filter out noise/forks |
82 | | - final_list.append({ |
83 | | - 'name': name, |
84 | | - 'url': meta['html_url'], |
85 | | - 'stars': meta['stargazers_count'], |
86 | | - 'desc': meta['description'], |
87 | | - 'updated': meta['pushed_at'] |
88 | | - }) |
89 | | - print(f" ✅ [{i}] Collected: {name}") |
90 | | - time.sleep(0.2) |
| 108 | + for item in items: |
| 109 | + repo = item['repository'] |
| 110 | + name = repo['full_name'] |
| 111 | + |
| 112 | + if name in seen_repos or repo.get('fork'): |
| 113 | + continue |
| 114 | + seen_repos.add(name) |
| 115 | + |
| 116 | + print(f" 👉 Inspecting: {name}...") |
| 117 | + |
| 118 | + # 1. Enrich with Tech Stack |
| 119 | + stack_tags = analyze_stack(name) |
| 120 | + |
| 121 | + # 2. Build Payload |
| 122 | + entry = { |
| 123 | + "name": name, |
| 124 | + "url": repo['html_url'], |
| 125 | + "description": repo.get('description', 'No description'), |
| 126 | + "stars": -1, # Requires separate API call if needed, skipping for speed |
| 127 | + "detected_stack": stack_tags, |
| 128 | + "config_file": item['name'], |
| 129 | + "discovered_at": datetime.now(timezone.utc).isoformat() |
| 130 | + } |
| 131 | + results.append(entry) |
| 132 | + time.sleep(1) # Respect rate limits |
91 | 133 |
|
92 | | - # Sort by most recently active |
93 | | - final_list.sort(key=lambda x: x['updated'], reverse=True) |
| 134 | + return results |
94 | 135 |
|
95 | | - output = { |
96 | | - 'generated_at': datetime.now(timezone.utc).isoformat(), |
97 | | - 'count': len(final_list), |
98 | | - 'repos': final_list |
99 | | - } |
| 136 | +def main(): |
| 137 | + if not GITHUB_TOKEN: |
| 138 | + print("⚠️ Error: GITHUB_TOKEN not set.") |
| 139 | + return |
100 | 140 |
|
101 | | - with open(OUTPUT_FILE, 'w') as f: |
102 | | - json.dump(output, f, indent=2) |
| 141 | + # 1. Run Search |
| 142 | + new_discoveries = search_broad_workers() |
103 | 143 |
|
104 | | - print(f"\n🎉 Done! Saved {len(final_list)} repos to {OUTPUT_FILE}") |
| 144 | + # 2. Save to Disk |
| 145 | + with open(RESULTS_FILE, 'w') as f: |
| 146 | + json.dump(new_discoveries, f, indent=2) |
| 147 | + print(f"\n✅ Saved {len(new_discoveries)} repos to {RESULTS_FILE}") |
| 148 | + |
| 149 | + # 3. Post to your Worker API (Optional) |
| 150 | + if API_ENDPOINT and new_discoveries: |
| 151 | + print(f"📤 Posting to {API_ENDPOINT}...") |
| 152 | + try: |
| 153 | + res = requests.post(API_ENDPOINT, json={"payload": new_discoveries}) |
| 154 | + print(f" Status: {res.status_code}") |
| 155 | + except Exception as e: |
| 156 | + print(f" Failed to post: {e}") |
105 | 157 |
|
106 | | -if __name__ == '__main__': |
| 158 | +if __name__ == "__main__": |
107 | 159 | main() |
0 commit comments