|
1 | | -#!/usr/bin/env python3 |
2 | | -import os |
3 | | -import json |
4 | | -import time |
5 | | -import requests |
6 | | -import base64 |
7 | | -from datetime import datetime, timezone |
8 | | - |
9 | | -# --- CONFIGURATION --- |
10 | | -GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') # Required |
11 | | -API_ENDPOINT = os.environ.get('WORKER_API_URL') # Optional: Your Worker URL |
12 | | -RESULTS_FILE = "daily_workers.json" |
13 | | - |
14 | | -# Headers for GitHub API |
15 | | -HEADERS = { |
16 | | - 'Authorization': f'Bearer {GITHUB_TOKEN}', |
17 | | - 'Accept': 'application/vnd.github+json', |
18 | | - 'User-Agent': 'Worker-Discovery-Bot/1.0' |
19 | | -} |
20 | | - |
21 | | -# --- TAGGING LOGIC --- |
22 | | -# We look for these keywords in package.json to auto-label the architecture |
23 | | -TECH_SIGNATURES = { |
24 | | - "framework": { |
25 | | - "hono": "Hono", |
26 | | - "astro": "Astro", |
27 | | - "remix": "Remix", |
28 | | - "next": "Next.js", |
29 | | - "itty-router": "Itty Router", |
30 | | - "fastapi": "FastAPI (Python)" |
31 | | - }, |
32 | | - "database": { |
33 | | - "drizzle-orm": "Drizzle ORM", |
34 | | - "prisma": "Prisma", |
35 | | - "kysely": "Kysely", |
36 | | - "@supabase/supabase-js": "Supabase", |
37 | | - "mongoose": "MongoDB" |
38 | | - }, |
39 | | - "infrastructure": { |
40 | | - "@cloudflare/ai": "Workers AI", |
41 | | - "@cloudflare/vectorize": "Vectorize", |
42 | | - "@cloudflare/kv-asset-handler": "KV Assets", |
43 | | - "toucan-js": "Toucan Telemetry", |
44 | | - "zod": "Zod Validation" |
45 | | - }, |
46 | | - "frontend": { |
47 | | - "clsx": "Tailwind/Shadcn Utils", |
48 | | - "lucide-react": "Lucide Icons", |
49 | | - "radix-ui": "Radix UI", |
50 | | - "react": "React", |
51 | | - "vue": "Vue" |
52 | | - } |
53 | | -} |
54 | | - |
55 | | -def get_file_content(repo_full_name, path): |
56 | | - """Fetches raw content of a file from a repo.""" |
57 | | - url = f"https://api.github.com{repo_full_name}/contents/{path}" |
58 | | - try: |
59 | | - res = requests.get(url, headers=HEADERS, timeout=5) |
60 | | - if res.status_code == 200: |
61 | | - content = base64.b64decode(res.json()['content']).decode('utf-8') |
62 | | - return content |
63 | | - except: |
64 | | - return None |
65 | | - return None |
66 | | - |
67 | | -def analyze_stack(repo_name): |
68 | | - """Reads package.json to detect tech stack.""" |
69 | | - tags = [] |
70 | | - package_json = get_file_content(repo_name, "package.json") |
71 | | - |
72 | | - if not package_json: |
73 | | - return ["Unknown/Non-JS"] |
74 | | - |
75 | | - try: |
76 | | - data = json.loads(package_json) |
77 | | - # Combine deps and devDeps |
78 | | - all_deps = {**data.get('dependencies', {}), **data.get('devDependencies', {})} |
79 | | - |
80 | | - for category, signatures in TECH_SIGNATURES.items(): |
81 | | - for pkg, label in signatures.items(): |
82 | | - if any(k for k in all_deps if pkg in k): |
83 | | - tags.append(label) |
84 | | - except: |
85 | | - tags.append("Parse Error") |
86 | | - |
87 | | - return list(set(tags)) |
| 1 | +import urllib.parse |
88 | 2 |
|
89 | 3 | def search_broad_workers(): |
90 | | - """Searches for ANY valid worker config file sorted by recently updated.""" |
91 | | - # We search for 'compatibility_date' which is mandatory in valid wrangler.toml/jsonc |
92 | | - query = "compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/ sort:indexed" |
93 | | - url = f"https://api.github.com{query}&per_page=30" |
| 4 | + """Searches for ANY valid worker config file using correct URL formatting.""" |
94 | 5 |
|
95 | | - print(f"🌊 Casting wide net with query: {query}") |
96 | | - res = requests.get(url, headers=HEADERS) |
| 6 | + # 1. Define the raw query |
| 7 | + raw_query = 'compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/' |
97 | 8 |
|
98 | | - if res.status_code != 200: |
99 | | - print(f"❌ Error: {res.status_code} {res.text}") |
100 | | - return [] |
101 | | - |
102 | | - items = res.json().get('items', []) |
103 | | - results = [] |
104 | | - seen_repos = set() |
105 | | - |
106 | | - print(f"🔍 Analyzing {len(items)} raw hits...") |
107 | | - |
108 | | - for item in items: |
109 | | - repo = item['repository'] |
110 | | - name = repo['full_name'] |
111 | | - |
112 | | - if name in seen_repos or repo.get('fork'): |
113 | | - continue |
114 | | - seen_repos.add(name) |
115 | | - |
116 | | - print(f" 👉 Inspecting: {name}...") |
117 | | - |
118 | | - # 1. Enrich with Tech Stack |
119 | | - stack_tags = analyze_stack(name) |
120 | | - |
121 | | - # 2. Build Payload |
122 | | - entry = { |
123 | | - "name": name, |
124 | | - "url": repo['html_url'], |
125 | | - "description": repo.get('description', 'No description'), |
126 | | - "stars": -1, # Requires separate API call if needed, skipping for speed |
127 | | - "detected_stack": stack_tags, |
128 | | - "config_file": item['name'], |
129 | | - "discovered_at": datetime.now(timezone.utc).isoformat() |
130 | | - } |
131 | | - results.append(entry) |
132 | | - time.sleep(1) # Respect rate limits |
133 | | - |
134 | | - return results |
135 | | - |
136 | | -def main(): |
137 | | - if not GITHUB_TOKEN: |
138 | | - print("⚠️ Error: GITHUB_TOKEN not set.") |
139 | | - return |
140 | | - |
141 | | - # 1. Run Search |
142 | | - new_discoveries = search_broad_workers() |
| 9 | + # 2. Encode the query (handles spaces, slashes, and regex chars correctly) |
| 10 | + encoded_query = urllib.parse.quote(raw_query) |
143 | 11 |
|
144 | | - # 2. Save to Disk |
145 | | - with open(RESULTS_FILE, 'w') as f: |
146 | | - json.dump(new_discoveries, f, indent=2) |
147 | | - print(f"\n✅ Saved {len(new_discoveries)} repos to {RESULTS_FILE}") |
148 | | - |
149 | | - # 3. Post to your Worker API (Optional) |
150 | | - if API_ENDPOINT and new_discoveries: |
151 | | - print(f"📤 Posting to {API_ENDPOINT}...") |
152 | | - try: |
153 | | - res = requests.post(API_ENDPOINT, json={"payload": new_discoveries}) |
154 | | - print(f" Status: {res.status_code}") |
155 | | - except Exception as e: |
156 | | - print(f" Failed to post: {e}") |
157 | | - |
158 | | -if __name__ == "__main__": |
159 | | - main() |
| 12 | + # 3. Construct the URL - THE '?' IS CRITICAL |
| 13 | + # Note: sort and order are parameters, not part of the query string itself |
| 14 | + url = f"https://api.github.com{encoded_query}&sort=indexed&order=desc&per_page=30" |
| 15 | + |
| 16 | + print(f"🌊 Casting wide net with encoded query: {raw_query}") |
| 17 | + |
| 18 | + try: |
| 19 | + res = requests.get(url, headers=HEADERS, timeout=30) |
| 20 | + |
| 21 | + if res.status_code != 200: |
| 22 | + print(f"❌ Error: {res.status_code} - {res.text}") |
| 23 | + return [] |
| 24 | + |
| 25 | + items = res.json().get('items', []) |
| 26 | + results = [] |
| 27 | + seen_repos = set() |
| 28 | + |
| 29 | + print(f"🔍 Analyzing {len(items)} raw hits...") |
| 30 | + |
| 31 | + for item in items: |
| 32 | + repo = item['repository'] |
| 33 | + name = repo['full_name'] |
| 34 | + |
| 35 | + # Skip duplicates and forks to keep the list high quality |
| 36 | + if name in seen_repos or repo.get('fork'): |
| 37 | + continue |
| 38 | + seen_repos.add(name) |
| 39 | + |
| 40 | + print(f" 👉 Inspecting: {name}...") |
| 41 | + |
| 42 | + stack_tags = analyze_stack(name) |
| 43 | + |
| 44 | + results.append({ |
| 45 | + "name": name, |
| 46 | + "url": repo['html_url'], |
| 47 | + "description": repo.get('description', 'No description'), |
| 48 | + "detected_stack": stack_tags, |
| 49 | + "config_file": item['name'], |
| 50 | + "discovered_at": datetime.now(timezone.utc).isoformat() |
| 51 | + }) |
| 52 | + time.sleep(1) # Sleep to avoid GitHub secondary rate limits |
| 53 | + |
| 54 | + return results |
| 55 | + |
| 56 | + except Exception as e: |
| 57 | + print(f"❌ Fatal Request Error: {e}") |
| 58 | + return [] |
0 commit comments