Skip to content

Commit 9c98e3f

Browse files
authored
Enhance search_repos.py with new features
Refactor GitHub repo search script to improve functionality and add tagging logic.
1 parent 9c31190 commit 9c98e3f

1 file changed

Lines changed: 132 additions & 80 deletions

File tree

Lines changed: 132 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,159 @@
11
#!/usr/bin/env python3
22
import os
3-
import sys
43
import json
54
import time
65
import requests
6+
import base64
77
from datetime import datetime, timezone
88

99
# --- CONFIGURATION ---
10-
TOKEN = os.environ.get('GITHUB_TOKEN')
11-
if not TOKEN:
12-
print("Error: GITHUB_TOKEN environment variable not set")
13-
sys.exit(1)
10+
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') # Required
11+
API_ENDPOINT = os.environ.get('WORKER_API_URL') # Optional: Your Worker URL
12+
RESULTS_FILE = "daily_workers.json"
1413

14+
# Headers for GitHub API
1515
HEADERS = {
16+
'Authorization': f'Bearer {GITHUB_TOKEN}',
1617
'Accept': 'application/vnd.github+json',
17-
'Authorization': f'Bearer {TOKEN}',
18-
'X-GitHub-Api-Version': '2022-11-28',
19-
'User-Agent': 'Cloudflare-Worker-Hunter'
18+
'User-Agent': 'Worker-Discovery-Bot/1.0'
2019
}
2120

22-
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
23-
OUTPUT_FILE = os.path.join(SCRIPT_DIR, 'results.json')
21+
# --- TAGGING LOGIC ---
22+
# We look for these keywords in package.json to auto-label the architecture
23+
TECH_SIGNATURES = {
24+
"framework": {
25+
"hono": "Hono",
26+
"astro": "Astro",
27+
"remix": "Remix",
28+
"next": "Next.js",
29+
"itty-router": "Itty Router",
30+
"fastapi": "FastAPI (Python)"
31+
},
32+
"database": {
33+
"drizzle-orm": "Drizzle ORM",
34+
"prisma": "Prisma",
35+
"kysely": "Kysely",
36+
"@supabase/supabase-js": "Supabase",
37+
"mongoose": "MongoDB"
38+
},
39+
"infrastructure": {
40+
"@cloudflare/ai": "Workers AI",
41+
"@cloudflare/vectorize": "Vectorize",
42+
"@cloudflare/kv-asset-handler": "KV Assets",
43+
"toucan-js": "Toucan Telemetry",
44+
"zod": "Zod Validation"
45+
},
46+
"frontend": {
47+
"clsx": "Tailwind/Shadcn Utils",
48+
"lucide-react": "Lucide Icons",
49+
"radix-ui": "Radix UI",
50+
"react": "React",
51+
"vue": "Vue"
52+
}
53+
}
2454

25-
# --- TARGET QUERIES ---
26-
# We use multiple specific queries to bypass GitHub's search limits and find "the good stuff"
27-
QUERIES = [
28-
# 1. Astro + Cloudflare + Shadcn (The modern stack)
29-
'"@astrojs/cloudflare" "tailwind-merge" path:package.json',
30-
# 2. Next.js on Pages + Shadcn
31-
'"@cloudflare/next-on-pages" "lucide-react" path:package.json',
32-
# 3. Newest Worker standard (jsonc) + AI
33-
'"@cloudflare/ai" path:wrangler.jsonc',
34-
# 4. Pure Workers + Drizzle (signals high quality)
35-
'"drizzle-orm" "wrangler" path:package.json'
36-
]
55+
def get_file_content(repo_full_name, path):
56+
"""Fetches raw content of a file from a repo."""
57+
url = f"https://api.github.com{repo_full_name}/contents/{path}"
58+
try:
59+
res = requests.get(url, headers=HEADERS, timeout=5)
60+
if res.status_code == 200:
61+
content = base64.b64decode(res.json()['content']).decode('utf-8')
62+
return content
63+
except:
64+
return None
65+
return None
3766

38-
def handle_rate_limit(response):
39-
if response.status_code == 403:
40-
retry_after = int(response.headers.get('Retry-After', 60))
41-
print(f"⚠️ Rate limit. Sleeping {retry_after}s...")
42-
time.sleep(retry_after)
43-
return True
44-
return False
67+
def analyze_stack(repo_name):
68+
"""Reads package.json to detect tech stack."""
69+
tags = []
70+
package_json = get_file_content(repo_name, "package.json")
71+
72+
if not package_json:
73+
return ["Unknown/Non-JS"]
4574

46-
def search_github(query):
47-
url = 'https://api.github.com/search/code'
48-
params = {'q': query, 'per_page': 50}
4975
try:
50-
res = requests.get(url, headers=HEADERS, params=params, timeout=30)
51-
if handle_rate_limit(res): return search_github(query)
52-
res.raise_for_status()
53-
return res.json().get('items', [])
54-
except Exception as e:
55-
print(f"❌ Search Error: {e}")
56-
return []
76+
data = json.loads(package_json)
77+
# Combine deps and devDeps
78+
all_deps = {**data.get('dependencies', {}), **data.get('devDependencies', {})}
79+
80+
for category, signatures in TECH_SIGNATURES.items():
81+
for pkg, label in signatures.items():
82+
if any(k for k in all_deps if pkg in k):
83+
tags.append(label)
84+
except:
85+
tags.append("Parse Error")
86+
87+
return list(set(tags))
5788

58-
def get_repo_meta(full_name):
59-
url = f'https://api.github.com/repos/{full_name}'
89+
def search_broad_workers():
90+
"""Searches for ANY valid worker config file sorted by recently updated."""
91+
# We search for 'compatibility_date' which is mandatory in valid wrangler.toml/jsonc
92+
query = "compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/ sort:indexed"
93+
url = f"https://api.github.com{query}&per_page=30"
94+
95+
print(f"🌊 Casting wide net with query: {query}")
6096
res = requests.get(url, headers=HEADERS)
61-
return res.json() if res.status_code == 200 else None
62-
63-
def main():
64-
print("🚀 Starting Edge Tech Discovery...\n")
65-
unique_repos = {}
97+
98+
if res.status_code != 200:
99+
print(f"❌ Error: {res.status_code} {res.text}")
100+
return []
101+
102+
items = res.json().get('items', [])
103+
results = []
104+
seen_repos = set()
66105

67-
for q in QUERIES:
68-
print(f"🔍 Searching: {q}")
69-
items = search_github(q)
70-
for item in items:
71-
name = item['repository']['full_name']
72-
if name not in unique_repos:
73-
unique_repos[name] = item['repository']
74-
time.sleep(2) # Prevent secondary rate limits
106+
print(f"🔍 Analyzing {len(items)} raw hits...")
75107

76-
print(f"\n✨ Found {len(unique_repos)} candidates. Refining metadata...")
77-
78-
final_list = []
79-
for i, (name, base_data) in enumerate(unique_repos.items(), 1):
80-
meta = get_repo_meta(name)
81-
if meta and not meta.get('fork'): # Filter out noise/forks
82-
final_list.append({
83-
'name': name,
84-
'url': meta['html_url'],
85-
'stars': meta['stargazers_count'],
86-
'desc': meta['description'],
87-
'updated': meta['pushed_at']
88-
})
89-
print(f" ✅ [{i}] Collected: {name}")
90-
time.sleep(0.2)
108+
for item in items:
109+
repo = item['repository']
110+
name = repo['full_name']
111+
112+
if name in seen_repos or repo.get('fork'):
113+
continue
114+
seen_repos.add(name)
115+
116+
print(f" 👉 Inspecting: {name}...")
117+
118+
# 1. Enrich with Tech Stack
119+
stack_tags = analyze_stack(name)
120+
121+
# 2. Build Payload
122+
entry = {
123+
"name": name,
124+
"url": repo['html_url'],
125+
"description": repo.get('description', 'No description'),
126+
"stars": -1, # Requires separate API call if needed, skipping for speed
127+
"detected_stack": stack_tags,
128+
"config_file": item['name'],
129+
"discovered_at": datetime.now(timezone.utc).isoformat()
130+
}
131+
results.append(entry)
132+
time.sleep(1) # Respect rate limits
91133

92-
# Sort by most recently active
93-
final_list.sort(key=lambda x: x['updated'], reverse=True)
134+
return results
94135

95-
output = {
96-
'generated_at': datetime.now(timezone.utc).isoformat(),
97-
'count': len(final_list),
98-
'repos': final_list
99-
}
136+
def main():
137+
if not GITHUB_TOKEN:
138+
print("⚠️ Error: GITHUB_TOKEN not set.")
139+
return
100140

101-
with open(OUTPUT_FILE, 'w') as f:
102-
json.dump(output, f, indent=2)
141+
# 1. Run Search
142+
new_discoveries = search_broad_workers()
103143

104-
print(f"\n🎉 Done! Saved {len(final_list)} repos to {OUTPUT_FILE}")
144+
# 2. Save to Disk
145+
with open(RESULTS_FILE, 'w') as f:
146+
json.dump(new_discoveries, f, indent=2)
147+
print(f"\n✅ Saved {len(new_discoveries)} repos to {RESULTS_FILE}")
148+
149+
# 3. Post to your Worker API (Optional)
150+
if API_ENDPOINT and new_discoveries:
151+
print(f"📤 Posting to {API_ENDPOINT}...")
152+
try:
153+
res = requests.post(API_ENDPOINT, json={"payload": new_discoveries})
154+
print(f" Status: {res.status_code}")
155+
except Exception as e:
156+
print(f" Failed to post: {e}")
105157

106-
if __name__ == '__main__':
158+
if __name__ == "__main__":
107159
main()

0 commit comments

Comments
 (0)