Skip to content

Commit db0bb4b

Browse files
authored
Refactor search_broad_workers for better query handling
Refactor search_broad_workers function to improve query handling and error management. Added URL encoding for search queries and enhanced duplicate handling.
1 parent 9c98e3f commit db0bb4b

1 file changed

Lines changed: 53 additions & 154 deletions

File tree

Lines changed: 53 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -1,159 +1,58 @@
1-
#!/usr/bin/env python3
2-
import os
3-
import json
4-
import time
5-
import requests
6-
import base64
7-
from datetime import datetime, timezone
8-
9-
# --- CONFIGURATION ---
10-
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') # Required
11-
API_ENDPOINT = os.environ.get('WORKER_API_URL') # Optional: Your Worker URL
12-
RESULTS_FILE = "daily_workers.json"
13-
14-
# Headers for GitHub API
15-
HEADERS = {
16-
'Authorization': f'Bearer {GITHUB_TOKEN}',
17-
'Accept': 'application/vnd.github+json',
18-
'User-Agent': 'Worker-Discovery-Bot/1.0'
19-
}
20-
21-
# --- TAGGING LOGIC ---
22-
# We look for these keywords in package.json to auto-label the architecture
23-
TECH_SIGNATURES = {
24-
"framework": {
25-
"hono": "Hono",
26-
"astro": "Astro",
27-
"remix": "Remix",
28-
"next": "Next.js",
29-
"itty-router": "Itty Router",
30-
"fastapi": "FastAPI (Python)"
31-
},
32-
"database": {
33-
"drizzle-orm": "Drizzle ORM",
34-
"prisma": "Prisma",
35-
"kysely": "Kysely",
36-
"@supabase/supabase-js": "Supabase",
37-
"mongoose": "MongoDB"
38-
},
39-
"infrastructure": {
40-
"@cloudflare/ai": "Workers AI",
41-
"@cloudflare/vectorize": "Vectorize",
42-
"@cloudflare/kv-asset-handler": "KV Assets",
43-
"toucan-js": "Toucan Telemetry",
44-
"zod": "Zod Validation"
45-
},
46-
"frontend": {
47-
"clsx": "Tailwind/Shadcn Utils",
48-
"lucide-react": "Lucide Icons",
49-
"radix-ui": "Radix UI",
50-
"react": "React",
51-
"vue": "Vue"
52-
}
53-
}
54-
55-
def get_file_content(repo_full_name, path):
56-
"""Fetches raw content of a file from a repo."""
57-
url = f"https://api.github.com{repo_full_name}/contents/{path}"
58-
try:
59-
res = requests.get(url, headers=HEADERS, timeout=5)
60-
if res.status_code == 200:
61-
content = base64.b64decode(res.json()['content']).decode('utf-8')
62-
return content
63-
except:
64-
return None
65-
return None
66-
67-
def analyze_stack(repo_name):
68-
"""Reads package.json to detect tech stack."""
69-
tags = []
70-
package_json = get_file_content(repo_name, "package.json")
71-
72-
if not package_json:
73-
return ["Unknown/Non-JS"]
74-
75-
try:
76-
data = json.loads(package_json)
77-
# Combine deps and devDeps
78-
all_deps = {**data.get('dependencies', {}), **data.get('devDependencies', {})}
79-
80-
for category, signatures in TECH_SIGNATURES.items():
81-
for pkg, label in signatures.items():
82-
if any(k for k in all_deps if pkg in k):
83-
tags.append(label)
84-
except:
85-
tags.append("Parse Error")
86-
87-
return list(set(tags))
1+
import urllib.parse
882

893
def search_broad_workers():
90-
"""Searches for ANY valid worker config file sorted by recently updated."""
91-
# We search for 'compatibility_date' which is mandatory in valid wrangler.toml/jsonc
92-
query = "compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/ sort:indexed"
93-
url = f"https://api.github.com{query}&per_page=30"
4+
"""Searches for ANY valid worker config file using correct URL formatting."""
945

95-
print(f"🌊 Casting wide net with query: {query}")
96-
res = requests.get(url, headers=HEADERS)
6+
# 1. Define the raw query
7+
raw_query = 'compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/'
978

98-
if res.status_code != 200:
99-
print(f"❌ Error: {res.status_code} {res.text}")
100-
return []
101-
102-
items = res.json().get('items', [])
103-
results = []
104-
seen_repos = set()
105-
106-
print(f"🔍 Analyzing {len(items)} raw hits...")
107-
108-
for item in items:
109-
repo = item['repository']
110-
name = repo['full_name']
111-
112-
if name in seen_repos or repo.get('fork'):
113-
continue
114-
seen_repos.add(name)
115-
116-
print(f" 👉 Inspecting: {name}...")
117-
118-
# 1. Enrich with Tech Stack
119-
stack_tags = analyze_stack(name)
120-
121-
# 2. Build Payload
122-
entry = {
123-
"name": name,
124-
"url": repo['html_url'],
125-
"description": repo.get('description', 'No description'),
126-
"stars": -1, # Requires separate API call if needed, skipping for speed
127-
"detected_stack": stack_tags,
128-
"config_file": item['name'],
129-
"discovered_at": datetime.now(timezone.utc).isoformat()
130-
}
131-
results.append(entry)
132-
time.sleep(1) # Respect rate limits
133-
134-
return results
135-
136-
def main():
137-
if not GITHUB_TOKEN:
138-
print("⚠️ Error: GITHUB_TOKEN not set.")
139-
return
140-
141-
# 1. Run Search
142-
new_discoveries = search_broad_workers()
9+
# 2. Encode the query (handles spaces, slashes, and regex chars correctly)
10+
encoded_query = urllib.parse.quote(raw_query)
14311

144-
# 2. Save to Disk
145-
with open(RESULTS_FILE, 'w') as f:
146-
json.dump(new_discoveries, f, indent=2)
147-
print(f"\n✅ Saved {len(new_discoveries)} repos to {RESULTS_FILE}")
148-
149-
# 3. Post to your Worker API (Optional)
150-
if API_ENDPOINT and new_discoveries:
151-
print(f"📤 Posting to {API_ENDPOINT}...")
152-
try:
153-
res = requests.post(API_ENDPOINT, json={"payload": new_discoveries})
154-
print(f" Status: {res.status_code}")
155-
except Exception as e:
156-
print(f" Failed to post: {e}")
157-
158-
if __name__ == "__main__":
159-
main()
12+
# 3. Construct the URL - THE '?' IS CRITICAL
13+
# Note: sort and order are parameters, not part of the query string itself
14+
url = f"https://api.github.com{encoded_query}&sort=indexed&order=desc&per_page=30"
15+
16+
print(f"🌊 Casting wide net with encoded query: {raw_query}")
17+
18+
try:
19+
res = requests.get(url, headers=HEADERS, timeout=30)
20+
21+
if res.status_code != 200:
22+
print(f"❌ Error: {res.status_code} - {res.text}")
23+
return []
24+
25+
items = res.json().get('items', [])
26+
results = []
27+
seen_repos = set()
28+
29+
print(f"🔍 Analyzing {len(items)} raw hits...")
30+
31+
for item in items:
32+
repo = item['repository']
33+
name = repo['full_name']
34+
35+
# Skip duplicates and forks to keep the list high quality
36+
if name in seen_repos or repo.get('fork'):
37+
continue
38+
seen_repos.add(name)
39+
40+
print(f" 👉 Inspecting: {name}...")
41+
42+
stack_tags = analyze_stack(name)
43+
44+
results.append({
45+
"name": name,
46+
"url": repo['html_url'],
47+
"description": repo.get('description', 'No description'),
48+
"detected_stack": stack_tags,
49+
"config_file": item['name'],
50+
"discovered_at": datetime.now(timezone.utc).isoformat()
51+
})
52+
time.sleep(1) # Sleep to avoid GitHub secondary rate limits
53+
54+
return results
55+
56+
except Exception as e:
57+
print(f"❌ Fatal Request Error: {e}")
58+
return []

0 commit comments

Comments
 (0)