Skip to content

Commit 5695bdb

Browse files
authored
Refactor search_repos.py for improved functionality
1 parent b2cebd8 commit 5695bdb

1 file changed

Lines changed: 97 additions & 14 deletions

File tree

Lines changed: 97 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,106 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import json
4+
import time
5+
import requests
6+
import base64
17
import urllib.parse
8+
from datetime import datetime, timezone
9+
10+
# --- CONFIGURATION ---
11+
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
12+
# Your Worker Endpoint for daily sync and deduplication
13+
WORKER_API_URL = os.environ.get('WORKER_API_URL')
14+
15+
# Define output path for GitHub Actions artifact
16+
# This ensures it saves to the repo root for the YAML to find
17+
ROOT_DIR = os.environ.get('GITHUB_WORKSPACE', os.getcwd())
18+
OUTPUT_FILE = os.path.join(ROOT_DIR, 'results.json')
19+
20+
HEADERS = {
21+
'Authorization': f'Bearer {GITHUB_TOKEN}',
22+
'Accept': 'application/vnd.github+json',
23+
'User-Agent': 'Cloudflare-Worker-Discovery-Bot/1.0'
24+
}
25+
26+
# Tags to search for in package.json
27+
TECH_SIGNATURES = {
28+
"framework": {"hono": "Hono", "astro": "Astro", "remix": "Remix", "next": "Next.js"},
29+
"database": {"drizzle-orm": "Drizzle", "prisma": "Prisma", "pg": "Postgres"},
30+
"ai": {"@cloudflare/ai": "Workers AI", "vectorize": "Vectorize", "openai": "OpenAI"},
31+
"ui": {"shadcn": "Shadcn", "tailwind": "Tailwind", "radix-ui": "Radix"}
32+
}
33+
34+
def analyze_stack(repo_name):
35+
"""Enriches the repo by checking its package.json dependencies."""
36+
tags = []
37+
url = f"https://api.github.com{repo_name}/contents/package.json"
38+
try:
39+
res = requests.get(url, headers=HEADERS, timeout=5)
40+
if res.status_code == 200:
41+
content = base64.b64decode(res.json()['content']).decode('utf-8')
42+
data = json.loads(content)
43+
all_deps = {**data.get('dependencies', {}), **data.get('devDependencies', {})}
44+
for category, sigs in TECH_SIGNATURES.items():
45+
for pkg, label in sigs.items():
46+
if any(pkg in k for k in all_deps):
47+
tags.append(label)
48+
except:
49+
pass
50+
return list(set(tags)) if tags else ["Standard Worker"]
51+
52+
def get_already_registered_repos():
53+
"""
54+
FUTURE DEDUPLICATION:
55+
Queries your Worker API to get a list of repos we've already found.
56+
"""
57+
# if WORKER_API_URL:
58+
# try:
59+
# print("Checking Worker API for previously registered repos...")
60+
# res = requests.get(f"{WORKER_API_URL}/list-repos", timeout=10)
61+
# if res.status_code == 200:
62+
# return set(res.json().get('repo_names', []))
63+
# except Exception as e:
64+
# print(f"⚠️ Could not fetch existing list: {e}")
65+
return set()
266

367
def search_broad_workers():
468
"""Searches for ANY valid worker config file using correct URL formatting."""
569

6-
# 1. Define the raw query
70+
# 1. Define and Encode Query
771
raw_query = 'compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/'
8-
9-
# 2. Encode the query (handles spaces, slashes, and regex chars correctly)
1072
encoded_query = urllib.parse.quote(raw_query)
1173

12-
# 3. Construct the URL - THE '?' IS CRITICAL
13-
# Note: sort and order are parameters, not part of the query string itself
14-
url = f"https://api.github.com{encoded_query}&sort=indexed&order=desc&per_page=30"
74+
# 2. Correct URL Construction
75+
url = f"https://api.github.com{encoded_query}&sort=indexed&order=desc&per_page=100"
1576

16-
print(f"🌊 Casting wide net with encoded query: {raw_query}")
77+
print(f"🌊 Casting wide net: {raw_query}")
78+
79+
# Get previously found repos for deduplication
80+
already_found = get_already_registered_repos()
1781

1882
try:
1983
res = requests.get(url, headers=HEADERS, timeout=30)
20-
2184
if res.status_code != 200:
2285
print(f"❌ Error: {res.status_code} - {res.text}")
2386
return []
2487

2588
items = res.json().get('items', [])
2689
results = []
27-
seen_repos = set()
90+
seen_now = set()
2891

2992
print(f"🔍 Analyzing {len(items)} raw hits...")
3093

3194
for item in items:
3295
repo = item['repository']
3396
name = repo['full_name']
3497

35-
# Skip duplicates and forks to keep the list high quality
36-
if name in seen_repos or repo.get('fork'):
98+
# Skip if: duplicate in this run, already in your DB, or is a fork
99+
if name in seen_now or name in already_found or repo.get('fork'):
37100
continue
38-
seen_repos.add(name)
39101

40-
print(f" 👉 Inspecting: {name}...")
102+
seen_now.add(name)
103+
print(f" 👉 New Find: {name}")
41104

42105
stack_tags = analyze_stack(name)
43106

@@ -49,10 +112,30 @@ def search_broad_workers():
49112
"config_file": item['name'],
50113
"discovered_at": datetime.now(timezone.utc).isoformat()
51114
})
52-
time.sleep(1) # Sleep to avoid GitHub secondary rate limits
115+
time.sleep(1) # Safety delay for secondary rate limits
53116

54117
return results
55118

56119
except Exception as e:
57120
print(f"❌ Fatal Request Error: {e}")
58121
return []
122+
123+
def main():
124+
if not GITHUB_TOKEN:
125+
print("❌ Error: GITHUB_TOKEN not set")
126+
return
127+
128+
discoveries = search_broad_workers()
129+
130+
# Save to artifact file
131+
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
132+
json.dump(discoveries, f, indent=2)
133+
134+
print(f"\n✅ Found {len(discoveries)} new repos. Saved to {OUTPUT_FILE}")
135+
136+
# OPTIONAL: Post to your Worker API immediately
137+
# if WORKER_API_URL and discoveries:
138+
# requests.post(f"{WORKER_API_URL}/ingest", json={"repos": discoveries})
139+
140+
if __name__ == '__main__':
141+
main()

0 commit comments

Comments
 (0)