1+ #!/usr/bin/env python3
2+ import os
3+ import json
4+ import time
5+ import requests
6+ import base64
17import urllib .parse
8+ from datetime import datetime , timezone
9+
10+ # --- CONFIGURATION ---
11+ GITHUB_TOKEN = os .environ .get ('GITHUB_TOKEN' )
12+ # Your Worker Endpoint for daily sync and deduplication
13+ WORKER_API_URL = os .environ .get ('WORKER_API_URL' )
14+
15+ # Define output path for GitHub Actions artifact
16+ # This ensures it saves to the repo root for the YAML to find
17+ ROOT_DIR = os .environ .get ('GITHUB_WORKSPACE' , os .getcwd ())
18+ OUTPUT_FILE = os .path .join (ROOT_DIR , 'results.json' )
19+
20+ HEADERS = {
21+ 'Authorization' : f'Bearer { GITHUB_TOKEN } ' ,
22+ 'Accept' : 'application/vnd.github+json' ,
23+ 'User-Agent' : 'Cloudflare-Worker-Discovery-Bot/1.0'
24+ }
25+
26+ # Tags to search for in package.json
27+ TECH_SIGNATURES = {
28+ "framework" : {"hono" : "Hono" , "astro" : "Astro" , "remix" : "Remix" , "next" : "Next.js" },
29+ "database" : {"drizzle-orm" : "Drizzle" , "prisma" : "Prisma" , "pg" : "Postgres" },
30+ "ai" : {"@cloudflare/ai" : "Workers AI" , "vectorize" : "Vectorize" , "openai" : "OpenAI" },
31+ "ui" : {"shadcn" : "Shadcn" , "tailwind" : "Tailwind" , "radix-ui" : "Radix" }
32+ }
33+
34+ def analyze_stack (repo_name ):
35+ """Enriches the repo by checking its package.json dependencies."""
36+ tags = []
37+ url = f"https://api.github.com{ repo_name } /contents/package.json"
38+ try :
39+ res = requests .get (url , headers = HEADERS , timeout = 5 )
40+ if res .status_code == 200 :
41+ content = base64 .b64decode (res .json ()['content' ]).decode ('utf-8' )
42+ data = json .loads (content )
43+ all_deps = {** data .get ('dependencies' , {}), ** data .get ('devDependencies' , {})}
44+ for category , sigs in TECH_SIGNATURES .items ():
45+ for pkg , label in sigs .items ():
46+ if any (pkg in k for k in all_deps ):
47+ tags .append (label )
48+ except :
49+ pass
50+ return list (set (tags )) if tags else ["Standard Worker" ]
51+
52+ def get_already_registered_repos ():
53+ """
54+ FUTURE DEDUPLICATION:
55+ Queries your Worker API to get a list of repos we've already found.
56+ """
57+ # if WORKER_API_URL:
58+ # try:
59+ # print("Checking Worker API for previously registered repos...")
60+ # res = requests.get(f"{WORKER_API_URL}/list-repos", timeout=10)
61+ # if res.status_code == 200:
62+ # return set(res.json().get('repo_names', []))
63+ # except Exception as e:
64+ # print(f"⚠️ Could not fetch existing list: {e}")
65+ return set ()
266
367def search_broad_workers ():
468 """Searches for ANY valid worker config file using correct URL formatting."""
569
6- # 1. Define the raw query
70+ # 1. Define and Encode Query
771 raw_query = 'compatibility_date path:/(wrangler\\ .jsonc|wrangler\\ .toml)/'
8-
9- # 2. Encode the query (handles spaces, slashes, and regex chars correctly)
1072 encoded_query = urllib .parse .quote (raw_query )
1173
12- # 3. Construct the URL - THE '?' IS CRITICAL
13- # Note: sort and order are parameters, not part of the query string itself
14- url = f"https://api.github.com{ encoded_query } &sort=indexed&order=desc&per_page=30"
74+ # 2. Correct URL Construction
75+ url = f"https://api.github.com{ encoded_query } &sort=indexed&order=desc&per_page=100"
1576
16- print (f"🌊 Casting wide net with encoded query: { raw_query } " )
77+ print (f"🌊 Casting wide net: { raw_query } " )
78+
79+ # Get previously found repos for deduplication
80+ already_found = get_already_registered_repos ()
1781
1882 try :
1983 res = requests .get (url , headers = HEADERS , timeout = 30 )
20-
2184 if res .status_code != 200 :
2285 print (f"❌ Error: { res .status_code } - { res .text } " )
2386 return []
2487
2588 items = res .json ().get ('items' , [])
2689 results = []
27- seen_repos = set ()
90+ seen_now = set ()
2891
2992 print (f"🔍 Analyzing { len (items )} raw hits..." )
3093
3194 for item in items :
3295 repo = item ['repository' ]
3396 name = repo ['full_name' ]
3497
35- # Skip duplicates and forks to keep the list high quality
36- if name in seen_repos or repo .get ('fork' ):
98+ # Skip if: duplicate in this run, already in your DB, or is a fork
99+ if name in seen_now or name in already_found or repo .get ('fork' ):
37100 continue
38- seen_repos .add (name )
39101
40- print (f" 👉 Inspecting: { name } ..." )
102+ seen_now .add (name )
103+ print (f" 👉 New Find: { name } " )
41104
42105 stack_tags = analyze_stack (name )
43106
@@ -49,10 +112,30 @@ def search_broad_workers():
49112 "config_file" : item ['name' ],
50113 "discovered_at" : datetime .now (timezone .utc ).isoformat ()
51114 })
52- time .sleep (1 ) # Sleep to avoid GitHub secondary rate limits
115+ time .sleep (1 ) # Safety delay for secondary rate limits
53116
54117 return results
55118
56119 except Exception as e :
57120 print (f"❌ Fatal Request Error: { e } " )
58121 return []
122+
123+ def main ():
124+ if not GITHUB_TOKEN :
125+ print ("❌ Error: GITHUB_TOKEN not set" )
126+ return
127+
128+ discoveries = search_broad_workers ()
129+
130+ # Save to artifact file
131+ with open (OUTPUT_FILE , 'w' , encoding = 'utf-8' ) as f :
132+ json .dump (discoveries , f , indent = 2 )
133+
134+ print (f"\n ✅ Found { len (discoveries )} new repos. Saved to { OUTPUT_FILE } " )
135+
136+ # OPTIONAL: Post to your Worker API immediately
137+ # if WORKER_API_URL and discoveries:
138+ # requests.post(f"{WORKER_API_URL}/ingest", json={"repos": discoveries})
139+
140+ if __name__ == '__main__' :
141+ main ()
0 commit comments