Enhance search_repos.py with new features

jmbish04 · web-flow · commit 9c98e3f702c2 · 2026-02-13T14:15:51.000-08:00
Refactor GitHub repo search script to improve functionality and add tagging logic.
diff --git a/scripts/github/workflows/search_repos.py b/scripts/github/workflows/search_repos.py
@@ -1,107 +1,159 @@
 #!/usr/bin/env python3
 import os
-import sys
 import json
 import time
 import requests
+import base64
 from datetime import datetime, timezone
 
 # --- CONFIGURATION ---
-TOKEN = os.environ.get('GITHUB_TOKEN')
-if not TOKEN:
-    print("Error: GITHUB_TOKEN environment variable not set")
-    sys.exit(1)
+GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') # Required
+API_ENDPOINT = os.environ.get('WORKER_API_URL') # Optional: Your Worker URL
+RESULTS_FILE = "daily_workers.json"
 
+# Headers for GitHub API
 HEADERS = {
+    'Authorization': f'Bearer {GITHUB_TOKEN}',
     'Accept': 'application/vnd.github+json',
-    'Authorization': f'Bearer {TOKEN}',
-    'X-GitHub-Api-Version': '2022-11-28',
-    'User-Agent': 'Cloudflare-Worker-Hunter'
+    'User-Agent': 'Worker-Discovery-Bot/1.0'
 }
 
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-OUTPUT_FILE = os.path.join(SCRIPT_DIR, 'results.json')
+# --- TAGGING LOGIC ---
+# We look for these keywords in package.json to auto-label the architecture
+TECH_SIGNATURES = {
+    "framework": {
+        "hono": "Hono",
+        "astro": "Astro",
+        "remix": "Remix",
+        "next": "Next.js",
+        "itty-router": "Itty Router",
+        "fastapi": "FastAPI (Python)"
+    },
+    "database": {
+        "drizzle-orm": "Drizzle ORM",
+        "prisma": "Prisma",
+        "kysely": "Kysely",
+        "@supabase/supabase-js": "Supabase",
+        "mongoose": "MongoDB"
+    },
+    "infrastructure": {
+        "@cloudflare/ai": "Workers AI",
+        "@cloudflare/vectorize": "Vectorize",
+        "@cloudflare/kv-asset-handler": "KV Assets",
+        "toucan-js": "Toucan Telemetry",
+        "zod": "Zod Validation"
+    },
+    "frontend": {
+        "clsx": "Tailwind/Shadcn Utils",
+        "lucide-react": "Lucide Icons",
+        "radix-ui": "Radix UI",
+        "react": "React",
+        "vue": "Vue"
+    }
+}
 
-# --- TARGET QUERIES ---
-# We use multiple specific queries to bypass GitHub's search limits and find "the good stuff"
-QUERIES = [
-    # 1. Astro + Cloudflare + Shadcn (The modern stack)
-    '"@astrojs/cloudflare" "tailwind-merge" path:package.json',
-    # 2. Next.js on Pages + Shadcn
-    '"@cloudflare/next-on-pages" "lucide-react" path:package.json',
-    # 3. Newest Worker standard (jsonc) + AI
-    '"@cloudflare/ai" path:wrangler.jsonc',
-    # 4. Pure Workers + Drizzle (signals high quality)
-    '"drizzle-orm" "wrangler" path:package.json'
-]
+def get_file_content(repo_full_name, path):
+    """Fetches raw content of a file from a repo."""
+    url = f"https://api.github.com{repo_full_name}/contents/{path}"
+    try:
+        res = requests.get(url, headers=HEADERS, timeout=5)
+        if res.status_code == 200:
+            content = base64.b64decode(res.json()['content']).decode('utf-8')
+            return content
+    except:
+        return None
+    return None
 
-def handle_rate_limit(response):
-    if response.status_code == 403:
-        retry_after = int(response.headers.get('Retry-After', 60))
-        print(f"⚠️ Rate limit. Sleeping {retry_after}s...")
-        time.sleep(retry_after)
-        return True
-    return False
+def analyze_stack(repo_name):
+    """Reads package.json to detect tech stack."""
+    tags = []
+    package_json = get_file_content(repo_name, "package.json")
+    
+    if not package_json:
+        return ["Unknown/Non-JS"]
 
-def search_github(query):
-    url = 'https://api.github.com/search/code'
-    params = {'q': query, 'per_page': 50}
     try:
-        res = requests.get(url, headers=HEADERS, params=params, timeout=30)
-        if handle_rate_limit(res): return search_github(query)
-        res.raise_for_status()
-        return res.json().get('items', [])
-    except Exception as e:
-        print(f"❌ Search Error: {e}")
-        return []
+        data = json.loads(package_json)
+        # Combine deps and devDeps
+        all_deps = {**data.get('dependencies', {}), **data.get('devDependencies', {})}
+        
+        for category, signatures in TECH_SIGNATURES.items():
+            for pkg, label in signatures.items():
+                if any(k for k in all_deps if pkg in k):
+                    tags.append(label)
+    except:
+        tags.append("Parse Error")
+        
+    return list(set(tags))
 
-def get_repo_meta(full_name):
-    url = f'https://api.github.com/repos/{full_name}'
+def search_broad_workers():
+    """Searches for ANY valid worker config file sorted by recently updated."""
+    # We search for 'compatibility_date' which is mandatory in valid wrangler.toml/jsonc
+    query = "compatibility_date path:/(wrangler\\.jsonc|wrangler\\.toml)/ sort:indexed"
+    url = f"https://api.github.com{query}&per_page=30"
+    
+    print(f"🌊 Casting wide net with query: {query}")
     res = requests.get(url, headers=HEADERS)
-    return res.json() if res.status_code == 200 else None
-
-def main():
-    print("🚀 Starting Edge Tech Discovery...\n")
-    unique_repos = {}
+    
+    if res.status_code != 200:
+        print(f"❌ Error: {res.status_code} {res.text}")
+        return []
+        
+    items = res.json().get('items', [])
+    results = []
+    seen_repos = set()
 
-    for q in QUERIES:
-        print(f"🔍 Searching: {q}")
-        items = search_github(q)
-        for item in items:
-            name = item['repository']['full_name']
-            if name not in unique_repos:
-                unique_repos[name] = item['repository']
-        time.sleep(2) # Prevent secondary rate limits
+    print(f"🔍 Analyzing {len(items)} raw hits...")
 
-    print(f"\n✨ Found {len(unique_repos)} candidates. Refining metadata...")
-    
-    final_list = []
-    for i, (name, base_data) in enumerate(unique_repos.items(), 1):
-        meta = get_repo_meta(name)
-        if meta and not meta.get('fork'): # Filter out noise/forks
-            final_list.append({
-                'name': name,
-                'url': meta['html_url'],
-                'stars': meta['stargazers_count'],
-                'desc': meta['description'],
-                'updated': meta['pushed_at']
-            })
-            print(f" ✅ [{i}] Collected: {name}")
-        time.sleep(0.2)
+    for item in items:
+        repo = item['repository']
+        name = repo['full_name']
+        
+        if name in seen_repos or repo.get('fork'): 
+            continue
+        seen_repos.add(name)
+        
+        print(f"   👉 Inspecting: {name}...")
+        
+        # 1. Enrich with Tech Stack
+        stack_tags = analyze_stack(name)
+        
+        # 2. Build Payload
+        entry = {
+            "name": name,
+            "url": repo['html_url'],
+            "description": repo.get('description', 'No description'),
+            "stars": -1, # Requires separate API call if needed, skipping for speed
+            "detected_stack": stack_tags,
+            "config_file": item['name'],
+            "discovered_at": datetime.now(timezone.utc).isoformat()
+        }
+        results.append(entry)
+        time.sleep(1) # Respect rate limits
 
-    # Sort by most recently active
-    final_list.sort(key=lambda x: x['updated'], reverse=True)
+    return results
 
-    output = {
-        'generated_at': datetime.now(timezone.utc).isoformat(),
-        'count': len(final_list),
-        'repos': final_list
-    }
+def main():
+    if not GITHUB_TOKEN:
+        print("⚠️  Error: GITHUB_TOKEN not set.")
+        return
 
-    with open(OUTPUT_FILE, 'w') as f:
-        json.dump(output, f, indent=2)
+    # 1. Run Search
+    new_discoveries = search_broad_workers()
     
-    print(f"\n🎉 Done! Saved {len(final_list)} repos to {OUTPUT_FILE}")
+    # 2. Save to Disk
+    with open(RESULTS_FILE, 'w') as f:
+        json.dump(new_discoveries, f, indent=2)
+    print(f"\n✅ Saved {len(new_discoveries)} repos to {RESULTS_FILE}")
+
+    # 3. Post to your Worker API (Optional)
+    if API_ENDPOINT and new_discoveries:
+        print(f"📤 Posting to {API_ENDPOINT}...")
+        try:
+            res = requests.post(API_ENDPOINT, json={"payload": new_discoveries})
+            print(f"   Status: {res.status_code}")
+        except Exception as e:
+            print(f"   Failed to post: {e}")
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()