core-github-api/.github/workflows/repo-sync.yml at abca85fbabd93f2298a62fc28f4c2e8dd72223e2 · jmbish04/core-github-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
name: Sync Repos to D1 Cache
on:
  schedule:
    - cron: "0 2 * * *" # Run daily at 2am
  workflow_dispatch: # Manual trigger

jobs:
  sync-to-d1:
    name: Analyze & Upsert Repos
    runs-on: ubuntu-latest
    permissions:
      contents: read

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install Dependencies
        run: pip install openai pydantic requests pygithub

      - name: Run Sync Agent
        env:
          # GitHub & Worker Auth
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

          # Cloudflare AI Gateway (AI Agent)
          CF_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
          CF_GATEWAY_ID: ${{ secrets.CLOUDFLARE_GATEWAY_ID }}
          CF_API_TOKEN: ${{ secrets.CLOUDFLARE_AI_GATEWAY_TOKEN }}
          WORKER_API_URL: ${{ secrets.WORKER_API_URL_BASE }}
          WORKER_API_KEY: ${{ secrets.WORKER_API_KEY }}
          QUERY: ${{ github.event.inputs.query }}

          # Target User
          TARGET_USER: ${{ github.repository_owner }}

        run: |
          cat <<EOF > sync_repos.py
          import os
          import json
          import asyncio
          import requests
          from typing import List, Set
          from pydantic import BaseModel, Field
          from openai import AsyncOpenAI
          from github import Github

          # --- CONFIGURATION ---
          API_BASE = os.environ["WORKER_API_URL"]
          WORKER_KEY = os.environ["WORKER_API_KEY"]
          TARGET_USER = os.environ["TARGET_USER"]

          # Cloudflare AI Gateway
          client = AsyncOpenAI(
              api_key=os.environ["CF_API_TOKEN"],
              base_url=f"https://gateway.ai.cloudflare.com/v1/{os.environ['CF_ACCOUNT_ID']}/{os.environ['CF_GATEWAY_ID']}/compat"
          )
          MODEL_ID = "@cf/openai/gpt-oss-120b"

          # --- DATA MODELS ---
          class RepoAnalysis(BaseModel):
              summary: str = Field(..., description="A concise, technical summary of what the repo does (max 2 sentences).")
              tags: List[str] = Field(..., description="List of tech stack tags e.g. ['cloudflare-workers', 'astro', 'python', 'd1', 'shadcn']")

          # --- STEPS ---

          def get_existing_repos() -> Set[str]:
              """Step 1: Ask D1 what we already have to avoid AI costs/dupes."""
              try:
                  url = f"{API_BASE}/api/repos/list"
                  print(f"Checking existing D1 records: {url}")
                  headers = {"X-API-Key": WORKER_KEY}
                  resp = requests.get(url, headers=headers)
                  if resp.status_code == 200:
                      # Expecting ["jmbish04/repo-a", "jmbish04/repo-b"]
                      data = resp.json()
                      print(f"Found {len(data)} existing repos in D1.")
                      return set(data)
                  else:
                      print(f"::warning::Failed to fetch existing list: {resp.status_code}")
                      return set()
              except Exception as e:
                  print(f"::warning::D1 check failed: {e}")
                  return set()

          async def analyze_repo(repo) -> dict:
              """Step 2: AI Agent analyzes code to generate metadata."""
              print(f"Analyzing {repo.full_name}...")

              # Gather Context (Readme + File List)
              try:
                  readme = repo.get_readme().decoded_content.decode("utf-8")[:6000]
              except:
                  readme = "No README."

              # Get file structure to detect frameworks (e.g. verify if it's Next.js vs Astro)
              try:
                  contents = repo.get_contents("")
                  files = [c.name for c in contents]
                  file_str = ", ".join(files)
              except:
                  file_str = "Unknown"

              prompt = f"""
              Analyze this GitHub repository.
              Repo: {repo.full_name}
              Files: {file_str}
              Readme Snippet:
              {readme}

              Task:
              1. Summarize the project.
              2. Extract technology tags (e.g. 'cloudflare', 'hono', 'drizzle', 'react', 'python').
              3. Return JSON matching the schema.
              """

              try:
                  response = await client.chat.completions.create(
                      model=MODEL_ID,
                      messages=[{"role": "user", "content": prompt}],
                      response_format={"type": "json_object"},
                      max_tokens=500
                  )
                  analysis = RepoAnalysis.model_validate_json(response.choices[0].message.content)

                  return {
                      "owner": repo.owner.login,
                      "name": repo.name,
                      "full_name": repo.full_name,
                      "description": repo.description or "",
                      "url": repo.html_url,
                      "language": repo.language,
                      "stars": repo.stargazers_count,
                      "ai_summary": analysis.summary,
                      "tags": analysis.tags,
                      "last_updated": repo.updated_at.isoformat()
                  }
              except Exception as e:
                  print(f"AI Analysis failed for {repo.name}: {e}")
                  return None

          async def main():
              # 1. Get D1 Cache
              existing_keys = get_existing_repos()

              # 2. Get GitHub Repos
              g = Github(os.environ["GITHUB_TOKEN"])
              user = g.get_user(TARGET_USER)

              repos_to_process = []
              print(f"Fetching repos for {TARGET_USER}...")

              for repo in user.get_repos(sort="updated", direction="desc"):
                  # Skip if private (optional, remove check if you want private synced)
                  # if repo.private: continue

                  # THE DEDUPLICATION LOGIC
                  key = f"{repo.owner.login}/{repo.name}"
                  if key in existing_keys:
                      print(f"Skipping {key} (Already in D1)")
                      continue

                  repos_to_process.append(repo)
                  if len(repos_to_process) >= 5: # Batch limit to save run time/tokens per run
                      break

              print(f"Processing {len(repos_to_process)} new/stale repositories...")

              # 3. Analyze & Upsert
              for repo in repos_to_process:
                  data = await analyze_repo(repo)
                  if data:
                      # POST to Worker
                      print(f"Upserting {data['full_name']} to D1...")
                      try:
                          r = requests.post(
                              f"{API_BASE}/api/repos/upsert",
                              json=data,
                              headers={"X-API-Key": WORKER_KEY}
                          )
                          r.raise_for_status()
                          print("Success.")
                      except Exception as e:
                          print(f"::error::Failed to upload {data['name']}: {e}")

          if __name__ == "__main__":
              asyncio.run(main())
          EOF

          python sync_repos.py