|
| 1 | +import https from 'https' |
| 2 | +import { Readable } from 'stream' |
| 3 | + |
| 4 | +import { getServiceLogger } from '@crowd/logging' |
| 5 | + |
| 6 | +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' |
| 7 | + |
| 8 | +const log = getServiceLogger() |
| 9 | + |
| 10 | +const CATEGORY_SLUG = 'project-onboardings' |
| 11 | +const GITHUB_GRAPHQL_URL = 'https://api.github.com/graphql' |
| 12 | +const GITHUB_NON_REPO_OWNERS = new Set(['user-attachments', 'orgs', 'apps', 'marketplace']) |
| 13 | +const OWNER = 'linuxfoundation' |
| 14 | +const REPO = 'insights' |
| 15 | + |
| 16 | +interface GraphQLResponse<T> { |
| 17 | + data?: T |
| 18 | + errors?: Array<{ message: string }> |
| 19 | +} |
| 20 | + |
| 21 | +interface DiscussionNode { |
| 22 | + number: number |
| 23 | + body: string |
| 24 | + closed: boolean |
| 25 | +} |
| 26 | + |
| 27 | +interface DiscussionsPage { |
| 28 | + pageInfo: { hasNextPage: boolean; endCursor: string | null } |
| 29 | + nodes: DiscussionNode[] |
| 30 | +} |
| 31 | + |
| 32 | +interface DiscussionsData { |
| 33 | + repository: { |
| 34 | + discussions: DiscussionsPage |
| 35 | + } |
| 36 | +} |
| 37 | + |
| 38 | +async function graphqlRequest<T>(query: string, variables: Record<string, unknown>): Promise<T> { |
| 39 | + const raw = process.env.CROWD_GITHUB_PERSONAL_ACCESS_TOKENS |
| 40 | + if (!raw) { |
| 41 | + throw new Error('CROWD_GITHUB_PERSONAL_ACCESS_TOKENS environment variable is not set') |
| 42 | + } |
| 43 | + const token = raw.split(',')[0].trim() |
| 44 | + |
| 45 | + const body = JSON.stringify({ query, variables }) |
| 46 | + |
| 47 | + return new Promise((resolve, reject) => { |
| 48 | + const url = new URL(GITHUB_GRAPHQL_URL) |
| 49 | + const req = https.request( |
| 50 | + { |
| 51 | + hostname: url.hostname, |
| 52 | + path: url.pathname, |
| 53 | + method: 'POST', |
| 54 | + headers: { |
| 55 | + 'Content-Type': 'application/json', |
| 56 | + 'Content-Length': Buffer.byteLength(body), |
| 57 | + Authorization: `Bearer ${token}`, |
| 58 | + 'User-Agent': 'crowd-dev-discovery-worker', |
| 59 | + }, |
| 60 | + }, |
| 61 | + (res) => { |
| 62 | + const chunks: Uint8Array[] = [] |
| 63 | + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) |
| 64 | + res.on('end', () => { |
| 65 | + try { |
| 66 | + const response = JSON.parse( |
| 67 | + Buffer.concat(chunks).toString('utf8'), |
| 68 | + ) as GraphQLResponse<T> |
| 69 | + if (response.errors?.length) { |
| 70 | + reject( |
| 71 | + new Error( |
| 72 | + `GitHub GraphQL errors: ${response.errors.map((e) => e.message).join(', ')}`, |
| 73 | + ), |
| 74 | + ) |
| 75 | + return |
| 76 | + } |
| 77 | + if (!response.data) { |
| 78 | + reject(new Error('GitHub GraphQL returned empty data')) |
| 79 | + return |
| 80 | + } |
| 81 | + resolve(response.data) |
| 82 | + } catch (err) { |
| 83 | + reject(new Error(`Failed to parse GitHub GraphQL response: ${err}`)) |
| 84 | + } |
| 85 | + }) |
| 86 | + res.on('error', reject) |
| 87 | + }, |
| 88 | + ) |
| 89 | + |
| 90 | + req.on('error', reject) |
| 91 | + req.write(body) |
| 92 | + req.end() |
| 93 | + }) |
| 94 | +} |
| 95 | + |
| 96 | +// Extracts github.com/{owner}/{repo} URLs from markdown text, normalised to the repo root. |
| 97 | +function extractRepoUrls(text: string): string[] { |
| 98 | + const urls = new Set<string>() |
| 99 | + const regex = /https?:\/\/github\.com\/([a-zA-Z0-9_.-]+)\/([a-zA-Z0-9_.-]+)/gi |
| 100 | + let match: RegExpExecArray | null |
| 101 | + while ((match = regex.exec(text)) !== null) { |
| 102 | + const owner = match[1].toLowerCase() |
| 103 | + const repo = match[2] |
| 104 | + .replace(/[.,;:!?]+$/, '') |
| 105 | + .replace(/\.git$/, '') |
| 106 | + .toLowerCase() |
| 107 | + if (owner && repo && !GITHUB_NON_REPO_OWNERS.has(owner)) { |
| 108 | + urls.add(`https://github.com/${owner}/${repo}`) |
| 109 | + } |
| 110 | + } |
| 111 | + return Array.from(urls) |
| 112 | +} |
| 113 | + |
| 114 | +async function getDiscussionCategoryId(): Promise<string> { |
| 115 | + const query = ` |
| 116 | + query { |
| 117 | + repository(owner: "${OWNER}", name: "${REPO}") { |
| 118 | + discussionCategories(first: 25) { |
| 119 | + nodes { |
| 120 | + id |
| 121 | + name |
| 122 | + slug |
| 123 | + } |
| 124 | + } |
| 125 | + } |
| 126 | + } |
| 127 | + ` |
| 128 | + |
| 129 | + interface CategoriesData { |
| 130 | + repository: { |
| 131 | + discussionCategories: { |
| 132 | + nodes: Array<{ id: string; name: string; slug: string }> |
| 133 | + } |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | + const data = await graphqlRequest<CategoriesData>(query, {}) |
| 138 | + const categories = data.repository.discussionCategories.nodes |
| 139 | + const category = categories.find((c) => c.slug === CATEGORY_SLUG) |
| 140 | + |
| 141 | + if (!category) { |
| 142 | + throw new Error( |
| 143 | + `Discussion category "${CATEGORY_SLUG}" not found in ${OWNER}/${REPO}. ` + |
| 144 | + `Available: ${categories.map((c) => `${c.name} (${c.slug})`).join(', ')}`, |
| 145 | + ) |
| 146 | + } |
| 147 | + |
| 148 | + return category.id |
| 149 | +} |
| 150 | + |
| 151 | +async function fetchDiscussionsPage( |
| 152 | + categoryId: string, |
| 153 | + cursor: string | null, |
| 154 | +): Promise<DiscussionsPage> { |
| 155 | + const query = ` |
| 156 | + query GetDiscussions($categoryId: ID!, $cursor: String) { |
| 157 | + repository(owner: "${OWNER}", name: "${REPO}") { |
| 158 | + discussions(first: 100, categoryId: $categoryId, after: $cursor) { |
| 159 | + pageInfo { |
| 160 | + hasNextPage |
| 161 | + endCursor |
| 162 | + } |
| 163 | + nodes { |
| 164 | + number |
| 165 | + body |
| 166 | + closed |
| 167 | + } |
| 168 | + } |
| 169 | + } |
| 170 | + } |
| 171 | + ` |
| 172 | + |
| 173 | + const data = await graphqlRequest<DiscussionsData>(query, { categoryId, cursor }) |
| 174 | + return data.repository.discussions |
| 175 | +} |
| 176 | + |
| 177 | +async function fetchAllDiscussionRepoUrls(): Promise<string[]> { |
| 178 | + const categoryId = await getDiscussionCategoryId() |
| 179 | + log.info({ categoryId, owner: OWNER, repo: REPO }, 'Insights Discussions: category ID resolved.') |
| 180 | + |
| 181 | + const allUrls = new Set<string>() |
| 182 | + let cursor: string | null = null |
| 183 | + let hasNextPage = true |
| 184 | + let pageCount = 0 |
| 185 | + |
| 186 | + while (hasNextPage) { |
| 187 | + pageCount++ |
| 188 | + const page = await fetchDiscussionsPage(categoryId, cursor) |
| 189 | + |
| 190 | + for (const discussion of page.nodes) { |
| 191 | + for (const url of extractRepoUrls(discussion.body)) { |
| 192 | + allUrls.add(url) |
| 193 | + } |
| 194 | + } |
| 195 | + |
| 196 | + hasNextPage = page.pageInfo.hasNextPage |
| 197 | + cursor = page.pageInfo.endCursor |
| 198 | + |
| 199 | + log.info( |
| 200 | + { |
| 201 | + pageCount, |
| 202 | + discussionsInPage: page.nodes.length, |
| 203 | + totalUniqueUrls: allUrls.size, |
| 204 | + hasNextPage, |
| 205 | + }, |
| 206 | + 'Insights Discussions: page processed.', |
| 207 | + ) |
| 208 | + } |
| 209 | + |
| 210 | + return Array.from(allUrls) |
| 211 | +} |
| 212 | + |
| 213 | +export class InsightsDiscussionsSource implements IDiscoverySource { |
| 214 | + public readonly name = 'insights-discussions' |
| 215 | + public readonly format = 'json' as const |
| 216 | + |
| 217 | + async listAvailableDatasets(): Promise<IDatasetDescriptor[]> { |
| 218 | + const today = new Date().toISOString().slice(0, 10) |
| 219 | + return [ |
| 220 | + { |
| 221 | + id: today, |
| 222 | + date: today, |
| 223 | + url: `https://github.com/${OWNER}/${REPO}/discussions/categories/${CATEGORY_SLUG}`, |
| 224 | + }, |
| 225 | + ] |
| 226 | + } |
| 227 | + |
| 228 | + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise<Readable> { |
| 229 | + log.info({ datasetId: dataset.id }, 'Insights Discussions: fetching discussion repo URLs.') |
| 230 | + |
| 231 | + const repoUrls = await fetchAllDiscussionRepoUrls() |
| 232 | + |
| 233 | + log.info( |
| 234 | + { datasetId: dataset.id, count: repoUrls.length }, |
| 235 | + 'Insights Discussions: unique repo URLs extracted.', |
| 236 | + ) |
| 237 | + |
| 238 | + return Readable.from( |
| 239 | + repoUrls.map((url) => ({ repoUrl: url })), |
| 240 | + { objectMode: true }, |
| 241 | + ) |
| 242 | + } |
| 243 | + |
| 244 | + parseRow(rawRow: Record<string, unknown>): IDiscoverySourceRow | null { |
| 245 | + const repoUrl = rawRow['repoUrl'] as string | undefined |
| 246 | + if (!repoUrl) return null |
| 247 | + |
| 248 | + let projectSlug = '' |
| 249 | + let repoName = '' |
| 250 | + try { |
| 251 | + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') |
| 252 | + projectSlug = urlPath |
| 253 | + repoName = urlPath.split('/').pop() || '' |
| 254 | + } catch { |
| 255 | + return null |
| 256 | + } |
| 257 | + |
| 258 | + if (!projectSlug || !repoName) return null |
| 259 | + |
| 260 | + return { |
| 261 | + projectSlug, |
| 262 | + repoName, |
| 263 | + repoUrl, |
| 264 | + action: 'evaluate', |
| 265 | + } |
| 266 | + } |
| 267 | +} |
0 commit comments