Skip to content

Commit d037565

Browse files
authored
Merge branch 'main' into improve/CM-1137
2 parents 88d4d35 + 19cae0e commit d037565

11 files changed

Lines changed: 350 additions & 479 deletions

File tree

services/apps/automatic_projects_discovery_worker/src/activities/activities.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ export async function processDataset(
9090
projectSlug: parsed.projectSlug,
9191
repoName: parsed.repoName,
9292
repoUrl: parsed.repoUrl,
93-
ossfCriticalityScore: parsed.ossfCriticalityScore,
93+
source: sourceName,
94+
action: parsed.action ?? 'auto',
9495
lfCriticalityScore: parsed.lfCriticalityScore,
9596
})
9697

services/apps/automatic_projects_discovery_worker/src/main.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { Options, ServiceWorker } from '@crowd/archetype-worker'
44
import { scheduleProjectsDiscovery } from './schedules/scheduleProjectsDiscovery'
55

66
const config: Config = {
7-
envvars: [],
7+
envvars: ['CROWD_GITHUB_PERSONAL_ACCESS_TOKENS'],
88
producer: {
99
enabled: false,
1010
},
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
import https from 'https'
2+
import { Readable } from 'stream'
3+
4+
import { getServiceLogger } from '@crowd/logging'
5+
6+
import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types'
7+
8+
const log = getServiceLogger()
9+
10+
const CATEGORY_SLUG = 'project-onboardings'
11+
const GITHUB_GRAPHQL_URL = 'https://api.github.com/graphql'
12+
const GITHUB_NON_REPO_OWNERS = new Set(['user-attachments', 'orgs', 'apps', 'marketplace'])
13+
const OWNER = 'linuxfoundation'
14+
const REPO = 'insights'
15+
16+
interface GraphQLResponse<T> {
17+
data?: T
18+
errors?: Array<{ message: string }>
19+
}
20+
21+
interface DiscussionNode {
22+
number: number
23+
body: string
24+
closed: boolean
25+
}
26+
27+
interface DiscussionsPage {
28+
pageInfo: { hasNextPage: boolean; endCursor: string | null }
29+
nodes: DiscussionNode[]
30+
}
31+
32+
interface DiscussionsData {
33+
repository: {
34+
discussions: DiscussionsPage
35+
}
36+
}
37+
38+
async function graphqlRequest<T>(query: string, variables: Record<string, unknown>): Promise<T> {
39+
const raw = process.env.CROWD_GITHUB_PERSONAL_ACCESS_TOKENS
40+
if (!raw) {
41+
throw new Error('CROWD_GITHUB_PERSONAL_ACCESS_TOKENS environment variable is not set')
42+
}
43+
const token = raw.split(',')[0].trim()
44+
45+
const body = JSON.stringify({ query, variables })
46+
47+
return new Promise((resolve, reject) => {
48+
const url = new URL(GITHUB_GRAPHQL_URL)
49+
const req = https.request(
50+
{
51+
hostname: url.hostname,
52+
path: url.pathname,
53+
method: 'POST',
54+
headers: {
55+
'Content-Type': 'application/json',
56+
'Content-Length': Buffer.byteLength(body),
57+
Authorization: `Bearer ${token}`,
58+
'User-Agent': 'crowd-dev-discovery-worker',
59+
},
60+
},
61+
(res) => {
62+
const chunks: Uint8Array[] = []
63+
res.on('data', (chunk: Uint8Array) => chunks.push(chunk))
64+
res.on('end', () => {
65+
try {
66+
const response = JSON.parse(
67+
Buffer.concat(chunks).toString('utf8'),
68+
) as GraphQLResponse<T>
69+
if (response.errors?.length) {
70+
reject(
71+
new Error(
72+
`GitHub GraphQL errors: ${response.errors.map((e) => e.message).join(', ')}`,
73+
),
74+
)
75+
return
76+
}
77+
if (!response.data) {
78+
reject(new Error('GitHub GraphQL returned empty data'))
79+
return
80+
}
81+
resolve(response.data)
82+
} catch (err) {
83+
reject(new Error(`Failed to parse GitHub GraphQL response: ${err}`))
84+
}
85+
})
86+
res.on('error', reject)
87+
},
88+
)
89+
90+
req.on('error', reject)
91+
req.write(body)
92+
req.end()
93+
})
94+
}
95+
96+
// Extracts github.com/{owner}/{repo} URLs from markdown text, normalised to the repo root.
97+
function extractRepoUrls(text: string): string[] {
98+
const urls = new Set<string>()
99+
const regex = /https?:\/\/github\.com\/([a-zA-Z0-9_.-]+)\/([a-zA-Z0-9_.-]+)/gi
100+
let match: RegExpExecArray | null
101+
while ((match = regex.exec(text)) !== null) {
102+
const owner = match[1].toLowerCase()
103+
const repo = match[2]
104+
.replace(/[.,;:!?]+$/, '')
105+
.replace(/\.git$/, '')
106+
.toLowerCase()
107+
if (owner && repo && !GITHUB_NON_REPO_OWNERS.has(owner)) {
108+
urls.add(`https://github.com/${owner}/${repo}`)
109+
}
110+
}
111+
return Array.from(urls)
112+
}
113+
114+
async function getDiscussionCategoryId(): Promise<string> {
115+
const query = `
116+
query {
117+
repository(owner: "${OWNER}", name: "${REPO}") {
118+
discussionCategories(first: 25) {
119+
nodes {
120+
id
121+
name
122+
slug
123+
}
124+
}
125+
}
126+
}
127+
`
128+
129+
interface CategoriesData {
130+
repository: {
131+
discussionCategories: {
132+
nodes: Array<{ id: string; name: string; slug: string }>
133+
}
134+
}
135+
}
136+
137+
const data = await graphqlRequest<CategoriesData>(query, {})
138+
const categories = data.repository.discussionCategories.nodes
139+
const category = categories.find((c) => c.slug === CATEGORY_SLUG)
140+
141+
if (!category) {
142+
throw new Error(
143+
`Discussion category "${CATEGORY_SLUG}" not found in ${OWNER}/${REPO}. ` +
144+
`Available: ${categories.map((c) => `${c.name} (${c.slug})`).join(', ')}`,
145+
)
146+
}
147+
148+
return category.id
149+
}
150+
151+
async function fetchDiscussionsPage(
152+
categoryId: string,
153+
cursor: string | null,
154+
): Promise<DiscussionsPage> {
155+
const query = `
156+
query GetDiscussions($categoryId: ID!, $cursor: String) {
157+
repository(owner: "${OWNER}", name: "${REPO}") {
158+
discussions(first: 100, categoryId: $categoryId, after: $cursor) {
159+
pageInfo {
160+
hasNextPage
161+
endCursor
162+
}
163+
nodes {
164+
number
165+
body
166+
closed
167+
}
168+
}
169+
}
170+
}
171+
`
172+
173+
const data = await graphqlRequest<DiscussionsData>(query, { categoryId, cursor })
174+
return data.repository.discussions
175+
}
176+
177+
async function fetchAllDiscussionRepoUrls(): Promise<string[]> {
178+
const categoryId = await getDiscussionCategoryId()
179+
log.info({ categoryId, owner: OWNER, repo: REPO }, 'Insights Discussions: category ID resolved.')
180+
181+
const allUrls = new Set<string>()
182+
let cursor: string | null = null
183+
let hasNextPage = true
184+
let pageCount = 0
185+
186+
while (hasNextPage) {
187+
pageCount++
188+
const page = await fetchDiscussionsPage(categoryId, cursor)
189+
190+
for (const discussion of page.nodes) {
191+
for (const url of extractRepoUrls(discussion.body)) {
192+
allUrls.add(url)
193+
}
194+
}
195+
196+
hasNextPage = page.pageInfo.hasNextPage
197+
cursor = page.pageInfo.endCursor
198+
199+
log.info(
200+
{
201+
pageCount,
202+
discussionsInPage: page.nodes.length,
203+
totalUniqueUrls: allUrls.size,
204+
hasNextPage,
205+
},
206+
'Insights Discussions: page processed.',
207+
)
208+
}
209+
210+
return Array.from(allUrls)
211+
}
212+
213+
export class InsightsDiscussionsSource implements IDiscoverySource {
214+
public readonly name = 'insights-discussions'
215+
public readonly format = 'json' as const
216+
217+
async listAvailableDatasets(): Promise<IDatasetDescriptor[]> {
218+
const today = new Date().toISOString().slice(0, 10)
219+
return [
220+
{
221+
id: today,
222+
date: today,
223+
url: `https://github.com/${OWNER}/${REPO}/discussions/categories/${CATEGORY_SLUG}`,
224+
},
225+
]
226+
}
227+
228+
async fetchDatasetStream(dataset: IDatasetDescriptor): Promise<Readable> {
229+
log.info({ datasetId: dataset.id }, 'Insights Discussions: fetching discussion repo URLs.')
230+
231+
const repoUrls = await fetchAllDiscussionRepoUrls()
232+
233+
log.info(
234+
{ datasetId: dataset.id, count: repoUrls.length },
235+
'Insights Discussions: unique repo URLs extracted.',
236+
)
237+
238+
return Readable.from(
239+
repoUrls.map((url) => ({ repoUrl: url })),
240+
{ objectMode: true },
241+
)
242+
}
243+
244+
parseRow(rawRow: Record<string, unknown>): IDiscoverySourceRow | null {
245+
const repoUrl = rawRow['repoUrl'] as string | undefined
246+
if (!repoUrl) return null
247+
248+
let projectSlug = ''
249+
let repoName = ''
250+
try {
251+
const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '')
252+
projectSlug = urlPath
253+
repoName = urlPath.split('/').pop() || ''
254+
} catch {
255+
return null
256+
}
257+
258+
if (!projectSlug || !repoName) return null
259+
260+
return {
261+
projectSlug,
262+
repoName,
263+
repoUrl,
264+
action: 'evaluate',
265+
}
266+
}
267+
}

services/apps/automatic_projects_discovery_worker/src/sources/registry.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1+
import { InsightsDiscussionsSource } from './insights-discussions/source'
12
import { LfCriticalityScoreSource } from './lf-criticality-score/source'
23
import { IDiscoverySource } from './types'
34

4-
const sources: IDiscoverySource[] = [new LfCriticalityScoreSource()]
5+
const sources: IDiscoverySource[] = [
6+
new LfCriticalityScoreSource(),
7+
new InsightsDiscussionsSource(),
8+
]
59

610
export function getSource(name: string): IDiscoverySource {
711
const source = sources.find((s) => s.name === name)

services/apps/automatic_projects_discovery_worker/src/sources/types.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import { Readable } from 'stream'
22

3+
import { ProjectCatalogAction } from '@crowd/data-access-layer/src/project-catalog/types'
4+
35
export interface IDatasetDescriptor {
46
id: string
57
date: string
@@ -22,6 +24,6 @@ export interface IDiscoverySourceRow {
2224
projectSlug: string
2325
repoName: string
2426
repoUrl: string
25-
ossfCriticalityScore?: number
27+
action?: ProjectCatalogAction
2628
lfCriticalityScore?: number
2729
}

0 commit comments

Comments
 (0)