Skip to content

Commit d8a70d4

Browse files
committed
Merge branch 'main' into feat/single-lookup
2 parents 97ee038 + 9918bfb commit d8a70d4

19 files changed

Lines changed: 245 additions & 77 deletions

File tree

backend/src/api/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import * as http from 'http'
77
import os from 'os'
88
import { QueryTypes } from 'sequelize'
99

10+
import { BadRequestError } from '@crowd/common'
1011
import { getDbConnection } from '@crowd/data-access-layer/src/database'
1112
import { getServiceLogger } from '@crowd/logging'
1213
import { getOpensearchClient } from '@crowd/opensearch'
@@ -149,7 +150,7 @@ setImmediate(async () => {
149150

150151
app.use((err: any, req: any, res: any, next: any) => {
151152
if (err.type === 'entity.parse.failed') {
152-
res.status(400).json({ error: { code: 'BAD_REQUEST', message: 'Invalid JSON body' } })
153+
next(new BadRequestError('Invalid JSON body'))
153154
return
154155
}
155156
next(err)

backend/src/api/public/v1/dev-stats/getAffiliationByHandle.ts renamed to backend/src/api/public/v1/affiliations/getAffiliationByHandle.ts

File renamed without changes.

backend/src/api/public/v1/dev-stats/getAffiliations.ts renamed to backend/src/api/public/v1/affiliations/getAffiliations.ts

File renamed without changes.

backend/src/api/public/v1/dev-stats/index.ts renamed to backend/src/api/public/v1/affiliations/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { getAffiliations } from './getAffiliations'
1010

1111
const rateLimiter = createRateLimiter({ max: 60, windowMs: 60 * 1000 })
1212

13-
export function devStatsRouter(): Router {
13+
export function memberOrganizationAffiliationsRouter(): Router {
1414
const router = Router()
1515

1616
router.use(rateLimiter)

backend/src/api/public/v1/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { AUTH0_CONFIG } from '../../../conf'
66
import { oauth2Middleware } from '../middlewares/oauth2Middleware'
77
import { staticApiKeyMiddleware } from '../middlewares/staticApiKeyMiddleware'
88

9-
import { devStatsRouter } from './dev-stats'
9+
import { memberOrganizationAffiliationsRouter } from './affiliations'
1010
import { membersRouter } from './members'
1111
import { organizationsRouter } from './organizations'
1212

@@ -15,7 +15,7 @@ export function v1Router(): Router {
1515

1616
router.use('/members', oauth2Middleware(AUTH0_CONFIG), membersRouter())
1717
router.use('/organizations', oauth2Middleware(AUTH0_CONFIG), organizationsRouter())
18-
router.use('/member-organization-affiliations', staticApiKeyMiddleware(), devStatsRouter())
18+
router.use('/affiliations', staticApiKeyMiddleware(), memberOrganizationAffiliationsRouter())
1919

2020
router.use(() => {
2121
throw new NotFoundError()

services/apps/git_integration/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"aioboto3>=15.1.0",
3636
"slugify>=0.0.1",
3737
"orjson>=3.11.3",
38+
"pyyaml>=6.0",
3839
]
3940

4041

services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
from crowdgit.models.service_execution import ServiceExecution
3838
from crowdgit.services.base.base_service import BaseService
3939
from crowdgit.services.maintainer.bedrock import invoke_bedrock
40-
from crowdgit.services.utils import run_shell_command
40+
from crowdgit.services.maintainer.section_extractor import SectionExtractor
41+
from crowdgit.services.utils import run_shell_command, safe_decode
4142
from crowdgit.settings import MAINTAINER_RETRY_INTERVAL_DAYS, MAINTAINER_UPDATE_INTERVAL_HOURS
4243

4344

@@ -93,6 +94,7 @@ class MaintainerService(BaseService):
9394
"code_owners",
9495
"emeritus",
9596
"workgroup",
97+
"readme",
9698
}
9799

98100
VALID_EXTENSIONS = {
@@ -132,6 +134,17 @@ class MaintainerService(BaseService):
132134
STEM_MATCH_SCORE = 50
133135
PARTIAL_STEM_SCORE = 25
134136

137+
# Files in KNOWN_PATHS that still need section filtering (contain non-governance content)
138+
SECTION_FILTERED_PATHS = {"readme.md", "governance.md"}
139+
SCORING_KEYWORDS_SET = frozenset(SCORING_KEYWORDS)
140+
141+
_section_extractor = SectionExtractor()
142+
143+
@staticmethod
144+
async def _read_text_file(file_path: str) -> str:
145+
async with aiofiles.open(file_path, "rb") as f:
146+
return safe_decode(await f.read())
147+
135148
def make_role(self, title: str):
136149
title = title.lower()
137150
title = (
@@ -360,8 +373,10 @@ async def process_chunk(chunk_index: int, chunk: str):
360373
self.get_extraction_prompt(maintainer_filename, content),
361374
pydantic_model=MaintainerInfo,
362375
)
363-
self.logger.info("Maintainers file content analyzed by AI")
364-
self.logger.info(f"Maintainers response: {maintainer_info}")
376+
info_count = len(maintainer_info.output.info) if maintainer_info.output.info else 0
377+
self.logger.info(
378+
f"Maintainers file content analyzed by AI (found={info_count}, cost={maintainer_info.cost:.4f})"
379+
)
365380
if maintainer_info.output.info is not None:
366381
return AggregatedMaintainerInfo(
367382
output=AggregatedMaintainerInfoItems(info=maintainer_info.output.info),
@@ -373,7 +388,7 @@ async def process_chunk(chunk_index: int, chunk: str):
373388
)
374389
else:
375390
self.logger.error(
376-
f"Expected a list of maintainer info or an error message, got: {str(maintainer_info)}"
391+
f"Expected a list of maintainer info or an error message, got error={maintainer_info.output.error}"
377392
)
378393
raise MaintanerAnalysisError(
379394
error_message="Unexpected response from AI for Maintainers analysis",
@@ -544,8 +559,7 @@ async def find_candidate_files(
544559
for candidate_path in all_paths:
545560
file_path = os.path.join(repo_path, candidate_path)
546561
try:
547-
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
548-
content = await f.read()
562+
content = await self._read_text_file(file_path)
549563
except Exception as e:
550564
self.logger.warning(f"Failed to read candidate {candidate_path}: {repr(e)}")
551565
continue
@@ -586,6 +600,16 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai
586600
f"Skipping README file '{filename}': no governance keyword found in content"
587601
)
588602
raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND)
603+
604+
fname = os.path.basename(filename).lower()
605+
if fname not in self.KNOWN_PATHS or fname in self.SECTION_FILTERED_PATHS:
606+
extracted = self._section_extractor.extract(fname, content, self.SCORING_KEYWORDS_SET)
607+
if extracted:
608+
self.logger.info(f"Using extracted sections for '{filename}'")
609+
content = extracted
610+
else:
611+
self.logger.debug(f"No sections extracted for '{filename}', using full content")
612+
589613
result = await self.analyze_file_content(filename, content)
590614

591615
if not result.output.info:
@@ -618,9 +642,7 @@ async def try_saved_maintainer_file(
618642
f"Saved maintainer file exists, reading content: '{saved_maintainer_file}'"
619643
)
620644
try:
621-
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
622-
content = await f.read()
623-
645+
content = await self._read_text_file(file_path)
624646
result = await self.analyze_and_build_result(saved_maintainer_file, content)
625647
cost += result.total_cost
626648
return result, cost
@@ -664,12 +686,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
664686
root_candidates, subdir_candidates = await self.find_candidate_files(repo_path)
665687
all_candidates = root_candidates + subdir_candidates
666688
candidate_files = [(path, score) for path, _, score in all_candidates][:100]
667-
self.logger.debug(
668-
f"Detection step 2: {len(root_candidates)} root candidate(s), "
669-
f"{len(subdir_candidates)} subdir candidate(s); "
670-
f"root={[p for p, _, _ in root_candidates]}, "
671-
f"subdir_top={[p for p, _, _ in subdir_candidates[:3]]}"
672-
)
673689

674690
# Step 3: Try root-level files first (in score order), then top subdirectory file
675691
failed_candidates: set[str] = set()
@@ -757,7 +773,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
757773
f"Passing {len(ai_input_files)} files to AI for maintainer file detection "
758774
f"(total repo files: {len(file_names)})"
759775
)
760-
self.logger.debug(f"AI input files: {[f for f, _ in ai_input_files]}")
761776
ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files)
762777
ai_suggested_file = ai_file_name
763778
total_cost += ai_cost
@@ -771,8 +786,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
771786
)
772787
else:
773788
try:
774-
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
775-
content = await f.read()
789+
content = await self._read_text_file(file_path)
776790
result = await self.analyze_and_build_result(ai_file_name, content)
777791
total_cost += result.total_cost
778792
return _attach_metadata(result)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import os
2+
import re
3+
4+
import tomllib
5+
import yaml
6+
7+
8+
class SectionExtractor:
9+
"""
10+
Extracts relevant sections from file content based on file format and governance keywords.
11+
Returns None when no relevant sections are found or format is unsupported (caller falls back to full content).
12+
"""
13+
14+
def extract(self, filename: str, content: str, keywords: set[str]) -> str | None:
15+
"""
16+
Returns extracted relevant section text, or None if no sections found.
17+
filename should be the basename (lowercased).
18+
"""
19+
ext = os.path.splitext(filename)[1]
20+
if ext in (".md", ".markdown"):
21+
return self._extract_markdown_sections(content, keywords)
22+
elif ext in (".yaml", ".yml"):
23+
return self._extract_yaml_sections(content, keywords)
24+
elif ext == ".toml":
25+
return self._extract_toml_sections(content, keywords)
26+
return None
27+
28+
def _extract_markdown_sections(self, content: str, keywords: set[str]) -> str | None:
29+
"""
30+
Splits content on `#`-style heading lines only (# / ## / ###...).
31+
Includes a section if its heading text contains any keyword.
32+
Returns joined matching sections, or None if none match.
33+
"""
34+
heading_pattern = re.compile(r"^#{1,6}\s+", re.MULTILINE)
35+
# Split into (heading_line, body) pairs; first element may be pre-heading content
36+
parts = heading_pattern.split(content)
37+
headings = heading_pattern.findall(content)
38+
39+
# parts[0] is text before the first heading (skip it)
40+
# parts[1..] correspond to headings[0..]
41+
matching_sections = []
42+
for i, heading_marker in enumerate(headings):
43+
block = parts[i + 1] # block starts right after the heading marker
44+
# The first line of block is the heading text
45+
first_newline = block.find("\n")
46+
heading_text = block[:first_newline].strip() if first_newline != -1 else block.strip()
47+
if any(kw in heading_text.lower() for kw in keywords):
48+
matching_sections.append(f"{heading_marker}{block}")
49+
50+
return "".join(matching_sections) if matching_sections else None
51+
52+
def _extract_yaml_sections(self, content: str, keywords: set[str]) -> str | None:
53+
"""
54+
Parses YAML and returns top-level keys whose name contains any keyword, serialized back to YAML.
55+
Returns None if no keys match or parsing fails.
56+
"""
57+
try:
58+
data = yaml.safe_load(content)
59+
except yaml.YAMLError:
60+
return None
61+
62+
if not isinstance(data, dict):
63+
return None
64+
65+
matching = {k: v for k, v in data.items() if any(kw in str(k).lower() for kw in keywords)}
66+
if not matching:
67+
return None
68+
69+
return yaml.dump(matching, default_flow_style=False, allow_unicode=True)
70+
71+
def _extract_toml_sections(self, content: str, keywords: set[str]) -> str | None:
72+
"""
73+
Parses TOML and returns top-level keys whose name contains any keyword,
74+
serialized as Python repr key=value lines (not valid TOML syntax).
75+
Returns None if no keys match or parsing fails.
76+
"""
77+
try:
78+
data = tomllib.loads(content)
79+
except tomllib.TOMLDecodeError:
80+
return None
81+
82+
matching = {k: v for k, v in data.items() if any(kw in k.lower() for kw in keywords)}
83+
if not matching:
84+
return None
85+
86+
# Serialize matching keys back as simple TOML representation
87+
lines = []
88+
for k, v in matching.items():
89+
lines.append(f"{k} = {repr(v)}")
90+
return "\n".join(lines)

services/apps/git_integration/src/crowdgit/services/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from crowdgit.logger import logger
1414

1515

16-
def _safe_decode(data: bytes) -> str:
16+
def safe_decode(data: bytes) -> str:
1717
"""
1818
Safely decode bytes to string, handling various encodings that might be present in git output.
1919
@@ -229,7 +229,7 @@ async def run_shell_command(
229229
async def _run_with_stderr_logging() -> bytes:
230230
async def _stream() -> None:
231231
async for raw_line in process.stderr:
232-
line = _safe_decode(raw_line).rstrip()
232+
line = safe_decode(raw_line).rstrip()
233233
if line:
234234
stderr_logger.log(stderr_log_level, line)
235235
stderr_lines.append(line)
@@ -240,7 +240,7 @@ async def _stream() -> None:
240240

241241
coro = _run_with_stderr_logging()
242242
stdout = await (asyncio.wait_for(coro, timeout=timeout) if timeout else coro)
243-
stdout_text = _safe_decode(stdout).strip() if stdout else ""
243+
stdout_text = safe_decode(stdout).strip() if stdout else ""
244244
stderr_text = "\n".join(stderr_lines)
245245
else:
246246
# Wait for completion with optional timeout
@@ -252,8 +252,8 @@ async def _stream() -> None:
252252
stdout, stderr = await process.communicate(input=stdin_input)
253253

254254
# Handle potentially non-UTF-8 encoded output from git commands
255-
stdout_text = _safe_decode(stdout).strip() if stdout else ""
256-
stderr_text = _safe_decode(stderr).strip() if stderr else ""
255+
stdout_text = safe_decode(stdout).strip() if stdout else ""
256+
stderr_text = safe_decode(stderr).strip() if stderr else ""
257257

258258
# Check return code
259259
if process.returncode == 0:

services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/db.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ func (db *InsightsDB) saveVulnerabilities(ctx context.Context, repoURL string, v
114114
scan_id = EXCLUDED.scan_id,
115115
status = EXCLUDED.status,
116116
fixed_version = EXCLUDED.fixed_version,
117-
severity = EXCLUDED.severity,
118-
cvss_score = EXCLUDED.cvss_score,
117+
severity = CASE WHEN EXCLUDED.severity = 'UNKNOWN' THEN vulnerabilities.severity ELSE EXCLUDED.severity END,
118+
cvss_score = COALESCE(EXCLUDED.cvss_score, vulnerabilities.cvss_score),
119119
summary = EXCLUDED.summary,
120120
details = EXCLUDED.details,
121121
cve_ids = EXCLUDED.cve_ids,

0 commit comments

Comments
 (0)