linuxfoundation
diff --git a/‎backend/.env.dist.composed‎
Lines changed: 1 addition & 0 deletions b/‎backend/.env.dist.composed‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/.env.dist.local‎
Lines changed: 8 additions & 0 deletions b/‎backend/.env.dist.local‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎scripts/services/docker/Dockerfile.git_integration‎
Lines changed: 22 additions & 1 deletion b/‎scripts/services/docker/Dockerfile.git_integration‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎services/apps/git_integration/src/crowdgit/enums.py‎
Lines changed: 1 addition & 0 deletions b/‎services/apps/git_integration/src/crowdgit/enums.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎services/apps/git_integration/src/crowdgit/errors.py‎
Lines changed: 1 addition & 0 deletions b/‎services/apps/git_integration/src/crowdgit/errors.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎services/apps/git_integration/src/crowdgit/server.py‎
Lines changed: 3 additions & 0 deletions b/‎services/apps/git_integration/src/crowdgit/server.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎services/apps/git_integration/src/crowdgit/services/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎services/apps/git_integration/src/crowdgit/services/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎services/apps/git_integration/src/crowdgit/services/utils.py‎
Lines changed: 37 additions & 11 deletions b/‎services/apps/git_integration/src/crowdgit/services/utils.py‎
Lines changed: 37 additions & 11 deletions
diff --git a/‎services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/.gitignore‎
Lines changed: 22 additions & 0 deletions b/‎services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/.gitignore‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/README.md‎
Lines changed: 109 additions & 0 deletions b/‎services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/README.md‎
Lines changed: 109 additions & 0 deletions
@@ -11,6 +11,7 @@ CROWD_S3_HOST="s3"
 # Db settings
 CROWD_DB_READ_HOST="db"
 CROWD_DB_WRITE_HOST="db"
+INSIGHTS_DB_WRITE_HOST="db"
 
 # Product DB settings
 PRODUCT_DB_HOST=product
 
@@ -38,6 +38,14 @@ CROWD_DB_USERNAME=postgres
 CROWD_DB_PASSWORD=example
 CROWD_DB_DATABASE=crowd-web
 
+INSIGHTS_DB_WRITE_HOST=localhost
+INSIGHTS_DB_USERNAME=postgres
+INSIGHTS_DB_PASSWORD=example
+INSIGHTS_DB_DATABASE=insights
+INSIGHTS_DB_PORT=5432
+INSIGHTS_DB_POOL_MAX=10
+INSIGHTS_DB_SSLMODE=disable
+
 # Product DB settings
 PRODUCT_DB_HOST=localhost
 PRODUCT_DB_PORT=5433
 
@@ -22,6 +22,23 @@ COPY ./services/apps/git_integration/src/crowdgit/services/software_value/ ./
 # Build the binary
 RUN CGO_ENABLED=0 GOOS=linux go build -ldflags "-w -s" -o software-value ./
 
+# Go builder stage 2: build the vulnerability-scanner binary
+FROM golang:1.25-alpine AS go-vuln-builder
+
+WORKDIR /go/src/vulnerability-scanner
+
+# Copy module files first for dependency caching
+COPY ./services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/go.mod ./
+COPY ./services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/go.sum ./
+
+# Copy source code
+COPY ./services/apps/git_integration/src/crowdgit/services/vulnerability_scanner/ ./
+
+# Download dependencies and build, using cache mounts to avoid re-downloading on every build
+RUN --mount=type=cache,target=/go/pkg/mod \
+    --mount=type=cache,target=/root/.cache/go-build \
+    go mod download && CGO_ENABLED=0 GOOS=linux go build -ldflags "-w -s" -o vulnerability-scanner ./
+
 # Builder stage: install build dependencies, uv, and dependencies
 FROM base AS builder
 
@@ -84,13 +101,17 @@ COPY --from=builder /usr/crowd/app /usr/crowd/app
 COPY --from=go-builder /go/src/software-value/software-value /usr/local/bin/software-value
 COPY --from=go-builder /go/bin/scc /usr/local/bin/scc
 
+# Copy vulnerability-scanner binary from go-vuln-builder stage
+COPY --from=go-vuln-builder /go/src/vulnerability-scanner/vulnerability-scanner /usr/local/bin/vulnerability-scanner
+
 # Add virtual environment bin to PATH
 ENV PATH="/usr/crowd/app/.venv/bin:$PATH"
 
 # Make runner script and binaries executable
 RUN chmod +x ./src/runner.sh \
     && chmod +x /usr/local/bin/software-value \
-    && chmod +x /usr/local/bin/scc
+    && chmod +x /usr/local/bin/scc \
+    && chmod +x /usr/local/bin/vulnerability-scanner
 
 EXPOSE 8085
 
 
@@ -70,3 +70,4 @@ class OperationType(str, Enum):
     COMMIT = "Commit"
     MAINTAINER = "Maintainer"
     SOFTWARE_VALUE = "SoftwareValue"
+    VULNERABILITY_SCAN = "VulnerabilityScanner"
@@ -56,6 +56,7 @@ class PermissionError(CrowdGitError):
 class CommandExecutionError(CrowdGitError):
     error_message: str = "Command execution failed"
     error_code: ErrorCode = ErrorCode.SHELL_COMMAND_FAILED
+    returncode: int | None = None
 
 
 @dataclass
 
@@ -11,6 +11,7 @@
     MaintainerService,
     QueueService,
     SoftwareValueService,
+    VulnerabilityScannerService,
 )
 from crowdgit.settings import WORKER_SHUTDOWN_TIMEOUT_SEC
 from crowdgit.worker.repository_worker import RepositoryWorker
@@ -25,13 +26,15 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     queue_service = QueueService()
     commit_service = CommitService(queue_service=queue_service)
     software_value_service = SoftwareValueService()
+    vulnerability_scanner_service = VulnerabilityScannerService()
     maintainer_service = MaintainerService()
 
     worker_task = None
     worker = RepositoryWorker(
         clone_service=clone_service,
         commit_service=commit_service,
         software_value_service=software_value_service,
+        vulnerability_scanner_service=vulnerability_scanner_service,
         maintainer_service=maintainer_service,
         queue_service=queue_service,
     )
 
@@ -4,12 +4,16 @@
 from crowdgit.services.maintainer.maintainer_service import MaintainerService
 from crowdgit.services.queue.queue_service import QueueService
 from crowdgit.services.software_value.software_value_service import SoftwareValueService
+from crowdgit.services.vulnerability_scanner.vulnerability_scanner_service import (
+    VulnerabilityScannerService,
+)
 
 __all__ = [
     "BaseService",
     "CloneService",
     "CommitService",
     "SoftwareValueService",
+    "VulnerabilityScannerService",
     "MaintainerService",
     "QueueService",
 ]
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 import re
 from urllib.parse import urlparse
 
@@ -167,6 +168,7 @@ async def run_shell_command(
     cwd: str = None,
     timeout: float | None = None,
     input_text: str | bytes | None = None,
+    stderr_logger: logging.Logger | None = None,
 ) -> str:
     """
     Run shell command asynchronously and return output on success, raise exception on failure.
@@ -176,6 +178,7 @@ async def run_shell_command(
         cwd: Working directory
         timeout: Command timeout in seconds
         input_text: Text (str) or bytes to send to stdin (will automatically append newline if not present)
+        stderr_logger: If provided, a logger whose .info() method is called with each stderr line in real-time
 
     Returns:
         str: Command stdout output
@@ -219,17 +222,37 @@ async def run_shell_command(
                     input_text += "\n"
                 stdin_input = input_text.encode("utf-8")
 
-        # Wait for completion with optional timeout
-        if timeout:
-            stdout, stderr = await asyncio.wait_for(
-                process.communicate(input=stdin_input), timeout=timeout
-            )
+        if stderr_logger:
+            stderr_lines: list[str] = []
+
+            async def _run_with_stderr_logging() -> bytes:
+                async def _stream() -> None:
+                    async for raw_line in process.stderr:
+                        line = _safe_decode(raw_line).rstrip()
+                        if line:
+                            stderr_logger.info(line)
+                            stderr_lines.append(line)
+
+                stdout, _ = await asyncio.gather(process.stdout.read(), _stream())
+                await process.wait()
+                return stdout
+
+            coro = _run_with_stderr_logging()
+            stdout = await (asyncio.wait_for(coro, timeout=timeout) if timeout else coro)
+            stdout_text = _safe_decode(stdout).strip() if stdout else ""
+            stderr_text = "\n".join(stderr_lines)
         else:
-            stdout, stderr = await process.communicate(input=stdin_input)
+            # Wait for completion with optional timeout
+            if timeout:
+                stdout, stderr = await asyncio.wait_for(
+                    process.communicate(input=stdin_input), timeout=timeout
+                )
+            else:
+                stdout, stderr = await process.communicate(input=stdin_input)
 
-        # Handle potentially non-UTF-8 encoded output from git commands
-        stdout_text = _safe_decode(stdout).strip() if stdout else ""
-        stderr_text = _safe_decode(stderr).strip() if stderr else ""
+            # Handle potentially non-UTF-8 encoded output from git commands
+            stdout_text = _safe_decode(stdout).strip() if stdout else ""
+            stderr_text = _safe_decode(stderr).strip() if stderr else ""
 
         # Check return code
         if process.returncode == 0:
@@ -248,8 +271,11 @@ async def run_shell_command(
             logger.error(f"Permission error: {stderr_text}")
             raise PermissionError(f"Permission denied while running: {command_str}")
         else:
-            logger.error(f"Command error: {stderr_text}")
-            raise CommandExecutionError(f"Command failed: {command_str} - {stderr_text}")
+            logger.error(f"Command failed (exit {process.returncode}): {stderr_text}")
+            raise CommandExecutionError(
+                f"Command failed (exit {process.returncode}): {command_str} - {stderr_text}",
+                returncode=process.returncode,
+            )
 
     except asyncio.TimeoutError:
         logger.error(f"Command timed out after {timeout}s: {command_str}")
 
@@ -0,0 +1,22 @@
+# Binaries
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+vulnerability-scanner
+
+# Test binary
+*.test
+
+# Coverage
+*.out
+
+# Go workspace
+go.work
+
+# Go dependencies (use vendor/ instead)
+vendor/
+
+# Environment
+.env
@@ -0,0 +1,109 @@
+# Vulnerability Scanner
+
+Scans a cloned git repository for known vulnerabilities using the [Google OSV Scanner SDK](https://pkg.go.dev/github.com/google/osv-scanner/v2/pkg/osvscanner) and persists the results to the insights database.
+
+## How it fits into the pipeline
+
+The repository worker processes repos through a fixed sequence of services per clone batch:
+
+```
+Clone → [first batch only] → SoftwareValue → VulnerabilityScanner → Maintainer → Commits → ...
+```
+
+It runs on the first batch only, when the full working directory is available. The Python worker calls the scanner via subprocess, passing the local repo path and the canonical repo URL. The Go binary does all the scanning and writes results directly to the database, then exits — the Python side only tracks whether it succeeded or failed.
+
+## Architecture: why a Go binary wrapped in Python
+
+The rest of the git integration is Python, but the OSV Scanner is a Go library with no Python bindings. Rather than shelling out to the `osv-scanner` CLI (which is fragile and adds an extra process layer), we embed the scanner as a Go SDK dependency and call it programmatically. This gives us full control over scan parameters, result access, and error handling.
+
+The binary follows the same **subprocess + JSON stdout** pattern used by the `software-value` service. The Python wrapper calls it, reads the JSON response from stdout, and treats it as a black box. The binary exits with code 0 under normal operation — errors are communicated through the JSON payload. The one exception is scan timeout: if the scan exceeds the time limit, the binary calls `os.Exit(124)` without emitting JSON. The Python wrapper handles this by catching exit code 124 and retrying with `--no-transitive` to skip transitive dependency resolution (see OOM handling section below).
+
+## Key design decisions
+
+### OSV Scanner uses sentinel errors for expected outcomes
+
+OSV Scanner returns sentinel errors for two non-failure cases:
+
+- `ErrVulnerabilitiesFound` — the scanner found vulnerabilities. Like `grep` returning exit code 1 on a match, this is an expected outcome. We treat it as success and surface the results normally.
+- `ErrNoPackagesFound` — the repo contains no scannable package manifests (e.g. a pure C or shell project). This is not a failure — the scanner ran correctly and determined there is nothing to check. The scan record gets status `no_packages_found` and the Python side does not treat it as an error.
+
+Any other non-nil error from the scanner is a real failure.
+
+### Vulnerability identity: (repo_url, vulnerability_id, package_name, source_path)
+
+Each vulnerability is uniquely identified by these four fields. The same CVE can appear in multiple packages and multiple lockfiles, so all four are needed to distinguish records. This is the unique key for upserts.
+
+### vulnerability_id and the ID columns
+
+Every vulnerability record has a `vulnerability_id` — the primary identifier OSV assigns to the advisory (e.g. `GHSA-xxxx-xxxx-xxxx`). GHSA is OSV's preferred canonical ID: even when a CVE exists, OSV typically uses the GHSA ID as the primary and lists the CVE as an alias. This is because GitHub's Advisory Database is OSV's main upstream source and GHSA IDs are stable and consistently present.
+
+Beyond the primary ID, each advisory carries a list of aliases (cross-references to the same vulnerability in other databases). We take both the primary ID and all aliases and classify them into three `TEXT[]` columns by prefix:
+
+- `cve_ids` — IDs starting with `CVE-`
+- `ghsa_ids` — IDs starting with `GHSA-` (usually includes the primary ID)
+- `other_ids` — everything else (e.g. `PYSEC-`, `GO-`, `RUSTSEC-`, `MAL-`)
+
+Each column holds an array, so multiple IDs of the same type are stored as proper discrete values.
+
+### Severity derived from CVSS score, not OSV severity strings
+
+OSV records include a `MaxSeverity` field as a CVSS numeric score. We map it to our own four-tier scale (CRITICAL / HIGH / MEDIUM / LOW) using standard CVSS thresholds rather than trusting the advisory's own severity label, which is inconsistently populated across ecosystems.
+
+### Status: OPEN vs FIX_AVAILABLE vs RESOLVED
+
+A vulnerability is `FIX_AVAILABLE` if the OSV record contains a fixed version in any of the affected ranges — meaning a patch exists but the repo is still on the vulnerable version. It's `OPEN` if no fix is known. `RESOLVED` is set automatically by the database logic (see below) for findings that were present in a previous scan but are no longer detected.
+
+The OSV scanner's flatten operation can occasionally produce multiple entries for the same (vulnerability_id, package_name, source_path) with different package versions — for example when a package appears more than once in the same lockfile. When that happens, we keep the smallest version, since that represents the worst-case exposure for that specific lockfile.
+
+### Database strategy: upsert + mark-resolved, not delete + insert
+
+On each scan, rather than deleting all previous findings and inserting fresh ones, we:
+
+1. Mark all currently active findings for the repo as `RESOLVED`.
+2. Upsert the current scan results — this re-activates any finding that is still present and inserts new ones. Findings that genuinely disappeared remain `RESOLVED` with a timestamp.
+
+This preserves history. You can tell when a vulnerability was first detected, when it was last seen, and when it went away — without needing a separate audit log. It also makes it cheap to compute `new_count` (rows that were truly inserted, not updated) and `resolved_count` (rows still carrying the newly-set `resolved_at` after the upsert pass).
+
+### Transitive dependency scanning
+
+By default the scanner resolves the full transitive dependency graph, not just direct dependencies declared in lockfiles. This catches vulnerabilities in indirect deps that the project never explicitly references.
+
+Transitive scanning is expensive for repos with large or deep package ecosystems — projects like ML frameworks with hundreds of transitive Python dependencies can exhaust the 3-minute scan timeout. The scanner handles this automatically:
+
+1. **First scan**: attempt with transitive=true. If it times out, retry immediately with transitive=false (direct deps only). The result of whichever mode succeeded is stored.
+2. **Subsequent scans**: reuse the same mode as the previous completed scan (`transitive_deps_scanned` stored per scan record). This avoids flip-flopping between modes and keeps results comparable across scans.
+
+
+### OOM handling and stale scan cleanup
+
+If the Go binary is killed by the OOM killer (SIGKILL, exit code -9), the scan record it created stays stuck as `running` since the process never got a chance to finalize it. The Python wrapper detects this:
+
+1. On **any** `CommandExecutionError`, it connects to the insights DB and marks all `running` scans for that repo as `failure` with the error message. This cleans up stale records regardless of the failure reason.
+2. On **OOM specifically** (returncode -9), it retries the scan with the `--no-transitive` flag, which forces the Go binary to skip transitive dependency resolution — the most memory-intensive part of scanning. This gives large repos a second chance to complete within memory limits.
+
+The `--no-transitive` flag is parsed by the Go binary and overrides the normal transitive mode selection logic.
+
+### Scan tracking
+
+Every invocation creates a row in `vulnerability_scans` before the scan starts (status: `running`) and updates it on completion with duration, counts, and any error. Terminal statuses are `success`, `no_packages_found`, and `failure`. This makes it possible to detect stalled or crashed scans and gives a simple history of scan health per repo.
+
+## Building
+
+The binary is built during Docker image construction (see `Dockerfile.git_integration`). To build locally:
+
+```bash
+cd services/apps/git_integration/src/crowdgit/services/vulnerability_scanner
+go build -o vulnerability-scanner .
+```
+
+
+Output is a JSON object on stdout:
+
+```json
+{ "status": "success", "error_code": null, "error_message": null }
+```
+
+On repos with no scannable package manifests:
+```json
+{ "status": "no_packages_found", "error_code": null, "error_message": null }
+```