Skip to content

Commit 49d11e3

Browse files
authored
feat: Re-implement importer in Go (#4808)
Big re-implementation of the importer (and importer-deleter) into Go: - Implemented go gitter getter - Moved distinct logic (git/GCS/REST) into relevant files - Removed some outdated things (e.g. public_logs_bucket) - I've remove the writing of ImportFindings, because I think the linter is currently also doing (and overwriting) these - Parallelised processing of source repositories - also parallelised downloading records and publishing to pubsub within a source repository via a shared worker pool - Made interfaces so that the code is testable. I'd appreciate people reviewing this to see how follow-able/self-documenting the code is, and point out where things may be unclear.
1 parent 714c537 commit 49d11e3

35 files changed

Lines changed: 3525 additions & 72 deletions

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ update-api-snapshots:
5555
cd gcp/api && UPDATE_SNAPS=true ./run_tests_e2e.sh $(HOME)/.config/gcloud/application_default_credentials.json
5656

5757
lint:
58-
GOTOOLCHAIN=go1.25.7 $(run-cmd) tools/lint_and_format.sh
58+
GOTOOLCHAIN=go1.26.0 $(run-cmd) tools/lint_and_format.sh
5959

6060
build-osv-protos:
6161
cd osv && $(run-cmd) python -m grpc_tools.protoc --python_out=. --mypy_out=. --proto_path=. --proto_path=osv-schema/proto vulnerability.proto importfinding.proto

deployment/build-and-stage.yaml

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ steps:
6262
args: ['push', '--all-tags', 'gcr.io/oss-vdb/worker-base']
6363
waitFor: ['build-worker-base', 'cloud-build-queue']
6464

65-
# Build/push core worker/importer/alias images.
65+
# Build/push core worker/recoverer images.
6666
- name: gcr.io/cloud-builders/docker
6767
args: ['build', '-t', 'gcr.io/oss-vdb/worker:latest', '-t', 'gcr.io/oss-vdb/worker:$COMMIT_SHA', '-f', 'gcp/workers/worker/Dockerfile', '.']
6868
id: 'build-worker'
@@ -71,15 +71,6 @@ steps:
7171
args: ['push', '--all-tags', 'gcr.io/oss-vdb/worker']
7272
waitFor: ['build-worker', 'cloud-build-queue']
7373

74-
- name: gcr.io/cloud-builders/docker
75-
args: ['build', '-t', 'gcr.io/oss-vdb/importer:latest', '-t', 'gcr.io/oss-vdb/importer:$COMMIT_SHA', '.']
76-
dir: 'gcp/workers/importer'
77-
id: 'build-importer'
78-
waitFor: ['build-worker']
79-
- name: gcr.io/cloud-builders/docker
80-
args: ['push', '--all-tags', 'gcr.io/oss-vdb/importer']
81-
waitFor: ['build-importer', 'cloud-build-queue']
82-
8374
- name: gcr.io/cloud-builders/docker
8475
args: ['build', '-t', 'gcr.io/oss-vdb/recoverer:latest', '-t', 'gcr.io/oss-vdb/recoverer:$COMMIT_SHA', '.']
8576
dir: 'gcp/workers/recoverer'
@@ -107,6 +98,20 @@ steps:
10798
waitFor: ['build-oss-fuzz-importer', 'cloud-build-queue']
10899

109100
# Build/push go images
101+
- name: 'gcr.io/cloud-builders/docker'
102+
entrypoint: 'bash'
103+
args: ['-c', 'docker pull gcr.io/oss-vdb/importer:latest || exit 0']
104+
id: 'pull-importer'
105+
waitFor: ['setup']
106+
- name: gcr.io/cloud-builders/docker
107+
args: ['build', '-t', 'gcr.io/oss-vdb/importer:latest', '-t', 'gcr.io/oss-vdb/importer:$COMMIT_SHA', '-f', 'cmd/importer/Dockerfile', '--cache-from', 'gcr.io/oss-vdb/importer:latest', '--pull', '.']
108+
dir: 'go'
109+
id: 'build-importer'
110+
waitFor: ['pull-importer']
111+
- name: gcr.io/cloud-builders/docker
112+
args: ['push', '--all-tags', 'gcr.io/oss-vdb/importer']
113+
waitFor: ['build-importer', 'cloud-build-queue']
114+
110115
- name: 'gcr.io/cloud-builders/docker'
111116
entrypoint: 'bash'
112117
args: ['-c', 'docker pull gcr.io/oss-vdb/exporter:latest || exit 0']

deployment/clouddeploy/gke-workers/base/importer.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ spec:
2525
env:
2626
- name: GITTER_HOST
2727
value: http://gitter-service:8888
28+
- name: IMPORT_TRACE_SAMPLE_RATE # for the overall importer trace
29+
value: "1.0"
30+
- name: TRACE_SAMPLE_RATE # for the individual vulnerability entries
31+
value: "0.05"
2832
securityContext:
2933
privileged: true
3034
resources:

deployment/clouddeploy/gke-workers/environments/oss-vdb-test/importer-deleter.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,4 @@ spec:
1717
image: importer
1818
args:
1919
- --delete
20-
- --delete_threshold_pct=2
21-
- --public_log_bucket=osv-test-public-import-logs
20+
- --delete-threshold-pct=2

deployment/clouddeploy/gke-workers/environments/oss-vdb-test/importer.yaml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@ spec:
1515
- name: OSV_VULNERABILITIES_BUCKET
1616
value: osv-test-vulnerabilities
1717
args:
18-
# TODO(michaelkedar): ssh secrets
19-
# TODO(michaelkedar): single source of truth w/ terraform config
20-
- "--public_log_bucket=osv-test-public-import-logs"
2118
# Note that with https://github.com/google/osv.dev/pull/2766
2219
# addition per-repository settings make this *really* take effect, see
2320
# https://github.com/google/osv.dev/pull/2837
24-
- "--strict_validation"
21+
- "--strict-validation"

deployment/clouddeploy/gke-workers/environments/oss-vdb/importer-deleter.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,4 @@ spec:
1717
image: importer
1818
args:
1919
- --delete
20-
- --delete_threshold_pct=2
21-
- --public_log_bucket=osv-public-import-logs
20+
- --delete-threshold-pct=2

deployment/clouddeploy/gke-workers/environments/oss-vdb/importer.yaml

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,3 @@ spec:
1414
value: oss-vdb
1515
- name: OSV_VULNERABILITIES_BUCKET
1616
value: osv-vulnerabilities
17-
args:
18-
- "--ssh_key_public=/secrets/ssh.pub"
19-
- "--ssh_key_private=/secrets/ssh"
20-
- "--public_log_bucket=osv-public-import-logs"
21-
volumeMounts:
22-
- mountPath: "/secrets"
23-
name: "secrets"
24-
volumes:
25-
- name: secrets
26-
secret:
27-
items:
28-
- key: ssh
29-
mode: 384
30-
path: ssh
31-
- key: ssh.pub
32-
path: ssh.pub
33-
secretName: secrets

go/cmd/exporter/exporter.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,19 +97,19 @@ func main() {
9797

9898
prevPrefix := ""
9999
MainLoop:
100-
for path, err := range vulnClient.Objects(ctx, gcsProtoPrefix) {
100+
for obj, err := range vulnClient.Objects(ctx, gcsProtoPrefix) {
101101
if err != nil {
102102
logger.FatalContext(ctx, "failed to list objects", slog.Any("err", err))
103103
}
104104
// Only log when we see a new ID prefix (i.e. roughly once per data source)
105-
prefix := filepath.Base(path)
105+
prefix := filepath.Base(obj.Name)
106106
prefix, _, _ = strings.Cut(prefix, "-")
107107
if prefix != prevPrefix {
108-
logger.InfoContext(ctx, "iterating vulnerabilities", slog.String("now_at", path))
108+
logger.InfoContext(ctx, "iterating vulnerabilities", slog.String("now_at", obj.Name))
109109
prevPrefix = prefix
110110
}
111111
select {
112-
case gcsPathToDownloaderCh <- path:
112+
case gcsPathToDownloaderCh <- obj.Name:
113113
case <-ctx.Done():
114114
break MainLoop
115115
}

go/cmd/generatesitemap/generatesitemap.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,19 +180,19 @@ func main() {
180180

181181
func listObjects(ctx context.Context, client clients.CloudStorage, outCh chan<- string) error {
182182
prevPrefix := ""
183-
for name, err := range client.Objects(ctx, gcsProtoPrefix) {
183+
for obj, err := range client.Objects(ctx, gcsProtoPrefix) {
184184
if err != nil {
185185
return err
186186
}
187187
// Only log when we see a new ID prefix (i.e. roughly once per data source)
188-
prefix := filepath.Base(name)
188+
prefix := filepath.Base(obj.Name)
189189
prefix, _, _ = strings.Cut(prefix, "-")
190190
if prefix != prevPrefix {
191-
logger.InfoContext(ctx, "iterating vulnerabilities", slog.String("now_at", name))
191+
logger.InfoContext(ctx, "iterating vulnerabilities", slog.String("now_at", obj.Name))
192192
prevPrefix = prefix
193193
}
194194
select {
195-
case outCh <- name:
195+
case outCh <- obj.Name:
196196
case <-ctx.Done():
197197
return ctx.Err()
198198
}

go/cmd/importer/Dockerfile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM golang:1.26.0-alpine@sha256:d4c4845f5d60c6a974c6000ce58ae079328d03ab7f721a0734277e69905473e5 AS build
16+
17+
WORKDIR /src
18+
19+
COPY ./go.mod /src/go.mod
20+
COPY ./go.sum /src/go.sum
21+
RUN go mod download && go mod verify
22+
23+
24+
COPY ./ /src/
25+
RUN CGO_ENABLED=0 go build -o importer ./cmd/importer/
26+
27+
FROM alpine:3.23@sha256:25109184c71bdad752c8312a8623239686a9a2071e8825f20acb8f2198c3f659
28+
29+
# Need to install the full tar package, to not use the busybox version, which doesn't have --zstd support.
30+
RUN apk add --no-cache git zstd tar
31+
32+
COPY --from=build /src/importer /
33+
34+
ENTRYPOINT ["/importer"]

0 commit comments

Comments
 (0)