Skip to content

Commit b99fb49

Browse files
authored
refactor(go api): determine version, import findings, health checks (#5535)
Translate the final python API endpoints to go. - health checks, which are automatically reporting healthy when the server is running. Querying the datastore/gcs for this seems unnecessary since I don't believe it maintains a persistent connection when not doing queries. - import findings reading from datastore to give the import failures for a specific source. (I don't know if anyone actually uses this) - determineversion for querying indexer results. For transparancy, I got Gemini to translate this from the python implementation (with reference to the indexer) for me. I don't see any issues with it, and the few repos I've tested it with give identical results, but I don't really understand the indexer enough to get what's going on. I told it not to reuse the indexer code, since it would require moving it to the `go/` directory which is a bigger refactor than I want to do right now.
1 parent dd08243 commit b99fb49

13 files changed

Lines changed: 978 additions & 22 deletions

File tree

go/cmd/api-devserver/main.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,15 @@ func runBackend(ctx context.Context, port int) {
201201
BatchMaxElements: batchMaxElements,
202202
})
203203
relationsStore := db.NewRelationsStore(dbClient)
204+
importFindingsStore := db.NewImportFindingsStore(dbClient, nil, "", "")
205+
repoIndexStore := db.NewRepoIndexStore(dbClient)
204206
if err := api.RunServer(ctx, api.ServerOptions{
205-
Port: port,
206-
VerboseLogs: true,
207-
VulnStore: vulnStore,
208-
RelationsStore: relationsStore,
207+
Port: port,
208+
VerboseLogs: true,
209+
VulnStore: vulnStore,
210+
RelationsStore: relationsStore,
211+
ImportFindingsStore: importFindingsStore,
212+
RepoIndexStore: repoIndexStore,
209213
}); err != nil {
210214
logger.ErrorContext(ctx, "Go API server exited", "error", err)
211215
}

go/cmd/api/main.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ func run() error {
9191
BatchMaxElements: batchMaxElements,
9292
})
9393
relationsStore := db.NewRelationsStore(dbClient)
94-
94+
importFindingsStore := db.NewImportFindingsStore(dbClient, nil, "", "") // The API does not need to talk to GCS, so we can ignore those fields.
95+
repoIndexStore := db.NewRepoIndexStore(dbClient)
9596
verboseLogs := strings.EqualFold(os.Getenv("OSV_VERBOSE_LOGGING"), "true")
9697

9798
var recovererPublisher clients.Publisher
@@ -107,10 +108,12 @@ func run() error {
107108
}
108109

109110
return api.RunServer(ctx, api.ServerOptions{
110-
Port: *port,
111-
VerboseLogs: verboseLogs,
112-
VulnStore: vulnStore,
113-
RelationsStore: relationsStore,
114-
RecovererPublisher: recovererPublisher,
111+
Port: *port,
112+
VerboseLogs: verboseLogs,
113+
VulnStore: vulnStore,
114+
RelationsStore: relationsStore,
115+
ImportFindingsStore: importFindingsStore,
116+
RepoIndexStore: repoIndexStore,
117+
RecovererPublisher: recovererPublisher,
115118
})
116119
}
Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
// Copyright 2026 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package api
16+
17+
import (
18+
"bytes"
19+
"cmp"
20+
"context"
21+
22+
"crypto/md5" //nolint:gosec // indexer uses md5 to hash files
23+
"encoding/binary"
24+
"encoding/hex"
25+
"fmt"
26+
"log/slog"
27+
"math"
28+
"math/bits"
29+
"regexp"
30+
"slices"
31+
"strings"
32+
33+
"github.com/google/osv.dev/go/internal/models"
34+
"github.com/google/osv.dev/go/logger"
35+
"google.golang.org/grpc/codes"
36+
"google.golang.org/grpc/status"
37+
pb "osv.dev/bindings/go/api"
38+
)
39+
40+
const (
41+
bucketSize = 512
42+
minScoreCutoff = 0.05
43+
maxDetermineVerResultsToReturn = 10
44+
// maxCandidatesToFetch is the number of candidate RepoIndexes we fetch and score.
45+
// Python limited this to 10, which caused it to arbitrarily discard good matches in
46+
// tie cases (e.g. many repos with 1 match). We use 100 to resolve these ties and
47+
// ensure accuracy while keeping Datastore read costs low.
48+
maxCandidatesToFetch = 100
49+
tagPrefix = "refs/tags/"
50+
)
51+
52+
var vendoredLibNames = map[string]struct{}{
53+
"3rdparty": {},
54+
"dep": {},
55+
"deps": {},
56+
"thirdparty": {},
57+
"third-party": {},
58+
"third_party": {},
59+
"libs": {},
60+
"external": {},
61+
"externals": {},
62+
"vendor": {},
63+
"vendored": {},
64+
}
65+
66+
func shouldSkipBucket(path string) bool {
67+
if path == "" {
68+
return false
69+
}
70+
components := strings.Split(path, "/")
71+
for _, c := range components {
72+
if _, ok := vendoredLibNames[strings.ToLower(c)]; ok {
73+
return true
74+
}
75+
}
76+
77+
return false
78+
}
79+
80+
// processBuckets creates buckets in the same process as the indexer.
81+
func processBuckets(fileHashes []*pb.FileHash) ([]*models.RepoIndexBucket, error) {
82+
buckets := make([][][]byte, bucketSize)
83+
84+
for _, fh := range fileHashes {
85+
if len(fh.GetHash()) < 2 {
86+
continue
87+
}
88+
if shouldSkipBucket(fh.GetFilePath()) {
89+
continue
90+
}
91+
92+
idx := binary.BigEndian.Uint16(fh.GetHash()[0:2]) % bucketSize
93+
buckets[idx] = append(buckets[idx], fh.GetHash())
94+
}
95+
96+
results := make([]*models.RepoIndexBucket, bucketSize)
97+
for i := range bucketSize {
98+
bucket := buckets[i]
99+
// Sort hashes lexicographically to produce deterministic bucket hashes
100+
slices.SortFunc(bucket, bytes.Compare)
101+
102+
hasher := md5.New() //nolint:gosec
103+
for _, h := range bucket {
104+
_, err := hasher.Write(h)
105+
if err != nil {
106+
return nil, fmt.Errorf("failed to write hash to hasher: %w", err)
107+
}
108+
}
109+
110+
results[i] = &models.RepoIndexBucket{
111+
NodeHash: hasher.Sum(nil),
112+
FilesContained: len(bucket),
113+
}
114+
}
115+
116+
return results, nil
117+
}
118+
119+
func estimateDiff(numBucketChange int, fileCountDiff int) int {
120+
// Guard against potential out-of-bound values to prevent Log(<=0) or NaN
121+
if numBucketChange < 0 {
122+
numBucketChange = 0
123+
}
124+
if numBucketChange >= bucketSize {
125+
numBucketChange = bucketSize - 1
126+
}
127+
estimate := float64(bucketSize) * math.Log(float64(bucketSize+1)/float64(bucketSize-numBucketChange+1))
128+
129+
// Use RoundToEven to match Python's round() behavior for 0.5,
130+
// ensuring identical score calculations and preventing filtering discrepancies.
131+
return fileCountDiff + int(math.RoundToEven(math.Max(estimate-float64(fileCountDiff), 0)/2))
132+
}
133+
134+
var candidateRegex = regexp.MustCompile(`(?i:\d+|rc\d*|alpha\d*|beta\d*|preview\d*)`)
135+
var isWordChar = regexp.MustCompile(`(?i)^[a-z]$`)
136+
137+
func normalizeTag(v string) string {
138+
if strings.HasPrefix(v, ".") {
139+
v = "0" + v
140+
}
141+
matches := candidateRegex.FindAllStringIndex(v, -1)
142+
if len(matches) == 0 {
143+
return v
144+
}
145+
components := make([]string, 0, len(matches))
146+
for _, loc := range matches {
147+
start, end := loc[0], loc[1]
148+
matchStr := v[start:end]
149+
150+
firstChar := strings.ToLower(matchStr[:1])
151+
if firstChar >= "a" && firstChar <= "z" {
152+
if start > 0 {
153+
prevChar := v[start-1 : start]
154+
if isWordChar.MatchString(prevChar) {
155+
continue
156+
}
157+
}
158+
}
159+
components = append(components, matchStr)
160+
}
161+
if len(components) == 0 {
162+
return v
163+
}
164+
165+
return strings.Join(components, "-")
166+
}
167+
168+
func (s *server) DetermineVersion(ctx context.Context, req *pb.DetermineVersionParameters) (*pb.VersionMatchList, error) {
169+
query := req.GetQuery()
170+
if query == nil {
171+
return &pb.VersionMatchList{}, nil
172+
}
173+
174+
logger.InfoContext(ctx, "DetermineVersion called", "hashes_count", len(query.GetFileHashes()))
175+
176+
// Filter and prepare file hashes
177+
var validHashes []*pb.FileHash
178+
for _, fh := range query.GetFileHashes() {
179+
if fh.Hash != nil && len(fh.GetHash()) <= 100 {
180+
validHashes = append(validHashes, fh)
181+
}
182+
}
183+
184+
buckets, err := processBuckets(validHashes)
185+
if err != nil {
186+
logger.ErrorContext(ctx, "failed to process buckets", slog.Any("error", err))
187+
return nil, status.Error(codes.Internal, "failed to process buckets")
188+
}
189+
190+
nodeHashes := make([][]byte, 0, len(buckets))
191+
nonEmtpyBucketIndices := make([]int, 0, len(buckets))
192+
var emptyBucketBitmap [bucketSize / 8]byte // 64 bytes for 512 bits
193+
194+
for i, b := range buckets {
195+
if b.FilesContained == 0 {
196+
continue
197+
}
198+
nodeHashes = append(nodeHashes, b.NodeHash)
199+
nonEmtpyBucketIndices = append(nonEmtpyBucketIndices, i)
200+
201+
// Set bit in emptyBucketBitmap (little-endian byte order bit allocation)
202+
emptyBucketBitmap[i/8] |= 1 << (i % 8)
203+
}
204+
205+
// Query Datastore via repository
206+
matchedBucketsByHash, err := s.repoIndexStore.QueryBuckets(ctx, nodeHashes)
207+
if err != nil {
208+
logger.ErrorContext(ctx, "Failed to query RepoIndexBuckets", "error", err)
209+
return nil, status.Error(codes.Internal, "failed to query repo index buckets")
210+
}
211+
212+
fileMatchCount := make(map[string]int)
213+
bucketMatchCount := make(map[string]int)
214+
numSkippedBuckets := 0
215+
skippedFiles := 0
216+
217+
// We need to keep track of which parent IDs we've seen
218+
parentIDsSet := make(map[string]struct{})
219+
220+
for _, idx := range nonEmtpyBucketIndices {
221+
b := buckets[idx]
222+
hexHash := hex.EncodeToString(b.NodeHash)
223+
matches := matchedBucketsByHash[hexHash]
224+
225+
if len(matches) == models.MaxMatchesToCare {
226+
numSkippedBuckets++
227+
skippedFiles += b.FilesContained
228+
229+
continue
230+
}
231+
232+
for _, match := range matches {
233+
if match.ParentID == "" {
234+
continue
235+
}
236+
parentIDsSet[match.ParentID] = struct{}{}
237+
fileMatchCount[match.ParentID] += match.FilesContained
238+
bucketMatchCount[match.ParentID]++
239+
}
240+
}
241+
242+
// Add skipped files back to the match count of all seen parent IDs
243+
for parentID := range parentIDsSet {
244+
fileMatchCount[parentID] += skippedFiles
245+
}
246+
247+
// Sort parent IDs by bucket match count descending, and limit to maxDetermineVerResultsToReturn
248+
parentIDs := make([]string, 0, len(parentIDsSet))
249+
for id := range parentIDsSet {
250+
parentIDs = append(parentIDs, id)
251+
}
252+
slices.SortFunc(parentIDs, func(a, b string) int {
253+
return -cmp.Compare(bucketMatchCount[a], bucketMatchCount[b])
254+
})
255+
256+
if len(parentIDs) > maxCandidatesToFetch {
257+
parentIDs = parentIDs[:maxCandidatesToFetch]
258+
}
259+
260+
repoIndexes, err := s.repoIndexStore.GetRepoIndexes(ctx, parentIDs)
261+
if err != nil {
262+
logger.ErrorContext(ctx, "Failed to get RepoIndexes", "error", err)
263+
return nil, status.Error(codes.Internal, "failed to get repo indexes")
264+
}
265+
266+
matches := make([]*pb.VersionMatch, 0, len(repoIndexes))
267+
queryFileCount := len(query.GetFileHashes())
268+
269+
// Inverted empty bucket bitmap of the query
270+
// (bitwise NOT on the query bitmap, meaning 1 represents empty in query)
271+
var invertedEmptyBucketBitmap [bucketSize / 8]byte
272+
for i := range emptyBucketBitmap {
273+
invertedEmptyBucketBitmap[i] = ^emptyBucketBitmap[i]
274+
}
275+
276+
for _, idx := range repoIndexes {
277+
if idx == nil || len(idx.EmptyBucketBitmap) < bucketSize/8 {
278+
continue
279+
}
280+
281+
// Calculate missed empty buckets
282+
// We are looking to find cases where the bitmap generated by the user query
283+
// gives a 0 (meaning empty in query, so 1 in invertedEmptyBucketBitmap),
284+
// but the bitmap of the repo is a 1 (meaning non-empty in repo).
285+
missedEmptyBuckets := 0
286+
for i := range bucketSize / 8 {
287+
// bitwise AND of inverted query bitmap and repo bitmap
288+
missed := invertedEmptyBucketBitmap[i] & idx.EmptyBucketBitmap[i]
289+
missedEmptyBuckets += bits.OnesCount8(missed)
290+
}
291+
292+
// Count empty buckets in user query
293+
emptyBucketCount := 0
294+
for i := range bucketSize / 8 {
295+
emptyBucketCount += bits.OnesCount8(invertedEmptyBucketBitmap[i])
296+
}
297+
298+
numBucketChange := bucketSize - bucketMatchCount[idx.ID] - emptyBucketCount + missedEmptyBuckets - numSkippedBuckets
299+
fileCountDiff := int(math.Abs(float64(idx.FileCount - queryFileCount)))
300+
301+
estimatedDiffFiles := estimateDiff(numBucketChange, fileCountDiff)
302+
maxFiles := int(math.Max(float64(idx.FileCount), float64(queryFileCount)))
303+
if maxFiles == 0 {
304+
continue
305+
}
306+
307+
score := float64(maxFiles-estimatedDiffFiles) / float64(maxFiles)
308+
if score < minScoreCutoff {
309+
continue
310+
}
311+
312+
version := normalizeTag(strings.TrimPrefix(idx.Tag, tagPrefix))
313+
version = strings.ReplaceAll(version, "-", ".")
314+
315+
if version == "" {
316+
continue
317+
}
318+
319+
matches = append(matches, &pb.VersionMatch{
320+
Score: score,
321+
MinimumFileMatches: int64(fileMatchCount[idx.ID]),
322+
EstimatedDiffFiles: int64(estimatedDiffFiles),
323+
RepoInfo: &pb.VersionRepositoryInformation{
324+
Type: pb.VersionRepositoryInformation_GIT,
325+
Address: idx.RepoAddr,
326+
Commit: hex.EncodeToString(idx.Commit),
327+
Tag: strings.TrimPrefix(idx.Tag, tagPrefix),
328+
Version: version,
329+
},
330+
})
331+
}
332+
333+
// Sort matches descending by score
334+
slices.SortFunc(matches, func(a, b *pb.VersionMatch) int {
335+
return -cmp.Compare(a.GetScore(), b.GetScore())
336+
})
337+
338+
// Limit results
339+
if len(matches) > maxDetermineVerResultsToReturn {
340+
matches = matches[:maxDetermineVerResultsToReturn]
341+
}
342+
343+
return &pb.VersionMatchList{Matches: matches}, nil
344+
}

0 commit comments

Comments
 (0)