Skip to content

Commit ad41edf

Browse files
feat: add force_update support for git checkouts (#4730)
This PR introduces a `force_update` mechanism for git repository checkouts. Specifically: 1. `osv/repos.py` functions `clone`, `clone_with_retries`, `_use_existing_checkout`, and `ensure_updated_checkout` now accept a `force_update` argument. 2. The `importer` worker now uses `force_update=True` when checking out source repositories. 3. The `gitter` service (Go) now parses `force-update=true` and bypasses its 1-hour cache when this flag is present. 4. Local `FETCH_CACHE` in `osv/repos.py` is also bypassed when `force_update=True`. --- *PR created automatically by Jules for task [6408382110995720439](https://jules.google.com/task/6408382110995720439) started by @another-rex* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent 31e4d8d commit ad41edf

3 files changed

Lines changed: 46 additions & 16 deletions

File tree

gcp/workers/importer/importer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,8 @@ def checkout(self, source_repo):
297297
source_repo.repo_url,
298298
os.path.join(self._sources_dir, source_repo.name),
299299
git_callbacks=self._git_callbacks(source_repo),
300-
branch=source_repo.repo_branch)
300+
branch=source_repo.repo_branch,
301+
force_update=True)
301302

302303
def _vuln_ids_from_gcs_blob(self, client: storage.Client,
303304
source_repo: osv.SourceRepository,

go/cmd/gitter/gitter.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func isAuthError(err error) bool {
103103
(strings.Contains(strings.ToLower(errString), "repository") && strings.Contains(strings.ToLower(errString), "not found"))
104104
}
105105

106-
func fetchBlob(ctx context.Context, url string) ([]byte, error) {
106+
func fetchBlob(ctx context.Context, url string, forceUpdate bool) ([]byte, error) {
107107
repoDirName := getRepoDirName(url)
108108
repoPath := path.Join(gitStorePath, repoDirName)
109109
archivePath := repoPath + ".zst"
@@ -113,7 +113,7 @@ func fetchBlob(ctx context.Context, url string) ([]byte, error) {
113113
lastFetchMu.Unlock()
114114

115115
// Check if we need to fetch
116-
if !ok || time.Since(accessTime) > fetchTimeout {
116+
if forceUpdate || !ok || time.Since(accessTime) > fetchTimeout {
117117
logger.Info("Fetching git blob", slog.String("url", url), slog.Duration("sinceAccessTime", time.Since(accessTime)))
118118
if _, err := os.Stat(path.Join(repoPath, ".git")); os.IsNotExist(err) {
119119
// Clone
@@ -240,8 +240,9 @@ func gitHandler(w http.ResponseWriter, r *http.Request) {
240240
http.Error(w, "Missing url parameter", http.StatusBadRequest)
241241
return
242242
}
243+
forceUpdate := r.URL.Query().Get("force-update") == "true"
243244

244-
logger.Info("Received request", slog.String("url", url), slog.String("remoteAddr", r.RemoteAddr))
245+
logger.Info("Received request", slog.String("url", url), slog.Bool("forceUpdate", forceUpdate), slog.String("remoteAddr", r.RemoteAddr))
245246
// If request came from a local ip, don't do the check
246247
if !isLocalRequest(r) {
247248
// Check if url starts with protocols: http(s)://, git://, ssh://, (s)ftp://
@@ -251,9 +252,15 @@ func gitHandler(w http.ResponseWriter, r *http.Request) {
251252
}
252253
}
253254

255+
// Keep the key as the url regardless of forceUpdate.
256+
// Occasionally this could be problematic if an existing unforce updated
257+
// query is already inplace, no force update will happen.
258+
// That is highly unlikely in our use case, as importer only queries
259+
// the repo once, and always with force update.
260+
// This is a tradeoff for simplicity to avoid having to setup locks per repo.
254261
//nolint:contextcheck // I can't change singleflight's interface
255262
fileData, err, _ := g.Do(url, func() (any, error) {
256-
return fetchBlob(r.Context(), url)
263+
return fetchBlob(r.Context(), url, forceUpdate)
257264
})
258265

259266
if err != nil {

osv/repos.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -123,19 +123,28 @@ def open_repo(checkout_dir):
123123
return repo
124124

125125

126-
def clone(git_url, checkout_dir, git_callbacks=None, blobless=False):
126+
def clone(git_url,
127+
checkout_dir,
128+
git_callbacks=None,
129+
blobless=False,
130+
force_update=False):
127131
"""Perform a clone."""
128132
# Don't user Gitter for oss-fuzz-vulns repo because it requires auth
129133
logging.info('Cloning %s to %s.', git_url, checkout_dir)
130134
if GITTER_HOST and git_url != 'ssh://github.com/google/oss-fuzz-vulns':
131135
try:
132136
os.makedirs(checkout_dir, exist_ok=True)
137+
params = {'url': _git_mirror(git_url)}
138+
if force_update:
139+
params['force-update'] = 'true'
140+
141+
# Long timeout duration (1hr) because it could be cloning a large repo
133142
resp = requests.get(
134143
f'{GITTER_HOST}/getgit',
135-
params={'url': _git_mirror(git_url)},
144+
params=params,
136145
stream=True,
137-
timeout=3600
138-
) # Long timeout duration (1hr) because it could be cloning a large repo
146+
timeout=3600,
147+
)
139148
if resp.status_code == 403:
140149
raise RepoInaccessibleError()
141150
if resp.status_code == 400:
@@ -189,13 +198,19 @@ def clone_with_retries(git_url,
189198
checkout_dir,
190199
git_callbacks=None,
191200
branch=None,
192-
blobless=False):
201+
blobless=False,
202+
force_update=False):
193203
"""Clone with retries."""
194204
logging.info('Cloning %s to %s', git_url, checkout_dir)
195205
os.makedirs(checkout_dir, exist_ok=True)
196206
for attempt in range(CLONE_TRIES):
197207
try:
198-
repo = clone(git_url, checkout_dir, git_callbacks, blobless=blobless)
208+
repo = clone(
209+
git_url,
210+
checkout_dir,
211+
git_callbacks,
212+
blobless=blobless,
213+
force_update=force_update)
199214
repo.cache = {}
200215
if branch:
201216
_checkout_branch(repo, branch)
@@ -216,7 +231,8 @@ def clone_with_retries(git_url,
216231
def _use_existing_checkout(git_url,
217232
checkout_dir,
218233
git_callbacks=None,
219-
branch=None):
234+
branch=None,
235+
force_update=False):
220236
"""Update and use existing checkout."""
221237
repo = open_repo(checkout_dir)
222238
repo.cache = {}
@@ -235,7 +251,7 @@ def _use_existing_checkout(git_url,
235251
raise NoBranchError('Branch "%s" not found in repo "%s"' %
236252
(branch, git_url)) from e
237253

238-
reset_repo(repo, git_callbacks)
254+
reset_repo(repo, git_callbacks, force=force_update)
239255
logging.info('Using existing checkout at %s', checkout_dir)
240256
return repo
241257

@@ -244,13 +260,18 @@ def ensure_updated_checkout(git_url,
244260
checkout_dir,
245261
git_callbacks=None,
246262
branch=None,
247-
blobless=False):
263+
blobless=False,
264+
force_update=False):
248265
"""Ensure updated checkout."""
249266
if os.path.exists(checkout_dir):
250267
# Already exists, reset and checkout latest revision.
251268
try:
252269
return _use_existing_checkout(
253-
git_url, checkout_dir, git_callbacks=git_callbacks, branch=branch)
270+
git_url,
271+
checkout_dir,
272+
git_callbacks=git_callbacks,
273+
branch=branch,
274+
force_update=force_update)
254275
except Exception as e:
255276
# Failed to re-use existing checkout. Delete it and start over.
256277
err_str = str(e)
@@ -265,7 +286,8 @@ def ensure_updated_checkout(git_url,
265286
checkout_dir,
266287
git_callbacks=git_callbacks,
267288
branch=branch,
268-
blobless=blobless)
289+
blobless=blobless,
290+
force_update=force_update)
269291
logging.info('Repo now at: %s', repo.head.peel().message)
270292
return repo
271293

0 commit comments

Comments
 (0)