diff --git a/gcp/workers/importer/importer.py b/gcp/workers/importer/importer.py index 55824172287..6385f193f09 100755 --- a/gcp/workers/importer/importer.py +++ b/gcp/workers/importer/importer.py @@ -297,7 +297,8 @@ def checkout(self, source_repo): source_repo.repo_url, os.path.join(self._sources_dir, source_repo.name), git_callbacks=self._git_callbacks(source_repo), - branch=source_repo.repo_branch) + branch=source_repo.repo_branch, + force_update=True) def _vuln_ids_from_gcs_blob(self, client: storage.Client, source_repo: osv.SourceRepository, diff --git a/go/cmd/gitter/gitter.go b/go/cmd/gitter/gitter.go index a0ba1121743..79eba120170 100644 --- a/go/cmd/gitter/gitter.go +++ b/go/cmd/gitter/gitter.go @@ -103,7 +103,7 @@ func isAuthError(err error) bool { (strings.Contains(strings.ToLower(errString), "repository") && strings.Contains(strings.ToLower(errString), "not found")) } -func fetchBlob(ctx context.Context, url string) ([]byte, error) { +func fetchBlob(ctx context.Context, url string, forceUpdate bool) ([]byte, error) { repoDirName := getRepoDirName(url) repoPath := path.Join(gitStorePath, repoDirName) archivePath := repoPath + ".zst" @@ -113,7 +113,7 @@ func fetchBlob(ctx context.Context, url string) ([]byte, error) { lastFetchMu.Unlock() // Check if we need to fetch - if !ok || time.Since(accessTime) > fetchTimeout { + if forceUpdate || !ok || time.Since(accessTime) > fetchTimeout { logger.Info("Fetching git blob", slog.String("url", url), slog.Duration("sinceAccessTime", time.Since(accessTime))) if _, err := os.Stat(path.Join(repoPath, ".git")); os.IsNotExist(err) { // Clone @@ -240,8 +240,9 @@ func gitHandler(w http.ResponseWriter, r *http.Request) { http.Error(w, "Missing url parameter", http.StatusBadRequest) return } + forceUpdate := r.URL.Query().Get("force-update") == "true" - logger.Info("Received request", slog.String("url", url), slog.String("remoteAddr", r.RemoteAddr)) + logger.Info("Received request", slog.String("url", url), slog.Bool("forceUpdate", forceUpdate), slog.String("remoteAddr", r.RemoteAddr)) // If request came from a local ip, don't do the check if !isLocalRequest(r) { // Check if url starts with protocols: http(s)://, git://, ssh://, (s)ftp:// @@ -251,9 +252,15 @@ func gitHandler(w http.ResponseWriter, r *http.Request) { } } + // Keep the key as the url regardless of forceUpdate. + // Occasionally this could be problematic if an existing unforce updated + // query is already inplace, no force update will happen. + // That is highly unlikely in our use case, as importer only queries + // the repo once, and always with force update. + // This is a tradeoff for simplicity to avoid having to setup locks per repo. //nolint:contextcheck // I can't change singleflight's interface fileData, err, _ := g.Do(url, func() (any, error) { - return fetchBlob(r.Context(), url) + return fetchBlob(r.Context(), url, forceUpdate) }) if err != nil { diff --git a/osv/repos.py b/osv/repos.py index 03a29e159c4..aa116d13bd4 100644 --- a/osv/repos.py +++ b/osv/repos.py @@ -123,19 +123,28 @@ def open_repo(checkout_dir): return repo -def clone(git_url, checkout_dir, git_callbacks=None, blobless=False): +def clone(git_url, + checkout_dir, + git_callbacks=None, + blobless=False, + force_update=False): """Perform a clone.""" # Don't user Gitter for oss-fuzz-vulns repo because it requires auth logging.info('Cloning %s to %s.', git_url, checkout_dir) if GITTER_HOST and git_url != 'ssh://github.com/google/oss-fuzz-vulns': try: os.makedirs(checkout_dir, exist_ok=True) + params = {'url': _git_mirror(git_url)} + if force_update: + params['force-update'] = 'true' + + # Long timeout duration (1hr) because it could be cloning a large repo resp = requests.get( f'{GITTER_HOST}/getgit', - params={'url': _git_mirror(git_url)}, + params=params, stream=True, - timeout=3600 - ) # Long timeout duration (1hr) because it could be cloning a large repo + timeout=3600, + ) if resp.status_code == 403: raise RepoInaccessibleError() if resp.status_code == 400: @@ -189,13 +198,19 @@ def clone_with_retries(git_url, checkout_dir, git_callbacks=None, branch=None, - blobless=False): + blobless=False, + force_update=False): """Clone with retries.""" logging.info('Cloning %s to %s', git_url, checkout_dir) os.makedirs(checkout_dir, exist_ok=True) for attempt in range(CLONE_TRIES): try: - repo = clone(git_url, checkout_dir, git_callbacks, blobless=blobless) + repo = clone( + git_url, + checkout_dir, + git_callbacks, + blobless=blobless, + force_update=force_update) repo.cache = {} if branch: _checkout_branch(repo, branch) @@ -216,7 +231,8 @@ def clone_with_retries(git_url, def _use_existing_checkout(git_url, checkout_dir, git_callbacks=None, - branch=None): + branch=None, + force_update=False): """Update and use existing checkout.""" repo = open_repo(checkout_dir) repo.cache = {} @@ -235,7 +251,7 @@ def _use_existing_checkout(git_url, raise NoBranchError('Branch "%s" not found in repo "%s"' % (branch, git_url)) from e - reset_repo(repo, git_callbacks) + reset_repo(repo, git_callbacks, force=force_update) logging.info('Using existing checkout at %s', checkout_dir) return repo @@ -244,13 +260,18 @@ def ensure_updated_checkout(git_url, checkout_dir, git_callbacks=None, branch=None, - blobless=False): + blobless=False, + force_update=False): """Ensure updated checkout.""" if os.path.exists(checkout_dir): # Already exists, reset and checkout latest revision. try: return _use_existing_checkout( - git_url, checkout_dir, git_callbacks=git_callbacks, branch=branch) + git_url, + checkout_dir, + git_callbacks=git_callbacks, + branch=branch, + force_update=force_update) except Exception as e: # Failed to re-use existing checkout. Delete it and start over. err_str = str(e) @@ -265,7 +286,8 @@ def ensure_updated_checkout(git_url, checkout_dir, git_callbacks=git_callbacks, branch=branch, - blobless=blobless) + blobless=blobless, + force_update=force_update) logging.info('Repo now at: %s', repo.head.peel().message) return repo