Skip to content

Commit d81ebd0

Browse files
committed
add support to not re-scan packages
Signed-off-by: Varsha U N <varshaun58@gmail.com>
1 parent 5632bf7 commit d81ebd0

2 files changed

Lines changed: 72 additions & 34 deletions

File tree

scanpipe/models.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4427,9 +4427,7 @@ class PackageArchive(UUIDPKModel):
44274427
)
44284428

44294429
class Meta:
4430-
indexes = [
4431-
models.Index(fields=["checksum_sha256"], name="checksum_idx"),
4432-
]
4430+
pass
44334431

44344432
def __str__(self):
44354433
return f"Archive {self.checksum_sha256[:8]} at {
@@ -4478,6 +4476,16 @@ class DownloadedPackage(UUIDPKModel):
44784476
on_delete=models.CASCADE,
44794477
help_text=_("The stored archive file associated with this package."),
44804478
)
4479+
scancode_version = models.CharField(
4480+
max_length=50,
4481+
blank=True,
4482+
help_text=_("ScanCode version used for scanning."),
4483+
)
4484+
pipeline_name = models.CharField(
4485+
max_length=100,
4486+
blank=True,
4487+
help_text=_("Pipeline used to process the package."),
4488+
)
44814489

44824490
class Meta:
44834491
indexes = [
@@ -4489,8 +4497,12 @@ class Meta:
44894497
condition=Q(url__gt=""),
44904498
name="%(app_label)s_%(class)s_unique_url_project",
44914499
),
4500+
models.UniqueConstraint(
4501+
fields=["project", "package_archive"],
4502+
name="%(app_label)s_%(class)s_unique_project_archive",
4503+
),
44924504
]
4493-
4505+
44944506
def __str__(self):
44954507
return f"{self.filename} for project {self.project.name}"
44964508

scanpipe/pipes/fetch.py

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
import git
4141
import requests
42+
import scanpipe
4243
from commoncode import command
4344
from commoncode.hash import multi_checksums
4445
from commoncode.text import python_safe_name
@@ -47,6 +48,7 @@
4748

4849
from scanpipe.models import DownloadedPackage
4950
from scanpipe.models import PackageArchive
51+
from io import BytesIO
5052

5153
logger = logging.getLogger("scanpipe.pipes")
5254

@@ -364,7 +366,7 @@ def fetch_git_repo(url, to=None):
364366
)
365367

366368

367-
def store_package_archive(project, url=None, file_path=None):
369+
def store_package_archive(project, url=None, file_path=None, pipeline_name=None):
368370
"""
369371
Store a package in PackageArchive and link it to DownloadedPackage.
370372
@@ -387,39 +389,55 @@ def store_package_archive(project, url=None, file_path=None):
387389
logger.info("Package storage disabled (ENABLE_PACKAGE_STORAGE=False)")
388390
return None
389391

390-
if not file_path:
391-
input_files = project.input_files.all()
392-
if not input_files:
393-
logger.info("No input files found for project")
394-
return None
395-
file_path = input_files[0].path
396-
logger.info(f"Using first input file: {file_path}")
397-
398-
file_path = str(file_path)
399-
logger.info(f"Processing file: {file_path}")
400-
401-
try:
402-
with open(file_path, "rb") as f:
403-
checksum = hashlib.sha256(f.read()).hexdigest()
404-
logger.info(f"Calculated SHA256: {checksum}")
405-
except FileNotFoundError as e:
406-
logger.error(f"File not found: {file_path}, error: {e}")
392+
if not file_path and not url:
393+
logger.error("Either file_path or url must be provided")
407394
return None
395+
396+
if url:
397+
existing = DownloadedPackage.objects.filter(project=project, url=url).first()
398+
if existing and not should_rescan(existing, pipeline_name):
399+
logger.info(f"Using existing package: {existing.package_archive.package_file.name}")
400+
return existing
401+
402+
if file_path:
403+
file_path = str(file_path)
404+
if not Path(file_path).exists():
405+
logger.error(f"File not found: {file_path}")
406+
return None
407+
with open(file_path, "rb") as f:
408+
content = f.read()
409+
filename = os.path.basename(file_path)
410+
else:
411+
try:
412+
response = requests.get(url, stream=True)
413+
response.raise_for_status()
414+
content = response.content
415+
filename = os.path.basename(url.split("?")[0])
416+
except requests.RequestException as e:
417+
logger.error(f"Failed to download {url}: {e}")
418+
return None
419+
420+
checksum = hashlib.sha256(content).hexdigest()
421+
logger.info(f"Calculated SHA256: {checksum}")
422+
423+
existing_archive = PackageArchive.objects.filter(checksum_sha256=checksum).first()
424+
if existing_archive:
425+
existing = DownloadedPackage.objects.filter(
426+
project=project, package_archive=existing_archive
427+
).first()
428+
if existing and not should_rescan(existing, pipeline_name):
429+
logger.info(f"Using existing package: {existing_archive.package_file.name}")
430+
return existing
408431

409432
try:
410-
archive, created = PackageArchive.objects.get_or_create(
433+
archive = PackageArchive(
411434
checksum_sha256=checksum,
412-
defaults={
413-
"storage_path": file_path,
414-
"package_file": File(
415-
open(file_path, "rb"), name=os.path.basename(file_path)
416-
),
417-
},
418-
)
419-
logger.info(
420-
f"PackageArchive {'created' if created else 'retrieved'}:"
421-
"{archive.checksum_sha256}"
435+
size=len(content),
422436
)
437+
with open(file_path, "rb") if file_path else BytesIO(content) as f:
438+
archive.package_file.save(filename, File(f), save=False)
439+
archive.save()
440+
logger.info(f"Created PackageArchive: {archive.checksum_sha256}")
423441
except Exception as e:
424442
logger.error(f"Error creating PackageArchive: {e}")
425443
return None
@@ -428,15 +446,23 @@ def store_package_archive(project, url=None, file_path=None):
428446
dp = DownloadedPackage.objects.create(
429447
project=project,
430448
url=url or "",
431-
filename=os.path.basename(file_path),
449+
filename=filename,
432450
package_archive=archive,
451+
scancode_version=scanpipe.__version__,
452+
pipeline_name=pipeline_name or "",
433453
)
434454
logger.info(f"DownloadedPackage created: {dp.url}, {dp.filename}")
435455
return dp
436456
except Exception as e:
437457
logger.error(f"Error creating DownloadedPackage: {e}")
438458
return None
439459

460+
def should_rescan(package, pipeline_name):
461+
"""Check if rescanning is needed based on ScanCode version or pipeline."""
462+
current_version = scanpipe.__version__
463+
return package.scancode_version != current_version or (
464+
pipeline_name and package.pipeline_name != pipeline_name
465+
)
440466

441467
SCHEME_TO_FETCHER_MAPPING = {
442468
"http": fetch_http,

0 commit comments

Comments
 (0)