3939
4040import git
4141import requests
42+ import scanpipe
4243from commoncode import command
4344from commoncode .hash import multi_checksums
4445from commoncode .text import python_safe_name
4748
4849from scanpipe .models import DownloadedPackage
4950from scanpipe .models import PackageArchive
51+ from io import BytesIO
5052
5153logger = logging .getLogger ("scanpipe.pipes" )
5254
@@ -364,7 +366,7 @@ def fetch_git_repo(url, to=None):
364366 )
365367
366368
367- def store_package_archive (project , url = None , file_path = None ):
369+ def store_package_archive (project , url = None , file_path = None , pipeline_name = None ):
368370 """
369371 Store a package in PackageArchive and link it to DownloadedPackage.
370372
@@ -387,39 +389,55 @@ def store_package_archive(project, url=None, file_path=None):
387389 logger .info ("Package storage disabled (ENABLE_PACKAGE_STORAGE=False)" )
388390 return None
389391
390- if not file_path :
391- input_files = project .input_files .all ()
392- if not input_files :
393- logger .info ("No input files found for project" )
394- return None
395- file_path = input_files [0 ].path
396- logger .info (f"Using first input file: { file_path } " )
397-
398- file_path = str (file_path )
399- logger .info (f"Processing file: { file_path } " )
400-
401- try :
402- with open (file_path , "rb" ) as f :
403- checksum = hashlib .sha256 (f .read ()).hexdigest ()
404- logger .info (f"Calculated SHA256: { checksum } " )
405- except FileNotFoundError as e :
406- logger .error (f"File not found: { file_path } , error: { e } " )
392+ if not file_path and not url :
393+ logger .error ("Either file_path or url must be provided" )
407394 return None
395+
396+ if url :
397+ existing = DownloadedPackage .objects .filter (project = project , url = url ).first ()
398+ if existing and not should_rescan (existing , pipeline_name ):
399+ logger .info (f"Using existing package: { existing .package_archive .package_file .name } " )
400+ return existing
401+
402+ if file_path :
403+ file_path = str (file_path )
404+ if not Path (file_path ).exists ():
405+ logger .error (f"File not found: { file_path } " )
406+ return None
407+ with open (file_path , "rb" ) as f :
408+ content = f .read ()
409+ filename = os .path .basename (file_path )
410+ else :
411+ try :
412+ response = requests .get (url , stream = True )
413+ response .raise_for_status ()
414+ content = response .content
415+ filename = os .path .basename (url .split ("?" )[0 ])
416+ except requests .RequestException as e :
417+ logger .error (f"Failed to download { url } : { e } " )
418+ return None
419+
420+ checksum = hashlib .sha256 (content ).hexdigest ()
421+ logger .info (f"Calculated SHA256: { checksum } " )
422+
423+ existing_archive = PackageArchive .objects .filter (checksum_sha256 = checksum ).first ()
424+ if existing_archive :
425+ existing = DownloadedPackage .objects .filter (
426+ project = project , package_archive = existing_archive
427+ ).first ()
428+ if existing and not should_rescan (existing , pipeline_name ):
429+ logger .info (f"Using existing package: { existing_archive .package_file .name } " )
430+ return existing
408431
409432 try :
410- archive , created = PackageArchive . objects . get_or_create (
433+ archive = PackageArchive (
411434 checksum_sha256 = checksum ,
412- defaults = {
413- "storage_path" : file_path ,
414- "package_file" : File (
415- open (file_path , "rb" ), name = os .path .basename (file_path )
416- ),
417- },
418- )
419- logger .info (
420- f"PackageArchive { 'created' if created else 'retrieved' } :"
421- "{archive.checksum_sha256}"
435+ size = len (content ),
422436 )
437+ with open (file_path , "rb" ) if file_path else BytesIO (content ) as f :
438+ archive .package_file .save (filename , File (f ), save = False )
439+ archive .save ()
440+ logger .info (f"Created PackageArchive: { archive .checksum_sha256 } " )
423441 except Exception as e :
424442 logger .error (f"Error creating PackageArchive: { e } " )
425443 return None
@@ -428,15 +446,23 @@ def store_package_archive(project, url=None, file_path=None):
428446 dp = DownloadedPackage .objects .create (
429447 project = project ,
430448 url = url or "" ,
431- filename = os . path . basename ( file_path ) ,
449+ filename = filename ,
432450 package_archive = archive ,
451+ scancode_version = scanpipe .__version__ ,
452+ pipeline_name = pipeline_name or "" ,
433453 )
434454 logger .info (f"DownloadedPackage created: { dp .url } , { dp .filename } " )
435455 return dp
436456 except Exception as e :
437457 logger .error (f"Error creating DownloadedPackage: { e } " )
438458 return None
439459
460+ def should_rescan (package , pipeline_name ):
461+ """Check if rescanning is needed based on ScanCode version or pipeline."""
462+ current_version = scanpipe .__version__
463+ return package .scancode_version != current_version or (
464+ pipeline_name and package .pipeline_name != pipeline_name
465+ )
440466
441467SCHEME_TO_FETCHER_MAPPING = {
442468 "http" : fetch_http ,
0 commit comments