@@ -92,10 +92,7 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
9292 return cls (type = AddressType .THREAD , value = (a .process .ppid , a .process .pid , a .tid ))
9393
9494 elif isinstance (a , capa .features .address .DynamicCallAddress ):
95- return cls (
96- type = AddressType .CALL ,
97- value = (a .thread .process .ppid , a .thread .process .pid , a .thread .tid , a .id ),
98- )
95+ return cls (type = AddressType .CALL , value = (a .thread .process .ppid , a .thread .process .pid , a .thread .tid , a .id ))
9996
10097 elif a == capa .features .address .NO_ADDRESS or isinstance (a , capa .features .address ._NoAddress ):
10198 return cls (type = AddressType .NO_ADDRESS , value = None )
@@ -346,9 +343,14 @@ class Freeze(BaseModel):
346343 model_config = ConfigDict (populate_by_name = True )
347344
348345
349- def dumps_static (extractor : StaticFeatureExtractor ) -> str :
346+ def dumps_static (extractor : StaticFeatureExtractor , reproducible : bool = False ) -> str :
350347 """
351348 serialize the given extractor to a string
349+
350+ When `reproducible` is true, the freeze's dynamic header metadata (e.g. the
351+ embedded capa version) is zeroed out so that output is identical across
352+ capa versions for a given extractor. This is used by the feature snapshot
353+ tests to keep fixtures stable across version bumps.
352354 """
353355 global_features : list [GlobalFeature ] = []
354356 for feature , _ in extractor .extract_global_features ():
@@ -357,6 +359,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
357359 feature = feature_from_capa (feature ),
358360 )
359361 )
362+ global_features .sort (key = lambda gf : gf .feature .model_dump_json ())
360363
361364 file_features : list [FileFeature ] = []
362365 for feature , address in extractor .extract_file_features ():
@@ -366,6 +369,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
366369 address = Address .from_capa (address ),
367370 )
368371 )
372+ file_features .sort (key = lambda ff : (ff .address , ff .feature .model_dump_json ()))
369373
370374 function_features : list [FunctionFeatures ] = []
371375 for f in extractor .get_functions ():
@@ -378,6 +382,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
378382 )
379383 for feature , addr in extractor .extract_function_features (f )
380384 ]
385+ ffeatures .sort (key = lambda ff : (ff .address , ff .feature .model_dump_json ()))
381386
382387 basic_blocks = []
383388 for bb in extractor .get_basic_blocks (f ):
@@ -390,6 +395,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
390395 )
391396 for feature , addr in extractor .extract_basic_block_features (f , bb )
392397 ]
398+ bbfeatures .sort (key = lambda bf : (bf .address , bf .feature .model_dump_json ()))
393399
394400 instructions = []
395401 for insn in extractor .get_instructions (f , bb ):
@@ -402,6 +408,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
402408 )
403409 for feature , addr in extractor .extract_insn_features (f , bb , insn )
404410 ]
411+ ifeatures .sort (key = lambda i : (i .address , i .feature .model_dump_json ()))
405412
406413 instructions .append (
407414 InstructionFeatures (
@@ -410,6 +417,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
410417 )
411418 )
412419
420+ instructions .sort (key = lambda i : i .address )
413421 basic_blocks .append (
414422 BasicBlockFeatures (
415423 address = bbaddr ,
@@ -418,6 +426,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
418426 )
419427 )
420428
429+ basic_blocks .sort (key = lambda bb : bb .address )
421430 function_features .append (
422431 FunctionFeatures (
423432 address = faddr ,
@@ -426,28 +435,33 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
426435 )
427436 )
428437
438+ function_features .sort (key = lambda ff : ff .address )
439+
429440 features = StaticFeatures (
430441 global_ = global_features , # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
431442 file = tuple (file_features ),
432443 functions = tuple (function_features ),
433444 )
434445
446+ extractor_version = "" if reproducible else capa .version .__version__
435447 freeze = Freeze (
436448 version = CURRENT_VERSION ,
437449 base_address = Address .from_capa (extractor .get_base_address ()), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
438450 sample_hashes = extractor .get_sample_hashes (),
439451 flavor = "static" ,
440- extractor = Extractor (name = extractor .__class__ .__name__ ),
452+ extractor = Extractor (name = extractor .__class__ .__name__ , version = extractor_version ),
441453 features = features ,
442454 )
443455 # type checkers are unable to recognise `base_address` as an argument due to alias
444456
445457 return freeze .model_dump_json ()
446458
447459
448- def dumps_dynamic (extractor : DynamicFeatureExtractor ) -> str :
460+ def dumps_dynamic (extractor : DynamicFeatureExtractor , reproducible : bool = False ) -> str :
449461 """
450462 serialize the given extractor to a string
463+
464+ See `dumps_static` for `reproducible`.
451465 """
452466 global_features : list [GlobalFeature ] = []
453467 for feature , _ in extractor .extract_global_features ():
@@ -456,6 +470,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
456470 feature = feature_from_capa (feature ),
457471 )
458472 )
473+ global_features .sort (key = lambda gf : gf .feature .model_dump_json ())
459474
460475 file_features : list [FileFeature ] = []
461476 for feature , address in extractor .extract_file_features ():
@@ -465,6 +480,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
465480 address = Address .from_capa (address ),
466481 )
467482 )
483+ file_features .sort (key = lambda ff : (ff .address , ff .feature .model_dump_json ()))
468484
469485 process_features : list [ProcessFeatures ] = []
470486 for p in extractor .get_processes ():
@@ -478,6 +494,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
478494 )
479495 for feature , addr in extractor .extract_process_features (p )
480496 ]
497+ pfeatures .sort (key = lambda pf : (pf .address , pf .feature .model_dump_json ()))
481498
482499 threads = []
483500 for t in extractor .get_threads (p ):
@@ -490,6 +507,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
490507 )
491508 for feature , addr in extractor .extract_thread_features (p , t )
492509 ]
510+ tfeatures .sort (key = lambda tf : (tf .address , tf .feature .model_dump_json ()))
493511
494512 calls = []
495513 for call in extractor .get_calls (p , t ):
@@ -503,6 +521,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
503521 )
504522 for feature , addr in extractor .extract_call_features (p , t , call )
505523 ]
524+ cfeatures .sort (key = lambda cf : (cf .address , cf .feature .model_dump_json ()))
506525
507526 calls .append (
508527 CallFeatures (
@@ -512,6 +531,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
512531 )
513532 )
514533
534+ calls .sort (key = lambda c : c .address )
515535 threads .append (
516536 ThreadFeatures (
517537 address = taddr ,
@@ -520,6 +540,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
520540 )
521541 )
522542
543+ threads .sort (key = lambda t : t .address )
523544 process_features .append (
524545 ProcessFeatures (
525546 address = paddr ,
@@ -529,6 +550,8 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
529550 )
530551 )
531552
553+ process_features .sort (key = lambda pf : pf .address )
554+
532555 features = DynamicFeatures (
533556 global_ = global_features , # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
534557 file = tuple (file_features ),
@@ -539,12 +562,13 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
539562 get_base_addr = getattr (extractor , "get_base_address" , None )
540563 base_addr = get_base_addr () if get_base_addr else capa .features .address .NO_ADDRESS
541564
565+ extractor_version = "" if reproducible else capa .version .__version__
542566 freeze = Freeze (
543567 version = CURRENT_VERSION ,
544568 base_address = Address .from_capa (base_addr ), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
545569 sample_hashes = extractor .get_sample_hashes (),
546570 flavor = "dynamic" ,
547- extractor = Extractor (name = extractor .__class__ .__name__ ),
571+ extractor = Extractor (name = extractor .__class__ .__name__ , version = extractor_version ),
548572 features = features ,
549573 )
550574 # type checkers are unable to recognise `base_address` as an argument due to alias
@@ -627,28 +651,28 @@ def loads_dynamic(s: str) -> DynamicFeatureExtractor:
627651MAGIC = "capa0000" .encode ("ascii" )
628652
629653
630- def dumps (extractor : FeatureExtractor ) -> str :
654+ def dumps (extractor : FeatureExtractor , reproducible : bool = False ) -> str :
631655 """serialize the given extractor to a string."""
632656 if isinstance (extractor , StaticFeatureExtractor ):
633- doc = dumps_static (extractor )
657+ doc = dumps_static (extractor , reproducible = reproducible )
634658 elif isinstance (extractor , DynamicFeatureExtractor ):
635- doc = dumps_dynamic (extractor )
659+ doc = dumps_dynamic (extractor , reproducible = reproducible )
636660 else :
637661 raise ValueError ("Invalid feature extractor" )
638662
639663 return doc
640664
641665
642- def dump (extractor : FeatureExtractor ) -> bytes :
666+ def dump (extractor : FeatureExtractor , reproducible : bool = False ) -> bytes :
643667 """serialize the given extractor to a byte array."""
644- return MAGIC + zlib .compress (dumps (extractor ).encode ("utf-8" ))
668+ return MAGIC + zlib .compress (dumps (extractor , reproducible = reproducible ).encode ("utf-8" ))
645669
646670
647671def is_freeze (buf : bytes ) -> bool :
648672 return buf [: len (MAGIC )] == MAGIC
649673
650674
651- def loads (s : str ):
675+ def loads (s : str ) -> FeatureExtractor :
652676 doc = json .loads (s )
653677
654678 if doc ["version" ] != CURRENT_VERSION :
@@ -662,7 +686,7 @@ def loads(s: str):
662686 raise ValueError (f"unsupported freeze format flavor: { doc ['flavor' ]} " )
663687
664688
665- def load (buf : bytes ):
689+ def load (buf : bytes ) -> FeatureExtractor :
666690 """deserialize a set of features (as a NullFeatureExtractor) from a byte array."""
667691 if not is_freeze (buf ):
668692 raise ValueError ("missing magic header" )
@@ -685,6 +709,11 @@ def main(argv=None):
685709 parser = argparse .ArgumentParser (description = "save capa features to a file" )
686710 capa .main .install_common_args (parser , {"input_file" , "format" , "backend" , "os" , "signatures" })
687711 parser .add_argument ("output" , type = str , help = "Path to output file" )
712+ parser .add_argument (
713+ "--reproducible" ,
714+ action = "store_true" ,
715+ help = "zero out dynamic header metadata (e.g. capa version) so output is stable across capa versions" ,
716+ )
688717 args = parser .parse_args (args = argv )
689718
690719 try :
@@ -696,11 +725,43 @@ def main(argv=None):
696725 except capa .main .ShouldExitError as e :
697726 return e .status_code
698727
699- Path (args .output ).write_bytes (dump (extractor ))
728+ output_path = Path (args .output )
729+ output_path .write_bytes (dump (extractor , reproducible = args .reproducible ))
730+
731+ # Log a manifest entry for the feature snapshot tests at INFO level. This
732+ # makes it easy to copy/paste into
733+ # `tests/fixtures/snapshots/features/manifest.json` when adding a new
734+ # fixture or refreshing an existing one.
735+ entry : dict [str , str ] = {
736+ "name" : output_path .stem ,
737+ "sample" : str (args .input_file ),
738+ "freeze" : output_path .name ,
739+ }
740+ if args .format and args .format != "auto" :
741+ entry ["format" ] = args .format
742+ if args .backend and args .backend != "auto" :
743+ entry ["backend" ] = args .backend
744+ if args .os and args .os != "auto" :
745+ entry ["os" ] = args .os
746+ commit = _git_head_commit ()
747+ if commit :
748+ entry ["generated_at_commit" ] = commit
749+ logger .info ("manifest entry: %s" , json .dumps (entry ))
700750
701751 return 0
702752
703753
754+ def _git_head_commit () -> str :
755+ """Return the HEAD commit, or empty string if this isn't a git checkout."""
756+ import subprocess
757+
758+ try :
759+ out = subprocess .check_output (["git" , "rev-parse" , "HEAD" ], stderr = subprocess .DEVNULL )
760+ except (subprocess .CalledProcessError , FileNotFoundError , OSError ):
761+ return ""
762+ return out .decode ("ascii" , errors = "replace" ).strip ()
763+
764+
704765if __name__ == "__main__" :
705766 import sys
706767
0 commit comments