@@ -92,10 +92,7 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
9292 return cls (type = AddressType .THREAD , value = (a .process .ppid , a .process .pid , a .tid ))
9393
9494 elif isinstance (a , capa .features .address .DynamicCallAddress ):
95- return cls (
96- type = AddressType .CALL ,
97- value = (a .thread .process .ppid , a .thread .process .pid , a .thread .tid , a .id ),
98- )
95+ return cls (type = AddressType .CALL , value = (a .thread .process .ppid , a .thread .process .pid , a .thread .tid , a .id ))
9996
10097 elif a == capa .features .address .NO_ADDRESS or isinstance (a , capa .features .address ._NoAddress ):
10198 return cls (type = AddressType .NO_ADDRESS , value = None )
@@ -144,17 +141,15 @@ def to_capa(self) -> capa.features.address.Address:
144141 assert isinstance (pid , int )
145142 assert isinstance (tid , int )
146143 return capa .features .address .ThreadAddress (
147- process = capa .features .address .ProcessAddress (ppid = ppid , pid = pid ),
148- tid = tid ,
144+ process = capa .features .address .ProcessAddress (ppid = ppid , pid = pid ), tid = tid
149145 )
150146
151147 elif self .type is AddressType .CALL :
152148 assert isinstance (self .value , tuple )
153149 ppid , pid , tid , id_ = self .value
154150 return capa .features .address .DynamicCallAddress (
155151 thread = capa .features .address .ThreadAddress (
156- process = capa .features .address .ProcessAddress (ppid = ppid , pid = pid ),
157- tid = tid ,
152+ process = capa .features .address .ProcessAddress (ppid = ppid , pid = pid ), tid = tid
158153 ),
159154 id = id_ ,
160155 )
@@ -180,6 +175,20 @@ def __lt__(self, other: "Address") -> bool:
180175 return self .value < other .value # type: ignore
181176
182177
178+ def _addr_sort_key (a : Address ) -> tuple :
179+ """
180+ Canonical, comparable sort key for an Address.
181+
182+ We don't rely on Address.__lt__ here because it returns True for
183+ NO_ADDRESS < NO_ADDRESS, which breaks strict weak ordering for sort.
184+ """
185+ if a .value is None :
186+ return (a .type .value , ())
187+ if isinstance (a .value , int ):
188+ return (a .type .value , (a .value ,))
189+ return (a .type .value , tuple (a .value ))
190+
191+
183192class GlobalFeature (HashableModel ):
184193 feature : Feature
185194
@@ -346,9 +355,14 @@ class Freeze(BaseModel):
346355 model_config = ConfigDict (populate_by_name = True )
347356
348357
349- def dumps_static (extractor : StaticFeatureExtractor ) -> str :
358+ def dumps_static (extractor : StaticFeatureExtractor , * , reproducible : bool = False ) -> str :
350359 """
351360 serialize the given extractor to a string
361+
362+ When `reproducible` is true, the freeze's dynamic header metadata (e.g. the
363+ embedded capa version) is zeroed out so that output is identical across
364+ capa versions for a given extractor. This is used by the feature snapshot
365+ tests to keep fixtures stable across version bumps.
352366 """
353367 global_features : list [GlobalFeature ] = []
354368 for feature , _ in extractor .extract_global_features ():
@@ -357,6 +371,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
357371 feature = feature_from_capa (feature ),
358372 )
359373 )
374+ global_features .sort (key = lambda gf : gf .feature .model_dump_json ())
360375
361376 file_features : list [FileFeature ] = []
362377 for feature , address in extractor .extract_file_features ():
@@ -366,6 +381,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
366381 address = Address .from_capa (address ),
367382 )
368383 )
384+ file_features .sort (key = lambda ff : (_addr_sort_key (ff .address ), ff .feature .model_dump_json ()))
369385
370386 function_features : list [FunctionFeatures ] = []
371387 for f in extractor .get_functions ():
@@ -378,6 +394,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
378394 )
379395 for feature , addr in extractor .extract_function_features (f )
380396 ]
397+ ffeatures .sort (key = lambda ff : (_addr_sort_key (ff .address ), ff .feature .model_dump_json ()))
381398
382399 basic_blocks = []
383400 for bb in extractor .get_basic_blocks (f ):
@@ -390,6 +407,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
390407 )
391408 for feature , addr in extractor .extract_basic_block_features (f , bb )
392409 ]
410+ bbfeatures .sort (key = lambda bf : (_addr_sort_key (bf .address ), bf .feature .model_dump_json ()))
393411
394412 instructions = []
395413 for insn in extractor .get_instructions (f , bb ):
@@ -402,6 +420,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
402420 )
403421 for feature , addr in extractor .extract_insn_features (f , bb , insn )
404422 ]
423+ ifeatures .sort (key = lambda i : (_addr_sort_key (i .address ), i .feature .model_dump_json ()))
405424
406425 instructions .append (
407426 InstructionFeatures (
@@ -410,6 +429,9 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
410429 )
411430 )
412431
432+ # sort by address so regeneration is obviously idempotent regardless of
433+ # any per-extractor iteration quirks.
434+ instructions .sort (key = lambda i : _addr_sort_key (i .address ))
413435 basic_blocks .append (
414436 BasicBlockFeatures (
415437 address = bbaddr ,
@@ -418,6 +440,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
418440 )
419441 )
420442
443+ basic_blocks .sort (key = lambda bb : _addr_sort_key (bb .address ))
421444 function_features .append (
422445 FunctionFeatures (
423446 address = faddr ,
@@ -426,28 +449,33 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
426449 )
427450 )
428451
452+ function_features .sort (key = lambda ff : _addr_sort_key (ff .address ))
453+
429454 features = StaticFeatures (
430455 global_ = global_features , # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
431456 file = tuple (file_features ),
432457 functions = tuple (function_features ),
433458 )
434459
460+ extractor_version = "" if reproducible else capa .version .__version__
435461 freeze = Freeze (
436462 version = CURRENT_VERSION ,
437463 base_address = Address .from_capa (extractor .get_base_address ()), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
438464 sample_hashes = extractor .get_sample_hashes (),
439465 flavor = "static" ,
440- extractor = Extractor (name = extractor .__class__ .__name__ ),
466+ extractor = Extractor (name = extractor .__class__ .__name__ , version = extractor_version ),
441467 features = features ,
442468 )
443469 # type checkers are unable to recognise `base_address` as an argument due to alias
444470
445471 return freeze .model_dump_json ()
446472
447473
448- def dumps_dynamic (extractor : DynamicFeatureExtractor ) -> str :
474+ def dumps_dynamic (extractor : DynamicFeatureExtractor , * , reproducible : bool = False ) -> str :
449475 """
450476 serialize the given extractor to a string
477+
478+ See `dumps_static` for `reproducible`.
451479 """
452480 global_features : list [GlobalFeature ] = []
453481 for feature , _ in extractor .extract_global_features ():
@@ -456,6 +484,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
456484 feature = feature_from_capa (feature ),
457485 )
458486 )
487+ global_features .sort (key = lambda gf : gf .feature .model_dump_json ())
459488
460489 file_features : list [FileFeature ] = []
461490 for feature , address in extractor .extract_file_features ():
@@ -465,6 +494,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
465494 address = Address .from_capa (address ),
466495 )
467496 )
497+ file_features .sort (key = lambda ff : (_addr_sort_key (ff .address ), ff .feature .model_dump_json ()))
468498
469499 process_features : list [ProcessFeatures ] = []
470500 for p in extractor .get_processes ():
@@ -478,6 +508,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
478508 )
479509 for feature , addr in extractor .extract_process_features (p )
480510 ]
511+ pfeatures .sort (key = lambda pf : (_addr_sort_key (pf .address ), pf .feature .model_dump_json ()))
481512
482513 threads = []
483514 for t in extractor .get_threads (p ):
@@ -490,6 +521,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
490521 )
491522 for feature , addr in extractor .extract_thread_features (p , t )
492523 ]
524+ tfeatures .sort (key = lambda tf : (_addr_sort_key (tf .address ), tf .feature .model_dump_json ()))
493525
494526 calls = []
495527 for call in extractor .get_calls (p , t ):
@@ -503,6 +535,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
503535 )
504536 for feature , addr in extractor .extract_call_features (p , t , call )
505537 ]
538+ cfeatures .sort (key = lambda cf : (_addr_sort_key (cf .address ), cf .feature .model_dump_json ()))
506539
507540 calls .append (
508541 CallFeatures (
@@ -512,6 +545,9 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
512545 )
513546 )
514547
548+ # sort by address so regeneration is obviously idempotent regardless of
549+ # any per-extractor iteration quirks.
550+ calls .sort (key = lambda c : _addr_sort_key (c .address ))
515551 threads .append (
516552 ThreadFeatures (
517553 address = taddr ,
@@ -520,6 +556,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
520556 )
521557 )
522558
559+ threads .sort (key = lambda t : _addr_sort_key (t .address ))
523560 process_features .append (
524561 ProcessFeatures (
525562 address = paddr ,
@@ -529,6 +566,8 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
529566 )
530567 )
531568
569+ process_features .sort (key = lambda pf : _addr_sort_key (pf .address ))
570+
532571 features = DynamicFeatures (
533572 global_ = global_features , # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
534573 file = tuple (file_features ),
@@ -539,12 +578,13 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
539578 get_base_addr = getattr (extractor , "get_base_address" , None )
540579 base_addr = get_base_addr () if get_base_addr else capa .features .address .NO_ADDRESS
541580
581+ extractor_version = "" if reproducible else capa .version .__version__
542582 freeze = Freeze (
543583 version = CURRENT_VERSION ,
544584 base_address = Address .from_capa (base_addr ), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
545585 sample_hashes = extractor .get_sample_hashes (),
546586 flavor = "dynamic" ,
547- extractor = Extractor (name = extractor .__class__ .__name__ ),
587+ extractor = Extractor (name = extractor .__class__ .__name__ , version = extractor_version ),
548588 features = features ,
549589 )
550590 # type checkers are unable to recognise `base_address` as an argument due to alias
@@ -627,21 +667,21 @@ def loads_dynamic(s: str) -> DynamicFeatureExtractor:
627667MAGIC = "capa0000" .encode ("ascii" )
628668
629669
630- def dumps (extractor : FeatureExtractor ) -> str :
670+ def dumps (extractor : FeatureExtractor , * , reproducible : bool = False ) -> str :
631671 """serialize the given extractor to a string."""
632672 if isinstance (extractor , StaticFeatureExtractor ):
633- doc = dumps_static (extractor )
673+ doc = dumps_static (extractor , reproducible = reproducible )
634674 elif isinstance (extractor , DynamicFeatureExtractor ):
635- doc = dumps_dynamic (extractor )
675+ doc = dumps_dynamic (extractor , reproducible = reproducible )
636676 else :
637677 raise ValueError ("Invalid feature extractor" )
638678
639679 return doc
640680
641681
642- def dump (extractor : FeatureExtractor ) -> bytes :
682+ def dump (extractor : FeatureExtractor , * , reproducible : bool = False ) -> bytes :
643683 """serialize the given extractor to a byte array."""
644- return MAGIC + zlib .compress (dumps (extractor ).encode ("utf-8" ))
684+ return MAGIC + zlib .compress (dumps (extractor , reproducible = reproducible ).encode ("utf-8" ))
645685
646686
647687def is_freeze (buf : bytes ) -> bool :
@@ -685,6 +725,11 @@ def main(argv=None):
685725 parser = argparse .ArgumentParser (description = "save capa features to a file" )
686726 capa .main .install_common_args (parser , {"input_file" , "format" , "backend" , "os" , "signatures" })
687727 parser .add_argument ("output" , type = str , help = "Path to output file" )
728+ parser .add_argument (
729+ "--reproducible" ,
730+ action = "store_true" ,
731+ help = "zero out dynamic header metadata (e.g. capa version) so output is stable across capa versions" ,
732+ )
688733 args = parser .parse_args (args = argv )
689734
690735 try :
@@ -696,11 +741,43 @@ def main(argv=None):
696741 except capa .main .ShouldExitError as e :
697742 return e .status_code
698743
699- Path (args .output ).write_bytes (dump (extractor ))
744+ output_path = Path (args .output )
745+ output_path .write_bytes (dump (extractor , reproducible = args .reproducible ))
746+
747+ # Log a manifest entry for the feature snapshot tests at INFO level. This
748+ # makes it easy to copy/paste into
749+ # `tests/fixtures/snapshots/features/manifest.json` when adding a new
750+ # fixture or refreshing an existing one.
751+ entry : dict [str , str ] = {
752+ "name" : output_path .stem ,
753+ "sample" : str (args .input_file ),
754+ "freeze" : output_path .name ,
755+ }
756+ if args .format and args .format != "auto" :
757+ entry ["format" ] = args .format
758+ if args .backend and args .backend != "auto" :
759+ entry ["backend" ] = args .backend
760+ if args .os and args .os != "auto" :
761+ entry ["os" ] = args .os
762+ commit = _git_head_commit ()
763+ if commit :
764+ entry ["generated_at_commit" ] = commit
765+ logger .info ("manifest entry: %s" , json .dumps (entry ))
700766
701767 return 0
702768
703769
770+ def _git_head_commit () -> str :
771+ """Return the HEAD commit, or empty string if this isn't a git checkout."""
772+ import subprocess
773+
774+ try :
775+ out = subprocess .check_output (["git" , "rev-parse" , "HEAD" ], stderr = subprocess .DEVNULL )
776+ except (subprocess .CalledProcessError , FileNotFoundError , OSError ):
777+ return ""
778+ return out .decode ("ascii" , errors = "replace" ).strip ()
779+
780+
704781if __name__ == "__main__" :
705782 import sys
706783
0 commit comments