Skip to content

Commit 18fa543

Browse files
tests: add snapshot tests for feature extration
Introduces data-driven snapshot tests that regenerate capa freeze files for a curated set of samples in the tests/data submodule and compare the bytes against committed fixtures under tests/fixtures/freezes/. Any change that perturbs feature extraction surfaces as a test failure with a feature-count delta and a truncated unified diff.
1 parent d9014d0 commit 18fa543

7 files changed

Lines changed: 308 additions & 20 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## master (unreleased)
44

55
### New Features
6+
- freeze: add `--reproducible` flag that zeros dynamic header metadata
67

78
### Breaking Changes
89

@@ -128,6 +129,7 @@
128129
- ci: use explicit and per job permissions @mike-hunhoff #3002
129130
- replace black/isort/flake8 with ruff @mike-hunhoff #2992
130131
- ci: update GitHub Actions to support Node.js 24 (deprecate Node.js 20) @mr-tz #2984
132+
- tests: add snapshot tests for feature extraction @williballenthin #3069
131133

132134
### Raw diffs
133135
- [capa v9.4.0...master](https://github.com/mandiant/capa/compare/v9.4.0...master)
@@ -272,7 +274,6 @@ Additionally a Binary Ninja bug has been fixed. Released binaries now include AR
272274
- nursery/get-dotnet-assembly-entry-point mehunhoff@google.com
273275

274276
### Bug Fixes
275-
276277
- binja: fix a crash during feature extraction when the MLIL is unavailable @xusheng6 #2714
277278

278279
### capa Explorer Web

capa/features/extractors/dotnetfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[tuple
9898
# namespaces may be empty, discard
9999
namespaces.discard("")
100100

101-
for namespace in namespaces:
101+
for namespace in sorted(namespaces):
102102
# namespace do not have an associated token, so we yield 0x0
103103
yield Namespace(namespace), NO_ADDRESS
104104

capa/features/freeze/__init__.py

Lines changed: 77 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,7 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
9292
return cls(type=AddressType.THREAD, value=(a.process.ppid, a.process.pid, a.tid))
9393

9494
elif isinstance(a, capa.features.address.DynamicCallAddress):
95-
return cls(
96-
type=AddressType.CALL,
97-
value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id),
98-
)
95+
return cls(type=AddressType.CALL, value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id))
9996

10097
elif a == capa.features.address.NO_ADDRESS or isinstance(a, capa.features.address._NoAddress):
10198
return cls(type=AddressType.NO_ADDRESS, value=None)
@@ -346,9 +343,14 @@ class Freeze(BaseModel):
346343
model_config = ConfigDict(populate_by_name=True)
347344

348345

349-
def dumps_static(extractor: StaticFeatureExtractor) -> str:
346+
def dumps_static(extractor: StaticFeatureExtractor, reproducible: bool = False) -> str:
350347
"""
351348
serialize the given extractor to a string
349+
350+
When `reproducible` is true, the freeze's dynamic header metadata (e.g. the
351+
embedded capa version) is zeroed out so that output is identical across
352+
capa versions for a given extractor. This is used by the feature snapshot
353+
tests to keep fixtures stable across version bumps.
352354
"""
353355
global_features: list[GlobalFeature] = []
354356
for feature, _ in extractor.extract_global_features():
@@ -357,6 +359,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
357359
feature=feature_from_capa(feature),
358360
)
359361
)
362+
global_features.sort(key=lambda gf: gf.feature.model_dump_json())
360363

361364
file_features: list[FileFeature] = []
362365
for feature, address in extractor.extract_file_features():
@@ -366,6 +369,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
366369
address=Address.from_capa(address),
367370
)
368371
)
372+
file_features.sort(key=lambda ff: (ff.address, ff.feature.model_dump_json()))
369373

370374
function_features: list[FunctionFeatures] = []
371375
for f in extractor.get_functions():
@@ -378,6 +382,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
378382
)
379383
for feature, addr in extractor.extract_function_features(f)
380384
]
385+
ffeatures.sort(key=lambda ff: (ff.address, ff.feature.model_dump_json()))
381386

382387
basic_blocks = []
383388
for bb in extractor.get_basic_blocks(f):
@@ -390,6 +395,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
390395
)
391396
for feature, addr in extractor.extract_basic_block_features(f, bb)
392397
]
398+
bbfeatures.sort(key=lambda bf: (bf.address, bf.feature.model_dump_json()))
393399

394400
instructions = []
395401
for insn in extractor.get_instructions(f, bb):
@@ -402,6 +408,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
402408
)
403409
for feature, addr in extractor.extract_insn_features(f, bb, insn)
404410
]
411+
ifeatures.sort(key=lambda i: (i.address, i.feature.model_dump_json()))
405412

406413
instructions.append(
407414
InstructionFeatures(
@@ -410,6 +417,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
410417
)
411418
)
412419

420+
instructions.sort(key=lambda i: i.address)
413421
basic_blocks.append(
414422
BasicBlockFeatures(
415423
address=bbaddr,
@@ -418,6 +426,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
418426
)
419427
)
420428

429+
basic_blocks.sort(key=lambda bb: bb.address)
421430
function_features.append(
422431
FunctionFeatures(
423432
address=faddr,
@@ -426,28 +435,33 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
426435
)
427436
)
428437

438+
function_features.sort(key=lambda ff: ff.address)
439+
429440
features = StaticFeatures(
430441
global_=global_features, # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
431442
file=tuple(file_features),
432443
functions=tuple(function_features),
433444
)
434445

446+
extractor_version = "" if reproducible else capa.version.__version__
435447
freeze = Freeze(
436448
version=CURRENT_VERSION,
437449
base_address=Address.from_capa(extractor.get_base_address()), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
438450
sample_hashes=extractor.get_sample_hashes(),
439451
flavor="static",
440-
extractor=Extractor(name=extractor.__class__.__name__),
452+
extractor=Extractor(name=extractor.__class__.__name__, version=extractor_version),
441453
features=features,
442454
)
443455
# type checkers are unable to recognise `base_address` as an argument due to alias
444456

445457
return freeze.model_dump_json()
446458

447459

448-
def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
460+
def dumps_dynamic(extractor: DynamicFeatureExtractor, reproducible: bool = False) -> str:
449461
"""
450462
serialize the given extractor to a string
463+
464+
See `dumps_static` for `reproducible`.
451465
"""
452466
global_features: list[GlobalFeature] = []
453467
for feature, _ in extractor.extract_global_features():
@@ -456,6 +470,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
456470
feature=feature_from_capa(feature),
457471
)
458472
)
473+
global_features.sort(key=lambda gf: gf.feature.model_dump_json())
459474

460475
file_features: list[FileFeature] = []
461476
for feature, address in extractor.extract_file_features():
@@ -465,6 +480,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
465480
address=Address.from_capa(address),
466481
)
467482
)
483+
file_features.sort(key=lambda ff: (ff.address, ff.feature.model_dump_json()))
468484

469485
process_features: list[ProcessFeatures] = []
470486
for p in extractor.get_processes():
@@ -478,6 +494,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
478494
)
479495
for feature, addr in extractor.extract_process_features(p)
480496
]
497+
pfeatures.sort(key=lambda pf: (pf.address, pf.feature.model_dump_json()))
481498

482499
threads = []
483500
for t in extractor.get_threads(p):
@@ -490,6 +507,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
490507
)
491508
for feature, addr in extractor.extract_thread_features(p, t)
492509
]
510+
tfeatures.sort(key=lambda tf: (tf.address, tf.feature.model_dump_json()))
493511

494512
calls = []
495513
for call in extractor.get_calls(p, t):
@@ -503,6 +521,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
503521
)
504522
for feature, addr in extractor.extract_call_features(p, t, call)
505523
]
524+
cfeatures.sort(key=lambda cf: (cf.address, cf.feature.model_dump_json()))
506525

507526
calls.append(
508527
CallFeatures(
@@ -512,6 +531,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
512531
)
513532
)
514533

534+
calls.sort(key=lambda c: c.address)
515535
threads.append(
516536
ThreadFeatures(
517537
address=taddr,
@@ -520,6 +540,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
520540
)
521541
)
522542

543+
threads.sort(key=lambda t: t.address)
523544
process_features.append(
524545
ProcessFeatures(
525546
address=paddr,
@@ -529,6 +550,8 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
529550
)
530551
)
531552

553+
process_features.sort(key=lambda pf: pf.address)
554+
532555
features = DynamicFeatures(
533556
global_=global_features, # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
534557
file=tuple(file_features),
@@ -539,12 +562,13 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
539562
get_base_addr = getattr(extractor, "get_base_address", None)
540563
base_addr = get_base_addr() if get_base_addr else capa.features.address.NO_ADDRESS
541564

565+
extractor_version = "" if reproducible else capa.version.__version__
542566
freeze = Freeze(
543567
version=CURRENT_VERSION,
544568
base_address=Address.from_capa(base_addr), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
545569
sample_hashes=extractor.get_sample_hashes(),
546570
flavor="dynamic",
547-
extractor=Extractor(name=extractor.__class__.__name__),
571+
extractor=Extractor(name=extractor.__class__.__name__, version=extractor_version),
548572
features=features,
549573
)
550574
# type checkers are unable to recognise `base_address` as an argument due to alias
@@ -627,28 +651,28 @@ def loads_dynamic(s: str) -> DynamicFeatureExtractor:
627651
MAGIC = "capa0000".encode("ascii")
628652

629653

630-
def dumps(extractor: FeatureExtractor) -> str:
654+
def dumps(extractor: FeatureExtractor, reproducible: bool = False) -> str:
631655
"""serialize the given extractor to a string."""
632656
if isinstance(extractor, StaticFeatureExtractor):
633-
doc = dumps_static(extractor)
657+
doc = dumps_static(extractor, reproducible=reproducible)
634658
elif isinstance(extractor, DynamicFeatureExtractor):
635-
doc = dumps_dynamic(extractor)
659+
doc = dumps_dynamic(extractor, reproducible=reproducible)
636660
else:
637661
raise ValueError("Invalid feature extractor")
638662

639663
return doc
640664

641665

642-
def dump(extractor: FeatureExtractor) -> bytes:
666+
def dump(extractor: FeatureExtractor, reproducible: bool = False) -> bytes:
643667
"""serialize the given extractor to a byte array."""
644-
return MAGIC + zlib.compress(dumps(extractor).encode("utf-8"))
668+
return MAGIC + zlib.compress(dumps(extractor, reproducible=reproducible).encode("utf-8"))
645669

646670

647671
def is_freeze(buf: bytes) -> bool:
648672
return buf[: len(MAGIC)] == MAGIC
649673

650674

651-
def loads(s: str):
675+
def loads(s: str) -> FeatureExtractor:
652676
doc = json.loads(s)
653677

654678
if doc["version"] != CURRENT_VERSION:
@@ -662,7 +686,7 @@ def loads(s: str):
662686
raise ValueError(f"unsupported freeze format flavor: {doc['flavor']}")
663687

664688

665-
def load(buf: bytes):
689+
def load(buf: bytes) -> FeatureExtractor:
666690
"""deserialize a set of features (as a NullFeatureExtractor) from a byte array."""
667691
if not is_freeze(buf):
668692
raise ValueError("missing magic header")
@@ -685,6 +709,11 @@ def main(argv=None):
685709
parser = argparse.ArgumentParser(description="save capa features to a file")
686710
capa.main.install_common_args(parser, {"input_file", "format", "backend", "os", "signatures"})
687711
parser.add_argument("output", type=str, help="Path to output file")
712+
parser.add_argument(
713+
"--reproducible",
714+
action="store_true",
715+
help="zero out dynamic header metadata (e.g. capa version) so output is stable across capa versions",
716+
)
688717
args = parser.parse_args(args=argv)
689718

690719
try:
@@ -696,11 +725,43 @@ def main(argv=None):
696725
except capa.main.ShouldExitError as e:
697726
return e.status_code
698727

699-
Path(args.output).write_bytes(dump(extractor))
728+
output_path = Path(args.output)
729+
output_path.write_bytes(dump(extractor, reproducible=args.reproducible))
730+
731+
# Log a manifest entry for the feature snapshot tests at INFO level. This
732+
# makes it easy to copy/paste into
733+
# `tests/fixtures/snapshots/features/manifest.json` when adding a new
734+
# fixture or refreshing an existing one.
735+
entry: dict[str, str] = {
736+
"name": output_path.stem,
737+
"sample": str(args.input_file),
738+
"freeze": output_path.name,
739+
}
740+
if args.format and args.format != "auto":
741+
entry["format"] = args.format
742+
if args.backend and args.backend != "auto":
743+
entry["backend"] = args.backend
744+
if args.os and args.os != "auto":
745+
entry["os"] = args.os
746+
commit = _git_head_commit()
747+
if commit:
748+
entry["generated_at_commit"] = commit
749+
logger.info("manifest entry: %s", json.dumps(entry))
700750

701751
return 0
702752

703753

754+
def _git_head_commit() -> str:
755+
"""Return the HEAD commit, or empty string if this isn't a git checkout."""
756+
import subprocess
757+
758+
try:
759+
out = subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL)
760+
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
761+
return ""
762+
return out.decode("ascii", errors="replace").strip()
763+
764+
704765
if __name__ == "__main__":
705766
import sys
706767

capa/features/freeze/__main__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import sys
16+
17+
from capa.features.freeze import main
18+
19+
sys.exit(main())

0 commit comments

Comments
 (0)