Skip to content

Commit df27110

Browse files
claudewilliballenthin
authored andcommitted
tests: add snapshot tests for feature extraction
Introduces data-driven snapshot tests that regenerate capa freeze files for a curated set of samples in the tests/data submodule and compare the bytes against committed fixtures under tests/fixtures/freezes/. Any change that perturbs feature extraction surfaces as a test failure with a feature- count delta and a truncated unified diff.
1 parent 5a60f3a commit df27110

15 files changed

Lines changed: 533 additions & 22 deletions

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
## master (unreleased)
44

55
### New Features
6+
- ci: add support for arm64 binary releases
7+
- tests: add feature snapshot tests under `tests/fixtures/snapshots/features/`
8+
- freeze: add `--reproducible` flag that zeros dynamic header metadata
69

710
### Breaking Changes
811

@@ -113,7 +116,7 @@
113116
- tests: update binja version to 5.3 @mr-tz #3011
114117
- ci: use explicit and per job permissions @mike-hunhoff #3002
115118
- replace black/isort/flake8 with ruff @mike-hunhoff #2992
116-
119+
- tests: add snapshot tests for feature extraction. @williballenthin #3054
117120
- ci: update GitHub Actions to support Node.js 24 (deprecate Node.js 20) @mr-tz #2984
118121

119122
### Raw diffs
@@ -258,6 +261,7 @@ Additionally a Binary Ninja bug has been fixed. Released binaries now include AR
258261
- nursery/get-dotnet-assembly-entry-point mehunhoff@google.com
259262

260263
### Bug Fixes
264+
- dotnetfile: sort namespace features so freeze output is deterministic across runs
261265

262266
- binja: fix a crash during feature extraction when the MLIL is unavailable @xusheng6 #2714
263267

capa/features/extractors/dotnetfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[tuple
9898
# namespaces may be empty, discard
9999
namespaces.discard("")
100100

101-
for namespace in namespaces:
101+
for namespace in sorted(namespaces):
102102
# namespace do not have an associated token, so we yield 0x0
103103
yield Namespace(namespace), NO_ADDRESS
104104

capa/features/freeze/__init__.py

Lines changed: 95 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,7 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
9292
return cls(type=AddressType.THREAD, value=(a.process.ppid, a.process.pid, a.tid))
9393

9494
elif isinstance(a, capa.features.address.DynamicCallAddress):
95-
return cls(
96-
type=AddressType.CALL,
97-
value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id),
98-
)
95+
return cls(type=AddressType.CALL, value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id))
9996

10097
elif a == capa.features.address.NO_ADDRESS or isinstance(a, capa.features.address._NoAddress):
10198
return cls(type=AddressType.NO_ADDRESS, value=None)
@@ -144,17 +141,15 @@ def to_capa(self) -> capa.features.address.Address:
144141
assert isinstance(pid, int)
145142
assert isinstance(tid, int)
146143
return capa.features.address.ThreadAddress(
147-
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid),
148-
tid=tid,
144+
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
149145
)
150146

151147
elif self.type is AddressType.CALL:
152148
assert isinstance(self.value, tuple)
153149
ppid, pid, tid, id_ = self.value
154150
return capa.features.address.DynamicCallAddress(
155151
thread=capa.features.address.ThreadAddress(
156-
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid),
157-
tid=tid,
152+
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
158153
),
159154
id=id_,
160155
)
@@ -180,6 +175,20 @@ def __lt__(self, other: "Address") -> bool:
180175
return self.value < other.value # type: ignore
181176

182177

178+
def _addr_sort_key(a: Address) -> tuple:
179+
"""
180+
Canonical, comparable sort key for an Address.
181+
182+
We don't rely on Address.__lt__ here because it returns True for
183+
NO_ADDRESS < NO_ADDRESS, which breaks strict weak ordering for sort.
184+
"""
185+
if a.value is None:
186+
return (a.type.value, ())
187+
if isinstance(a.value, int):
188+
return (a.type.value, (a.value,))
189+
return (a.type.value, tuple(a.value))
190+
191+
183192
class GlobalFeature(HashableModel):
184193
feature: Feature
185194

@@ -346,9 +355,14 @@ class Freeze(BaseModel):
346355
model_config = ConfigDict(populate_by_name=True)
347356

348357

349-
def dumps_static(extractor: StaticFeatureExtractor) -> str:
358+
def dumps_static(extractor: StaticFeatureExtractor, *, reproducible: bool = False) -> str:
350359
"""
351360
serialize the given extractor to a string
361+
362+
When `reproducible` is true, the freeze's dynamic header metadata (e.g. the
363+
embedded capa version) is zeroed out so that output is identical across
364+
capa versions for a given extractor. This is used by the feature snapshot
365+
tests to keep fixtures stable across version bumps.
352366
"""
353367
global_features: list[GlobalFeature] = []
354368
for feature, _ in extractor.extract_global_features():
@@ -357,6 +371,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
357371
feature=feature_from_capa(feature),
358372
)
359373
)
374+
global_features.sort(key=lambda gf: gf.feature.model_dump_json())
360375

361376
file_features: list[FileFeature] = []
362377
for feature, address in extractor.extract_file_features():
@@ -366,6 +381,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
366381
address=Address.from_capa(address),
367382
)
368383
)
384+
file_features.sort(key=lambda ff: (_addr_sort_key(ff.address), ff.feature.model_dump_json()))
369385

370386
function_features: list[FunctionFeatures] = []
371387
for f in extractor.get_functions():
@@ -378,6 +394,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
378394
)
379395
for feature, addr in extractor.extract_function_features(f)
380396
]
397+
ffeatures.sort(key=lambda ff: (_addr_sort_key(ff.address), ff.feature.model_dump_json()))
381398

382399
basic_blocks = []
383400
for bb in extractor.get_basic_blocks(f):
@@ -390,6 +407,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
390407
)
391408
for feature, addr in extractor.extract_basic_block_features(f, bb)
392409
]
410+
bbfeatures.sort(key=lambda bf: (_addr_sort_key(bf.address), bf.feature.model_dump_json()))
393411

394412
instructions = []
395413
for insn in extractor.get_instructions(f, bb):
@@ -402,6 +420,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
402420
)
403421
for feature, addr in extractor.extract_insn_features(f, bb, insn)
404422
]
423+
ifeatures.sort(key=lambda i: (_addr_sort_key(i.address), i.feature.model_dump_json()))
405424

406425
instructions.append(
407426
InstructionFeatures(
@@ -410,6 +429,9 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
410429
)
411430
)
412431

432+
# sort by address so regeneration is obviously idempotent regardless of
433+
# any per-extractor iteration quirks.
434+
instructions.sort(key=lambda i: _addr_sort_key(i.address))
413435
basic_blocks.append(
414436
BasicBlockFeatures(
415437
address=bbaddr,
@@ -418,6 +440,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
418440
)
419441
)
420442

443+
basic_blocks.sort(key=lambda bb: _addr_sort_key(bb.address))
421444
function_features.append(
422445
FunctionFeatures(
423446
address=faddr,
@@ -426,28 +449,33 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
426449
)
427450
)
428451

452+
function_features.sort(key=lambda ff: _addr_sort_key(ff.address))
453+
429454
features = StaticFeatures(
430455
global_=global_features, # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
431456
file=tuple(file_features),
432457
functions=tuple(function_features),
433458
)
434459

460+
extractor_version = "" if reproducible else capa.version.__version__
435461
freeze = Freeze(
436462
version=CURRENT_VERSION,
437463
base_address=Address.from_capa(extractor.get_base_address()), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
438464
sample_hashes=extractor.get_sample_hashes(),
439465
flavor="static",
440-
extractor=Extractor(name=extractor.__class__.__name__),
466+
extractor=Extractor(name=extractor.__class__.__name__, version=extractor_version),
441467
features=features,
442468
)
443469
# type checkers are unable to recognise `base_address` as an argument due to alias
444470

445471
return freeze.model_dump_json()
446472

447473

448-
def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
474+
def dumps_dynamic(extractor: DynamicFeatureExtractor, *, reproducible: bool = False) -> str:
449475
"""
450476
serialize the given extractor to a string
477+
478+
See `dumps_static` for `reproducible`.
451479
"""
452480
global_features: list[GlobalFeature] = []
453481
for feature, _ in extractor.extract_global_features():
@@ -456,6 +484,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
456484
feature=feature_from_capa(feature),
457485
)
458486
)
487+
global_features.sort(key=lambda gf: gf.feature.model_dump_json())
459488

460489
file_features: list[FileFeature] = []
461490
for feature, address in extractor.extract_file_features():
@@ -465,6 +494,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
465494
address=Address.from_capa(address),
466495
)
467496
)
497+
file_features.sort(key=lambda ff: (_addr_sort_key(ff.address), ff.feature.model_dump_json()))
468498

469499
process_features: list[ProcessFeatures] = []
470500
for p in extractor.get_processes():
@@ -478,6 +508,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
478508
)
479509
for feature, addr in extractor.extract_process_features(p)
480510
]
511+
pfeatures.sort(key=lambda pf: (_addr_sort_key(pf.address), pf.feature.model_dump_json()))
481512

482513
threads = []
483514
for t in extractor.get_threads(p):
@@ -490,6 +521,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
490521
)
491522
for feature, addr in extractor.extract_thread_features(p, t)
492523
]
524+
tfeatures.sort(key=lambda tf: (_addr_sort_key(tf.address), tf.feature.model_dump_json()))
493525

494526
calls = []
495527
for call in extractor.get_calls(p, t):
@@ -503,6 +535,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
503535
)
504536
for feature, addr in extractor.extract_call_features(p, t, call)
505537
]
538+
cfeatures.sort(key=lambda cf: (_addr_sort_key(cf.address), cf.feature.model_dump_json()))
506539

507540
calls.append(
508541
CallFeatures(
@@ -512,6 +545,9 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
512545
)
513546
)
514547

548+
# sort by address so regeneration is obviously idempotent regardless of
549+
# any per-extractor iteration quirks.
550+
calls.sort(key=lambda c: _addr_sort_key(c.address))
515551
threads.append(
516552
ThreadFeatures(
517553
address=taddr,
@@ -520,6 +556,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
520556
)
521557
)
522558

559+
threads.sort(key=lambda t: _addr_sort_key(t.address))
523560
process_features.append(
524561
ProcessFeatures(
525562
address=paddr,
@@ -529,6 +566,8 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
529566
)
530567
)
531568

569+
process_features.sort(key=lambda pf: _addr_sort_key(pf.address))
570+
532571
features = DynamicFeatures(
533572
global_=global_features, # type: ignore[call-arg] # pydantic alias "global" not recognized by type checkers
534573
file=tuple(file_features),
@@ -539,12 +578,13 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
539578
get_base_addr = getattr(extractor, "get_base_address", None)
540579
base_addr = get_base_addr() if get_base_addr else capa.features.address.NO_ADDRESS
541580

581+
extractor_version = "" if reproducible else capa.version.__version__
542582
freeze = Freeze(
543583
version=CURRENT_VERSION,
544584
base_address=Address.from_capa(base_addr), # type: ignore[call-arg] # pydantic alias "base address" not recognized by type checkers
545585
sample_hashes=extractor.get_sample_hashes(),
546586
flavor="dynamic",
547-
extractor=Extractor(name=extractor.__class__.__name__),
587+
extractor=Extractor(name=extractor.__class__.__name__, version=extractor_version),
548588
features=features,
549589
)
550590
# type checkers are unable to recognise `base_address` as an argument due to alias
@@ -627,21 +667,21 @@ def loads_dynamic(s: str) -> DynamicFeatureExtractor:
627667
MAGIC = "capa0000".encode("ascii")
628668

629669

630-
def dumps(extractor: FeatureExtractor) -> str:
670+
def dumps(extractor: FeatureExtractor, *, reproducible: bool = False) -> str:
631671
"""serialize the given extractor to a string."""
632672
if isinstance(extractor, StaticFeatureExtractor):
633-
doc = dumps_static(extractor)
673+
doc = dumps_static(extractor, reproducible=reproducible)
634674
elif isinstance(extractor, DynamicFeatureExtractor):
635-
doc = dumps_dynamic(extractor)
675+
doc = dumps_dynamic(extractor, reproducible=reproducible)
636676
else:
637677
raise ValueError("Invalid feature extractor")
638678

639679
return doc
640680

641681

642-
def dump(extractor: FeatureExtractor) -> bytes:
682+
def dump(extractor: FeatureExtractor, *, reproducible: bool = False) -> bytes:
643683
"""serialize the given extractor to a byte array."""
644-
return MAGIC + zlib.compress(dumps(extractor).encode("utf-8"))
684+
return MAGIC + zlib.compress(dumps(extractor, reproducible=reproducible).encode("utf-8"))
645685

646686

647687
def is_freeze(buf: bytes) -> bool:
@@ -685,6 +725,11 @@ def main(argv=None):
685725
parser = argparse.ArgumentParser(description="save capa features to a file")
686726
capa.main.install_common_args(parser, {"input_file", "format", "backend", "os", "signatures"})
687727
parser.add_argument("output", type=str, help="Path to output file")
728+
parser.add_argument(
729+
"--reproducible",
730+
action="store_true",
731+
help="zero out dynamic header metadata (e.g. capa version) so output is stable across capa versions",
732+
)
688733
args = parser.parse_args(args=argv)
689734

690735
try:
@@ -696,11 +741,43 @@ def main(argv=None):
696741
except capa.main.ShouldExitError as e:
697742
return e.status_code
698743

699-
Path(args.output).write_bytes(dump(extractor))
744+
output_path = Path(args.output)
745+
output_path.write_bytes(dump(extractor, reproducible=args.reproducible))
746+
747+
# Log a manifest entry for the feature snapshot tests at INFO level. This
748+
# makes it easy to copy/paste into
749+
# `tests/fixtures/snapshots/features/manifest.json` when adding a new
750+
# fixture or refreshing an existing one.
751+
entry: dict[str, str] = {
752+
"name": output_path.stem,
753+
"sample": str(args.input_file),
754+
"freeze": output_path.name,
755+
}
756+
if args.format and args.format != "auto":
757+
entry["format"] = args.format
758+
if args.backend and args.backend != "auto":
759+
entry["backend"] = args.backend
760+
if args.os and args.os != "auto":
761+
entry["os"] = args.os
762+
commit = _git_head_commit()
763+
if commit:
764+
entry["generated_at_commit"] = commit
765+
logger.info("manifest entry: %s", json.dumps(entry))
700766

701767
return 0
702768

703769

770+
def _git_head_commit() -> str:
771+
"""Return the HEAD commit, or empty string if this isn't a git checkout."""
772+
import subprocess
773+
774+
try:
775+
out = subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL)
776+
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
777+
return ""
778+
return out.decode("ascii", errors="replace").strip()
779+
780+
704781
if __name__ == "__main__":
705782
import sys
706783

capa/features/freeze/__main__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import sys
16+
17+
from capa.features.freeze import main
18+
19+
sys.exit(main())

0 commit comments

Comments
 (0)