diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index a349b31fe..0964ae07c 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -37,6 +37,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Fix bug in Segment Injector: favor regions with data when injecting data ([#682](https://github.com/redballoonsecurity/ofrak/pull/682)) - Pass `usedforsecurity=False` to non-cryptographic `hashlib` calls to prevent failures when Python links against FIPS OpenSSL ([#744](https://github.com/redballoonsecurity/ofrak/pull/744)) - Fix `chdir` in UEFI components causing failing tests. ([#747](https://github.com/redballoonsecurity/ofrak/pull/747)) +- Fix binwalk and entropy file descriptor leak. ([#749](https://github.com/redballoonsecurity/ofrak/pull/749)) ## [3.3.0](https://github.com/redballoonsecurity/ofrak/compare/ofrak-v3.2.0...ofrak-v3.3.0) - 2025-10-03 diff --git a/ofrak_core/src/ofrak/core/binwalk.py b/ofrak_core/src/ofrak/core/binwalk.py index 24835149d..e0e215a96 100644 --- a/ofrak_core/src/ofrak/core/binwalk.py +++ b/ofrak_core/src/ofrak/core/binwalk.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import Dict -from ofrak.resource import ResourceFactory, Resource +from ofrak.resource import Resource from ofrak.model.resource_model import ResourceAttributes @@ -19,8 +19,6 @@ from ofrak.core.binary import GenericBinary from ofrak.model.component_model import ComponentExternalTool -from ofrak.service.data_service_i import DataServiceInterface -from ofrak.service.resource_service_i import ResourceServiceInterface class _BinwalkExternalTool(ComponentExternalTool): @@ -37,6 +35,8 @@ async def is_tool_installed(self) -> bool: BINWALK_TOOL = _BinwalkExternalTool() +pool = ProcessPoolExecutor(max_workers=1) + @dataclass(**ResourceAttributes.DATACLASS_PARAMS) class BinwalkAttributes(ResourceAttributes): @@ -60,15 +60,6 @@ class BinwalkAnalyzer(Analyzer[None, BinwalkAttributes]): outputs = (BinwalkAttributes,) external_dependencies = (BINWALK_TOOL,) - def __init__( - self, - resource_factory: ResourceFactory, - data_service: DataServiceInterface, - resource_service: ResourceServiceInterface, - ): - super().__init__(resource_factory, data_service, resource_service) - self.pool = ProcessPoolExecutor() - async def analyze(self, resource: Resource, config=None) -> BinwalkAttributes: if not BINWALK_INSTALLED: raise ComponentMissingDependencyError(self, BINWALK_TOOL) @@ -76,7 +67,7 @@ async def analyze(self, resource: Resource, config=None) -> BinwalkAttributes: # Should errors be handled the way they are in the `DataSummaryAnalyzer`? Likely to be # overkill here. offsets = await asyncio.get_running_loop().run_in_executor( - self.pool, _run_binwalk_on_file, temp_path + pool, _run_binwalk_on_file, temp_path ) return BinwalkAttributes(offsets) diff --git a/ofrak_core/src/ofrak/core/entropy/entropy.py b/ofrak_core/src/ofrak/core/entropy/entropy.py index e0d0941f4..3c21bfe41 100644 --- a/ofrak_core/src/ofrak/core/entropy/entropy.py +++ b/ofrak_core/src/ofrak/core/entropy/entropy.py @@ -23,6 +23,9 @@ from ofrak.core.entropy.entropy_py import entropy_py as entropy_func +pool = ProcessPoolExecutor(max_workers=1) + + @dataclass class DataSummary: """ @@ -62,7 +65,6 @@ def __init__( resource_service: ResourceServiceInterface, ): super().__init__(resource_factory, data_service, resource_service) - self.pool = ProcessPoolExecutor() self.max_analysis_retries = 10 self._cache: Dict[str, DataSummary] = {} @@ -78,6 +80,7 @@ async def get_data_summary(self, resource: Resource) -> DataSummary: return self._cache[cache_info.cache_key] async def _compute_data_summary(self, resource: Resource, depth=0) -> DataSummary: + global pool if depth > self.max_analysis_retries: raise RuntimeError( f"Analysis process killed more than {self.max_analysis_retries} times. Aborting." @@ -85,16 +88,16 @@ async def _compute_data_summary(self, resource: Resource, depth=0) -> DataSummar try: data = await resource.get_data() entropy = await asyncio.get_running_loop().run_in_executor( - self.pool, sample_entropy, data, resource.get_id() + pool, sample_entropy, data, resource.get_id() ) magnitude = await asyncio.get_running_loop().run_in_executor( - self.pool, sample_magnitude, data + pool, sample_magnitude, data ) data_summary = DataSummary(entropy, magnitude) return data_summary except BrokenProcessPool: # If the previous one was aborted, try again with a new pool - self.pool = ProcessPoolExecutor() + pool = ProcessPoolExecutor() return await self._compute_data_summary(resource, depth=depth + 1) diff --git a/ofrak_core/tests/components/test_binwalk_component.py b/ofrak_core/tests/components/test_binwalk_component.py index 4226bbb5e..f283e27bf 100644 --- a/ofrak_core/tests/components/test_binwalk_component.py +++ b/ofrak_core/tests/components/test_binwalk_component.py @@ -79,3 +79,29 @@ async def test_binwalk_component(ofrak_context, test_case): if test_case.number_of_results is not None: assert len(binwalk_offsets) == test_case.number_of_results assert test_case.subset_of_results.items() <= binwalk_offsets.items() + + +@pytest.mark.skipif( + not os.path.isdir(f"/proc/{os.getpid()}/fd"), + reason="Requires /proc//fd (Linux only)", +) +@pytest.mark.skipif_missing_deps([BinwalkAnalyzer]) +async def test_binwalk_does_not_leak_fds(ofrak_context): + """ + Regression test for the ProcessPoolExecutor FD leak in BinwalkAnalyzer. + """ + fd_dir = f"/proc/{os.getpid()}/fd" + asset_path = os.path.join(BINWALK_ASSETS_PATH, "dirtraversal.tar") + before = len(os.listdir(fd_dir)) + + iterations = 5 + for _ in range(iterations): + root_resource = await ofrak_context.create_root_resource_from_file(asset_path) + await root_resource.analyze(BinwalkAttributes) + after = len(os.listdir(fd_dir)) + + delta = after - before + assert delta < 10, ( + f"BinwalkAnalyzer leaked {delta} FDs across {iterations} iterations " + f"({before} -> {after})." + ) diff --git a/ofrak_core/tests/components/test_entropy_component.py b/ofrak_core/tests/components/test_entropy_component.py index af2b42988..b1768a6ea 100644 --- a/ofrak_core/tests/components/test_entropy_component.py +++ b/ofrak_core/tests/components/test_entropy_component.py @@ -82,3 +82,28 @@ def _almost_equal(bytes1: bytes, bytes2: bytes) -> bool: print(f"Inputs differ at byte {i} ({bytes1[i]} != {bytes2[i]})") return False return True + + +@pytest.mark.skipif( + not os.path.isdir(f"/proc/{os.getpid()}/fd"), + reason="Requires /proc//fd (Linux only)", +) +async def test_entropy_does_not_leak_fds(ofrak_context: OFRAKContext): + """ + Regression test for the ProcessPoolExecutor FD leak in DataSummaryAnalyzer. + """ + fd_dir = f"/proc/{os.getpid()}/fd" + asset_path = os.path.join(components.ASSETS_DIR, "hello.out") + before = len(os.listdir(fd_dir)) + + iterations = 5 + for _ in range(iterations): + root_resource = await ofrak_context.create_root_resource_from_file(asset_path) + await root_resource.run(DataSummaryAnalyzer) + after = len(os.listdir(fd_dir)) + + delta = after - before + assert delta < 10, ( + f"DataSummaryAnalyzer leaked {delta} FDs across {iterations} iterations " + f"({before} -> {after})." + )