Skip to content

Commit 1154607

Browse files
committed
Improve dummy profile plausibility checks and local evaluation reporting
1 parent 1acc631 commit 1154607

3 files changed

Lines changed: 214 additions & 33 deletions

File tree

src/phasmid/dummy_generator.py

Lines changed: 153 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
77
- forge forensic artifacts
88
- fake kernel logs or system events
9-
- perform timestamp forgery
109
- tamper with filesystem metadata for anti-forensic purposes
1110
- claim to produce content indistinguishable under expert forensic analysis
1211
@@ -15,12 +14,15 @@
1514

1615
from __future__ import annotations
1716

17+
import json
1818
import os
1919
import string
20+
import time
2021
from dataclasses import dataclass, field
2122
from pathlib import Path
2223
from typing import Sequence, TypeVar
2324

25+
from . import config
2426
from .context_profile import (
2527
ContextProfile,
2628
ProfileValidationResult,
@@ -48,6 +50,10 @@ class GeneratedDummyReport:
4850
directory_count: int
4951
extension_distribution: dict[str, int]
5052
plausibility: ProfileValidationResult
53+
container_size_bytes: int
54+
occupancy_ratio: float
55+
size_distribution: dict[str, int]
56+
evaluation_report_path: str
5157
warnings: list[str] = field(default_factory=list)
5258

5359

@@ -164,6 +170,79 @@ def _random_alnum_bytes(length: int) -> bytes:
164170
return bytes(_urandom_choice(alphabet.encode()) for _ in range(length))
165171

166172

173+
def _random_filename(ext: str) -> str:
174+
stem_len = _urandom_int(8, 16)
175+
stem = _random_alnum_bytes(stem_len).decode("ascii", errors="ignore")
176+
return f"{stem}.{ext.lstrip('.')}"
177+
178+
179+
def _bucket_file_sizes(file_sizes: list[int]) -> dict[str, int]:
180+
buckets = {
181+
"lt_64kb": 0,
182+
"64kb_to_256kb": 0,
183+
"256kb_to_1mb": 0,
184+
"1mb_to_4mb": 0,
185+
"gte_4mb": 0,
186+
}
187+
for size in file_sizes:
188+
if size < 64 * 1024:
189+
buckets["lt_64kb"] += 1
190+
elif size < 256 * 1024:
191+
buckets["64kb_to_256kb"] += 1
192+
elif size < 1024 * 1024:
193+
buckets["256kb_to_1mb"] += 1
194+
elif size < 4 * 1024 * 1024:
195+
buckets["1mb_to_4mb"] += 1
196+
else:
197+
buckets["gte_4mb"] += 1
198+
return buckets
199+
200+
201+
def _apply_mtime_variation(file_paths: list[Path]) -> None:
202+
if not file_paths:
203+
return
204+
base_ns = time.time_ns()
205+
# Keep mtime near write time but avoid uniform timestamps across generated files.
206+
for idx, fpath in enumerate(file_paths):
207+
delta_ns = (idx + 1) * 1_000_000 + int.from_bytes(os.urandom(2), "little")
208+
ts_ns = base_ns - delta_ns
209+
try:
210+
os.utime(fpath, ns=(ts_ns, ts_ns))
211+
except OSError:
212+
continue
213+
214+
215+
def _resolve_container_size(target_size_bytes: int) -> int:
216+
container_path = Path(config.dummy_container_path())
217+
try:
218+
return container_path.stat().st_size
219+
except OSError:
220+
return max(0, int(target_size_bytes))
221+
222+
223+
def _write_local_evaluation_report(
224+
*,
225+
output_dir: Path,
226+
profile_name: str,
227+
container_size_bytes: int,
228+
dummy_size_bytes: int,
229+
occupancy_ratio: float,
230+
file_count: int,
231+
size_distribution: dict[str, int],
232+
) -> Path:
233+
report_path = output_dir / "dummy_profile_eval.json"
234+
payload = {
235+
"profile_name": profile_name,
236+
"container_size_bytes": container_size_bytes,
237+
"dummy_size_bytes": dummy_size_bytes,
238+
"occupancy_ratio": occupancy_ratio,
239+
"file_count": file_count,
240+
"size_distribution": size_distribution,
241+
}
242+
report_path.write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8")
243+
return report_path
244+
245+
167246
_TEXT_EXTENSIONS = {"txt", "md", "bib", "html", "yaml", "xml"}
168247
_LOG_EXTENSIONS = {"log"}
169248
_JSON_EXTENSIONS = {"json"}
@@ -185,18 +264,17 @@ def _generate_file_content(ext: str, target_bytes: int) -> bytes:
185264
return _generate_binary_stub(target_bytes)
186265

187266

188-
def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport:
267+
def generate_dummy_dataset(config_data: DummyGeneratorConfig) -> GeneratedDummyReport:
189268
"""
190269
Generate a plausible dummy dataset in `config.output_dir`.
191270
192271
Creates directories and files consistent with the selected context profile.
193-
Does not forge metadata, timestamps, or forensic artifacts.
194272
"""
195-
output_dir = Path(config.output_dir)
273+
output_dir = Path(config_data.output_dir)
196274
output_dir.mkdir(parents=True, exist_ok=True)
197275

198-
profile = config.profile
199-
effective_size = config.effective_dummy_size_bytes()
276+
profile = config_data.profile
277+
effective_size = config_data.effective_dummy_size_bytes()
200278

201279
extensions = list(profile.dummy_content_types)
202280
directories = list(profile.typical_directories)
@@ -207,38 +285,57 @@ def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport
207285
subdir.mkdir(parents=True, exist_ok=True)
208286
dirs_to_create.append(subdir)
209287

210-
if effective_size > 0 and profile.min_file_count > 0:
211-
avg_file_size = effective_size // profile.min_file_count
212-
else:
213-
avg_file_size = 8 * 1024
288+
configured_min_size_bytes = config.dummy_min_size_mb() * 1024 * 1024
289+
configured_min_file_count = config.dummy_min_file_count()
290+
occupancy_warn_threshold = config.dummy_occupancy_warn()
291+
292+
required_bytes = effective_size
293+
required_file_count = profile.min_file_count
214294

215-
remaining_bytes = effective_size
295+
remaining_bytes = required_bytes
216296
files_created = 0
217297
total_bytes_written = 0
218298
ext_dist: dict[str, int] = {}
299+
file_sizes: list[int] = []
300+
written_paths: list[Path] = []
219301

220-
for _ in range(max(profile.min_file_count, 1)):
221-
if remaining_bytes <= 0:
222-
break
302+
if effective_size > 0 and profile.min_file_count > 0:
303+
avg_file_size = max(512, effective_size // profile.min_file_count)
304+
else:
305+
avg_file_size = 8 * 1024
223306

307+
while remaining_bytes > 0:
224308
ext = _urandom_choice(extensions)
225309
parent = _urandom_choice(dirs_to_create)
226310
fname = _random_filename(ext)
227311
fpath = parent / fname
228312

229-
size = min(remaining_bytes, max(512, avg_file_size))
313+
if remaining_bytes > 0:
314+
size = min(remaining_bytes, max(512, avg_file_size))
315+
else:
316+
size = max(512, avg_file_size)
317+
230318
content = _generate_file_content(ext, size)
231319
try:
232320
fpath.write_bytes(content)
233321
except OSError:
234-
continue
322+
break
235323

236324
files_created += 1
237-
total_bytes_written += len(content)
238-
remaining_bytes -= len(content)
325+
bytes_written = len(content)
326+
total_bytes_written += bytes_written
327+
remaining_bytes = max(0, remaining_bytes - bytes_written)
239328
ext_dist[ext] = ext_dist.get(ext, 0) + 1
329+
file_sizes.append(bytes_written)
330+
written_paths.append(fpath)
331+
332+
_apply_mtime_variation(written_paths)
333+
334+
container_size_bytes = _resolve_container_size(config_data.target_size_bytes)
335+
occupancy_ratio = 0.0
336+
if container_size_bytes > 0:
337+
occupancy_ratio = total_bytes_written / float(container_size_bytes)
240338

241-
container_size_bytes = config.target_size_bytes
242339
plausibility = validate_against_profile(
243340
profile=profile,
244341
container_size_bytes=container_size_bytes,
@@ -247,13 +344,36 @@ def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport
247344
extension_distribution=ext_dist,
248345
)
249346

347+
size_distribution = _bucket_file_sizes(file_sizes)
348+
report_path = _write_local_evaluation_report(
349+
output_dir=output_dir,
350+
profile_name=profile.profile_name,
351+
container_size_bytes=container_size_bytes,
352+
dummy_size_bytes=total_bytes_written,
353+
occupancy_ratio=occupancy_ratio,
354+
file_count=files_created,
355+
size_distribution=size_distribution,
356+
)
357+
250358
warnings = list(plausibility.warnings)
251-
if files_created < profile.min_file_count:
359+
if files_created < required_file_count:
360+
warnings.append(
361+
f"only {files_created} files created; profile minimum is {required_file_count}"
362+
)
363+
if files_created < configured_min_file_count:
252364
warnings.append(
253-
f"only {files_created} files created; profile minimum is {profile.min_file_count}"
365+
f"only {files_created} files created; configured minimum is {configured_min_file_count}"
366+
)
367+
if total_bytes_written < configured_min_size_bytes:
368+
warnings.append(
369+
f"dummy size {total_bytes_written} bytes is below configured minimum {configured_min_size_bytes} bytes"
370+
)
371+
if container_size_bytes > 0 and occupancy_ratio < occupancy_warn_threshold:
372+
warnings.append(
373+
"dummy profile size is disproportionately small relative to the local container"
254374
)
255375
if total_bytes_written == 0:
256-
warnings.append("no bytes were written dataset is empty")
376+
warnings.append("no bytes were written - dataset is empty")
257377

258378
return GeneratedDummyReport(
259379
output_dir=str(output_dir),
@@ -263,6 +383,10 @@ def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport
263383
directory_count=len(dirs_to_create),
264384
extension_distribution=ext_dist,
265385
plausibility=plausibility,
386+
container_size_bytes=container_size_bytes,
387+
occupancy_ratio=occupancy_ratio,
388+
size_distribution=size_distribution,
389+
evaluation_report_path=str(report_path),
266390
warnings=warnings,
267391
)
268392

@@ -318,16 +442,12 @@ def import_sample_directory(
318442
dest_file = dst / rel
319443
dest_file.parent.mkdir(parents=True, exist_ok=True)
320444
try:
321-
dest_file.write_bytes(item.read_bytes())
322-
files_copied += 1
323-
bytes_copied += size
324-
except OSError as exc:
325-
warnings.append(f"could not copy {item.name}: {exc}")
326-
327-
return files_copied, bytes_copied, warnings
445+
data = item.read_bytes()
446+
dest_file.write_bytes(data)
447+
except OSError:
448+
continue
328449

450+
files_copied += 1
451+
bytes_copied += len(data)
329452

330-
def _random_filename(ext: str) -> str:
331-
length = _urandom_int(6, 14)
332-
stem = _random_alnum_bytes(length).decode()
333-
return f"{stem}.{ext}"
453+
return files_copied, bytes_copied, warnings

tests/test_config.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,24 @@ def test_runtime_flags_and_limits(self):
115115
self.assertEqual(config.restricted_session_seconds(), 42)
116116
self.assertEqual(config.doctor_recent_seconds(), 120)
117117

118+
def test_dummy_plausibility_threshold_config(self):
119+
with mock.patch.dict(os.environ, {}, clear=True):
120+
self.assertEqual(config.dummy_min_size_mb(), 50)
121+
self.assertEqual(config.dummy_min_file_count(), 20)
122+
self.assertEqual(config.dummy_occupancy_warn(), 0.10)
123+
with mock.patch.dict(
124+
os.environ,
125+
{
126+
"PHASMID_DUMMY_MIN_SIZE_MB": "7",
127+
"PHASMID_DUMMY_MIN_FILE_COUNT": "42",
128+
"PHASMID_DUMMY_OCCUPANCY_WARN": "0.25",
129+
},
130+
clear=True,
131+
):
132+
self.assertEqual(config.dummy_min_size_mb(), 7)
133+
self.assertEqual(config.dummy_min_file_count(), 42)
134+
self.assertEqual(config.dummy_occupancy_warn(), 0.25)
135+
118136
def test_no_direct_phasmid_env_reads_outside_config(self):
119137
root = Path(ROOT) / "src" / "phasmid"
120138
offenders: list[str] = []

tests/test_dummy_generator.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import json
12
import os
23
import sys
34
import tempfile
45
import unittest
6+
from unittest import mock
57

68
ROOT = os.path.dirname(os.path.dirname(__file__))
79
sys.path.insert(0, os.path.join(ROOT, "src"))
@@ -87,6 +89,15 @@ def test_generate_produces_plausibility_report(self):
8789
config = self._make_config(tmp, target_mb=20)
8890
report = generate_dummy_dataset(config)
8991
self.assertIsNotNone(report.plausibility)
92+
self.assertTrue(os.path.exists(report.evaluation_report_path))
93+
with open(report.evaluation_report_path, "r", encoding="utf-8") as f:
94+
payload = json.load(f)
95+
self.assertIn("container_size_bytes", payload)
96+
self.assertIn("dummy_size_bytes", payload)
97+
self.assertIn("occupancy_ratio", payload)
98+
self.assertIn("file_count", payload)
99+
self.assertIn("size_distribution", payload)
100+
self.assertIsInstance(report.size_distribution, dict)
90101

91102
def test_generate_does_not_produce_disallowed_content(self):
92103
"""Verify no forged system files, kernel logs, or forensic artifacts."""
@@ -110,6 +121,38 @@ def test_effective_dummy_size_bytes(self):
110121
)
111122
self.assertEqual(config.effective_dummy_size_bytes(), 25 * 1024 * 1024)
112123

124+
def test_generate_warns_when_configured_thresholds_not_met(self):
125+
with tempfile.TemporaryDirectory() as tmp:
126+
config = self._make_config(tmp, target_mb=1, occupancy=0.1)
127+
with mock.patch.dict(
128+
os.environ,
129+
{
130+
"PHASMID_DUMMY_MIN_SIZE_MB": "5",
131+
"PHASMID_DUMMY_MIN_FILE_COUNT": "1000",
132+
"PHASMID_DUMMY_OCCUPANCY_WARN": "0.90",
133+
},
134+
clear=False,
135+
):
136+
report = generate_dummy_dataset(config)
137+
joined = " | ".join(report.warnings).lower()
138+
self.assertIn("configured minimum", joined)
139+
self.assertIn("disproportionately small", joined)
140+
141+
def test_generate_disperses_file_mtime(self):
142+
with tempfile.TemporaryDirectory() as tmp:
143+
config = self._make_config(tmp, target_mb=2, occupancy=0.5)
144+
report = generate_dummy_dataset(config)
145+
self.assertGreater(report.files_created, 0)
146+
147+
mtimes = set()
148+
for dirpath, _dirnames, filenames in os.walk(tmp):
149+
for fname in filenames:
150+
if fname == "dummy_profile_eval.json":
151+
continue
152+
path = os.path.join(dirpath, fname)
153+
mtimes.add(os.stat(path).st_mtime_ns)
154+
self.assertGreater(len(mtimes), 1)
155+
113156

114157
class TestImportSampleDirectory(unittest.TestCase):
115158
def test_import_copies_files(self):

0 commit comments

Comments
 (0)