66
77 - forge forensic artifacts
88 - fake kernel logs or system events
9- - perform timestamp forgery
109 - tamper with filesystem metadata for anti-forensic purposes
1110 - claim to produce content indistinguishable under expert forensic analysis
1211
1514
1615from __future__ import annotations
1716
17+ import json
1818import os
1919import string
20+ import time
2021from dataclasses import dataclass , field
2122from pathlib import Path
2223from typing import Sequence , TypeVar
2324
25+ from . import config
2426from .context_profile import (
2527 ContextProfile ,
2628 ProfileValidationResult ,
@@ -48,6 +50,10 @@ class GeneratedDummyReport:
4850 directory_count : int
4951 extension_distribution : dict [str , int ]
5052 plausibility : ProfileValidationResult
53+ container_size_bytes : int
54+ occupancy_ratio : float
55+ size_distribution : dict [str , int ]
56+ evaluation_report_path : str
5157 warnings : list [str ] = field (default_factory = list )
5258
5359
@@ -164,6 +170,79 @@ def _random_alnum_bytes(length: int) -> bytes:
164170 return bytes (_urandom_choice (alphabet .encode ()) for _ in range (length ))
165171
166172
173+ def _random_filename (ext : str ) -> str :
174+ stem_len = _urandom_int (8 , 16 )
175+ stem = _random_alnum_bytes (stem_len ).decode ("ascii" , errors = "ignore" )
176+ return f"{ stem } .{ ext .lstrip ('.' )} "
177+
178+
179+ def _bucket_file_sizes (file_sizes : list [int ]) -> dict [str , int ]:
180+ buckets = {
181+ "lt_64kb" : 0 ,
182+ "64kb_to_256kb" : 0 ,
183+ "256kb_to_1mb" : 0 ,
184+ "1mb_to_4mb" : 0 ,
185+ "gte_4mb" : 0 ,
186+ }
187+ for size in file_sizes :
188+ if size < 64 * 1024 :
189+ buckets ["lt_64kb" ] += 1
190+ elif size < 256 * 1024 :
191+ buckets ["64kb_to_256kb" ] += 1
192+ elif size < 1024 * 1024 :
193+ buckets ["256kb_to_1mb" ] += 1
194+ elif size < 4 * 1024 * 1024 :
195+ buckets ["1mb_to_4mb" ] += 1
196+ else :
197+ buckets ["gte_4mb" ] += 1
198+ return buckets
199+
200+
201+ def _apply_mtime_variation (file_paths : list [Path ]) -> None :
202+ if not file_paths :
203+ return
204+ base_ns = time .time_ns ()
205+ # Keep mtime near write time but avoid uniform timestamps across generated files.
206+ for idx , fpath in enumerate (file_paths ):
207+ delta_ns = (idx + 1 ) * 1_000_000 + int .from_bytes (os .urandom (2 ), "little" )
208+ ts_ns = base_ns - delta_ns
209+ try :
210+ os .utime (fpath , ns = (ts_ns , ts_ns ))
211+ except OSError :
212+ continue
213+
214+
215+ def _resolve_container_size (target_size_bytes : int ) -> int :
216+ container_path = Path (config .dummy_container_path ())
217+ try :
218+ return container_path .stat ().st_size
219+ except OSError :
220+ return max (0 , int (target_size_bytes ))
221+
222+
223+ def _write_local_evaluation_report (
224+ * ,
225+ output_dir : Path ,
226+ profile_name : str ,
227+ container_size_bytes : int ,
228+ dummy_size_bytes : int ,
229+ occupancy_ratio : float ,
230+ file_count : int ,
231+ size_distribution : dict [str , int ],
232+ ) -> Path :
233+ report_path = output_dir / "dummy_profile_eval.json"
234+ payload = {
235+ "profile_name" : profile_name ,
236+ "container_size_bytes" : container_size_bytes ,
237+ "dummy_size_bytes" : dummy_size_bytes ,
238+ "occupancy_ratio" : occupancy_ratio ,
239+ "file_count" : file_count ,
240+ "size_distribution" : size_distribution ,
241+ }
242+ report_path .write_text (json .dumps (payload , ensure_ascii = True , indent = 2 ), encoding = "utf-8" )
243+ return report_path
244+
245+
167246_TEXT_EXTENSIONS = {"txt" , "md" , "bib" , "html" , "yaml" , "xml" }
168247_LOG_EXTENSIONS = {"log" }
169248_JSON_EXTENSIONS = {"json" }
@@ -185,18 +264,17 @@ def _generate_file_content(ext: str, target_bytes: int) -> bytes:
185264 return _generate_binary_stub (target_bytes )
186265
187266
188- def generate_dummy_dataset (config : DummyGeneratorConfig ) -> GeneratedDummyReport :
267+ def generate_dummy_dataset (config_data : DummyGeneratorConfig ) -> GeneratedDummyReport :
189268 """
190269 Generate a plausible dummy dataset in `config.output_dir`.
191270
192271 Creates directories and files consistent with the selected context profile.
193- Does not forge metadata, timestamps, or forensic artifacts.
194272 """
195- output_dir = Path (config .output_dir )
273+ output_dir = Path (config_data .output_dir )
196274 output_dir .mkdir (parents = True , exist_ok = True )
197275
198- profile = config .profile
199- effective_size = config .effective_dummy_size_bytes ()
276+ profile = config_data .profile
277+ effective_size = config_data .effective_dummy_size_bytes ()
200278
201279 extensions = list (profile .dummy_content_types )
202280 directories = list (profile .typical_directories )
@@ -207,38 +285,57 @@ def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport
207285 subdir .mkdir (parents = True , exist_ok = True )
208286 dirs_to_create .append (subdir )
209287
210- if effective_size > 0 and profile .min_file_count > 0 :
211- avg_file_size = effective_size // profile .min_file_count
212- else :
213- avg_file_size = 8 * 1024
288+ configured_min_size_bytes = config .dummy_min_size_mb () * 1024 * 1024
289+ configured_min_file_count = config .dummy_min_file_count ()
290+ occupancy_warn_threshold = config .dummy_occupancy_warn ()
291+
292+ required_bytes = effective_size
293+ required_file_count = profile .min_file_count
214294
215- remaining_bytes = effective_size
295+ remaining_bytes = required_bytes
216296 files_created = 0
217297 total_bytes_written = 0
218298 ext_dist : dict [str , int ] = {}
299+ file_sizes : list [int ] = []
300+ written_paths : list [Path ] = []
219301
220- for _ in range (max (profile .min_file_count , 1 )):
221- if remaining_bytes <= 0 :
222- break
302+ if effective_size > 0 and profile .min_file_count > 0 :
303+ avg_file_size = max (512 , effective_size // profile .min_file_count )
304+ else :
305+ avg_file_size = 8 * 1024
223306
307+ while remaining_bytes > 0 :
224308 ext = _urandom_choice (extensions )
225309 parent = _urandom_choice (dirs_to_create )
226310 fname = _random_filename (ext )
227311 fpath = parent / fname
228312
229- size = min (remaining_bytes , max (512 , avg_file_size ))
313+ if remaining_bytes > 0 :
314+ size = min (remaining_bytes , max (512 , avg_file_size ))
315+ else :
316+ size = max (512 , avg_file_size )
317+
230318 content = _generate_file_content (ext , size )
231319 try :
232320 fpath .write_bytes (content )
233321 except OSError :
234- continue
322+ break
235323
236324 files_created += 1
237- total_bytes_written += len (content )
238- remaining_bytes -= len (content )
325+ bytes_written = len (content )
326+ total_bytes_written += bytes_written
327+ remaining_bytes = max (0 , remaining_bytes - bytes_written )
239328 ext_dist [ext ] = ext_dist .get (ext , 0 ) + 1
329+ file_sizes .append (bytes_written )
330+ written_paths .append (fpath )
331+
332+ _apply_mtime_variation (written_paths )
333+
334+ container_size_bytes = _resolve_container_size (config_data .target_size_bytes )
335+ occupancy_ratio = 0.0
336+ if container_size_bytes > 0 :
337+ occupancy_ratio = total_bytes_written / float (container_size_bytes )
240338
241- container_size_bytes = config .target_size_bytes
242339 plausibility = validate_against_profile (
243340 profile = profile ,
244341 container_size_bytes = container_size_bytes ,
@@ -247,13 +344,36 @@ def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport
247344 extension_distribution = ext_dist ,
248345 )
249346
347+ size_distribution = _bucket_file_sizes (file_sizes )
348+ report_path = _write_local_evaluation_report (
349+ output_dir = output_dir ,
350+ profile_name = profile .profile_name ,
351+ container_size_bytes = container_size_bytes ,
352+ dummy_size_bytes = total_bytes_written ,
353+ occupancy_ratio = occupancy_ratio ,
354+ file_count = files_created ,
355+ size_distribution = size_distribution ,
356+ )
357+
250358 warnings = list (plausibility .warnings )
251- if files_created < profile .min_file_count :
359+ if files_created < required_file_count :
360+ warnings .append (
361+ f"only { files_created } files created; profile minimum is { required_file_count } "
362+ )
363+ if files_created < configured_min_file_count :
252364 warnings .append (
253- f"only { files_created } files created; profile minimum is { profile .min_file_count } "
365+ f"only { files_created } files created; configured minimum is { configured_min_file_count } "
366+ )
367+ if total_bytes_written < configured_min_size_bytes :
368+ warnings .append (
369+ f"dummy size { total_bytes_written } bytes is below configured minimum { configured_min_size_bytes } bytes"
370+ )
371+ if container_size_bytes > 0 and occupancy_ratio < occupancy_warn_threshold :
372+ warnings .append (
373+ "dummy profile size is disproportionately small relative to the local container"
254374 )
255375 if total_bytes_written == 0 :
256- warnings .append ("no bytes were written — dataset is empty" )
376+ warnings .append ("no bytes were written - dataset is empty" )
257377
258378 return GeneratedDummyReport (
259379 output_dir = str (output_dir ),
@@ -263,6 +383,10 @@ def generate_dummy_dataset(config: DummyGeneratorConfig) -> GeneratedDummyReport
263383 directory_count = len (dirs_to_create ),
264384 extension_distribution = ext_dist ,
265385 plausibility = plausibility ,
386+ container_size_bytes = container_size_bytes ,
387+ occupancy_ratio = occupancy_ratio ,
388+ size_distribution = size_distribution ,
389+ evaluation_report_path = str (report_path ),
266390 warnings = warnings ,
267391 )
268392
@@ -318,16 +442,12 @@ def import_sample_directory(
318442 dest_file = dst / rel
319443 dest_file .parent .mkdir (parents = True , exist_ok = True )
320444 try :
321- dest_file .write_bytes (item .read_bytes ())
322- files_copied += 1
323- bytes_copied += size
324- except OSError as exc :
325- warnings .append (f"could not copy { item .name } : { exc } " )
326-
327- return files_copied , bytes_copied , warnings
445+ data = item .read_bytes ()
446+ dest_file .write_bytes (data )
447+ except OSError :
448+ continue
328449
450+ files_copied += 1
451+ bytes_copied += len (data )
329452
330- def _random_filename (ext : str ) -> str :
331- length = _urandom_int (6 , 14 )
332- stem = _random_alnum_bytes (length ).decode ()
333- return f"{ stem } .{ ext } "
453+ return files_copied , bytes_copied , warnings
0 commit comments