@@ -33,7 +33,7 @@ mod gui;
3333
3434use std:: collections:: HashMap ;
3535use std:: io:: { Cursor , Read } ;
36- use std:: path:: PathBuf ;
36+ use std:: path:: { Path , PathBuf } ;
3737use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
3838use std:: sync:: mpsc:: { Receiver , SyncSender , sync_channel} ;
3939use std:: sync:: { Arc , Mutex } ;
@@ -66,6 +66,11 @@ struct RunArgs {
6666 /// the next job and upload of finished results always overlap the GPU run.
6767 #[ arg( long, default_value_t = 1 ) ]
6868 jobs : usize ,
69+ /// Maximum on-disk blob cache size, in GB. Once exceeded, the oldest cached
70+ /// blobs are evicted. Eviction is safe: a running job holds its blob in memory,
71+ /// so the only cost of dropping a file is a re-download on a later cache miss.
72+ #[ arg( long, default_value_t = 20 ) ]
73+ cache_max_gb : u64 ,
6974}
7075
7176// ------------------------------------------------------------- pullOne response
@@ -245,10 +250,53 @@ fn fetch_blob(args: &RunArgs, d: &DataRef) -> Result<Vec<u8>> {
245250 let bytes = std:: fs:: read ( & target) ?;
246251 if is_temp {
247252 let _ = std:: fs:: remove_file ( & target) ;
253+ } else {
254+ // We just added a finalized blob; keep the cache under its size cap.
255+ prune_cache ( & cache, args. cache_max_gb . saturating_mul ( 1 << 30 ) ) ;
248256 }
249257 Ok ( bytes)
250258}
251259
260+ /// Keep the blob cache under `max_bytes` by evicting the oldest finalized entries
261+ /// (by mtime). Best-effort — any IO error just leaves that file in place. Skips
262+ /// rsurl's in-progress `.part`/`.tmp` files so an active download is never
263+ /// disturbed; everything else is fair game, since `fetch_blob` reads each blob
264+ /// fully into memory and no running job depends on its file surviving.
265+ fn prune_cache ( cache : & Path , max_bytes : u64 ) {
266+ let Ok ( rd) = std:: fs:: read_dir ( cache) else {
267+ return ;
268+ } ;
269+ let mut entries: Vec < ( std:: time:: SystemTime , u64 , PathBuf ) > = Vec :: new ( ) ;
270+ let mut total: u64 = 0 ;
271+ for e in rd. flatten ( ) {
272+ let path = e. path ( ) ;
273+ let name = path. file_name ( ) . and_then ( |n| n. to_str ( ) ) . unwrap_or ( "" ) ;
274+ if name. ends_with ( ".part" ) || name. ends_with ( ".tmp" ) {
275+ continue ; // an in-progress download rsurl still owns
276+ }
277+ let Ok ( meta) = e. metadata ( ) else { continue } ;
278+ if !meta. is_file ( ) {
279+ continue ;
280+ }
281+ let mtime = meta. modified ( ) . unwrap_or ( std:: time:: UNIX_EPOCH ) ;
282+ total = total. saturating_add ( meta. len ( ) ) ;
283+ entries. push ( ( mtime, meta. len ( ) , path) ) ;
284+ }
285+ if total <= max_bytes {
286+ return ;
287+ }
288+ entries. sort_by_key ( |( mtime, _, _) | * mtime) ; // oldest first
289+ for ( _, len, path) in entries {
290+ if total <= max_bytes {
291+ break ;
292+ }
293+ if std:: fs:: remove_file ( & path) . is_ok ( ) {
294+ total = total. saturating_sub ( len) ;
295+ eprintln ! ( "[decryptd] cache: evicted {} ({len} B)" , path. display( ) ) ;
296+ }
297+ }
298+ }
299+
252300/// Decode an RFC 2397 `data:` URI into its raw bytes. Handles the two payload
253301/// encodings: `;base64` (the platform's case — base64 over the gzip/xz blob) and
254302/// the default percent-encoding. The media type in the header is ignored; the
@@ -654,6 +702,7 @@ fn prefetch_loop(
654702/// GPU stage: the serialized step. One per `--jobs`; each takes a ready job, runs it,
655703/// and hands the result to the upload stage.
656704fn run_loop (
705+ args : Arc < RunArgs > ,
657706 ready : Arc < Mutex < Receiver < ReadyJob > > > ,
658707 inflight : InFlight ,
659708 done : SyncSender < FinishedJob > ,
@@ -677,6 +726,11 @@ fn run_loop(
677726 Err ( e) => {
678727 eprintln ! ( "[decryptd] run error: {e:#}" ) ;
679728 inflight. lock ( ) . unwrap ( ) . remove ( & frag_id) ;
729+ // Back off before taking the next fragment. Without this a
730+ // persistent GPU fault (OOM, driver wedged, no compatible cubin)
731+ // spins here, claiming + downloading fragments as fast as the
732+ // network allows and burning the work pool for nothing.
733+ thread:: sleep ( Duration :: from_secs ( args. idle_secs ) ) ;
680734 }
681735 }
682736 }
@@ -798,10 +852,15 @@ fn run_worker(args: RunArgs, status: Status) -> Result<()> {
798852 }
799853 let mut runners = Vec :: new ( ) ;
800854 for _ in 0 ..jobs {
801- let ( ready_rx, inflight, done_tx) = ( ready_rx. clone ( ) , inflight. clone ( ) , done_tx. clone ( ) ) ;
855+ let ( args, ready_rx, inflight, done_tx) = (
856+ args. clone ( ) ,
857+ ready_rx. clone ( ) ,
858+ inflight. clone ( ) ,
859+ done_tx. clone ( ) ,
860+ ) ;
802861 let status = status. clone ( ) ;
803862 runners. push ( thread:: spawn ( move || {
804- run_loop ( ready_rx, inflight, done_tx, status)
863+ run_loop ( args , ready_rx, inflight, done_tx, status)
805864 } ) ) ;
806865 }
807866 drop ( done_tx) ;
@@ -844,4 +903,37 @@ mod tests {
844903 decode_data_url ( "data:application/octet-stream;BASE64,aGVs\n bG8=" ) . expect ( "decode" ) ;
845904 assert_eq ! ( bytes, b"hello" ) ;
846905 }
906+
907+ #[ test]
908+ fn prune_cache_evicts_down_to_cap_and_spares_in_progress ( ) {
909+ // Unique scratch dir so parallel test runs don't collide.
910+ let dir = std:: env:: temp_dir ( ) . join ( format ! ( "decryptd-prune-{}" , std:: process:: id( ) ) ) ;
911+ let _ = std:: fs:: remove_dir_all ( & dir) ;
912+ std:: fs:: create_dir_all ( & dir) . unwrap ( ) ;
913+
914+ // Three finalized 10-byte blobs (30 B) plus an in-progress .part that
915+ // eviction must never touch even though we're over the cap.
916+ for name in [ "aaa" , "bbb" , "ccc" ] {
917+ std:: fs:: write ( dir. join ( name) , [ 0u8 ; 10 ] ) . unwrap ( ) ;
918+ }
919+ std:: fs:: write ( dir. join ( "pending-download.tmp.part" ) , [ 0u8 ; 10 ] ) . unwrap ( ) ;
920+
921+ // Cap at 15 B: must evict finalized blobs until <= 15 (keeps exactly one),
922+ // leaving the .part alone.
923+ prune_cache ( & dir, 15 ) ;
924+
925+ let finalized = std:: fs:: read_dir ( & dir)
926+ . unwrap ( )
927+ . flatten ( )
928+ . filter_map ( |e| e. file_name ( ) . into_string ( ) . ok ( ) )
929+ . filter ( |n| !n. ends_with ( ".part" ) )
930+ . count ( ) ;
931+ assert_eq ! ( finalized, 1 , "should evict down to one finalized blob" ) ;
932+ assert ! (
933+ dir. join( "pending-download.tmp.part" ) . exists( ) ,
934+ "in-progress .part must survive eviction"
935+ ) ;
936+
937+ let _ = std:: fs:: remove_dir_all ( & dir) ;
938+ }
847939}
0 commit comments