11using System . Buffers ;
22using System . IO . Hashing ;
3+ using System . Text . Json ;
4+ using DeepPurge . Core . App ;
35using DeepPurge . Core . Diagnostics ;
46using DeepPurge . Core . Safety ;
57
@@ -22,17 +24,29 @@ public class DuplicateGroup
2224/// million files doesn't allocate a million 1 MB arrays. Matches the
2325/// algorithm used by Czkawka / dupeGuru / fdupes.
2426/// </summary>
27+ public class HashCacheEntry
28+ {
29+ public long Size { get ; set ; }
30+ public long LastWriteTicks { get ; set ; }
31+ public ulong HeadHash { get ; set ; }
32+ public ulong FullHash { get ; set ; }
33+ public bool HasFullHash { get ; set ; }
34+ }
35+
2536public class DuplicateFinder
2637{
2738 private const int FirstChunkBytes = 1 * 1024 * 1024 ;
2839 private const long MinFileBytes = 4 * 1024 ;
40+ private static readonly string CachePath = Path . Combine ( DataPaths . Root , "hash-cache.json" ) ;
41+ private Dictionary < string , HashCacheEntry > _cache = new ( StringComparer . OrdinalIgnoreCase ) ;
2942
3043 public async Task < List < DuplicateGroup > > FindAsync (
3144 IEnumerable < string > roots ,
3245 long minBytes = MinFileBytes ,
3346 IProgress < string > ? progress = null ,
3447 CancellationToken ct = default )
3548 {
49+ LoadCache ( ) ;
3650 var bySize = await Task . Run ( ( ) => GroupBySize ( roots , minBytes , progress , ct ) , ct ) ;
3751 progress ? . Report ( $ "Stage 1: { bySize . Count } size-collision groups") ;
3852
@@ -88,6 +102,7 @@ public async Task<List<DuplicateGroup>> FindAsync(
88102 }
89103 }
90104
105+ SaveCache ( ) ;
91106 return finalGroups
92107 . OrderByDescending ( g => g . WastedBytes )
93108 . ToList ( ) ;
@@ -194,8 +209,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
194209 }
195210 }
196211
197- private static async Task < ulong ? > HashHeadAsync ( string path , CancellationToken ct )
212+ private async Task < ulong ? > HashHeadAsync ( string path , CancellationToken ct )
198213 {
214+ if ( TryGetCachedHead ( path , out var cached ) ) return cached ;
199215 byte [ ] ? rented = null ;
200216 try
201217 {
@@ -213,7 +229,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
213229 }
214230 var hash = new XxHash3 ( ) ;
215231 hash . Append ( rented . AsSpan ( 0 , total ) ) ;
216- return hash . GetCurrentHashAsUInt64 ( ) ;
232+ var result = hash . GetCurrentHashAsUInt64 ( ) ;
233+ UpdateCache ( path , headHash : result ) ;
234+ return result ;
217235 }
218236 catch ( OperationCanceledException ) { throw ; }
219237 catch { return null ; }
@@ -223,8 +241,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
223241 }
224242 }
225243
226- private static async Task < ulong ? > HashFullAsync ( string path , CancellationToken ct )
244+ private async Task < ulong ? > HashFullAsync ( string path , CancellationToken ct )
227245 {
246+ if ( TryGetCachedFull ( path , out var cached ) ) return cached ;
228247 const int BufferBytes = 256 * 1024 ;
229248 byte [ ] ? rented = null ;
230249 try
@@ -238,7 +257,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
238257 int read ;
239258 while ( ( read = await fs . ReadAsync ( rented . AsMemory ( 0 , BufferBytes ) , ct ) ) > 0 )
240259 hash . Append ( rented . AsSpan ( 0 , read ) ) ;
241- return hash . GetCurrentHashAsUInt64 ( ) ;
260+ var result = hash . GetCurrentHashAsUInt64 ( ) ;
261+ UpdateCache ( path , fullHash : result ) ;
262+ return result ;
242263 }
243264 catch ( OperationCanceledException ) { throw ; }
244265 catch { return null ; }
@@ -247,4 +268,73 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
247268 if ( rented != null ) ArrayPool < byte > . Shared . Return ( rented ) ;
248269 }
249270 }
271+
272+ private bool TryGetCachedHead ( string path , out ulong hash )
273+ {
274+ hash = 0 ;
275+ if ( ! _cache . TryGetValue ( path , out var entry ) ) return false ;
276+ try
277+ {
278+ var fi = new FileInfo ( path ) ;
279+ if ( fi . Length != entry . Size || fi . LastWriteTimeUtc . Ticks != entry . LastWriteTicks ) return false ;
280+ hash = entry . HeadHash ;
281+ return true ;
282+ }
283+ catch { return false ; }
284+ }
285+
286+ private bool TryGetCachedFull ( string path , out ulong hash )
287+ {
288+ hash = 0 ;
289+ if ( ! _cache . TryGetValue ( path , out var entry ) ) return false ;
290+ if ( ! entry . HasFullHash ) return false ;
291+ try
292+ {
293+ var fi = new FileInfo ( path ) ;
294+ if ( fi . Length != entry . Size || fi . LastWriteTimeUtc . Ticks != entry . LastWriteTicks ) return false ;
295+ hash = entry . FullHash ;
296+ return true ;
297+ }
298+ catch { return false ; }
299+ }
300+
301+ private void UpdateCache ( string path , ulong ? headHash = null , ulong ? fullHash = null )
302+ {
303+ try
304+ {
305+ var fi = new FileInfo ( path ) ;
306+ if ( ! _cache . TryGetValue ( path , out var entry ) )
307+ {
308+ entry = new HashCacheEntry { Size = fi . Length , LastWriteTicks = fi . LastWriteTimeUtc . Ticks } ;
309+ _cache [ path ] = entry ;
310+ }
311+ entry . Size = fi . Length ;
312+ entry . LastWriteTicks = fi . LastWriteTimeUtc . Ticks ;
313+ if ( headHash . HasValue ) entry . HeadHash = headHash . Value ;
314+ if ( fullHash . HasValue ) { entry . FullHash = fullHash . Value ; entry . HasFullHash = true ; }
315+ }
316+ catch { }
317+ }
318+
319+ private void LoadCache ( )
320+ {
321+ try
322+ {
323+ if ( ! File . Exists ( CachePath ) ) return ;
324+ var json = File . ReadAllText ( CachePath ) ;
325+ _cache = JsonSerializer . Deserialize < Dictionary < string , HashCacheEntry > > ( json )
326+ ?? new ( StringComparer . OrdinalIgnoreCase ) ;
327+ }
328+ catch { _cache = new ( StringComparer . OrdinalIgnoreCase ) ; }
329+ }
330+
331+ private void SaveCache ( )
332+ {
333+ try
334+ {
335+ var json = JsonSerializer . Serialize ( _cache ) ;
336+ File . WriteAllText ( CachePath , json ) ;
337+ }
338+ catch ( Exception ex ) { Log . Warn ( $ "SaveCache: { ex . Message } ") ; }
339+ }
250340}
0 commit comments