Skip to content

Commit 4edb02e

Browse files
committed
feat: persistent hash cache for duplicate finder — instant re-scans via (path,size,mtime) invalidation
1 parent 52da1f7 commit 4edb02e

1 file changed

Lines changed: 94 additions & 4 deletions

File tree

src/DeepPurge.Core/FileSystem/DuplicateFinder.cs

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using System.Buffers;
22
using System.IO.Hashing;
3+
using System.Text.Json;
4+
using DeepPurge.Core.App;
35
using DeepPurge.Core.Diagnostics;
46
using DeepPurge.Core.Safety;
57

@@ -22,17 +24,29 @@ public class DuplicateGroup
2224
/// million files doesn't allocate a million 1 MB arrays. Matches the
2325
/// algorithm used by Czkawka / dupeGuru / fdupes.
2426
/// </summary>
27+
public class HashCacheEntry
28+
{
29+
public long Size { get; set; }
30+
public long LastWriteTicks { get; set; }
31+
public ulong HeadHash { get; set; }
32+
public ulong FullHash { get; set; }
33+
public bool HasFullHash { get; set; }
34+
}
35+
2536
public class DuplicateFinder
2637
{
2738
private const int FirstChunkBytes = 1 * 1024 * 1024;
2839
private const long MinFileBytes = 4 * 1024;
40+
private static readonly string CachePath = Path.Combine(DataPaths.Root, "hash-cache.json");
41+
private Dictionary<string, HashCacheEntry> _cache = new(StringComparer.OrdinalIgnoreCase);
2942

3043
public async Task<List<DuplicateGroup>> FindAsync(
3144
IEnumerable<string> roots,
3245
long minBytes = MinFileBytes,
3346
IProgress<string>? progress = null,
3447
CancellationToken ct = default)
3548
{
49+
LoadCache();
3650
var bySize = await Task.Run(() => GroupBySize(roots, minBytes, progress, ct), ct);
3751
progress?.Report($"Stage 1: {bySize.Count} size-collision groups");
3852

@@ -88,6 +102,7 @@ public async Task<List<DuplicateGroup>> FindAsync(
88102
}
89103
}
90104

105+
SaveCache();
91106
return finalGroups
92107
.OrderByDescending(g => g.WastedBytes)
93108
.ToList();
@@ -194,8 +209,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
194209
}
195210
}
196211

197-
private static async Task<ulong?> HashHeadAsync(string path, CancellationToken ct)
212+
private async Task<ulong?> HashHeadAsync(string path, CancellationToken ct)
198213
{
214+
if (TryGetCachedHead(path, out var cached)) return cached;
199215
byte[]? rented = null;
200216
try
201217
{
@@ -213,7 +229,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
213229
}
214230
var hash = new XxHash3();
215231
hash.Append(rented.AsSpan(0, total));
216-
return hash.GetCurrentHashAsUInt64();
232+
var result = hash.GetCurrentHashAsUInt64();
233+
UpdateCache(path, headHash: result);
234+
return result;
217235
}
218236
catch (OperationCanceledException) { throw; }
219237
catch { return null; }
@@ -223,8 +241,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
223241
}
224242
}
225243

226-
private static async Task<ulong?> HashFullAsync(string path, CancellationToken ct)
244+
private async Task<ulong?> HashFullAsync(string path, CancellationToken ct)
227245
{
246+
if (TryGetCachedFull(path, out var cached)) return cached;
228247
const int BufferBytes = 256 * 1024;
229248
byte[]? rented = null;
230249
try
@@ -238,7 +257,9 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
238257
int read;
239258
while ((read = await fs.ReadAsync(rented.AsMemory(0, BufferBytes), ct)) > 0)
240259
hash.Append(rented.AsSpan(0, read));
241-
return hash.GetCurrentHashAsUInt64();
260+
var result = hash.GetCurrentHashAsUInt64();
261+
UpdateCache(path, fullHash: result);
262+
return result;
242263
}
243264
catch (OperationCanceledException) { throw; }
244265
catch { return null; }
@@ -247,4 +268,73 @@ private static IEnumerable<string> SafeEnumerate(string root, CancellationToken
247268
if (rented != null) ArrayPool<byte>.Shared.Return(rented);
248269
}
249270
}
271+
272+
private bool TryGetCachedHead(string path, out ulong hash)
273+
{
274+
hash = 0;
275+
if (!_cache.TryGetValue(path, out var entry)) return false;
276+
try
277+
{
278+
var fi = new FileInfo(path);
279+
if (fi.Length != entry.Size || fi.LastWriteTimeUtc.Ticks != entry.LastWriteTicks) return false;
280+
hash = entry.HeadHash;
281+
return true;
282+
}
283+
catch { return false; }
284+
}
285+
286+
private bool TryGetCachedFull(string path, out ulong hash)
287+
{
288+
hash = 0;
289+
if (!_cache.TryGetValue(path, out var entry)) return false;
290+
if (!entry.HasFullHash) return false;
291+
try
292+
{
293+
var fi = new FileInfo(path);
294+
if (fi.Length != entry.Size || fi.LastWriteTimeUtc.Ticks != entry.LastWriteTicks) return false;
295+
hash = entry.FullHash;
296+
return true;
297+
}
298+
catch { return false; }
299+
}
300+
301+
private void UpdateCache(string path, ulong? headHash = null, ulong? fullHash = null)
302+
{
303+
try
304+
{
305+
var fi = new FileInfo(path);
306+
if (!_cache.TryGetValue(path, out var entry))
307+
{
308+
entry = new HashCacheEntry { Size = fi.Length, LastWriteTicks = fi.LastWriteTimeUtc.Ticks };
309+
_cache[path] = entry;
310+
}
311+
entry.Size = fi.Length;
312+
entry.LastWriteTicks = fi.LastWriteTimeUtc.Ticks;
313+
if (headHash.HasValue) entry.HeadHash = headHash.Value;
314+
if (fullHash.HasValue) { entry.FullHash = fullHash.Value; entry.HasFullHash = true; }
315+
}
316+
catch { }
317+
}
318+
319+
private void LoadCache()
320+
{
321+
try
322+
{
323+
if (!File.Exists(CachePath)) return;
324+
var json = File.ReadAllText(CachePath);
325+
_cache = JsonSerializer.Deserialize<Dictionary<string, HashCacheEntry>>(json)
326+
?? new(StringComparer.OrdinalIgnoreCase);
327+
}
328+
catch { _cache = new(StringComparer.OrdinalIgnoreCase); }
329+
}
330+
331+
private void SaveCache()
332+
{
333+
try
334+
{
335+
var json = JsonSerializer.Serialize(_cache);
336+
File.WriteAllText(CachePath, json);
337+
}
338+
catch (Exception ex) { Log.Warn($"SaveCache: {ex.Message}"); }
339+
}
250340
}

0 commit comments

Comments
 (0)