|
28 | 28 | import lombok.*; |
29 | 29 | import lombok.extern.slf4j.Slf4j; |
30 | 30 | import org.apache.commons.codec.digest.DigestUtils; |
| 31 | +import org.apache.commons.io.FilenameUtils; |
31 | 32 | import org.apache.tika.Tika; |
32 | 33 | import org.apache.tika.mime.MediaType; |
33 | 34 | import org.apache.tika.mime.MediaTypeRegistry; |
| 35 | +import org.jetbrains.annotations.NotNull; |
34 | 36 |
|
35 | 37 | import java.io.ByteArrayInputStream; |
36 | 38 | import java.io.File; |
37 | 39 | import java.io.IOException; |
38 | 40 | import java.nio.charset.Charset; |
39 | 41 | import java.nio.file.Files; |
40 | 42 | import java.util.*; |
| 43 | +import java.util.concurrent.ConcurrentHashMap; |
| 44 | +import java.util.concurrent.atomic.AtomicLong; |
41 | 45 | import java.util.zip.CRC32C; |
42 | 46 | import java.util.zip.Checksum; |
43 | 47 |
|
@@ -68,6 +72,31 @@ public class Winnowing { |
68 | 72 | private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection |
69 | 73 | @Builder.Default |
70 | 74 | private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation |
| 75 | + @Builder.Default |
| 76 | + private Map<String, String> obfuscationMap = new ConcurrentHashMap<>(); |
| 77 | + @Builder.Default |
| 78 | + private static final AtomicLong idGenerator = new AtomicLong(0); //Incremental ids used for obfuscating path |
| 79 | + |
| 80 | + /** |
| 81 | + * Resolves the real file path for a given obfuscated path. |
| 82 | + * This method is thread-safe and can be called concurrently from multiple threads. |
| 83 | + * |
| 84 | + * @param obfuscatedPath the obfuscated path |
| 85 | + * @return the real file path corresponding to the provided obfuscated path, or null if no mapping exists |
| 86 | + */ |
| 87 | + public String deobfuscateFilePath(@NotNull String obfuscatedPath) { |
| 88 | + return obfuscationMap.get(obfuscatedPath); |
| 89 | + } |
| 90 | + |
| 91 | + |
| 92 | + /** |
| 93 | + * Retrieves the size of the obfuscation map. |
| 94 | + * |
| 95 | + * @return the number of entries in the obfuscation map |
| 96 | + */ |
| 97 | + public int getObfuscationMapSize() { |
| 98 | + return obfuscationMap.size(); |
| 99 | + } |
71 | 100 |
|
72 | 101 | /** |
73 | 102 | * Calculate the WFP (fingerprint) for the given file |
@@ -112,7 +141,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c |
112 | 141 | char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray(); |
113 | 142 | String fileMD5 = DigestUtils.md5Hex(contents); |
114 | 143 | StringBuilder wfpBuilder = new StringBuilder(); |
115 | | - // TODO add obfuscation of the filename here |
| 144 | + |
| 145 | + if (obfuscate) { |
| 146 | + filename = obfuscateFilePath(filename); |
| 147 | + } |
| 148 | + |
116 | 149 | wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename)); |
117 | 150 | if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) { |
118 | 151 | return wfpBuilder.toString(); |
@@ -180,6 +213,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c |
180 | 213 | return wfpBuilder.toString(); |
181 | 214 | } |
182 | 215 |
|
| 216 | + /** |
| 217 | + * Obfuscates the given file path by replacing it with a generated unique identifier while |
| 218 | + * retaining its original file extension. |
| 219 | + * This method is thread-safe and can be called concurrently from multiple threads. |
| 220 | + * |
| 221 | + * @param originalPath the original file path to be obfuscated; must not be null |
| 222 | + * @return the obfuscated file path with a unique identifier and the original file extension |
| 223 | + */ |
| 224 | + private String obfuscateFilePath(@NotNull String originalPath) { |
| 225 | + final String extension = extractExtension(originalPath); |
| 226 | + |
| 227 | + // Generate a unique identifier for the obfuscated file using a thread-safe approach |
| 228 | + final String obfuscatedPath = idGenerator.getAndIncrement() + extension; |
| 229 | + this.obfuscationMap.put(obfuscatedPath, originalPath); |
| 230 | + return obfuscatedPath; |
| 231 | + } |
| 232 | + |
| 233 | + /** |
| 234 | + * Extracts file extension from the given path, including the leading dot. |
| 235 | + * |
| 236 | + * @param path the file path or name (must not be null) |
| 237 | + * @return the file extension with leading dot (e.g., ".txt") or empty string if no extension |
| 238 | + */ |
| 239 | + private String extractExtension(@NotNull String path) { |
| 240 | + try { |
| 241 | + String extractedExtension = FilenameUtils.getExtension(path).trim(); |
| 242 | + return extractedExtension.isEmpty() ? "" : "." + extractedExtension; |
| 243 | + } catch (IllegalArgumentException e) { |
| 244 | + log.debug("Could not extract extension from filename '{}': {}", |
| 245 | + path, e.getMessage()); |
| 246 | + return ""; |
| 247 | + } |
| 248 | + } |
| 249 | + |
183 | 250 | /** |
184 | 251 | * Determine if a file/contents should be skipped for snippet generation or not |
185 | 252 | * @param filename filename for the contents (optional) |
|
0 commit comments