2828import lombok .*;
2929import lombok .extern .slf4j .Slf4j ;
3030import org .apache .commons .codec .digest .DigestUtils ;
31+ import org .apache .commons .io .FilenameUtils ;
3132import org .apache .tika .Tika ;
3233import org .apache .tika .mime .MediaType ;
3334import org .apache .tika .mime .MediaTypeRegistry ;
35+ import org .jetbrains .annotations .NotNull ;
3436
3537import java .io .ByteArrayInputStream ;
3638import java .io .File ;
3739import java .io .IOException ;
3840import java .nio .charset .Charset ;
3941import java .nio .file .Files ;
4042import java .util .*;
43+ import java .util .concurrent .ConcurrentHashMap ;
44+ import java .util .concurrent .atomic .AtomicLong ;
4145import java .util .zip .CRC32C ;
4246import java .util .zip .Checksum ;
4347
@@ -58,6 +62,14 @@ public class Winnowing {
5862 private static final Tika tika = new Tika ();
5963 private static final MediaTypeRegistry mediaTypeRegistry = MediaTypeRegistry .getDefaultRegistry ();
6064
65+ /**
66+ * Shared counter for generating unique IDs.
67+ * idGenerator is shared across all Winnowing instances,
68+ * ensuring sequential and unique ID generation for path obfuscation
69+ * regardless of how many instances of Winnowing are created.
70+ */
71+ private static final AtomicLong idGenerator = new AtomicLong (0 );
72+
6173 @ Builder .Default
6274 private Boolean skipSnippets = Boolean .FALSE ; // Skip snippet generations
6375 @ Builder .Default
@@ -68,6 +80,29 @@ public class Winnowing {
6880 private boolean hpsm = Boolean .FALSE ; // Enable High Precision Snippet Matching data collection
6981 @ Builder .Default
7082 private int snippetLimit = MAX_LONG_LINE_CHARS ; // Enable limiting of size of a single line of snippet generation
83+ @ Builder .Default
84+ private Map <String , String > obfuscationMap = new ConcurrentHashMap <>();
85+
86+ /**
87+ * Resolves the real file path for a given obfuscated path.
88+ * This method is thread-safe and can be called concurrently from multiple threads.
89+ *
90+ * @param obfuscatedPath the obfuscated path
91+ * @return the real file path corresponding to the provided obfuscated path, or null if no mapping exists
92+ */
93+ public String deobfuscateFilePath (@ NotNull String obfuscatedPath ) {
94+ return obfuscationMap .get (obfuscatedPath );
95+ }
96+
97+
98+ /**
99+ * Retrieves the size of the obfuscation map.
100+ *
101+ * @return the number of entries in the obfuscation map
102+ */
103+ public int getObfuscationMapSize () {
104+ return obfuscationMap .size ();
105+ }
71106
72107 /**
73108 * Calculate the WFP (fingerprint) for the given file
@@ -112,7 +147,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
112147 char [] fileContents = (new String (contents , Charset .defaultCharset ())).toCharArray ();
113148 String fileMD5 = DigestUtils .md5Hex (contents );
114149 StringBuilder wfpBuilder = new StringBuilder ();
115- // TODO add obfuscation of the filename here
150+
151+ if (obfuscate ) {
152+ filename = obfuscateFilePath (filename );
153+ }
154+
116155 wfpBuilder .append (String .format ("file=%s,%d,%s\n " , fileMD5 , contents .length , filename ));
117156 if (binFile || this .skipSnippets || this .skipSnippets (filename , fileContents )) {
118157 return wfpBuilder .toString ();
@@ -180,6 +219,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
180219 return wfpBuilder .toString ();
181220 }
182221
222+ /**
223+ * Obfuscates the given file path by replacing it with a generated unique identifier while
224+ * retaining its original file extension.
225+ * This method is thread-safe and can be called concurrently from multiple threads.
226+ *
227+ * @param originalPath the original file path to be obfuscated; must not be null
228+ * @return the obfuscated file path with a unique identifier and the original file extension
229+ */
230+ private String obfuscateFilePath (@ NotNull String originalPath ) {
231+ final String extension = extractExtension (originalPath );
232+
233+ // Generate a unique identifier for the obfuscated file using a thread-safe approach
234+ final String obfuscatedPath = idGenerator .getAndIncrement () + extension ;
235+ this .obfuscationMap .put (obfuscatedPath , originalPath );
236+ return obfuscatedPath ;
237+ }
238+
239+ /**
240+ * Extracts file extension from the given path, including the leading dot.
241+ *
242+ * @param path the file path or name (must not be null)
243+ * @return the file extension with leading dot (e.g., ".txt") or empty string if no extension
244+ */
245+ private String extractExtension (@ NotNull String path ) {
246+ try {
247+ String extractedExtension = FilenameUtils .getExtension (path ).trim ();
248+ return extractedExtension .isEmpty () ? "" : "." + extractedExtension ;
249+ } catch (IllegalArgumentException e ) {
250+ log .debug ("Could not extract extension from filename '{}': {}" ,
251+ path , e .getMessage ());
252+ return "" ;
253+ }
254+ }
255+
183256 /**
184257 * Determine if a file/contents should be skipped for snippet generation or not
185258 * @param filename filename for the contents (optional)
0 commit comments