Skip to content

Commit 1c44e80

Browse files
committed
feat(SP-2487): implement path obfuscation on Winnowing class
1 parent a686fda commit 1c44e80

3 files changed

Lines changed: 248 additions & 1 deletion

File tree

src/main/java/com/scanoss/Winnowing.java

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,20 @@
2828
import lombok.*;
2929
import lombok.extern.slf4j.Slf4j;
3030
import org.apache.commons.codec.digest.DigestUtils;
31+
import org.apache.commons.io.FilenameUtils;
3132
import org.apache.tika.Tika;
3233
import org.apache.tika.mime.MediaType;
3334
import org.apache.tika.mime.MediaTypeRegistry;
35+
import org.jetbrains.annotations.NotNull;
3436

3537
import java.io.ByteArrayInputStream;
3638
import java.io.File;
3739
import java.io.IOException;
3840
import java.nio.charset.Charset;
3941
import java.nio.file.Files;
4042
import java.util.*;
43+
import java.util.concurrent.ConcurrentHashMap;
44+
import java.util.concurrent.atomic.AtomicLong;
4145
import java.util.zip.CRC32C;
4246
import java.util.zip.Checksum;
4347

@@ -58,6 +62,14 @@ public class Winnowing {
5862
private static final Tika tika = new Tika();
5963
private static final MediaTypeRegistry mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry();
6064

65+
/**
66+
* Shared counter for generating unique IDs.
67+
* idGenerator is shared across all Winnowing instances,
68+
* ensuring sequential and unique ID generation for path obfuscation
69+
* regardless of how many instances of Winnowing are created.
70+
*/
71+
private static final AtomicLong idGenerator = new AtomicLong(0);
72+
6173
@Builder.Default
6274
private Boolean skipSnippets = Boolean.FALSE; // Skip snippet generations
6375
@Builder.Default
@@ -68,6 +80,29 @@ public class Winnowing {
6880
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
6981
@Builder.Default
7082
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
83+
@Builder.Default
84+
private Map<String, String> obfuscationMap = new ConcurrentHashMap<>();
85+
86+
/**
87+
* Resolves the real file path for a given obfuscated path.
88+
* This method is thread-safe and can be called concurrently from multiple threads.
89+
*
90+
* @param obfuscatedPath the obfuscated path
91+
* @return the real file path corresponding to the provided obfuscated path, or null if no mapping exists
92+
*/
93+
public String deobfuscateFilePath(@NotNull String obfuscatedPath) {
94+
return obfuscationMap.get(obfuscatedPath);
95+
}
96+
97+
98+
/**
99+
* Retrieves the size of the obfuscation map.
100+
*
101+
* @return the number of entries in the obfuscation map
102+
*/
103+
public int getObfuscationMapSize() {
104+
return obfuscationMap.size();
105+
}
71106

72107
/**
73108
* Calculate the WFP (fingerprint) for the given file
@@ -112,7 +147,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
112147
char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray();
113148
String fileMD5 = DigestUtils.md5Hex(contents);
114149
StringBuilder wfpBuilder = new StringBuilder();
115-
// TODO add obfuscation of the filename here
150+
151+
if (obfuscate) {
152+
filename = obfuscateFilePath(filename);
153+
}
154+
116155
wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
117156
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
118157
return wfpBuilder.toString();
@@ -180,6 +219,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
180219
return wfpBuilder.toString();
181220
}
182221

222+
/**
223+
* Obfuscates the given file path by replacing it with a generated unique identifier while
224+
* retaining its original file extension.
225+
* This method is thread-safe and can be called concurrently from multiple threads.
226+
*
227+
* @param originalPath the original file path to be obfuscated; must not be null
228+
* @return the obfuscated file path with a unique identifier and the original file extension
229+
*/
230+
private String obfuscateFilePath(@NotNull String originalPath) {
231+
final String extension = extractExtension(originalPath);
232+
233+
// Generate a unique identifier for the obfuscated file using a thread-safe approach
234+
final String obfuscatedPath = idGenerator.getAndIncrement() + extension;
235+
this.obfuscationMap.put(obfuscatedPath, originalPath);
236+
return obfuscatedPath;
237+
}
238+
239+
/**
240+
* Extracts file extension from the given path, including the leading dot.
241+
*
242+
* @param path the file path or name (must not be null)
243+
* @return the file extension with leading dot (e.g., ".txt") or empty string if no extension
244+
*/
245+
private String extractExtension(@NotNull String path) {
246+
try {
247+
String extractedExtension = FilenameUtils.getExtension(path).trim();
248+
return extractedExtension.isEmpty() ? "" : "." + extractedExtension;
249+
} catch (IllegalArgumentException e) {
250+
log.debug("Could not extract extension from filename '{}': {}",
251+
path, e.getMessage());
252+
return "";
253+
}
254+
}
255+
183256
/**
184257
* Determine if a file/contents should be skipped for snippet generation or not
185258
* @param filename filename for the contents (optional)

src/test/java/com/scanoss/TestWinnowing.java

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525

2626
import com.scanoss.exceptions.WinnowingException;
27+
import com.scanoss.utils.WinnowingUtils;
2728
import lombok.extern.slf4j.Slf4j;
2829
import org.junit.After;
2930
import org.junit.Before;
@@ -265,4 +266,74 @@ public void TestWinnowingFileFailures() {
265266

266267
log.info("Finished {} -->", methodName);
267268
}
269+
270+
@Test
271+
public void TestWinnowingObfuscationFileWithExtension() {
272+
String methodName = new Object() {
273+
}.getClass().getEnclosingMethod().getName();
274+
log.info("<-- Starting {}", methodName);
275+
276+
Winnowing winnowing = Winnowing.builder().obfuscate(true).build();
277+
278+
String fileWithExtension = "testing/data/test-file.txt";
279+
280+
String wfpWithExtension = winnowing.wfpForFile(fileWithExtension, fileWithExtension);
281+
assertNotNull("Expected a result from WFP with extension", wfpWithExtension);
282+
283+
String obfuscatedPathWithExtension = WinnowingUtils.extractFilePathFromWFPBlock(wfpWithExtension);
284+
assertNotNull("Should have found an obfuscated path in WFP with extension", obfuscatedPathWithExtension);
285+
286+
String originalPathWithExtension = winnowing.deobfuscateFilePath(obfuscatedPathWithExtension);
287+
assertNotNull("Should be able to retrieve original path with extension", originalPathWithExtension);
288+
assertEquals("Original path should match input file with extension", fileWithExtension, originalPathWithExtension);
289+
290+
log.info("Finished {} -->", methodName);
291+
}
292+
293+
@Test
294+
public void TestWinnowingObfuscationFileWithoutExtension() {
295+
String methodName = new Object() {
296+
}.getClass().getEnclosingMethod().getName();
297+
log.info("<-- Starting {}", methodName);
298+
299+
Winnowing winnowing = Winnowing.builder().obfuscate(true).build();
300+
301+
String fileWithoutExtension = "testing/data/nbproject";
302+
303+
String wfpWithoutExtension = winnowing.wfpForFile(fileWithoutExtension, fileWithoutExtension);
304+
305+
String obfuscatedPathWithoutExtension = WinnowingUtils.extractFilePathFromWFPBlock(wfpWithoutExtension);
306+
assertNotNull("Should have found an obfuscated path in WFP without extension", obfuscatedPathWithoutExtension);
307+
308+
String originalPathWithoutExtension = winnowing.deobfuscateFilePath(obfuscatedPathWithoutExtension);
309+
assertNotNull("Should be able to retrieve original path without extension", originalPathWithoutExtension);
310+
assertEquals("Original path should match input file without extension", fileWithoutExtension, originalPathWithoutExtension);
311+
312+
log.info("Finished {} -->", methodName);
313+
}
314+
315+
@Test
316+
public void TestDeobfuscateFilePathEmpty() {
317+
String methodName = new Object() {
318+
}.getClass().getEnclosingMethod().getName();
319+
log.info("<-- Starting {}", methodName);
320+
321+
Winnowing winnowing = Winnowing.builder().build();
322+
assertNull("Should return null when given an empty obfuscated path", winnowing.deobfuscateFilePath(""));
323+
324+
log.info("Finished {} -->", methodName);
325+
}
326+
327+
@Test
328+
public void TestDeobfuscateFilePathInvalid() {
329+
String methodName = new Object() {
330+
}.getClass().getEnclosingMethod().getName();
331+
log.info("<-- Starting {}", methodName);
332+
333+
Winnowing winnowing = Winnowing.builder().build();
334+
assertNull("Should return null for a non-existent obfuscated path", winnowing.deobfuscateFilePath("invalidPath"));
335+
336+
log.info("Finished {} -->", methodName);
337+
}
268338
}
339+
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// SPDX-License-Identifier: MIT
2+
/*
3+
* Copyright (c) 2025, SCANOSS
4+
*
5+
* Permission is hereby granted, free of charge, to any person obtaining a copy
6+
* of this software and associated documentation files (the "Software"), to deal
7+
* in the Software without restriction, including without limitation the rights
8+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
* copies of the Software, and to permit persons to whom the Software is
10+
* furnished to do so, subject to the following conditions:
11+
*
12+
* The above copyright notice and this permission notice shall be included in
13+
* all copies or substantial portions of the Software.
14+
*
15+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21+
* THE SOFTWARE.
22+
*/
23+
package com.scanoss;
24+
25+
import lombok.extern.slf4j.Slf4j;
26+
import org.junit.Test;
27+
28+
import java.util.*;
29+
import java.util.concurrent.*;
30+
31+
import static org.junit.Assert.*;
32+
/**
33+
* Tests to validate thread safety of the path obfuscation feature in the Winnowing class.
34+
*/
35+
@Slf4j
36+
public class WinnowingConcurrencyTest {
37+
38+
/**
39+
* Test that concurrent obfuscation of paths works correctly without data loss or corruption.
40+
* This simulates multiple threads processing different files simultaneously.
41+
*/
42+
@Test
43+
public void testConcurrentObfuscation() throws InterruptedException, ExecutionException {
44+
int fileCount = 500; // More files to increase collision chances
45+
int iterations = 3; // Run multiple iterations to increase stress
46+
47+
48+
for (int iter = 0; iter < iterations; iter++) {
49+
log.info("Starting high-collision test iteration {}", iter);
50+
51+
Winnowing winnowing = Winnowing.builder().obfuscate(true).build();
52+
53+
ExecutorService executor = Executors.newFixedThreadPool(fileCount);
54+
55+
// Create a list of paths to obfuscate
56+
List<String> paths = new ArrayList<>();
57+
for (int i = 0; i < fileCount; i++) {
58+
paths.add("/path/to/file" + i + ".java");
59+
}
60+
61+
List<Future<String>> futures = new ArrayList<>(fileCount);
62+
63+
// Use a CyclicBarrier to ensure all threads start exactly together
64+
// Also adds an action that runs when all threads reach the barrier
65+
CyclicBarrier barrier = new CyclicBarrier(fileCount, () -> {
66+
log.info("All threads released simultaneously!");
67+
});
68+
69+
for (String path : paths) {
70+
futures.add(executor.submit(() -> {
71+
try {
72+
byte[] contents = ("sample content for " + path).getBytes();
73+
74+
// Wait at barrier until all threads are ready
75+
barrier.await();
76+
77+
// Access the same Winnowing instance concurrently
78+
return winnowing.wfpForContents(path, false, contents);
79+
80+
} catch (InterruptedException | BrokenBarrierException e) {
81+
throw new RuntimeException(e);
82+
}
83+
}));
84+
}
85+
86+
List<String> wfps = new ArrayList<>(fileCount);
87+
88+
// Wait for all tasks to complete and collect results
89+
for (Future<String> future : futures) {
90+
wfps.add(future.get()); // Add timeout to detect deadlocks
91+
}
92+
93+
executor.shutdown();
94+
assertTrue("Executor did not terminate properly",
95+
executor.awaitTermination(5, TimeUnit.SECONDS));
96+
97+
// Verify results
98+
log.info("Processed {} paths with {} unique results",
99+
paths.size(), winnowing.getObfuscationMapSize());
100+
assertEquals(paths.size(), winnowing.getObfuscationMapSize());
101+
}
102+
}
103+
}

0 commit comments

Comments
 (0)