Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 76 additions & 1 deletion src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,20 @@
import lombok.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.jetbrains.annotations.NotNull;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.CRC32C;
import java.util.zip.Checksum;

Expand All @@ -58,6 +62,14 @@ public class Winnowing {
private static final Tika tika = new Tika();
private static final MediaTypeRegistry mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry();

/**
* Shared counter for generating unique IDs.
* idGenerator is shared across all Winnowing instances,
* ensuring sequential and unique ID generation for path obfuscation
* regardless of how many instances of Winnowing are created.
*/
private static final AtomicLong idGenerator = new AtomicLong(0);

@Builder.Default
private Boolean skipSnippets = Boolean.FALSE; // Skip snippet generations
@Builder.Default
Expand All @@ -68,6 +80,31 @@ public class Winnowing {
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
@Builder.Default
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
@Builder.Default
private Map<String, String> obfuscationMap = new ConcurrentHashMap<>();

/**
* Resolves the real file path for a given obfuscated path.
* This method is thread-safe and can be called concurrently from multiple threads.
* If the provided path is not found in the obfuscation map, the original path is returned.
Copy link
Copy Markdown
Contributor

@agustingroh agustingroh May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment right?
should be "If the original path is not found in the obfuscation map, the obfuscated path is returned." ?

*
* @param obfuscatedPath the obfuscated path
* @return the real file path corresponding to the provided obfuscated path, or the original path if no mapping exists
*/
public String deobfuscateFilePath(@NotNull String obfuscatedPath) {
String originalPath = obfuscationMap.get(obfuscatedPath);
return originalPath != null ? originalPath : obfuscatedPath;
}


/**
* Retrieves the size of the obfuscation map.
*
* @return the number of entries in the obfuscation map
*/
public int getObfuscationMapSize() {
return obfuscationMap.size();
}

/**
* Calculate the WFP (fingerprint) for the given file
Expand Down Expand Up @@ -112,7 +149,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray();
String fileMD5 = DigestUtils.md5Hex(contents);
StringBuilder wfpBuilder = new StringBuilder();
// TODO add obfuscation of the filename here

if (obfuscate) {
filename = obfuscateFilePath(filename);
}

wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
return wfpBuilder.toString();
Expand Down Expand Up @@ -180,6 +221,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
return wfpBuilder.toString();
}

/**
* Obfuscates the given file path by replacing it with a generated unique identifier while
* retaining its original file extension.
* This method is thread-safe and can be called concurrently from multiple threads.
*
* @param originalPath the original file path to be obfuscated; must not be null
* @return the obfuscated file path with a unique identifier and the original file extension
*/
private String obfuscateFilePath(@NotNull String originalPath) {
final String extension = extractExtension(originalPath);

// Generate a unique identifier for the obfuscated file using a thread-safe approach
final String obfuscatedPath = idGenerator.getAndIncrement() + extension;
this.obfuscationMap.put(obfuscatedPath, originalPath);
return obfuscatedPath;
}

/**
* Extracts file extension from the given path, including the leading dot.
*
* @param path the file path or name (must not be null)
* @return the file extension with leading dot (e.g., ".txt") or empty string if no extension
*/
private String extractExtension(@NotNull String path) {
try {
String extractedExtension = FilenameUtils.getExtension(path).trim();
return extractedExtension.isEmpty() ? "" : "." + extractedExtension;
} catch (IllegalArgumentException e) {
log.debug("Could not extract extension from filename '{}': {}",
path, e.getMessage());
return "";
}
}

/**
* Determine if a file/contents should be skipped for snippet generation or not
* @param filename filename for the contents (optional)
Expand Down
48 changes: 48 additions & 0 deletions src/main/java/com/scanoss/utils/WinnowingUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
*/
package com.scanoss.utils;

import org.jetbrains.annotations.NotNull;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* SCANOSS Winnowing Utils Class
* <p>
Expand All @@ -47,4 +54,45 @@ public static char normalize(char c) {
return 0;
}
}


/**
* Extracts the first/primary file path from a WFP block.
* This is a convenience method for single-file scenarios.
*
* @param wfpBlock the WFP block containing file entries
* @return the first extracted file path, or null if none found
*/
public static String extractFilePathFromWFPBlock(@NotNull String wfpBlock) {
Set<String> paths = extractFilePathsFromWFPBlock(wfpBlock);
return paths.isEmpty() ? null : paths.iterator().next();
}


/**
* Extract all file paths from a multi-file WFP block using regex.
* A multi-file WFP block contains multiple entries each starting with "file=".
*
* @param wfpBlock the WFP block containing multiple file entries
* @return a Set of extracted file paths, empty if none found
*/
public static Set<String> extractFilePathsFromWFPBlock(@NotNull String wfpBlock) {
Set<String> paths = new HashSet<>();

// Pattern to match file=<md5>,<size>,<path> format and capture the path
// This regex matches: "file=" followed by any characters until a comma,
// then any characters until another comma, then captures everything after that comma until end of line
Pattern pattern = Pattern.compile("^file=[^,]+,[^,]+,(.+)$", Pattern.MULTILINE);
Matcher matcher = pattern.matcher(wfpBlock);

// Find all matches and add the captured paths to the result set
while (matcher.find()) {
String path = matcher.group(1);
if (path != null && !path.isEmpty()) {
paths.add(path);
}
}

return paths;
}
}
69 changes: 69 additions & 0 deletions src/test/java/com/scanoss/TestWinnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@


import com.scanoss.exceptions.WinnowingException;
import com.scanoss.utils.WinnowingUtils;
import lombok.extern.slf4j.Slf4j;
import org.junit.After;
import org.junit.Before;
Expand Down Expand Up @@ -265,4 +266,72 @@ public void TestWinnowingFileFailures() {

log.info("Finished {} -->", methodName);
}

@Test
public void TestWinnowingObfuscationFileWithExtension() {
String methodName = new Object() {
}.getClass().getEnclosingMethod().getName();
log.info("<-- Starting {}", methodName);

Winnowing winnowing = Winnowing.builder().obfuscate(true).build();

String fileWithExtension = "testing/data/test-file.txt";

String wfpWithExtension = winnowing.wfpForFile(fileWithExtension, fileWithExtension);
assertNotNull("Expected a result from WFP with extension", wfpWithExtension);

String obfuscatedPathWithExtension = WinnowingUtils.extractFilePathFromWFPBlock(wfpWithExtension);
assertNotNull("Should have found an obfuscated path in WFP with extension", obfuscatedPathWithExtension);

String originalPathWithExtension = winnowing.deobfuscateFilePath(obfuscatedPathWithExtension);
assertEquals("Original path should match input file with extension", fileWithExtension, originalPathWithExtension);

log.info("Finished {} -->", methodName);
}

@Test
public void TestWinnowingObfuscationFileWithoutExtension() {
String methodName = new Object() {
}.getClass().getEnclosingMethod().getName();
log.info("<-- Starting {}", methodName);

Winnowing winnowing = Winnowing.builder().obfuscate(true).build();

String fileWithoutExtension = "testing/data/nbproject";

String wfpWithoutExtension = winnowing.wfpForFile(fileWithoutExtension, fileWithoutExtension);

String obfuscatedPathWithoutExtension = WinnowingUtils.extractFilePathFromWFPBlock(wfpWithoutExtension);
assertNotNull("Should have found an obfuscated path in WFP without extension", obfuscatedPathWithoutExtension);

String originalPathWithoutExtension = winnowing.deobfuscateFilePath(obfuscatedPathWithoutExtension);
assertEquals("Original path should match input file without extension", fileWithoutExtension, originalPathWithoutExtension);

log.info("Finished {} -->", methodName);
}

@Test
public void TestDeobfuscateFilePathEmpty() {
String methodName = new Object() {
}.getClass().getEnclosingMethod().getName();
log.info("<-- Starting {}", methodName);

Winnowing winnowing = Winnowing.builder().build();
assertEquals("Should return null when given an empty obfuscated path", "" ,winnowing.deobfuscateFilePath(""));

log.info("Finished {} -->", methodName);
}

@Test
public void TestDeobfuscateFilePathInvalid() {
String methodName = new Object() {
}.getClass().getEnclosingMethod().getName();
log.info("<-- Starting {}", methodName);

Winnowing winnowing = Winnowing.builder().build();
assertEquals("Should return same path if not exist on the map", "invalidPath", winnowing.deobfuscateFilePath("invalidPath"));

log.info("Finished {} -->", methodName);
}
}

98 changes: 98 additions & 0 deletions src/test/java/com/scanoss/WinnowingConcurrencyTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) 2025, SCANOSS
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package com.scanoss;

import lombok.extern.slf4j.Slf4j;
import org.junit.Test;

import java.util.*;
import java.util.concurrent.*;

import static org.junit.Assert.*;
/**
* Tests to validate thread safety of the path obfuscation feature in the Winnowing class.
*/
@Slf4j
public class WinnowingConcurrencyTest {

/**
* Test that concurrent obfuscation of paths works correctly without data loss or corruption.
* This simulates multiple threads processing different files simultaneously.
*/
@Test
public void testConcurrentObfuscation() throws InterruptedException, ExecutionException {
int fileCount = 500; // More files to increase collision chances
int iterations = 3; // Run multiple iterations to increase stress


for (int iter = 0; iter < iterations; iter++) {
log.info("Starting high-collision test iteration {}", iter);

Winnowing winnowing = Winnowing.builder().obfuscate(true).build();

ExecutorService executor = Executors.newFixedThreadPool(fileCount);

// Create a list of paths to obfuscate
List<String> paths = new ArrayList<>();
for (int i = 0; i < fileCount; i++) {
paths.add("/path/to/file" + i + ".java");
}

List<Future<String>> futures = new ArrayList<>(fileCount);

// Use a CyclicBarrier to ensure all threads start exactly together.
CyclicBarrier barrier = new CyclicBarrier(fileCount, () -> log.info("All threads released simultaneously!"));

for (String path : paths) {
futures.add(executor.submit(() -> {
try {
byte[] contents = ("sample content for " + path).getBytes();

// Wait at barrier until all threads are ready
barrier.await();

// Access the same Winnowing instance concurrently
return winnowing.wfpForContents(path, false, contents);

} catch (InterruptedException | BrokenBarrierException e) {
throw new RuntimeException(e);
}
}));
}

// Wait for all tasks to complete
for (Future<String> future : futures) {
future.get();
}

executor.shutdown();
assertTrue("Executor did not terminate properly",
executor.awaitTermination(5, TimeUnit.SECONDS));

// Verify results
log.info("Processed {} paths with {} unique results",
paths.size(), winnowing.getObfuscationMapSize());
assertEquals(paths.size(), winnowing.getObfuscationMapSize());
}
}
}
Loading