-
Notifications
You must be signed in to change notification settings - Fork 4
feat(SP-2487): Implement path obfuscation #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 3 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
b4423ae
chore(SP-2487): add utils `extractFilePathsFromWFPBlock()` and `extra…
isasmendiagus b8ec9e7
feat(SP-2487): implement path obfuscation on Winnowing class
isasmendiagus 3e77390
feat(SP-2487): implement path deobfuscation on Scanner class
isasmendiagus 9090987
feat(SP-2487): add obfuscate option to CLI
isasmendiagus 31f0855
chore(SP-2487): add `javadoc` and `lint` to Makefile
isasmendiagus File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,16 +28,20 @@ | |
| import lombok.*; | ||
| import lombok.extern.slf4j.Slf4j; | ||
| import org.apache.commons.codec.digest.DigestUtils; | ||
| import org.apache.commons.io.FilenameUtils; | ||
| import org.apache.tika.Tika; | ||
| import org.apache.tika.mime.MediaType; | ||
| import org.apache.tika.mime.MediaTypeRegistry; | ||
| import org.jetbrains.annotations.NotNull; | ||
|
|
||
| import java.io.ByteArrayInputStream; | ||
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.nio.charset.Charset; | ||
| import java.nio.file.Files; | ||
| import java.util.*; | ||
| import java.util.concurrent.ConcurrentHashMap; | ||
| import java.util.concurrent.atomic.AtomicLong; | ||
| import java.util.zip.CRC32C; | ||
| import java.util.zip.Checksum; | ||
|
|
||
|
|
@@ -58,6 +62,14 @@ public class Winnowing { | |
| private static final Tika tika = new Tika(); | ||
| private static final MediaTypeRegistry mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry(); | ||
|
|
||
| /** | ||
| * Shared counter for generating unique IDs. | ||
| * idGenerator is shared across all Winnowing instances, | ||
| * ensuring sequential and unique ID generation for path obfuscation | ||
| * regardless of how many instances of Winnowing are created. | ||
| */ | ||
| private static final AtomicLong idGenerator = new AtomicLong(0); | ||
|
|
||
| @Builder.Default | ||
| private Boolean skipSnippets = Boolean.FALSE; // Skip snippet generations | ||
| @Builder.Default | ||
|
|
@@ -68,6 +80,31 @@ public class Winnowing { | |
| private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection | ||
| @Builder.Default | ||
| private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation | ||
| @Builder.Default | ||
| private Map<String, String> obfuscationMap = new ConcurrentHashMap<>(); | ||
|
|
||
| /** | ||
| * Resolves the real file path for a given obfuscated path. | ||
| * This method is thread-safe and can be called concurrently from multiple threads. | ||
| * If the provided path is not found in the obfuscation map, the original path is returned. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this comment right? |
||
| * | ||
| * @param obfuscatedPath the obfuscated path | ||
| * @return the real file path corresponding to the provided obfuscated path, or the original path if no mapping exists | ||
| */ | ||
| public String deobfuscateFilePath(@NotNull String obfuscatedPath) { | ||
| String originalPath = obfuscationMap.get(obfuscatedPath); | ||
| return originalPath != null ? originalPath : obfuscatedPath; | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Retrieves the size of the obfuscation map. | ||
| * | ||
| * @return the number of entries in the obfuscation map | ||
| */ | ||
| public int getObfuscationMapSize() { | ||
| return obfuscationMap.size(); | ||
| } | ||
|
|
||
| /** | ||
| * Calculate the WFP (fingerprint) for the given file | ||
|
|
@@ -112,7 +149,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c | |
| char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray(); | ||
| String fileMD5 = DigestUtils.md5Hex(contents); | ||
| StringBuilder wfpBuilder = new StringBuilder(); | ||
| // TODO add obfuscation of the filename here | ||
|
|
||
| if (obfuscate) { | ||
| filename = obfuscateFilePath(filename); | ||
| } | ||
|
|
||
| wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename)); | ||
| if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) { | ||
| return wfpBuilder.toString(); | ||
|
|
@@ -180,6 +221,40 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c | |
| return wfpBuilder.toString(); | ||
| } | ||
|
|
||
| /** | ||
| * Obfuscates the given file path by replacing it with a generated unique identifier while | ||
| * retaining its original file extension. | ||
| * This method is thread-safe and can be called concurrently from multiple threads. | ||
| * | ||
| * @param originalPath the original file path to be obfuscated; must not be null | ||
| * @return the obfuscated file path with a unique identifier and the original file extension | ||
| */ | ||
| private String obfuscateFilePath(@NotNull String originalPath) { | ||
| final String extension = extractExtension(originalPath); | ||
|
|
||
| // Generate a unique identifier for the obfuscated file using a thread-safe approach | ||
| final String obfuscatedPath = idGenerator.getAndIncrement() + extension; | ||
| this.obfuscationMap.put(obfuscatedPath, originalPath); | ||
| return obfuscatedPath; | ||
| } | ||
|
|
||
| /** | ||
| * Extracts file extension from the given path, including the leading dot. | ||
| * | ||
| * @param path the file path or name (must not be null) | ||
| * @return the file extension with leading dot (e.g., ".txt") or empty string if no extension | ||
| */ | ||
| private String extractExtension(@NotNull String path) { | ||
| try { | ||
| String extractedExtension = FilenameUtils.getExtension(path).trim(); | ||
| return extractedExtension.isEmpty() ? "" : "." + extractedExtension; | ||
| } catch (IllegalArgumentException e) { | ||
| log.debug("Could not extract extension from filename '{}': {}", | ||
| path, e.getMessage()); | ||
| return ""; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Determine if a file/contents should be skipped for snippet generation or not | ||
| * @param filename filename for the contents (optional) | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we use a constant here?
Suggested:
private final static int ID
private final static int SCANNER_ID