Skip to content

Commit 17a5151

Browse files
committed
chore(SP-2487): add extractFilePathsFromWFPBlock()
1 parent 0ec8201 commit 17a5151

2 files changed

Lines changed: 122 additions & 0 deletions

File tree

src/main/java/com/scanoss/utils/WinnowingUtils.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
*/
2323
package com.scanoss.utils;
2424

25+
import org.jetbrains.annotations.NotNull;
26+
27+
import java.util.HashSet;
28+
import java.util.Set;
29+
import java.util.regex.Matcher;
30+
import java.util.regex.Pattern;
31+
2532
/**
2633
* SCANOSS Winnowing Utils Class
2734
* <p>
@@ -47,4 +54,31 @@ public static char normalize(char c) {
4754
return 0;
4855
}
4956
}
57+
58+
/**
59+
* Extract all file paths from a multi-file WFP block using regex.
60+
* A multi-file WFP block contains multiple entries each starting with "file=".
61+
*
62+
* @param wfpBlock the WFP block containing multiple file entries
63+
* @return a Set of extracted file paths, empty if none found
64+
*/
65+
public static Set<String> extractFilePathsFromWFPBlock(@NotNull String wfpBlock) {
66+
Set<String> paths = new HashSet<>();
67+
68+
// Pattern to match file=<md5>,<size>,<path> format and capture the path
69+
// This regex matches: "file=" followed by any characters until a comma,
70+
// then any characters until another comma, then captures everything after that comma until end of line
71+
Pattern pattern = Pattern.compile("^file=[^,]+,[^,]+,(.+)$", Pattern.MULTILINE);
72+
Matcher matcher = pattern.matcher(wfpBlock);
73+
74+
// Find all matches and add the captured paths to the result set
75+
while (matcher.find()) {
76+
String path = matcher.group(1);
77+
if (path != null && !path.isEmpty()) {
78+
paths.add(path);
79+
}
80+
}
81+
82+
return paths;
83+
}
5084
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
2+
package com.scanoss.utils;
3+
4+
import org.junit.Test;
5+
6+
import java.util.Set;
7+
8+
import static org.junit.Assert.*;
9+
10+
public class WinnowingUtilsTest {
11+
12+
// Test file format: file=<md5>,<file_size>,<path>
13+
private static final String FILE1 = "file=90ebac4735d345fde0d05d939321d8fc,15878,/path/to/file1";
14+
private static final String FILE2 = "file=a7c31f87d23c42af732f57d39a9b05ac,24680,/path/to/file2";
15+
private static final String FILE3 = "file=e8585d8740d6664fda9e242a1d68b0f0,1815,/path/to/file3";
16+
private static final String FILE_SAME_PATH = "file=b1a89f4c5b0de974ad9846108c6d093a,9876,/path/to/file1";
17+
private static final String FILE_WITH_COMMA = "file=72a9e90d423b92dba36f78acc9bbecc7,12345,/path/with,comma";
18+
private static final String INVALID_FILE_NO_COMMAS = "file=invalid";
19+
private static final String INVALID_FILE_ONE_COMMA = "file=a7c31f87d23c42af732f57d39a9b05ac,24680";
20+
21+
// WFP hash entries
22+
private static final String WFP_ENTRY1 = "4=30777ca8,e9227657\n9=831bd2c5,701a2c74";
23+
private static final String WFP_ENTRY2 = "5=12345678,abcdefgh";
24+
25+
@Test
26+
public void testExtractFilePathsFromWFPBlock_SingleFile_ReturnsSinglePath() {
27+
String wfpBlock = FILE1 + "\n" + WFP_ENTRY1;
28+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
29+
assertEquals(1, result.size());
30+
assertTrue(result.contains("/path/to/file1"));
31+
}
32+
33+
@Test
34+
public void testExtractFilePathsFromWFPBlock_MultipleFiles_ReturnsAllPaths() {
35+
String wfpBlock = FILE1 + "\n" + WFP_ENTRY1 + "\n" + FILE2 + "\n" + WFP_ENTRY2 + "\n" + FILE3 + "\n";
36+
37+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
38+
assertEquals(3, result.size());
39+
assertTrue(result.contains("/path/to/file1"));
40+
assertTrue(result.contains("/path/to/file2"));
41+
assertTrue(result.contains("/path/to/file3"));
42+
}
43+
44+
@Test
45+
public void testExtractFilePathsFromWFPBlock_DuplicatePaths_ReturnsUniqueSet() {
46+
String wfpBlock = FILE1 + "\n" + WFP_ENTRY1 + "\n" + FILE_SAME_PATH + "\n" + WFP_ENTRY2 + "\n";
47+
48+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
49+
assertEquals(1, result.size());
50+
assertTrue(result.contains("/path/to/file1"));
51+
}
52+
53+
@Test
54+
public void testExtractFilePathsFromWFPBlock_EmptyString_ReturnsEmptySet() {
55+
String wfpBlock = "";
56+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
57+
assertTrue(result.isEmpty());
58+
}
59+
60+
@Test
61+
public void testExtractFilePathsFromWFPBlock_NoValidFileLines_ReturnsEmptySet() {
62+
String wfpBlock = "not_file=90ebac4735d345fde0d05d939321d8fc,15878,something\nanother=line\n";
63+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
64+
assertTrue(result.isEmpty());
65+
}
66+
67+
@Test
68+
public void testExtractFilePathsFromWFPBlock_WithPathsContainingCommas_ParsesCorrectly() {
69+
String wfpBlock = FILE_WITH_COMMA + "\n" + WFP_ENTRY1 + "\n" + FILE2 + "\n";
70+
71+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
72+
assertEquals(2, result.size());
73+
assertTrue(result.contains("/path/with,comma"));
74+
assertTrue(result.contains("/path/to/file2"));
75+
}
76+
77+
@Test
78+
public void testExtractFilePathsFromWFPBlock_ComplexCase_HandlesCorrectly() {
79+
String wfpBlock = "not_a_file=something\n" + FILE1 + "\n" + WFP_ENTRY1 + "\n" + INVALID_FILE_NO_COMMAS + "\n"
80+
+ FILE2 + "\n" + WFP_ENTRY2 + "\n" + "random line\n" + FILE3 + "\n";
81+
82+
Set<String> result = WinnowingUtils.extractFilePathsFromWFPBlock(wfpBlock);
83+
assertEquals(3, result.size());
84+
assertTrue(result.contains("/path/to/file1"));
85+
assertTrue(result.contains("/path/to/file2"));
86+
assertTrue(result.contains("/path/to/file3"));
87+
}
88+
}

0 commit comments

Comments
 (0)