Skip to content

Commit bc81bfb

Browse files
committed
add new method to extract sensitive information.
1 parent 44b4807 commit bc81bfb

6 files changed

Lines changed: 195 additions & 3 deletions

File tree

.idea/inspectionProfiles/Project_Default.xml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/runConfigurations/Run_Unt_Tests.xml renamed to .idea/runConfigurations/Run_All_Unit_Tests.xml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@
7474
<artifactId>java-jwt</artifactId>
7575
<version>4.5.0</version>
7676
</dependency>
77+
<dependency>
78+
<groupId>org.iban4j</groupId>
79+
<artifactId>iban4j</artifactId>
80+
<version>3.2.11-RELEASE</version>
81+
</dependency>
7782
<!-- TEST ONLY PURPOSE -->
7883
<dependency>
7984
<groupId>org.junit.jupiter</groupId>

src/main/java/eu/righettod/SecurityUtils.java

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.tika.metadata.Metadata;
3131
import org.apache.tika.mime.MediaType;
3232
import org.apache.tika.mime.MimeTypes;
33+
import org.iban4j.IbanUtil;
3334
import org.w3c.dom.Document;
3435
import org.xml.sax.EntityResolver;
3536
import org.xml.sax.InputSource;
@@ -63,10 +64,14 @@
6364
import java.security.MessageDigest;
6465
import java.security.SecureRandom;
6566
import java.time.Duration;
67+
import java.time.LocalDate;
68+
import java.time.YearMonth;
69+
import java.time.ZoneId;
6670
import java.util.*;
6771
import java.util.List;
6872
import java.util.concurrent.*;
6973
import java.util.concurrent.atomic.AtomicInteger;
74+
import java.util.regex.Matcher;
7075
import java.util.regex.Pattern;
7176
import java.util.zip.ZipEntry;
7277
import java.util.zip.ZipFile;
@@ -1422,4 +1427,91 @@ public static boolean isXSDSafe(String xsdFilePath) {
14221427
}
14231428
return isSafe;
14241429
}
1430+
1431+
1432+
/**
1433+
* Extract all sensitive information from a string provided.<br>
1434+
* This can be used to identify any sensitive information into a message expected to be written in a log and then replace every sensitive values by an obfuscated ones.<br>
1435+
* For the luxembourg national identification number, this method focus on detecting identifiers for a physical entity (people) and not a moral one (company).<br>
1436+
* I delegated the validation of the IBAN to a dedicated library to not "reinvent the wheel" and then introduce buggy validation myself.
1437+
*
1438+
* @param content String in which sensitive information must be searched.
1439+
* @return A map with the collection of identified sensitive information gathered by sensitive information type. If nothing is found then the map is empty. A type of sensitive information is only present if there is at least one item found. A set is used to not store duplicates occurrence of the same sensitive information.
1440+
* @throws Exception If any error occurs during the processing.
1441+
* @see "https://guichet.public.lu/en/citoyens/citoyennete/registre-national/identification/demande-numero-rnpp.html"
1442+
* @see "https://cnpd.public.lu/fr/decisions-avis/2009/identifiant-unique.html"
1443+
* @see "https://cnpd.public.lu/content/dam/cnpd/fr/decisions-avis/2009/identifiant-unique/48_2009.pdf"
1444+
* @see "https://en.wikipedia.org/wiki/International_Bank_Account_Number"
1445+
* @see "https://www.iban.com/structure"
1446+
* @see "https://github.com/arturmkrtchyan/iban4j"
1447+
* @see "https://cwe.mitre.org/data/definitions/532.html"
1448+
*/
1449+
public static Map<SensitiveInformationType, Set<String>> extractAllSensitiveInformation(String content) throws Exception {
1450+
Pattern nationalIdentifierRegex = Pattern.compile("([0-9]{13})");
1451+
Pattern ibanNonHumanFormattedRegex = Pattern.compile("([A-Z]{2}[0-9]{2}[A-Z0-9]{11,30})", Pattern.CASE_INSENSITIVE);
1452+
Pattern ibanHumanFormattedRegex = Pattern.compile("([A-Z]{2}[0-9]{2}(?:\\s[A-Z0-9]{4}){2,7}\\s[A-Z0-9]{1,4})", Pattern.CASE_INSENSITIVE);
1453+
Map<SensitiveInformationType, Set<String>> data = new HashMap<>();
1454+
data.put(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER, new HashSet<>());
1455+
data.put(SensitiveInformationType.IBAN, new HashSet<>());
1456+
1457+
if (content != null && !content.isBlank()) {
1458+
/* Step 1: Search for LU national identifier */
1459+
//A national identifier have the following structure: [BIRTHDATE_YEAR_YYYY][BIRTHDATE_MONTH_MM][BIRTHDATE_DAY_DD][FIVE_INTEGER]
1460+
//Define minimal and maximal birth year base on current year
1461+
//Assume people live less than 120 years
1462+
int maxBirthYear = LocalDate.now(ZoneId.of("Europe/Luxembourg")).getYear();
1463+
int minBirthYear = maxBirthYear - 120;
1464+
Matcher matcher = nationalIdentifierRegex.matcher(content);
1465+
String nationalIdentierFull;
1466+
int nationalIdentierYear, nationalIdentierMonth, nationalIdentierDay;
1467+
while (matcher.find()) {
1468+
nationalIdentierFull = matcher.group(1);
1469+
//Check that the string is a valid national identifier and if yes then add it
1470+
nationalIdentierYear = Integer.parseInt(nationalIdentierFull.substring(0, 4));
1471+
nationalIdentierMonth = Integer.parseInt(nationalIdentierFull.substring(4, 6));
1472+
nationalIdentierDay = Integer.parseInt(nationalIdentierFull.substring(6, 8));
1473+
if (nationalIdentierYear >= minBirthYear && nationalIdentierYear <= maxBirthYear) {
1474+
if (nationalIdentierMonth >= 1 && nationalIdentierMonth <= 12) {
1475+
if (YearMonth.of(nationalIdentierYear, nationalIdentierMonth).isValidDay(nationalIdentierDay)) {
1476+
data.get(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER).add(nationalIdentierFull);
1477+
}
1478+
}
1479+
}
1480+
}
1481+
1482+
/* Step 2a: Search for IBAN that are non human formatted */
1483+
matcher = ibanNonHumanFormattedRegex.matcher(content);
1484+
String iban, ibanUpperCased;
1485+
while (matcher.find()) {
1486+
iban = matcher.group(1);
1487+
ibanUpperCased = iban.toUpperCase(Locale.ROOT);
1488+
//Check that the string is a valid iban and if yes then add it
1489+
if (IbanUtil.isValid(ibanUpperCased)) {
1490+
data.get(SensitiveInformationType.IBAN).add(iban);
1491+
}
1492+
}
1493+
1494+
/* Step 2b: Search for IBAN that are human formatted */
1495+
matcher = ibanHumanFormattedRegex.matcher(content);
1496+
String ibanUpperCasedNoSpace;
1497+
while (matcher.find()) {
1498+
iban = matcher.group(1);
1499+
ibanUpperCasedNoSpace = iban.toUpperCase(Locale.ROOT).replace(" ", "");
1500+
//Check that the string is a valid iban and if yes then add it
1501+
if (IbanUtil.isValid(ibanUpperCasedNoSpace)) {
1502+
data.get(SensitiveInformationType.IBAN).add(iban);
1503+
}
1504+
}
1505+
}
1506+
1507+
//Cleanup if a set is empty
1508+
if (data.get(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER).isEmpty()) {
1509+
data.remove(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER);
1510+
}
1511+
if (data.get(SensitiveInformationType.IBAN).isEmpty()) {
1512+
data.remove(SensitiveInformationType.IBAN);
1513+
}
1514+
1515+
return data;
1516+
}
14251517
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package eu.righettod;
2+
3+
/**
4+
* Enumeration used by the method <code>SecurityUtils.extractAllSensitiveInformation()</code> to identify types of information found.
5+
*/
6+
public enum SensitiveInformationType {
7+
/**
8+
* National identifier used by government entities in Luxembourg to identify uniquely citizens.
9+
*
10+
* @see "https://guichet.public.lu/en/citoyens/citoyennete/registre-national/identification/demande-numero-rnpp.html"
11+
* @see "https://cnpd.public.lu/fr/decisions-avis/2009/identifiant-unique.html"
12+
*
13+
*/
14+
LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER,
15+
16+
/**
17+
* International Bank Account Number.
18+
*
19+
* @see "https://en.wikipedia.org/wiki/International_Bank_Account_Number"
20+
*/
21+
IBAN
22+
23+
}

src/test/java/eu/righettod/TestSecurityUtils.java

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -663,11 +663,83 @@ public void isXSDSafe() {
663663
String testFile = getTestFilePath(f);
664664
assertFalse(SecurityUtils.isXSDSafe(testFile), String.format(TEMPLATE_MESSAGE_FALSE_NEGATIVE_FOR_FILE, testFile));
665665
});
666-
List<String> safeFileList = Arrays.asList("test-xsd-no-external-schema.xsd");
666+
List<String> safeFileList = List.of("test-xsd-no-external-schema.xsd");
667667
safeFileList.forEach(f -> {
668668
String testFile = getTestFilePath(f);
669669
assertTrue(SecurityUtils.isXSDSafe(testFile), String.format(TEMPLATE_MESSAGE_FALSE_POSITIVE_FOR_FILE, testFile));
670670
});
671671
}
672+
673+
@Test
674+
public void extractAllSensitiveInformation() {
675+
/* Test extraction of a single type of sensitive information */
676+
//Case format is the following
677+
//[0]: The string containing one or several sensitive information
678+
//[1]: The collection of sensitive information separated by a ;
679+
//[2]: The type of sensitive information
680+
String luxNationalIdTypeName = SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER.name();
681+
String ibanTypeName = SensitiveInformationType.IBAN.name();
682+
final List<String[]> casesWithSensitiveData = new ArrayList<>();
683+
casesWithSensitiveData.add(new String[]{"I expected to log 1955010112345", "1955010112345", luxNationalIdTypeName});
684+
casesWithSensitiveData.add(new String[]{"I expected to log 1955010112345 from 1974052254321", "1955010112345;1974052254321", luxNationalIdTypeName});
685+
casesWithSensitiveData.add(new String[]{"I expected to\nlog 1955010112345\nfrom\t1974052254321", "1955010112345;1974052254321", luxNationalIdTypeName});
686+
casesWithSensitiveData.add(new String[]{"I expected to log BE71096123456769", "BE71096123456769", ibanTypeName});
687+
casesWithSensitiveData.add(new String[]{"I expected to log BE71096123456769 with EG800002000156789012345180002", "BE71096123456769;EG800002000156789012345180002", ibanTypeName});
688+
casesWithSensitiveData.add(new String[]{"I expected to\nlog BE71096123456769\nwith\tEG800002000156789012345180002", "BE71096123456769;EG800002000156789012345180002", ibanTypeName});
689+
casesWithSensitiveData.add(new String[]{"I expected to log DE89 3704 0044 0532 0130 00", "DE89 3704 0044 0532 0130 00", ibanTypeName});
690+
casesWithSensitiveData.add(new String[]{"I expected to log DE89 3704 0044 0532 0130 00 with FR14 2004 1010 0505 0001 3M02 606", "DE89 3704 0044 0532 0130 00;FR14 2004 1010 0505 0001 3M02 606", ibanTypeName});
691+
casesWithSensitiveData.add(new String[]{"I expected to\nlog DE89 3704 0044 0532 0130 00 with\tFR14 2004 1010 0505 0001 3M02 606", "DE89 3704 0044 0532 0130 00;FR14 2004 1010 0505 0001 3M02 606", ibanTypeName});
692+
casesWithSensitiveData.forEach(caseData -> {
693+
try {
694+
String content = caseData[0];
695+
List<String> expectedInfos = Arrays.stream(caseData[1].split(";")).toList();
696+
SensitiveInformationType expectedInfosType = SensitiveInformationType.valueOf(caseData[2]);
697+
Map<SensitiveInformationType, Set<String>> data = SecurityUtils.extractAllSensitiveInformation(content);
698+
assertEquals(1, data.size(), String.format("[%s] The number of type of identified information is incorrect!", caseData[2]));
699+
assertEquals(expectedInfos.size(), data.get(expectedInfosType).size(), String.format("[%s] The number of identified information is incorrect!", caseData[2]));
700+
assertTrue(expectedInfos.containsAll(data.get(expectedInfosType)), String.format("[%s] The identified information is incorrect!", caseData[2]));
701+
} catch (Exception e) {
702+
throw new RuntimeException(e);
703+
}
704+
});
705+
706+
/* Test extraction of all the types of sensitive information */
707+
SensitiveInformationType luxNationalIdType = SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER;
708+
SensitiveInformationType ibanType = SensitiveInformationType.IBAN;
709+
String content = "I expected\nto log 1955010112345 and 1974052254321\tfrom DE89 3704 0044 0532 0130 00\nwith BE71096123456769";
710+
Set<String> nationalIdentifierExpected = Set.of("1955010112345", "1974052254321");
711+
Set<String> ibanExpected = Set.of("DE89 3704 0044 0532 0130 00", "BE71096123456769");
712+
try {
713+
Map<SensitiveInformationType, Set<String>> data = SecurityUtils.extractAllSensitiveInformation(content);
714+
assertEquals(2, data.size(), "[COMBINED] The number of type of identified information is incorrect!");
715+
assertEquals(2, data.get(luxNationalIdType).size(), String.format("[COMBINED][%s] The number of identified information is incorrect!", luxNationalIdType));
716+
assertEquals(2, data.get(ibanType).size(), String.format("[COMBINED][%s] The number of identified information is incorrect!", ibanType));
717+
assertTrue(nationalIdentifierExpected.containsAll(data.get(luxNationalIdType)), String.format("[%s] The identified information is incorrect!", luxNationalIdType));
718+
assertTrue(ibanExpected.containsAll(data.get(ibanType)), String.format("[%s] The identified information is incorrect!", ibanType));
719+
} catch (Exception e) {
720+
fail(e);
721+
}
722+
723+
/* Test extraction of sensitive information from content without any sensitive information */
724+
//Case format is the the direct string content
725+
final List<String> casesWithoutSensitiveData = new ArrayList<>();
726+
casesWithoutSensitiveData.add("Hello World");
727+
casesWithoutSensitiveData.add("Hello World from 1111111111111");
728+
casesWithoutSensitiveData.add("Hello World from 3000010112345");
729+
casesWithoutSensitiveData.add("Hello World from 1800010112345");
730+
casesWithoutSensitiveData.add("Hello\nWorld from\t1980130112345");
731+
casesWithoutSensitiveData.add("Hello World from 1980023112345");
732+
casesWithoutSensitiveData.add("Hello World from DE89 3704 0044 0532 0130 AA");
733+
casesWithoutSensitiveData.add("Hello World from SV43ACAT000000000000001231XX");
734+
casesWithoutSensitiveData.forEach(caseData -> {
735+
try {
736+
Map<SensitiveInformationType, Set<String>> data = SecurityUtils.extractAllSensitiveInformation(caseData);
737+
assertTrue(data.isEmpty(), "The number of type of identified information is incorrect!");
738+
} catch (Exception e) {
739+
throw new RuntimeException(e);
740+
}
741+
});
742+
743+
}
672744
}
673745

0 commit comments

Comments
 (0)