|
30 | 30 | import org.apache.tika.metadata.Metadata; |
31 | 31 | import org.apache.tika.mime.MediaType; |
32 | 32 | import org.apache.tika.mime.MimeTypes; |
| 33 | +import org.iban4j.IbanUtil; |
33 | 34 | import org.w3c.dom.Document; |
34 | 35 | import org.xml.sax.EntityResolver; |
35 | 36 | import org.xml.sax.InputSource; |
|
63 | 64 | import java.security.MessageDigest; |
64 | 65 | import java.security.SecureRandom; |
65 | 66 | import java.time.Duration; |
| 67 | +import java.time.LocalDate; |
| 68 | +import java.time.YearMonth; |
| 69 | +import java.time.ZoneId; |
66 | 70 | import java.util.*; |
67 | 71 | import java.util.List; |
68 | 72 | import java.util.concurrent.*; |
69 | 73 | import java.util.concurrent.atomic.AtomicInteger; |
| 74 | +import java.util.regex.Matcher; |
70 | 75 | import java.util.regex.Pattern; |
71 | 76 | import java.util.zip.ZipEntry; |
72 | 77 | import java.util.zip.ZipFile; |
@@ -1422,4 +1427,91 @@ public static boolean isXSDSafe(String xsdFilePath) { |
1422 | 1427 | } |
1423 | 1428 | return isSafe; |
1424 | 1429 | } |
| 1430 | + |
| 1431 | + |
| 1432 | + /** |
| 1433 | + * Extract all sensitive information from a string provided.<br> |
| 1434 | + * This can be used to identify any sensitive information into a message expected to be written in a log and then replace every sensitive values by an obfuscated ones.<br> |
| 1435 | + * For the luxembourg national identification number, this method focus on detecting identifiers for a physical entity (people) and not a moral one (company).<br> |
| 1436 | + * I delegated the validation of the IBAN to a dedicated library to not "reinvent the wheel" and then introduce buggy validation myself. |
| 1437 | + * |
| 1438 | + * @param content String in which sensitive information must be searched. |
| 1439 | + * @return A map with the collection of identified sensitive information gathered by sensitive information type. If nothing is found then the map is empty. A type of sensitive information is only present if there is at least one item found. A set is used to not store duplicates occurrence of the same sensitive information. |
| 1440 | + * @throws Exception If any error occurs during the processing. |
| 1441 | + * @see "https://guichet.public.lu/en/citoyens/citoyennete/registre-national/identification/demande-numero-rnpp.html" |
| 1442 | + * @see "https://cnpd.public.lu/fr/decisions-avis/2009/identifiant-unique.html" |
| 1443 | + * @see "https://cnpd.public.lu/content/dam/cnpd/fr/decisions-avis/2009/identifiant-unique/48_2009.pdf" |
| 1444 | + * @see "https://en.wikipedia.org/wiki/International_Bank_Account_Number" |
| 1445 | + * @see "https://www.iban.com/structure" |
| 1446 | + * @see "https://github.com/arturmkrtchyan/iban4j" |
| 1447 | + * @see "https://cwe.mitre.org/data/definitions/532.html" |
| 1448 | + */ |
| 1449 | + public static Map<SensitiveInformationType, Set<String>> extractAllSensitiveInformation(String content) throws Exception { |
| 1450 | + Pattern nationalIdentifierRegex = Pattern.compile("([0-9]{13})"); |
| 1451 | + Pattern ibanNonHumanFormattedRegex = Pattern.compile("([A-Z]{2}[0-9]{2}[A-Z0-9]{11,30})", Pattern.CASE_INSENSITIVE); |
| 1452 | + Pattern ibanHumanFormattedRegex = Pattern.compile("([A-Z]{2}[0-9]{2}(?:\\s[A-Z0-9]{4}){2,7}\\s[A-Z0-9]{1,4})", Pattern.CASE_INSENSITIVE); |
| 1453 | + Map<SensitiveInformationType, Set<String>> data = new HashMap<>(); |
| 1454 | + data.put(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER, new HashSet<>()); |
| 1455 | + data.put(SensitiveInformationType.IBAN, new HashSet<>()); |
| 1456 | + |
| 1457 | + if (content != null && !content.isBlank()) { |
| 1458 | + /* Step 1: Search for LU national identifier */ |
| 1459 | + //A national identifier have the following structure: [BIRTHDATE_YEAR_YYYY][BIRTHDATE_MONTH_MM][BIRTHDATE_DAY_DD][FIVE_INTEGER] |
| 1460 | + //Define minimal and maximal birth year base on current year |
| 1461 | + //Assume people live less than 120 years |
| 1462 | + int maxBirthYear = LocalDate.now(ZoneId.of("Europe/Luxembourg")).getYear(); |
| 1463 | + int minBirthYear = maxBirthYear - 120; |
| 1464 | + Matcher matcher = nationalIdentifierRegex.matcher(content); |
| 1465 | + String nationalIdentierFull; |
| 1466 | + int nationalIdentierYear, nationalIdentierMonth, nationalIdentierDay; |
| 1467 | + while (matcher.find()) { |
| 1468 | + nationalIdentierFull = matcher.group(1); |
| 1469 | + //Check that the string is a valid national identifier and if yes then add it |
| 1470 | + nationalIdentierYear = Integer.parseInt(nationalIdentierFull.substring(0, 4)); |
| 1471 | + nationalIdentierMonth = Integer.parseInt(nationalIdentierFull.substring(4, 6)); |
| 1472 | + nationalIdentierDay = Integer.parseInt(nationalIdentierFull.substring(6, 8)); |
| 1473 | + if (nationalIdentierYear >= minBirthYear && nationalIdentierYear <= maxBirthYear) { |
| 1474 | + if (nationalIdentierMonth >= 1 && nationalIdentierMonth <= 12) { |
| 1475 | + if (YearMonth.of(nationalIdentierYear, nationalIdentierMonth).isValidDay(nationalIdentierDay)) { |
| 1476 | + data.get(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER).add(nationalIdentierFull); |
| 1477 | + } |
| 1478 | + } |
| 1479 | + } |
| 1480 | + } |
| 1481 | + |
| 1482 | + /* Step 2a: Search for IBAN that are non human formatted */ |
| 1483 | + matcher = ibanNonHumanFormattedRegex.matcher(content); |
| 1484 | + String iban, ibanUpperCased; |
| 1485 | + while (matcher.find()) { |
| 1486 | + iban = matcher.group(1); |
| 1487 | + ibanUpperCased = iban.toUpperCase(Locale.ROOT); |
| 1488 | + //Check that the string is a valid iban and if yes then add it |
| 1489 | + if (IbanUtil.isValid(ibanUpperCased)) { |
| 1490 | + data.get(SensitiveInformationType.IBAN).add(iban); |
| 1491 | + } |
| 1492 | + } |
| 1493 | + |
| 1494 | + /* Step 2b: Search for IBAN that are human formatted */ |
| 1495 | + matcher = ibanHumanFormattedRegex.matcher(content); |
| 1496 | + String ibanUpperCasedNoSpace; |
| 1497 | + while (matcher.find()) { |
| 1498 | + iban = matcher.group(1); |
| 1499 | + ibanUpperCasedNoSpace = iban.toUpperCase(Locale.ROOT).replace(" ", ""); |
| 1500 | + //Check that the string is a valid iban and if yes then add it |
| 1501 | + if (IbanUtil.isValid(ibanUpperCasedNoSpace)) { |
| 1502 | + data.get(SensitiveInformationType.IBAN).add(iban); |
| 1503 | + } |
| 1504 | + } |
| 1505 | + } |
| 1506 | + |
| 1507 | + //Cleanup if a set is empty |
| 1508 | + if (data.get(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER).isEmpty()) { |
| 1509 | + data.remove(SensitiveInformationType.LUXEMBOURG_NATIONAL_IDENTIFICATION_NUMBER); |
| 1510 | + } |
| 1511 | + if (data.get(SensitiveInformationType.IBAN).isEmpty()) { |
| 1512 | + data.remove(SensitiveInformationType.IBAN); |
| 1513 | + } |
| 1514 | + |
| 1515 | + return data; |
| 1516 | + } |
1425 | 1517 | } |
0 commit comments