Skip to content

Commit 56e5543

Browse files
committed
Revert "Add new rules for text sanitization (#510)"
This reverts commit 105bcea.
1 parent ece9c1f commit 56e5543

1 file changed

Lines changed: 14 additions & 58 deletions

File tree

  • java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/FilterConfig.java

Lines changed: 14 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -31,102 +31,58 @@ public class FilterConfig {
3131
private boolean filterTinyText = true;
3232
private boolean filterHiddenOCG = true;
3333
private boolean filterSensitiveData = false;
34-
private final List<SanitizationRule> filterRules = new ArrayList<>();
34+
private final List<SanitizationRule> filterRules;
3535

3636
/** Default rules */
37-
private static final List<SanitizationRule> DEFAULT_RULES = new ArrayList<>();
38-
static {
39-
DEFAULT_RULES.add(new SanitizationRule(
37+
private void initializeDefaultRules() {
38+
filterRules.add(new SanitizationRule(
4039
Pattern.compile("[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}"),
4140
"email@example.com"
4241
));
43-
DEFAULT_RULES.add(new SanitizationRule(
42+
filterRules.add(new SanitizationRule(
4443
Pattern.compile("[+]\\d+(?:-\\d+)+"),
4544
"+00-0000-0000"
4645
));
47-
DEFAULT_RULES.add(new SanitizationRule(
46+
filterRules.add(new SanitizationRule(
4847
Pattern.compile("[A-Z]{1,2}\\d{6,9}"),
4948
"AA0000000"
5049
));
51-
DEFAULT_RULES.add(new SanitizationRule(
50+
filterRules.add(new SanitizationRule(
5251
Pattern.compile("\\b\\d{4}-?\\d{4}-?\\d{4}-?\\d{4}\\b"),
5352
"0000-0000-0000-0000"
5453
));
55-
DEFAULT_RULES.add(new SanitizationRule(
54+
filterRules.add(new SanitizationRule(
5655
Pattern.compile("\\b\\d{10,18}\\b"),
5756
"0000000000000000"
5857
));
59-
DEFAULT_RULES.add(new SanitizationRule(
58+
filterRules.add(new SanitizationRule(
6059
Pattern.compile("\\b(?:\\d{1,3}\\.){3}\\d{1,3}\\b"),
6160
"0.0.0.0"
6261
));
63-
DEFAULT_RULES.add(new SanitizationRule(
62+
filterRules.add(new SanitizationRule(
6463
Pattern.compile("\\b([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}\\b"),
6564
"0.0.0.0::1"
6665
));
67-
DEFAULT_RULES.add(new SanitizationRule(
66+
filterRules.add(new SanitizationRule(
6867
Pattern.compile("\\b(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\\b"),
6968
"00:00:00:00:00:00"
7069
));
71-
DEFAULT_RULES.add(new SanitizationRule(
70+
filterRules.add(new SanitizationRule(
7271
Pattern.compile("\\b\\d{15}\\b"),
7372
"000000000000000"
7473
));
75-
DEFAULT_RULES.add(new SanitizationRule(
74+
filterRules.add(new SanitizationRule(
7675
Pattern.compile("https?://[A-Za-z0-9.-]+(:\\d+)?(/\\S*)?"),
7776
"https://example.com"
7877
));
79-
//TODO Confirm info about regex for Korean phone, card, resident numbers and etc.
80-
// Korean Resident Registration Number
81-
DEFAULT_RULES.add(new SanitizationRule(
82-
Pattern.compile("\\b\\d{6}-\\d{7}\\b"),
83-
"000000-0000000"
84-
));
85-
// Korean phone numbers
86-
DEFAULT_RULES.add(new SanitizationRule(
87-
Pattern.compile("\\b0\\d{1,2}-\\d{3,4}-\\d{4}\\b"),
88-
"010-0000-0000"
89-
));
90-
// Korean business registration number
91-
DEFAULT_RULES.add(new SanitizationRule(
92-
Pattern.compile("\\b\\d{3}-\\d{2}-\\d{5}\\b"),
93-
"000-00-00000"
94-
));
95-
// Korean bank account numbers
96-
// DEFAULT_RULES.add(new SanitizationRule(
97-
// Pattern.compile("\\b\\d{2,4}-\\d{2,3}-\\d{4,6}\\b"),
98-
// "000-000-000000"
99-
// ));
100-
//TODO Confirm info about regex for AWS (maybe create 2 separate rules for AKIA|ASIA)
101-
// AWS Access Key
102-
DEFAULT_RULES.add(new SanitizationRule(
103-
Pattern.compile("\\b(AKIA|ASIA)[0-9A-Z]{12,124}\\b"),
104-
"AKIA0000000000000000"
105-
));
106-
//TODO Confirm info about regex for GitHub (maybe create separate rules for ghp|ghu|gho|ghs|ghr)
107-
// GitHub Personal Access Token
108-
DEFAULT_RULES.add(new SanitizationRule(
109-
Pattern.compile("\\bgh[puors]_[A-Za-z0-9]{10,251}\\b"),
110-
"ghp_000000000000000000000000000000000000"
111-
));
112-
// GitHub Fine-grained Personal Access Token
113-
DEFAULT_RULES.add(new SanitizationRule(
114-
Pattern.compile("\\bgithub_pat_[A-Za-z0-9_]{10,243}\\b"),
115-
"github_pat_0000000000000000000000_00000000000000000000000000000000000000000000000000000000000"
116-
));
117-
// AWS Secret Key (Finds 40-character, base-64 strings that don't have any base 64 characters immediately before or after).
118-
// Has to be last rule
119-
DEFAULT_RULES.add(new SanitizationRule(
120-
Pattern.compile("(?<![A-Za-z0-9/+])[A-Za-z0-9/+]{40}(?![A-Za-z0-9/+])"),
121-
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
122-
));
12378
}
12479

12580
/**
12681
* Constructor initializing the configuration of filter.
12782
*/
12883
public FilterConfig() {
129-
filterRules.addAll(DEFAULT_RULES);
84+
this.filterRules = new ArrayList<>();
85+
initializeDefaultRules();
13086
}
13187

13288
/**

0 commit comments

Comments
 (0)