Skip to content

Commit 47e1a34

Browse files
committed
Revise the watermark removal algorithm to be both generalizable and highly specific
1 parent aa015ac commit 47e1a34

15 files changed

Lines changed: 2052 additions & 27 deletions

src/main/java/com/applitools/imagetester/ImageTester.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -705,11 +705,12 @@ private static Options getOptions() {
705705

706706
options.addOption(Option.builder("rwauto")
707707
.longOpt("removeWatermarkAuto")
708-
.desc("Auto-detect a vector watermark template by intersecting path " +
709-
"shapes across all PDFs in the input directory, then strip the " +
710-
"template from each. Requires at least 2 input PDFs. With -rwo, " +
711-
"writes cleaned PDFs to that directory and exits. Without -rwo, " +
712-
"cleans to a temp directory and uploads cleaned PDFs to Applitools.")
708+
.desc("Auto-detect a vector watermark shared across all PDFs in the input " +
709+
"directory by its fill color, then strip only paths in that color " +
710+
"from each PDF, leaving all other content intact. Requires at least " +
711+
"2 input PDFs from the same source. With -rwo, writes cleaned PDFs to " +
712+
"that directory and exits. Without -rwo, cleans to a temp directory " +
713+
"and uploads cleaned PDFs to Applitools.")
713714
.hasArg(false)
714715
.build());
715716

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package com.applitools.imagetester.lib;
2+
3+
import java.io.IOException;
4+
import java.io.OutputStream;
5+
import java.util.ArrayDeque;
6+
import java.util.ArrayList;
7+
import java.util.Arrays;
8+
import java.util.Deque;
9+
import java.util.HashSet;
10+
import java.util.List;
11+
import java.util.Set;
12+
13+
import org.apache.pdfbox.contentstream.operator.Operator;
14+
import org.apache.pdfbox.cos.COSName;
15+
import org.apache.pdfbox.cos.COSStream;
16+
import org.apache.pdfbox.pdfparser.PDFStreamParser;
17+
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
18+
import org.apache.pdfbox.pdmodel.PDDocument;
19+
import org.apache.pdfbox.pdmodel.PDPage;
20+
21+
/**
22+
* Removes filled vector paths drawn in a target non-stroking (fill) color,
23+
* leaving everything else — body text, images, strokes, clip paths, and fills
24+
* of any other color — untouched.
25+
*
26+
* Watermarks of the kind this targets are stamped as filled outlines in a
27+
* single muted color distinct from the document's real content, so keying
28+
* removal on fill color strips the watermark without touching shared branding.
29+
*/
30+
public final class ColorPathStripper {
31+
32+
private static final Set<String> PATH_CONSTRUCTION_OPS = new HashSet<>(Arrays.asList(
33+
"m", "l", "c", "v", "y", "h", "re", "W", "W*"));
34+
private static final Set<String> FILL_PAINT_OPS = new HashSet<>(Arrays.asList(
35+
"f", "F", "f*", "b", "b*", "B", "B*"));
36+
private static final Set<String> NON_FILL_PAINT_OPS = new HashSet<>(Arrays.asList(
37+
"S", "s", "n"));
38+
39+
private ColorPathStripper() {
40+
}
41+
42+
public static void removeFromAllPages(PDDocument doc, float[] targetRgb, float tolerance) throws IOException {
43+
if (targetRgb == null) return;
44+
for (int i = 0; i < doc.getNumberOfPages(); i++) {
45+
PDPage page = doc.getPage(i);
46+
PDFStreamParser parser = new PDFStreamParser(page);
47+
parser.parse();
48+
List<Object> cleaned = strip(parser.getTokens(), targetRgb, tolerance);
49+
50+
COSStream newStream = new COSStream();
51+
try (OutputStream out = newStream.createOutputStream()) {
52+
new ContentStreamWriter(out).writeTokens(cleaned);
53+
}
54+
page.getCOSObject().setItem(COSName.CONTENTS, newStream);
55+
}
56+
}
57+
58+
public static List<Object> strip(List<Object> tokens, float[] targetRgb, float tolerance) {
59+
List<Object> result = new ArrayList<>();
60+
List<Object> argBuffer = new ArrayList<>();
61+
List<Object> currentPath = new ArrayList<>();
62+
boolean inPath = false;
63+
64+
Deque<float[]> stateStack = new ArrayDeque<>();
65+
float[] fill = {0f, 0f, 0f};
66+
67+
for (Object t : tokens) {
68+
if (!(t instanceof Operator)) {
69+
argBuffer.add(t);
70+
continue;
71+
}
72+
String op = ((Operator) t).getName();
73+
74+
if (PATH_CONSTRUCTION_OPS.contains(op)) {
75+
currentPath.addAll(argBuffer);
76+
currentPath.add(t);
77+
argBuffer.clear();
78+
inPath = true;
79+
} else if (inPath && (FILL_PAINT_OPS.contains(op) || NON_FILL_PAINT_OPS.contains(op))) {
80+
currentPath.addAll(argBuffer);
81+
currentPath.add(t);
82+
argBuffer.clear();
83+
boolean drop = FILL_PAINT_OPS.contains(op) && colorMatches(fill, targetRgb, tolerance);
84+
if (!drop) result.addAll(currentPath);
85+
currentPath.clear();
86+
inPath = false;
87+
} else {
88+
if (inPath) {
89+
result.addAll(currentPath);
90+
currentPath.clear();
91+
inPath = false;
92+
}
93+
fill = applyColorState(op, argBuffer, fill, stateStack);
94+
result.addAll(argBuffer);
95+
result.add(t);
96+
argBuffer.clear();
97+
}
98+
}
99+
result.addAll(currentPath);
100+
result.addAll(argBuffer);
101+
return result;
102+
}
103+
104+
private static float[] applyColorState(String op, List<Object> args, float[] fill, Deque<float[]> stack) {
105+
switch (op) {
106+
case "q":
107+
stack.push(fill.clone());
108+
return fill;
109+
case "Q":
110+
return stack.isEmpty() ? fill : stack.pop();
111+
default:
112+
return DeviceColor.fromOperator(op, args, fill);
113+
}
114+
}
115+
116+
private static boolean colorMatches(float[] fill, float[] target, float tol) {
117+
if (target == null) return false;
118+
for (int i = 0; i < 3; i++) {
119+
if (Math.abs(fill[i] - target[i]) > tol) return false;
120+
}
121+
return true;
122+
}
123+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package com.applitools.imagetester.lib;
2+
3+
import java.util.List;
4+
5+
import org.apache.pdfbox.cos.COSNumber;
6+
7+
/**
8+
* Interprets PDF non-stroking color operators (rg/g/k and sc/scn over device
9+
* color spaces) into normalized RGB. Pattern and named-colorspace operands that
10+
* can't be read as plain device components leave the color unchanged.
11+
*/
12+
final class DeviceColor {
13+
14+
private DeviceColor() {
15+
}
16+
17+
/** Returns the RGB a color operator sets, or {@code current} if the operator isn't a color op. */
18+
static float[] fromOperator(String op, List<Object> args, float[] current) {
19+
switch (op) {
20+
case "rg":
21+
return rgbOrCurrent(numbers(args, 3), current);
22+
case "g":
23+
float[] g = numbers(args, 1);
24+
return g == null ? current : new float[] {g[0], g[0], g[0]};
25+
case "k":
26+
return cmykToRgb(numbers(args, 4), current);
27+
case "sc":
28+
case "scn":
29+
return fromComponents(args, current);
30+
default:
31+
return current;
32+
}
33+
}
34+
35+
private static float[] fromComponents(List<Object> args, float[] current) {
36+
int n = 0;
37+
for (int i = args.size() - 1; i >= 0 && args.get(i) instanceof COSNumber; i--) n++;
38+
switch (n) {
39+
case 1:
40+
float v = ((COSNumber) args.get(args.size() - 1)).floatValue();
41+
return new float[] {v, v, v};
42+
case 3:
43+
return rgbOrCurrent(numbers(args, 3), current);
44+
case 4:
45+
return cmykToRgb(numbers(args, 4), current);
46+
default:
47+
return current;
48+
}
49+
}
50+
51+
private static float[] rgbOrCurrent(float[] vals, float[] current) {
52+
return vals == null ? current : vals;
53+
}
54+
55+
private static float[] cmykToRgb(float[] cmyk, float[] current) {
56+
if (cmyk == null) return current;
57+
return new float[] {
58+
(1f - cmyk[0]) * (1f - cmyk[3]),
59+
(1f - cmyk[1]) * (1f - cmyk[3]),
60+
(1f - cmyk[2]) * (1f - cmyk[3])
61+
};
62+
}
63+
64+
/** Returns the last {@code count} numeric operands, or null if too few are present. */
65+
private static float[] numbers(List<Object> args, int count) {
66+
if (args.size() < count) return null;
67+
float[] out = new float[count];
68+
int start = args.size() - count;
69+
for (int i = 0; i < count; i++) {
70+
Object o = args.get(start + i);
71+
if (!(o instanceof COSNumber)) return null;
72+
out[i] = ((COSNumber) o).floatValue();
73+
}
74+
return out;
75+
}
76+
}

src/main/java/com/applitools/imagetester/lib/PdfVectorWatermarkAutoMode.java

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import java.util.List;
1010
import java.util.Locale;
1111
import java.util.Map;
12-
import java.util.Set;
1312

1413
import org.apache.pdfbox.pdmodel.PDDocument;
1514
import org.apache.pdfbox.pdmodel.PDPage;
@@ -28,6 +27,9 @@
2827
*/
2928
public final class PdfVectorWatermarkAutoMode {
3029

30+
/** Per-channel RGB tolerance when matching a path's fill color to the detected watermark color. */
31+
private static final float WATERMARK_COLOR_TOLERANCE = 0.05f;
32+
3133
static final String SINGLE_PDF_NOTICE =
3234
"\nHeads up: you're testing one PDF on its own." +
3335
"\n" +
@@ -80,22 +82,24 @@ public static int run(File inputRoot, File outDir, String optionalTextHint, Logg
8082
List<File> pdfs = entry.getValue();
8183
String groupLabel = groupLabel(inputRoot, groupDir);
8284

83-
Set<String> pathFingerprint;
84-
Set<String> varyingOpSeqs;
85-
Set<String> varyingForms;
85+
float[] watermarkColor;
8686
try {
87-
pathFingerprint = PathFingerprinter.intersection(pdfs);
88-
varyingOpSeqs = OpSequenceVarianceFinder.findVarying(pdfs);
89-
varyingForms = VaryingFormFinder.findVarying(pdfs);
87+
watermarkColor = WatermarkColorDetector.detect(pdfs);
9088
} catch (IOException e) {
9189
logger.printMessage(String.format(
92-
"Failed to compute watermark fingerprint for %s: %s",
93-
groupLabel, e.getMessage()));
90+
"Failed to detect watermark for %s: %s", groupLabel, e.getMessage()));
9491
continue;
9592
}
96-
logger.printMessage(String.format(
97-
"[%s] Watermark fingerprint: %d shared shape(s), %d varying op-seq(s), %d varying form(s) across %d PDF(s)",
98-
groupLabel, pathFingerprint.size(), varyingOpSeqs.size(), varyingForms.size(), pdfs.size()));
93+
if (watermarkColor == null) {
94+
logger.printMessage(String.format(
95+
"[%s] No vector watermark detected across %d PDF(s); copying unchanged",
96+
groupLabel, pdfs.size()));
97+
} else {
98+
logger.printMessage(String.format(
99+
"[%s] Watermark color rgb(%d, %d, %d) detected across %d PDF(s)",
100+
groupLabel, to255(watermarkColor[0]), to255(watermarkColor[1]),
101+
to255(watermarkColor[2]), pdfs.size()));
102+
}
99103

100104
for (File pdf : pdfs) {
101105
File output = resolveOutput(inputRoot, outDir, pdf);
@@ -104,7 +108,7 @@ public static int run(File inputRoot, File outDir, String optionalTextHint, Logg
104108
continue;
105109
}
106110
try {
107-
cleanOnePdf(pdf, pathFingerprint, varyingOpSeqs, varyingForms, optionalTextHint, output);
111+
cleanOnePdf(pdf, watermarkColor, optionalTextHint, output);
108112
processed++;
109113
} catch (IOException e) {
110114
logger.printMessage("Failed to clean " + pdf.getAbsolutePath() + ": " + e.getMessage());
@@ -116,8 +120,7 @@ public static int run(File inputRoot, File outDir, String optionalTextHint, Logg
116120
return 0;
117121
}
118122

119-
private static void cleanOnePdf(File input, Set<String> pathFingerprint, Set<String> varyingOpSeqs,
120-
Set<String> varyingForms, String optionalTextHint, File output)
123+
private static void cleanOnePdf(File input, float[] watermarkColor, String optionalTextHint, File output)
121124
throws IOException {
122125
boolean hasTextHint = optionalTextHint != null && !optionalTextHint.trim().isEmpty();
123126
try (PDDocument source = PDDocument.load(input);
@@ -129,12 +132,15 @@ private static void cleanOnePdf(File input, Set<String> pathFingerprint, Set<Str
129132
}
130133
cleaned.addPage(page);
131134
}
132-
VectorWatermarkRemover.removeFromAllPages(cleaned, pathFingerprint, varyingOpSeqs);
133-
FormXObjectStripper.emptyForms(cleaned, varyingForms);
135+
ColorPathStripper.removeFromAllPages(cleaned, watermarkColor, WATERMARK_COLOR_TOLERANCE);
134136
cleaned.save(output);
135137
}
136138
}
137139

140+
private static int to255(float component) {
141+
return Math.round(component * 255f);
142+
}
143+
138144
private static Map<File, List<File>> groupPdfsByDirectory(File root) {
139145
Map<File, List<File>> result = new LinkedHashMap<>();
140146
if (root.isFile()) {

0 commit comments

Comments
 (0)