Skip to content

Commit 450b855

Browse files
authored
Fix encrypted PDF string decryption when ciphertext starts with BOM bytes (#1306)
* Add BOM to UTF-16 encoded byte array Refactor UTF-16 encoding to include BOM in byte array. * Enhance StringToken to store raw byte data Added rawBytes field to preserve original byte data from the PDF file. * Add original raw bytes to StringToken constructor * Simplify originalRawBytes assignment in StringTokenizer Simplified assignment of originalRawBytes by removing cloning. * Add test for decrypting BOM-prefixed ciphertext Added a test to verify decryption of strings with BOM in encrypted PDFs. * Fix keyword assertion in EncryptedDocumentTests Update keyword assertion for encrypted PDF test. * Added test file for bom-encrypted
1 parent 5a851a1 commit 450b855

4 files changed

Lines changed: 54 additions & 6 deletions

File tree

src/UglyToad.PdfPig.Tests/Integration/EncryptedDocumentTests.cs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,24 @@ public void CanReadDocumentWithNoKeyLengthAndRevision4()
7272
}
7373
}
7474

75+
[Fact]
76+
public void CanDecryptStringWhenCiphertextStartsWithBom()
77+
{
78+
// The Keywords string in this PDF has ciphertext bytes starting with FF FE (UTF-16 LE BOM).
79+
// Without the fix, StringTokenizer detects the BOM on encrypted bytes, strips 2 bytes,
80+
// and the subsequent RC4 decryption produces garbage.
81+
using (var document = PdfDocument.Open(
82+
IntegrationHelpers.GetSpecificTestDocumentPath("test-bom-encrypted.pdf")))
83+
{
84+
Assert.True(document.IsEncrypted);
85+
86+
var keywords = document.Information.Keywords;
87+
88+
Assert.NotNull(keywords);
89+
Assert.Contains("sample keywords for testing encrypted PDF string decryption", keywords);
90+
}
91+
}
92+
7593
private static string GetPath() => IntegrationHelpers.GetSpecificTestDocumentPath(FileName);
7694
}
77-
}
95+
}
Binary file not shown.

src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
namespace UglyToad.PdfPig.Tokenization
1+
namespace UglyToad.PdfPig.Tokenization
22
{
33
using System.Text;
44
using Core;
@@ -154,11 +154,13 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
154154

155155
StringToken.Encoding encodedWith;
156156
string tokenStr;
157+
byte[] originalRawBytes = null;
157158
if (builder.Length >= 2)
158159
{
159160
if (builder[0] == 0xFE && builder[1] == 0xFF)
160161
{
161162
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
163+
originalRawBytes = rawBytes;
162164

163165
tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1);
164166

@@ -167,6 +169,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
167169
else if (builder[0] == 0xFF && builder[1] == 0xFE)
168170
{
169171
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
172+
originalRawBytes = rawBytes;
170173

171174
tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1);
172175

@@ -218,7 +221,9 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
218221

219222
builder.Clear();
220223

221-
token = new StringToken(tokenStr, encodedWith);
224+
token = originalRawBytes != null
225+
? new StringToken(tokenStr, encodedWith, originalRawBytes)
226+
: new StringToken(tokenStr, encodedWith);
222227

223228
return true;
224229
}
@@ -316,4 +321,4 @@ private static int CheckForEndOfString(int numberOfBrackets, IInputBytes bytes)
316321
return braces;
317322
}
318323
}
319-
}
324+
}

src/UglyToad.PdfPig.Tokens/StringToken.cs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ namespace UglyToad.PdfPig.Tokens
88
/// </summary>
99
public class StringToken : IDataToken<string>
1010
{
11+
private readonly byte[] rawBytes;
12+
1113
/// <summary>
1214
/// The string in the token.
1315
/// </summary>
@@ -30,11 +32,29 @@ public StringToken(string data, Encoding encodedWith = Encoding.Iso88591)
3032
EncodedWith = encodedWith;
3133
}
3234

35+
/// <summary>
36+
/// Create a new <see cref="StringToken"/> with preserved raw bytes.
37+
/// </summary>
38+
/// <param name="data">The string data for the token to contain.</param>
39+
/// <param name="encodedWith">The encoding used to generate the <see cref="Data"/>.</param>
40+
/// <param name="rawBytes">The original raw bytes from the PDF file.</param>
41+
public StringToken(string data, Encoding encodedWith, byte[] rawBytes)
42+
{
43+
Data = data ?? throw new ArgumentNullException(nameof(data));
44+
EncodedWith = encodedWith;
45+
this.rawBytes = rawBytes;
46+
}
47+
3348
/// <summary>
3449
/// Convert the <see langword="string"/> in <see cref="Data"/> back to bytes.
3550
/// </summary>
3651
public byte[] GetBytes()
3752
{
53+
if (rawBytes != null)
54+
{
55+
return rawBytes;
56+
}
57+
3858
switch (EncodedWith)
3959
{
4060
case Encoding.Utf16BE:
@@ -51,7 +71,12 @@ public byte[] GetBytes()
5171
}
5272
case Encoding.Utf16:
5373
{
54-
return System.Text.Encoding.Unicode.GetBytes(Data);
74+
var data = System.Text.Encoding.Unicode.GetBytes(Data);
75+
var result = new byte[data.Length + 2];
76+
result[0] = 0xFF;
77+
result[1] = 0xFE;
78+
Array.Copy(data, 0, result, 2, data.Length);
79+
return result;
5580
}
5681
case Encoding.PdfDocEncoding:
5782
return PdfDocEncoding.StringToBytes(Data);
@@ -105,4 +130,4 @@ public enum Encoding : byte
105130
PdfDocEncoding = 3,
106131
}
107132
}
108-
}
133+
}

0 commit comments

Comments
 (0)