Fix encrypted PDF string decryption when ciphertext starts with BOM bytes (#1306)

VarunSaiTeja · web-flow · commit 450b85538b77 · 2026-05-25T14:51:01.000+01:00
* Add BOM to UTF-16 encoded byte array

Refactor UTF-16 encoding to include BOM in byte array.

* Enhance StringToken to store raw byte data

Added rawBytes field to preserve original byte data from the PDF file.

* Add original raw bytes to StringToken constructor

* Simplify originalRawBytes assignment in StringTokenizer

Simplified assignment of originalRawBytes by removing cloning.

* Add test for decrypting BOM-prefixed ciphertext

Added a test to verify decryption of strings with BOM in encrypted PDFs.

* Fix keyword assertion in EncryptedDocumentTests

Update keyword assertion for encrypted PDF test.

* Added test file for bom-encrypted
diff --git a/src/UglyToad.PdfPig.Tests/Integration/EncryptedDocumentTests.cs b/src/UglyToad.PdfPig.Tests/Integration/EncryptedDocumentTests.cs
@@ -72,6 +72,24 @@ public void CanReadDocumentWithNoKeyLengthAndRevision4()
             }
         }
 
+        [Fact]
+        public void CanDecryptStringWhenCiphertextStartsWithBom()
+        {
+            // The Keywords string in this PDF has ciphertext bytes starting with FF FE (UTF-16 LE BOM).
+            // Without the fix, StringTokenizer detects the BOM on encrypted bytes, strips 2 bytes,
+            // and the subsequent RC4 decryption produces garbage.
+            using (var document = PdfDocument.Open(
+                IntegrationHelpers.GetSpecificTestDocumentPath("test-bom-encrypted.pdf")))
+            {
+                Assert.True(document.IsEncrypted);
+         
+                var keywords = document.Information.Keywords;
+         
+                Assert.NotNull(keywords);
+                Assert.Contains("sample keywords for testing encrypted PDF string decryption", keywords);
+            }
+        }
+
         private static string GetPath() => IntegrationHelpers.GetSpecificTestDocumentPath(FileName);
     }
-}
+}
diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/test-bom-encrypted.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/test-bom-encrypted.pdf
diff --git a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs
@@ -1,4 +1,4 @@
-﻿namespace UglyToad.PdfPig.Tokenization
+namespace UglyToad.PdfPig.Tokenization
 {
     using System.Text;
     using Core;
@@ -154,11 +154,13 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
 
             StringToken.Encoding encodedWith;
             string tokenStr;
+            byte[] originalRawBytes = null;
             if (builder.Length >= 2)
             {
                 if (builder[0] == 0xFE && builder[1] == 0xFF)
                 {
                     var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
+                    originalRawBytes = rawBytes;
 
                     tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1);
 
@@ -167,6 +169,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
                 else if (builder[0] == 0xFF && builder[1] == 0xFE)
                 {
                     var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
+                    originalRawBytes = rawBytes;
 
                     tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1);
 
@@ -218,7 +221,9 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
 
             builder.Clear();
 
-            token = new StringToken(tokenStr, encodedWith);
+            token = originalRawBytes != null
+                ? new StringToken(tokenStr, encodedWith, originalRawBytes)
+                : new StringToken(tokenStr, encodedWith);
 
             return true;
         }
@@ -316,4 +321,4 @@ private static int CheckForEndOfString(int numberOfBrackets, IInputBytes bytes)
             return braces;
         }
     }
-}
+}
diff --git a/src/UglyToad.PdfPig.Tokens/StringToken.cs b/src/UglyToad.PdfPig.Tokens/StringToken.cs
@@ -8,6 +8,8 @@ namespace UglyToad.PdfPig.Tokens
     /// </summary>
     public class StringToken : IDataToken<string>
     {
+        private readonly byte[] rawBytes;
+
         /// <summary>
         /// The string in the token.
         /// </summary>
@@ -30,11 +32,29 @@ public StringToken(string data, Encoding encodedWith = Encoding.Iso88591)
             EncodedWith = encodedWith;
         }
 
+        /// <summary>
+        /// Create a new <see cref="StringToken"/> with preserved raw bytes.
+        /// </summary>
+        /// <param name="data">The string data for the token to contain.</param>
+        /// <param name="encodedWith">The encoding used to generate the <see cref="Data"/>.</param>
+        /// <param name="rawBytes">The original raw bytes from the PDF file.</param>
+        public StringToken(string data, Encoding encodedWith, byte[] rawBytes)
+        {
+            Data = data ?? throw new ArgumentNullException(nameof(data));
+            EncodedWith = encodedWith;
+            this.rawBytes = rawBytes;
+        }
+
         /// <summary>
         /// Convert the <see langword="string"/> in <see cref="Data"/> back to bytes.
         /// </summary>
         public byte[] GetBytes()
         {
+            if (rawBytes != null)
+            {
+                return rawBytes;
+            }
+
             switch (EncodedWith)
             {
                 case Encoding.Utf16BE:
@@ -51,7 +71,12 @@ public byte[] GetBytes()
                 }
                 case Encoding.Utf16:
                 {
-                    return System.Text.Encoding.Unicode.GetBytes(Data);
+                    var data = System.Text.Encoding.Unicode.GetBytes(Data);
+                    var result = new byte[data.Length + 2];
+                    result[0] = 0xFF;
+                    result[1] = 0xFE;
+                    Array.Copy(data, 0, result, 2, data.Length);
+                    return result;
                 }
                 case Encoding.PdfDocEncoding:
                     return PdfDocEncoding.StringToBytes(Data);
@@ -105,4 +130,4 @@ public enum Encoding : byte
             PdfDocEncoding = 3,
         }
     }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-namespace UglyToad.PdfPig.Tokenization`
	`1`	`+namespace UglyToad.PdfPig.Tokenization`
`2`	`2`	`{`
`3`	`3`	`using System.Text;`
`4`	`4`	`using Core;`
`@@ -154,11 +154,13 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok`
`154`	`154`
`155`	`155`	`StringToken.Encoding encodedWith;`
`156`	`156`	`string tokenStr;`
	`157`	`+ byte[] originalRawBytes = null;`
`157`	`158`	`if (builder.Length >= 2)`
`158`	`159`	`{`
`159`	`160`	`if (builder[0] == 0xFE && builder[1] == 0xFF)`
`160`	`161`	`{`
`161`	`162`	`var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());`
	`163`	`+ originalRawBytes = rawBytes;`
`162`	`164`
`163`	`165`	`tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1);`
`164`	`166`
`@@ -167,6 +169,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok`
`167`	`169`	`else if (builder[0] == 0xFF && builder[1] == 0xFE)`
`168`	`170`	`{`
`169`	`171`	`var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());`
	`172`	`+ originalRawBytes = rawBytes;`
`170`	`173`
`171`	`174`	`tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1);`
`172`	`175`
`@@ -218,7 +221,9 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok`
`218`	`221`
`219`	`222`	`builder.Clear();`
`220`	`223`
`221`		`- token = new StringToken(tokenStr, encodedWith);`
	`224`	`+ token = originalRawBytes != null`
	`225`	`+ ? new StringToken(tokenStr, encodedWith, originalRawBytes)`
	`226`	`+ : new StringToken(tokenStr, encodedWith);`
`222`	`227`
`223`	`228`	`return true;`
`224`	`229`	`}`
`@@ -316,4 +321,4 @@ private static int CheckForEndOfString(int numberOfBrackets, IInputBytes bytes)`
`316`	`321`	`return braces;`
`317`	`322`	`}`
`318`	`323`	`}`
`319`		`-}`
	`324`	`+}`