Skip to content

Commit 77b7248

Browse files
committed
Added lazy image byte loading and EagerlyLoadImageBytes option
1 parent e5c04fe commit 77b7248

7 files changed

Lines changed: 160 additions & 109 deletions

File tree

src/UglyToad.PdfPig.Tests/TestPdfImage.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ public class TestPdfImage : IPdfImage
3838

3939
public Memory<byte> DecodedBytes { get; set; }
4040

41+
public bool HasLoadedBytes => !RawMemory.IsEmpty;
42+
4143
public IPdfImage? MaskImage { get; }
4244

4345
public bool TryGetBytesAsMemory(out Memory<byte> bytes)

src/UglyToad.PdfPig/Content/IPdfImage.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,23 @@ public interface IPdfImage: IBoundingBox
3838

3939
/// <summary>
4040
/// The encoded memory of the image with all filters still applied.
41+
/// Accesing this property may trigger loading of the image bytes from the PDF stream.
4142
/// </summary>
4243
Memory<byte> RawMemory { get; }
4344

4445
/// <summary>
4546
/// The encoded memory span of the image with all filters still applied.
47+
/// Accesing this property may trigger loading of the image bytes from the PDF stream.
4648
/// </summary>
4749
Span<byte> RawBytes { get; }
4850

51+
/// <summary>
52+
/// Whether the image byte data is available. Returns <see langword="false"/> when
53+
/// <see cref="ParsingOptions.EagerlyLoadImageBytes"/> is <see langword="false"/>.
54+
/// Image metadata (dimensions, color space, bounding box) is available regardless.
55+
/// </summary>
56+
bool HasLoadedBytes { get; }
57+
4958
/// <summary>
5059
/// The color rendering intent to be used when rendering the image.
5160
/// </summary>

src/UglyToad.PdfPig/Content/InlineImage.cs

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,28 @@
22
{
33
using System;
44
using System.Collections.Generic;
5-
using System.Diagnostics.CodeAnalysis;
5+
using System.Diagnostics.CodeAnalysis;
66
using Core;
77
using Filters;
88
using Graphics.Colors;
99
using Graphics.Core;
1010
using Tokens;
11-
using Images.Png;
12-
11+
using Images.Png;
12+
1313
/// <inheritdoc />
1414
/// <summary>
1515
/// A small image that is completely defined directly inline within a <see cref="T:UglyToad.PdfPig.Content.Page" />'s content stream.
1616
/// </summary>
1717
public class InlineImage : IPdfImage
1818
{
1919
private readonly Lazy<Memory<byte>>? memoryFactory;
20+
private readonly Memory<byte> rawMemory;
2021

2122
/// <inheritdoc />
22-
public PdfRectangle BoundingBox { get; }
23+
public PdfRectangle BoundingBox { get; }
2324

24-
/// <inheritdoc />
25-
[Obsolete("Use BoundingBox instead.")]
25+
/// <inheritdoc />
26+
[Obsolete("Use BoundingBox instead.")]
2627
public PdfRectangle Bounds => BoundingBox;
2728

2829
/// <inheritdoc />
@@ -53,33 +54,36 @@ public class InlineImage : IPdfImage
5354
public bool Interpolate { get; }
5455

5556
/// <inheritdoc />
56-
public Memory<byte> RawMemory { get; }
57-
57+
public Memory<byte> RawMemory => rawMemory;
58+
5859
/// <inheritdoc />
5960
public Span<byte> RawBytes => RawMemory.Span;
6061

6162
/// <inheritdoc />
62-
public ColorSpaceDetails ColorSpaceDetails { get; }
63-
64-
/// <inheritdoc />
63+
public bool HasLoadedBytes => !rawMemory.IsEmpty;
64+
65+
/// <inheritdoc />
66+
public ColorSpaceDetails ColorSpaceDetails { get; }
67+
68+
/// <inheritdoc />
6569
public IPdfImage? MaskImage { get; }
6670

6771
/// <summary>
6872
/// Create a new <see cref="InlineImage"/>.
6973
/// </summary>
70-
internal InlineImage(PdfRectangle bounds,
71-
int widthInSamples,
72-
int heightInSamples,
73-
int bitsPerComponent,
74+
internal InlineImage(PdfRectangle bounds,
75+
int widthInSamples,
76+
int heightInSamples,
77+
int bitsPerComponent,
7478
bool isImageMask,
7579
RenderingIntent renderingIntent,
7680
bool interpolate,
7781
IReadOnlyList<double> decode,
78-
Memory<byte> rawMemory,
82+
Memory<byte> rawMemory,
7983
ILookupFilterProvider filterProvider,
80-
IReadOnlyList<NameToken> filterNames,
84+
IReadOnlyList<NameToken> filterNames,
8185
DictionaryToken streamDictionary,
82-
ColorSpaceDetails colorSpaceDetails,
86+
ColorSpaceDetails colorSpaceDetails,
8387
IPdfImage? softMaskImage)
8488
{
8589
IsInlineImage = true;
@@ -92,33 +96,36 @@ internal InlineImage(PdfRectangle bounds,
9296
RenderingIntent = renderingIntent;
9397
Interpolate = interpolate;
9498
ImageDictionary = streamDictionary;
95-
RawMemory = rawMemory;
96-
ColorSpaceDetails = colorSpaceDetails;
97-
98-
var filters = filterProvider.GetNamedFilters(filterNames);
99-
100-
var supportsFilters = true;
101-
foreach (var filter in filters)
99+
this.rawMemory = rawMemory;
100+
ColorSpaceDetails = colorSpaceDetails;
101+
102+
if (!rawMemory.IsEmpty)
102103
{
103-
if (!filter.IsSupported)
104+
var filters = filterProvider.GetNamedFilters(filterNames);
105+
106+
var supportsFilters = true;
107+
foreach (var filter in filters)
104108
{
105-
supportsFilters = false;
106-
break;
109+
if (!filter.IsSupported)
110+
{
111+
supportsFilters = false;
112+
break;
113+
}
107114
}
108-
}
109115

110-
memoryFactory = supportsFilters ? new Lazy<Memory<byte>>(() =>
111-
{
112-
var b = RawMemory;
113-
for (var i = 0; i < filters.Count; i++)
116+
memoryFactory = supportsFilters ? new Lazy<Memory<byte>>(() =>
114117
{
115-
var filter = filters[i];
116-
b = filter.Decode(b, streamDictionary, filterProvider, i);
117-
}
118+
var b = RawMemory;
119+
for (var i = 0; i < filters.Count; i++)
120+
{
121+
var filter = filters[i];
122+
b = filter.Decode(b, streamDictionary, filterProvider, i);
123+
}
124+
125+
return b;
126+
}) : null;
127+
}
118128

119-
return b;
120-
}) : null;
121-
122129
MaskImage = softMaskImage;
123130
}
124131

@@ -143,6 +150,6 @@ public bool TryGetBytesAsMemory(out Memory<byte> bytes)
143150
public override string ToString()
144151
{
145152
return $"Inline Image (w {BoundingBox.Width}, h {BoundingBox.Height})";
146-
}
153+
}
147154
}
148155
}

src/UglyToad.PdfPig/Graphics/BaseStreamProcessor.cs

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ public virtual void PushState()
215215
/// <inheritdoc/>
216216
public void ShowText(IInputBytes bytes)
217217
{
218-
TextSequence++;
218+
TextSequence++;
219219

220220
var currentState = GetCurrentState();
221221

@@ -435,8 +435,16 @@ public virtual void ApplyXObject(NameToken xObjectName)
435435
}
436436
else if (subType.Equals(NameToken.Image))
437437
{
438+
var imageStream = xObjectStream;
439+
if (!ParsingOptions.EagerlyLoadImageBytes)
440+
{
441+
// Replace the stream data with empty bytes so we dont hold image data in memory.
442+
// The dictionary is kept for metadata (width, height, etc).
443+
imageStream = new StreamToken(xObjectStream.StreamDictionary, Memory<byte>.Empty);
444+
}
445+
438446
var contentRecord = new XObjectContentRecord(XObjectType.Image,
439-
xObjectStream,
447+
imageStream,
440448
matrix,
441449
state.RenderingIntent,
442450
state.ColorSpaceContext?.CurrentStrokingColorSpace ?? DeviceRgbColorSpaceDetails.Instance);
@@ -565,7 +573,7 @@ protected virtual void ProcessFormXObject(StreamToken formStream, NameToken xObj
565573
.ToArray());
566574
}
567575

568-
// 2. Update current transformation matrix.
576+
// 2. Update current transformation matrix.
569577
ModifyCurrentTransformationMatrix(formMatrix);
570578

571579
var contentStream = formStream.Decode(FilterProvider, PdfScanner);
@@ -587,7 +595,7 @@ protected virtual void ProcessFormXObject(StreamToken formStream, NameToken xObj
587595
if (hasCircularReference)
588596
{
589597
if (ParsingOptions.UseLenientParsing)
590-
{
598+
{
591599
// TODO - We might be removing too much, good for the moment. See Issues1250() for examples
592600
operations = operations.Where(o => o is not InvokeNamedXObject xo || xo.Name != xObjectName)
593601
.ToArray();
@@ -635,21 +643,21 @@ protected virtual bool HasFormXObjectCircularReference(StreamToken formStream,
635643
if (!TryGetXObjectToken(formStream, xObjectName, PdfScanner, out var t1))
636644
{
637645
return false;
638-
}
639-
646+
}
647+
640648
if (!ResourceStore.TryGetXObject(xObjectName, out var resourceStream))
641649
{
642650
return false;
643-
}
644-
651+
}
652+
645653
if (!TryGetXObjectToken(resourceStream, xObjectName, PdfScanner, out var t2))
646654
{
647655
return false;
648-
}
649-
650-
if (t1 is null || t2 is null)
651-
{
652-
return false;
656+
}
657+
658+
if (t1 is null || t2 is null)
659+
{
660+
return false;
653661
}
654662

655663
return t1.Equals(t2);
@@ -660,8 +668,8 @@ static bool TryGetXObjectToken(StreamToken streamToken, NameToken xObjectName, I
660668
if (!streamToken.StreamDictionary.TryGet<DictionaryToken>(NameToken.Resources, scanner, out var formResources))
661669
{
662670
return false;
663-
}
664-
671+
}
672+
665673
if (!formResources.TryGet<DictionaryToken>(NameToken.Xobject, out var xObjectBase) || !xObjectBase.TryGet(xObjectName, out token))
666674
{
667675
return false;
@@ -887,7 +895,7 @@ public virtual void EndInlineImage(Memory<byte> bytes)
887895
return;
888896
}
889897

890-
InlineImageBuilder.Bytes = bytes;
898+
InlineImageBuilder.Bytes = ParsingOptions.EagerlyLoadImageBytes ? bytes : Memory<byte>.Empty;
891899

892900
var image = InlineImageBuilder.CreateInlineImage(CurrentTransformationMatrix,
893901
FilterProvider,
@@ -1004,7 +1012,7 @@ public virtual void SetWordSpacing(double spacing)
10041012
/// <inheritdoc/>
10051013
public virtual void ModifyCurrentTransformationMatrix(TransformationMatrix value)
10061014
{
1007-
var state = GetCurrentState();
1015+
var state = GetCurrentState();
10081016
state.CurrentTransformationMatrix = value.Multiply(state.CurrentTransformationMatrix);
10091017
}
10101018

src/UglyToad.PdfPig/ParsingOptions.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,15 @@ public sealed class ParsingOptions
6363
/// Filter provider to use while parsing the document. The <see cref="DefaultFilterProvider"/> will be used if set to <c>null</c>.
6464
/// </summary>
6565
public IFilterProvider? FilterProvider { get; set; } = null;
66+
67+
/// <summary>
68+
/// Whether to load image byte data when enumerating images on a page.
69+
/// When <see langword="true"/> (the default), image bytes are available via <see cref="Content.IPdfImage.RawMemory"/>
70+
/// and <see cref="Content.IPdfImage.TryGetBytesAsMemory(out System.Memory{byte})"/>.
71+
/// When <see langword="false"/>, image metadata (width, height, bounding box, color space, etc.)
72+
/// is still available but image byte data is not retained, reducing memory usage
73+
/// for documents with large or numerous images.
74+
/// </summary>
75+
public bool EagerlyLoadImageBytes { get; set; } = true;
6676
}
6777
}

0 commit comments

Comments
 (0)