Skip to content
BobLd edited this page Apr 21, 2025 · 5 revisions

Letters / Words / Text Block Bounding Boxes Differ by Operating System

This is most certainly due to fonts being available on a given OS, and not on another.

On Linux, installing additional font should help:

sudo apt install ttf*

See issue

Some images cannot be extracted

PdfPig does not support all images filters out of the box. Filters requiring external implementation are: DCT, JPX and JBIG2. You can either implement your own, or use the following NuGet packages:

  • PdfPig.Filters.Dct.JpegLibrary
  • PdfPig.Filters.Jbig2.PdfboxJbig2
  • PdfPig.Filters.Jpx.OpenJpegDotNet

Once the Nuget packages are added, use the following:

// Create your filter provider
public sealed class MyFilterProvider : BaseFilterProvider
{
    /// <summary>
    /// The single instance of this provider.
    /// </summary>
    public static readonly IFilterProvider Instance = new MyFilterProvider();

    /// <inheritdoc/>
    private MyFilterProvider() : base(GetDictionary())
    {
    }

    private static Dictionary<string, IFilter> GetDictionary()
    {
        // new filters
        var jbig2 = new PdfboxJbig2DecodeFilter(); 
        var jpx = new OpenJpegJpxDecodeFilter();
        var dct = new JpegLibraryDctDecodeFilter();
        
        // Default filters
        var ascii85 = new Ascii85Filter();
        var asciiHex = new AsciiHexDecodeFilter();
        var ccitt = new CcittFaxDecodeFilter();
        var dct = new DctDecodeFilter();
        var flate = new FlateFilter();
        var runLength = new RunLengthFilter();
        var lzw = new LzwFilter();

        return new Dictionary<string, IFilter>
        {
            { NameToken.Ascii85Decode.Data, ascii85 },
            { NameToken.Ascii85DecodeAbbreviation.Data, ascii85 },
            { NameToken.AsciiHexDecode.Data, asciiHex },
            { NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex },
            { NameToken.CcittfaxDecode.Data, ccitt },
            { NameToken.CcittfaxDecodeAbbreviation.Data, ccitt },
            { NameToken.DctDecode.Data, dct },
            { NameToken.DctDecodeAbbreviation.Data, dct },
            { NameToken.FlateDecode.Data, flate },
            { NameToken.FlateDecodeAbbreviation.Data, flate },
            { NameToken.Jbig2Decode.Data, jbig2 },
            { NameToken.JpxDecode.Data, jpx },
            { NameToken.RunLengthDecode.Data, runLength },
            { NameToken.RunLengthDecodeAbbreviation.Data, runLength },
            { NameToken.LzwDecode.Data, lzw },
            { NameToken.LzwDecodeAbbreviation.Data, lzw }
        };
    }
}

var parsingOption = new ParsingOptions()
{
	UseLenientParsing = true, // Optinal
	SkipMissingFonts = true, // Optinal
	FilterProvider = MyFilterProvider.Instance
};

using (var doc = PdfDocument.Open("my_document.pdf", parsingOption))
{
	int i = 0;
	foreach (var page in doc.GetPages())
	{
		foreach (var pdfImage in page.GetImages())
		{
			// Process your images, e.g.:
			File.WriteAllBytes($"image_{i++}.png", bytes);
		}
	}
}
Clone this wiki locally