Skip to content

Commit 97e831c

Browse files
committed
handle narrow whitespaces in default text extractor #319
where the gap is small but much larger than all previous gaps at this font size (and still larger than some minimum threshold) then break the word at this gap boundary.
1 parent 264cf7b commit 97e831c

File tree

1 file changed

+37
-2
lines changed

1 file changed

+37
-2
lines changed

src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
2121

2222
var lettersSoFar = new List<Letter>(10);
2323

24+
var gapCountsSoFarByFontSize = new Dictionary<double, Dictionary<double, int>>();
25+
2426
var y = default(double?);
2527
var lastX = default(double?);
2628
var lastLetter = default(Letter);
@@ -68,15 +70,48 @@ public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
6870
continue;
6971
}
7072

73+
var letterHeight = Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height);
74+
7175
var gap = letter.Location.X - (lastLetter.Location.X + lastLetter.Width);
7276
var nextToLeft = letter.Location.X < lastX.Value - 1;
73-
var nextBigSpace = gap > Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height) * 0.39;
77+
var nextBigSpace = gap > letterHeight * 0.39;
7478
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
7579
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1;
7680
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1;
7781
var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation;
7882

79-
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers)
83+
var suspectGap = false;
84+
85+
if (!nextFontSizeDiffers && letter.FontSize > 0 && gap >= 0)
86+
{
87+
var fontSize = Math.Round(letter.FontSize);
88+
if (!gapCountsSoFarByFontSize.TryGetValue(fontSize, out var gapCounts))
89+
{
90+
gapCounts = new Dictionary<double, int>();
91+
gapCountsSoFarByFontSize[fontSize] = gapCounts;
92+
}
93+
94+
var gapRounded = Math.Round(gap, 2);
95+
if (!gapCounts.ContainsKey(gapRounded))
96+
{
97+
gapCounts[gapRounded] = 0;
98+
}
99+
100+
gapCounts[gapRounded]++;
101+
102+
// More than one type of gap.
103+
if (gapCounts.Count > 1 && gap > letterHeight * 0.16)
104+
{
105+
var mostCommonGap = gapCounts.OrderByDescending(x => x.Value).First();
106+
107+
if (gap > (mostCommonGap.Key * 5) && mostCommonGap.Value > 1)
108+
{
109+
suspectGap = true;
110+
}
111+
}
112+
}
113+
114+
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers || suspectGap)
80115
{
81116
if (lettersSoFar.Count > 0)
82117
{

0 commit comments

Comments
 (0)