Skip to content

Commit 6ee7c09

Browse files
authored
merge pull request #93 from BobLd/master
improving clustering algorithm
2 parents d37149a + b69c004 commit 6ee7c09

File tree

4 files changed

+140
-175
lines changed

4 files changed

+140
-175
lines changed

src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs

Lines changed: 99 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
1212
internal class ClusteringAlgorithms
1313
{
1414
/// <summary>
15-
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
16-
/// https://en.wikipedia.org/wiki/Transitive_closure
15+
/// Algorithm to group elements using nearest neighbours.
1716
/// </summary>
1817
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
1918
/// <param name="elements">List of elements to group.</param>
@@ -23,7 +22,7 @@ internal class ClusteringAlgorithms
2322
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
2423
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
2524
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
26-
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> elements,
25+
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(List<T> elements,
2726
Func<PdfPoint, PdfPoint, double> distMeasure,
2827
Func<T, T, double> maxDistanceFunction,
2928
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
@@ -41,7 +40,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
4140
* that if indexes[i] = j then indexes[j] != i.
4241
*
4342
* 2. Group indexes
44-
* Group indexes if share neighbours in common - Transitive closure
43+
* Group indexes if share neighbours in common - Depth-first search
4544
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
4645
* (i,j,k) will form a group and (m,n) will form another group.
4746
*************************************************************************************/
@@ -56,12 +55,15 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
5655

5756
if (filterPivot(pivot))
5857
{
59-
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
60-
var paired = elements[index];
58+
int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
6159

62-
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
60+
if (index != -1)
6361
{
64-
indexes[e] = index;
62+
var paired = elements[index];
63+
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
64+
{
65+
indexes[e] = index;
66+
}
6567
}
6668
}
6769
});
@@ -73,8 +75,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
7375
}
7476

7577
/// <summary>
76-
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
77-
/// https://en.wikipedia.org/wiki/Transitive_closure
78+
/// Algorithm to group elements using nearest neighbours.
7879
/// </summary>
7980
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
8081
/// <param name="elements">Array of elements to group.</param>
@@ -84,7 +85,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
8485
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
8586
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
8687
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
87-
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
88+
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
8889
Func<PdfPoint, PdfPoint, double> distMeasure,
8990
Func<T, T, double> maxDistanceFunction,
9091
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
@@ -102,7 +103,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
102103
* that if indexes[i] = j then indexes[j] != i.
103104
*
104105
* 2. Group indexes
105-
* Group indexes if share neighbours in common - Transitive closure
106+
* Group indexes if share neighbours in common - Depth-first search
106107
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
107108
* (i,j,k) will form a group and (m,n) will form another group.
108109
*************************************************************************************/
@@ -117,12 +118,15 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
117118

118119
if (filterPivot(pivot))
119120
{
120-
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
121-
var paired = elements[index];
121+
int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
122122

123-
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
123+
if (index != -1)
124124
{
125-
indexes[e] = index;
125+
var paired = elements[index];
126+
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
127+
{
128+
indexes[e] = index;
129+
}
126130
}
127131
}
128132
});
@@ -134,8 +138,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
134138
}
135139

136140
/// <summary>
137-
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
138-
/// https://en.wikipedia.org/wiki/Transitive_closure
141+
/// Algorithm to group elements using nearest neighbours.
139142
/// </summary>
140143
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
141144
/// <param name="elements">Array of elements to group.</param>
@@ -145,7 +148,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
145148
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
146149
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
147150
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
148-
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
151+
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
149152
Func<PdfLine, PdfLine, double> distMeasure,
150153
Func<T, T, double> maxDistanceFunction,
151154
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
@@ -163,7 +166,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
163166
* that if indexes[i] = j then indexes[j] != i.
164167
*
165168
* 2. Group indexes
166-
* Group indexes if share neighbours in common - Transitive closure
169+
* Group indexes if share neighbours in common - Depth-first search
167170
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
168171
* (i,j,k) will form a group and (m,n) will form another group.
169172
*************************************************************************************/
@@ -178,12 +181,15 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
178181

179182
if (filterPivot(pivot))
180183
{
181-
int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
182-
var paired = elements[index];
184+
int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist);
183185

184-
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
186+
if (index != -1)
185187
{
186-
indexes[e] = index;
188+
var paired = elements[index];
189+
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
190+
{
191+
indexes[e] = index;
192+
}
187193
}
188194
}
189195
});
@@ -195,104 +201,98 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
195201
}
196202

197203
/// <summary>
198-
/// Group elements via transitive closure. Each element has only one connected neighbour.
199-
/// https://en.wikipedia.org/wiki/Transitive_closure
204+
/// Group elements using Depth-first search.
205+
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
200206
/// </summary>
201-
/// <param name="indexes">Array of paired elements index.</param>
202-
/// <returns></returns>
203-
private static List<HashSet<int>> GroupIndexes(int[] indexes)
207+
/// <param name="edges">The graph. edges[i] = j indicates that there is an edge between i and j.</param>
208+
/// <returns>A List of HashSets containing containing the grouped indexes.</returns>
209+
internal static List<HashSet<int>> GroupIndexes(int[] edges)
204210
{
205-
int[][] adjacency = new int[indexes.Length][];
206-
for (int i = 0; i < indexes.Length; i++)
211+
int[][] adjacency = new int[edges.Length][];
212+
for (int i = 0; i < edges.Length; i++)
207213
{
208214
HashSet<int> matches = new HashSet<int>();
209-
for (int j = 0; j < indexes.Length; ++j)
215+
if (edges[i] != -1) matches.Add(edges[i]);
216+
for (int j = 0; j < edges.Length; j++)
210217
{
211-
if (indexes[j] == i) matches.Add(j);
218+
if (edges[j] == i) matches.Add(j);
212219
}
213220
adjacency[i] = matches.ToArray();
214221
}
215222

216223
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
217-
bool[] isDone = new bool[indexes.Length];
224+
bool[] isDone = new bool[edges.Length];
218225

219-
for (int p = 0; p < indexes.Length; p++)
226+
for (int p = 0; p < edges.Length; p++)
220227
{
221228
if (isDone[p]) continue;
229+
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
230+
}
231+
return groupedIndexes;
232+
}
222233

223-
LinkedList<int[]> L = new LinkedList<int[]>();
224-
HashSet<int> grouped = new HashSet<int>();
225-
L.AddLast(new[] { p, indexes[p] });
226-
227-
while (L.Any())
234+
/// <summary>
235+
/// Group elements using Depth-first search.
236+
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
237+
/// </summary>
238+
/// <param name="edges">The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...</param>
239+
/// <returns>A List of HashSets containing containing the grouped indexes.</returns>
240+
internal static List<HashSet<int>> GroupIndexes(int[][] edges)
241+
{
242+
int[][] adjacency = new int[edges.Length][];
243+
for (int i = 0; i < edges.Length; i++)
244+
{
245+
HashSet<int> matches = new HashSet<int>();
246+
for (int j = 0; j < edges[i].Length; j++)
228247
{
229-
var current = L.First.Value;
230-
L.RemoveFirst();
231-
var current0 = current[0];
232-
var current1 = current[1];
248+
if (edges[i][j] != -1) matches.Add(edges[i][j]);
249+
}
233250

234-
if (current0 != -1 && !isDone[current0])
251+
for (int j = 0; j < edges.Length; j++)
252+
{
253+
for (int k = 0; k < edges[j].Length; k++)
235254
{
236-
var adjs = adjacency[current0];
237-
foreach (var k in adjs)
238-
{
239-
if (isDone[k]) continue;
240-
L.AddLast(new[] { k, current0 });
241-
}
242-
243-
int current0P = indexes[current0];
244-
if (current0P != -1)
245-
{
246-
var adjsP = adjacency[current0P];
247-
foreach (var k in adjsP)
248-
{
249-
if (isDone[k]) continue;
250-
L.AddLast(new[] { k, current0P });
251-
isDone[k] = true;
252-
grouped.Add(k);
253-
}
254-
}
255-
else
256-
{
257-
L.AddLast(new[] { current0, current0P });
258-
isDone[current0] = true;
259-
grouped.Add(current0);
260-
}
255+
if (edges[j][k] == i) matches.Add(j);
261256
}
257+
}
258+
adjacency[i] = matches.ToArray();
259+
}
262260

263-
if (current1 != -1 && !isDone[current1])
264-
{
265-
var adjs = adjacency[current1];
266-
foreach (var k in adjs)
267-
{
268-
if (isDone[k]) continue;
269-
L.AddLast(new[] { k, current1 });
270-
}
261+
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
262+
bool[] isDone = new bool[edges.Length];
271263

272-
int current1P = indexes[current1];
273-
if (current1P != -1)
274-
{
275-
var adjsP = adjacency[current1P];
276-
foreach (var k in adjsP)
277-
{
278-
if (isDone[k]) continue;
279-
L.AddLast(new[] { k, current1P });
280-
isDone[k] = true;
281-
grouped.Add(k);
282-
}
283-
}
284-
else
285-
{
286-
L.AddLast(new[] { current1, current1P });
287-
isDone[current1] = true;
288-
grouped.Add(current1);
289-
}
264+
for (int p = 0; p < edges.Length; p++)
265+
{
266+
if (isDone[p]) continue;
267+
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
268+
}
269+
return groupedIndexes;
270+
}
271+
272+
/// <summary>
273+
/// Depth-first search
274+
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
275+
/// </summary>
276+
private static HashSet<int> DfsIterative(int c, int[][] adj, ref bool[] isDone)
277+
{
278+
HashSet<int> group = new HashSet<int>();
279+
Stack<int> S = new Stack<int>();
280+
S.Push(c);
281+
282+
while (S.Any())
283+
{
284+
var v = S.Pop();
285+
if (!isDone[v])
286+
{
287+
group.Add(v);
288+
isDone[v] = true;
289+
foreach (var w in adj[v])
290+
{
291+
S.Push(w);
290292
}
291293
}
292-
groupedIndexes.Add(grouped);
293294
}
294-
295-
return groupedIndexes;
295+
return group;
296296
}
297297
}
298298
}

0 commit comments

Comments
 (0)