@@ -12,8 +12,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
12
12
internal class ClusteringAlgorithms
13
13
{
14
14
/// <summary>
15
- /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
16
- /// https://en.wikipedia.org/wiki/Transitive_closure
15
+ /// Algorithm to group elements using nearest neighbours.
17
16
/// </summary>
18
17
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
19
18
/// <param name="elements">List of elements to group.</param>
@@ -23,7 +22,7 @@ internal class ClusteringAlgorithms
23
22
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
24
23
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
25
24
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
26
- internal static IEnumerable < HashSet < int > > SimpleTransitiveClosure < T > ( List < T > elements ,
25
+ internal static IEnumerable < HashSet < int > > ClusterNearestNeighbours < T > ( List < T > elements ,
27
26
Func < PdfPoint , PdfPoint , double > distMeasure ,
28
27
Func < T , T , double > maxDistanceFunction ,
29
28
Func < T , PdfPoint > pivotPoint , Func < T , PdfPoint > candidatesPoint ,
@@ -41,7 +40,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
41
40
* that if indexes[i] = j then indexes[j] != i.
42
41
*
43
42
* 2. Group indexes
44
- * Group indexes if share neighbours in common - Transitive closure
43
+ * Group indexes if share neighbours in common - Depth-first search
45
44
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
46
45
* (i,j,k) will form a group and (m,n) will form another group.
47
46
*************************************************************************************/
@@ -56,12 +55,15 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
56
55
57
56
if ( filterPivot ( pivot ) )
58
57
{
59
- int index = pivotPoint ( pivot ) . FindIndexNearest ( candidatesPoints , distMeasure , out double dist ) ;
60
- var paired = elements [ index ] ;
58
+ int index = pivot . FindIndexNearest ( elements , candidatesPoint , pivotPoint , distMeasure , out double dist ) ;
61
59
62
- if ( filterFinal ( pivot , paired ) && dist < maxDistanceFunction ( pivot , paired ) )
60
+ if ( index != - 1 )
63
61
{
64
- indexes [ e ] = index ;
62
+ var paired = elements [ index ] ;
63
+ if ( filterFinal ( pivot , paired ) && dist < maxDistanceFunction ( pivot , paired ) )
64
+ {
65
+ indexes [ e ] = index ;
66
+ }
65
67
}
66
68
}
67
69
} ) ;
@@ -73,8 +75,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
73
75
}
74
76
75
77
/// <summary>
76
- /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
77
- /// https://en.wikipedia.org/wiki/Transitive_closure
78
+ /// Algorithm to group elements using nearest neighbours.
78
79
/// </summary>
79
80
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
80
81
/// <param name="elements">Array of elements to group.</param>
@@ -84,7 +85,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> ele
84
85
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
85
86
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
86
87
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
87
- internal static IEnumerable < HashSet < int > > SimpleTransitiveClosure < T > ( T [ ] elements ,
88
+ internal static IEnumerable < HashSet < int > > ClusterNearestNeighbours < T > ( T [ ] elements ,
88
89
Func < PdfPoint , PdfPoint , double > distMeasure ,
89
90
Func < T , T , double > maxDistanceFunction ,
90
91
Func < T , PdfPoint > pivotPoint , Func < T , PdfPoint > candidatesPoint ,
@@ -102,7 +103,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
102
103
* that if indexes[i] = j then indexes[j] != i.
103
104
*
104
105
* 2. Group indexes
105
- * Group indexes if share neighbours in common - Transitive closure
106
+ * Group indexes if share neighbours in common - Depth-first search
106
107
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
107
108
* (i,j,k) will form a group and (m,n) will form another group.
108
109
*************************************************************************************/
@@ -117,12 +118,15 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
117
118
118
119
if ( filterPivot ( pivot ) )
119
120
{
120
- int index = pivotPoint ( pivot ) . FindIndexNearest ( candidatesPoints , distMeasure , out double dist ) ;
121
- var paired = elements [ index ] ;
121
+ int index = pivot . FindIndexNearest ( elements , candidatesPoint , pivotPoint , distMeasure , out double dist ) ;
122
122
123
- if ( filterFinal ( pivot , paired ) && dist < maxDistanceFunction ( pivot , paired ) )
123
+ if ( index != - 1 )
124
124
{
125
- indexes [ e ] = index ;
125
+ var paired = elements [ index ] ;
126
+ if ( filterFinal ( pivot , paired ) && dist < maxDistanceFunction ( pivot , paired ) )
127
+ {
128
+ indexes [ e ] = index ;
129
+ }
126
130
}
127
131
}
128
132
} ) ;
@@ -134,8 +138,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
134
138
}
135
139
136
140
/// <summary>
137
- /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
138
- /// https://en.wikipedia.org/wiki/Transitive_closure
141
+ /// Algorithm to group elements using nearest neighbours.
139
142
/// </summary>
140
143
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
141
144
/// <param name="elements">Array of elements to group.</param>
@@ -145,7 +148,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
145
148
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
146
149
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
147
150
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
148
- internal static IEnumerable < HashSet < int > > SimpleTransitiveClosure < T > ( T [ ] elements ,
151
+ internal static IEnumerable < HashSet < int > > ClusterNearestNeighbours < T > ( T [ ] elements ,
149
152
Func < PdfLine , PdfLine , double > distMeasure ,
150
153
Func < T , T , double > maxDistanceFunction ,
151
154
Func < T , PdfLine > pivotLine , Func < T , PdfLine > candidatesLine ,
@@ -163,7 +166,7 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
163
166
* that if indexes[i] = j then indexes[j] != i.
164
167
*
165
168
* 2. Group indexes
166
- * Group indexes if share neighbours in common - Transitive closure
169
+ * Group indexes if share neighbours in common - Depth-first search
167
170
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
168
171
* (i,j,k) will form a group and (m,n) will form another group.
169
172
*************************************************************************************/
@@ -178,12 +181,15 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
178
181
179
182
if ( filterPivot ( pivot ) )
180
183
{
181
- int index = pivotLine ( pivot ) . FindIndexNearest ( candidatesLines , distMeasure , out double dist ) ;
182
- var paired = elements [ index ] ;
184
+ int index = pivot . FindIndexNearest ( elements , candidatesLine , pivotLine , distMeasure , out double dist ) ;
183
185
184
- if ( filterFinal ( pivot , paired ) && dist < maxDistanceFunction ( pivot , paired ) )
186
+ if ( index != - 1 )
185
187
{
186
- indexes [ e ] = index ;
188
+ var paired = elements [ index ] ;
189
+ if ( filterFinal ( pivot , paired ) && dist < maxDistanceFunction ( pivot , paired ) )
190
+ {
191
+ indexes [ e ] = index ;
192
+ }
187
193
}
188
194
}
189
195
} ) ;
@@ -195,104 +201,98 @@ internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] element
195
201
}
196
202
197
203
/// <summary>
198
- /// Group elements via transitive closure. Each element has only one connected neighbour .
199
- /// https://en.wikipedia.org/wiki/Transitive_closure
204
+ /// Group elements using Depth-first search .
205
+ /// <para> https://en.wikipedia.org/wiki/Depth-first_search</para>
200
206
/// </summary>
201
- /// <param name="indexes">Array of paired elements index .</param>
202
- /// <returns></returns>
203
- private static List < HashSet < int > > GroupIndexes ( int [ ] indexes )
207
+ /// <param name="edges">The graph. edges[i] = j indicates that there is an edge between i and j .</param>
208
+ /// <returns>A List of HashSets containing containing the grouped indexes. </returns>
209
+ internal static List < HashSet < int > > GroupIndexes ( int [ ] edges )
204
210
{
205
- int [ ] [ ] adjacency = new int [ indexes . Length ] [ ] ;
206
- for ( int i = 0 ; i < indexes . Length ; i ++ )
211
+ int [ ] [ ] adjacency = new int [ edges . Length ] [ ] ;
212
+ for ( int i = 0 ; i < edges . Length ; i ++ )
207
213
{
208
214
HashSet < int > matches = new HashSet < int > ( ) ;
209
- for ( int j = 0 ; j < indexes . Length ; ++ j )
215
+ if ( edges [ i ] != - 1 ) matches . Add ( edges [ i ] ) ;
216
+ for ( int j = 0 ; j < edges . Length ; j ++ )
210
217
{
211
- if ( indexes [ j ] == i ) matches . Add ( j ) ;
218
+ if ( edges [ j ] == i ) matches . Add ( j ) ;
212
219
}
213
220
adjacency [ i ] = matches . ToArray ( ) ;
214
221
}
215
222
216
223
List < HashSet < int > > groupedIndexes = new List < HashSet < int > > ( ) ;
217
- bool [ ] isDone = new bool [ indexes . Length ] ;
224
+ bool [ ] isDone = new bool [ edges . Length ] ;
218
225
219
- for ( int p = 0 ; p < indexes . Length ; p ++ )
226
+ for ( int p = 0 ; p < edges . Length ; p ++ )
220
227
{
221
228
if ( isDone [ p ] ) continue ;
229
+ groupedIndexes . Add ( DfsIterative ( p , adjacency , ref isDone ) ) ;
230
+ }
231
+ return groupedIndexes ;
232
+ }
222
233
223
- LinkedList < int [ ] > L = new LinkedList < int [ ] > ( ) ;
224
- HashSet < int > grouped = new HashSet < int > ( ) ;
225
- L . AddLast ( new [ ] { p , indexes [ p ] } ) ;
226
-
227
- while ( L . Any ( ) )
234
+ /// <summary>
235
+ /// Group elements using Depth-first search.
236
+ /// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
237
+ /// </summary>
238
+ /// <param name="edges">The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...</param>
239
+ /// <returns>A List of HashSets containing containing the grouped indexes.</returns>
240
+ internal static List < HashSet < int > > GroupIndexes ( int [ ] [ ] edges )
241
+ {
242
+ int [ ] [ ] adjacency = new int [ edges . Length ] [ ] ;
243
+ for ( int i = 0 ; i < edges . Length ; i ++ )
244
+ {
245
+ HashSet < int > matches = new HashSet < int > ( ) ;
246
+ for ( int j = 0 ; j < edges [ i ] . Length ; j ++ )
228
247
{
229
- var current = L . First . Value ;
230
- L . RemoveFirst ( ) ;
231
- var current0 = current [ 0 ] ;
232
- var current1 = current [ 1 ] ;
248
+ if ( edges [ i ] [ j ] != - 1 ) matches . Add ( edges [ i ] [ j ] ) ;
249
+ }
233
250
234
- if ( current0 != - 1 && ! isDone [ current0 ] )
251
+ for ( int j = 0 ; j < edges . Length ; j ++ )
252
+ {
253
+ for ( int k = 0 ; k < edges [ j ] . Length ; k ++ )
235
254
{
236
- var adjs = adjacency [ current0 ] ;
237
- foreach ( var k in adjs )
238
- {
239
- if ( isDone [ k ] ) continue ;
240
- L . AddLast ( new [ ] { k , current0 } ) ;
241
- }
242
-
243
- int current0P = indexes [ current0 ] ;
244
- if ( current0P != - 1 )
245
- {
246
- var adjsP = adjacency [ current0P ] ;
247
- foreach ( var k in adjsP )
248
- {
249
- if ( isDone [ k ] ) continue ;
250
- L . AddLast ( new [ ] { k , current0P } ) ;
251
- isDone [ k ] = true ;
252
- grouped . Add ( k ) ;
253
- }
254
- }
255
- else
256
- {
257
- L . AddLast ( new [ ] { current0 , current0P } ) ;
258
- isDone [ current0 ] = true ;
259
- grouped . Add ( current0 ) ;
260
- }
255
+ if ( edges [ j ] [ k ] == i ) matches . Add ( j ) ;
261
256
}
257
+ }
258
+ adjacency [ i ] = matches . ToArray ( ) ;
259
+ }
262
260
263
- if ( current1 != - 1 && ! isDone [ current1 ] )
264
- {
265
- var adjs = adjacency [ current1 ] ;
266
- foreach ( var k in adjs )
267
- {
268
- if ( isDone [ k ] ) continue ;
269
- L . AddLast ( new [ ] { k , current1 } ) ;
270
- }
261
+ List < HashSet < int > > groupedIndexes = new List < HashSet < int > > ( ) ;
262
+ bool [ ] isDone = new bool [ edges . Length ] ;
271
263
272
- int current1P = indexes [ current1 ] ;
273
- if ( current1P != - 1 )
274
- {
275
- var adjsP = adjacency [ current1P ] ;
276
- foreach ( var k in adjsP )
277
- {
278
- if ( isDone [ k ] ) continue ;
279
- L . AddLast ( new [ ] { k , current1P } ) ;
280
- isDone [ k ] = true ;
281
- grouped . Add ( k ) ;
282
- }
283
- }
284
- else
285
- {
286
- L . AddLast ( new [ ] { current1 , current1P } ) ;
287
- isDone [ current1 ] = true ;
288
- grouped . Add ( current1 ) ;
289
- }
264
+ for ( int p = 0 ; p < edges . Length ; p ++ )
265
+ {
266
+ if ( isDone [ p ] ) continue ;
267
+ groupedIndexes . Add ( DfsIterative ( p , adjacency , ref isDone ) ) ;
268
+ }
269
+ return groupedIndexes ;
270
+ }
271
+
272
+ /// <summary>
273
+ /// Depth-first search
274
+ /// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
275
+ /// </summary>
276
+ private static HashSet < int > DfsIterative ( int c , int [ ] [ ] adj , ref bool [ ] isDone )
277
+ {
278
+ HashSet < int > group = new HashSet < int > ( ) ;
279
+ Stack < int > S = new Stack < int > ( ) ;
280
+ S . Push ( c ) ;
281
+
282
+ while ( S . Any ( ) )
283
+ {
284
+ var v = S . Pop ( ) ;
285
+ if ( ! isDone [ v ] )
286
+ {
287
+ group . Add ( v ) ;
288
+ isDone [ v ] = true ;
289
+ foreach ( var w in adj [ v ] )
290
+ {
291
+ S . Push ( w ) ;
290
292
}
291
293
}
292
- groupedIndexes . Add ( grouped ) ;
293
294
}
294
-
295
- return groupedIndexes ;
295
+ return group ;
296
296
}
297
297
}
298
298
}
0 commit comments