-
Notifications
You must be signed in to change notification settings - Fork 256
/
Copy pathDecorationTextBlockClassifier.cs
311 lines (276 loc) · 18.2 KB
/
DecorationTextBlockClassifier.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
using Content;
using Geometry;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using Util;
/// <summary>
/// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
/// </summary>
public static class DecorationTextBlockClassifier
{
private static readonly Regex NumbersPattern = new Regex(@"(\d+)|(\b([MDCLXVI]+)\b)", RegexOptions.IgnoreCase);
private const string replacementChar = "@";
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
/// <param name="wordExtractor"></param>
/// <param name="pageSegmenter"></param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
{
return Get(pages, wordExtractor, pageSegmenter, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism);
}
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
/// <param name="wordExtractor"></param>
/// <param name="pageSegmenter"></param>
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func<string, string, double> minimumEditDistanceNormalised,
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
{
if (pages.Count < 2)
{
throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages));
}
ConcurrentDictionary<int, IReadOnlyList<TextBlock>> pagesBlocks = new ConcurrentDictionary<int, IReadOnlyList<TextBlock>>();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Parallel.For(0, pages.Count, parallelOptions, p =>
{
var words = pages[p].GetWords(wordExtractor);
var blocks = pageSegmenter.GetBlocks(words);
if (!pagesBlocks.TryAdd(p, blocks))
{
throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
}
});
return Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(),
minimumEditDistanceNormalised,
similarityThreshold,
n,
maxDegreeOfParallelism);
}
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
{
return Get(pagesTextBlocks, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism);
}
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
Func<string, string, double> minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
{
if (pagesTextBlocks.Count < 2)
{
throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pagesTextBlocks));
}
ConcurrentDictionary<int, List<TextBlock>> pageDecorations = new ConcurrentDictionary<int, List<TextBlock>>();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Parallel.For(0, pagesTextBlocks.Count, parallelOptions, p =>
{
if (!pageDecorations.TryAdd(p, new List<TextBlock>()))
{
throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
}
int pMinus1 = GetPreviousPageNumber(p, pagesTextBlocks.Count);
int pPlus1 = GetNextPageNumber(p, pagesTextBlocks.Count);
var previousPage = pagesTextBlocks[pMinus1];
var currentPage = pagesTextBlocks[p];
var nextPage = pagesTextBlocks[pPlus1];
int nCurrent = Math.Min(n, currentPage.Count);
// First, for each page, we sort all blocks on the page in four different orders:
// - from top to bottom (based on the minimum y coordinate),
// - from bottom to top (maximum y coordinate),
// - from left to right (minimum x coordinate),
// - from right to left (maximumx coordinate).
// From top to bottom (based on the minimum y coordinate)
previousPage = previousPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList();
currentPage = currentPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList();
nextPage = nextPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList();
for (int i = 0; i < nCurrent; i++)
{
var current = currentPage[i];
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
}
}
// From bottom to top (maximum y coordinate)
previousPage = previousPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList();
currentPage = currentPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList();
nextPage = nextPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList();
for (int i = 0; i < nCurrent; i++)
{
var current = currentPage[i];
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
}
}
// From left to right (minimum x coordinate)
previousPage = previousPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList();
currentPage = currentPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList();
nextPage = nextPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList();
for (int i = 0; i < nCurrent; i++)
{
var current = currentPage[i];
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
}
}
// From right to left (maximumx coordinate)
previousPage = previousPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList();
currentPage = currentPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList();
nextPage = nextPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList();
for (int i = 0; i < nCurrent; i++)
{
var current = currentPage[i];
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
}
}
});
return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value).ToList();
}
/// <summary>
/// [The content similarity] is calculated from the normalized edit
/// distance between the two content strings, where digits are replaced with “@” chars.
/// A content similarity of 1 is reached when both strings are exactly equal.
/// </summary>
private static double ContentSimilarity(TextBlock b1, TextBlock b2, Func<string, string, double> minimumEditDistanceNormalised)
{
double similarity = 1.0 - minimumEditDistanceNormalised(
NumbersPattern.Replace(b1.Text, replacementChar),
NumbersPattern.Replace(b2.Text, replacementChar));
return similarity;
}
/// <summary>
/// The geometric similarity is the area of the intersection between the two boundingbox rectangles divided by the larger of the two boundingboxes.
/// </summary>
private static double GeomSimilarity(TextBlock b1, TextBlock b2)
{
double similarity = 0;
var intersect = b1.BoundingBox.Intersect(b2.BoundingBox);
if (intersect.HasValue)
{
similarity = intersect.Value.Area / Math.Max(b1.BoundingBox.Area, b2.BoundingBox.Area);
}
return similarity;
}
/// <summary>
/// This similarity score is a value in the range [0,1] and given
/// by the product between the content and the geometric similarity.
/// </summary>
private static double Similarity(TextBlock b1, TextBlock b2, Func<string, string, double> minimumEditDistanceNormalised)
{
return ContentSimilarity(b1, b2, minimumEditDistanceNormalised) * GeomSimilarity(b1, b2);
}
private static double ScoreI(TextBlock current, TextBlock previous, TextBlock next, Func<string, string, double> minimumEditDistanceNormalised)
{
return 0.5 * (Similarity(current, next, minimumEditDistanceNormalised) + Similarity(current, previous, minimumEditDistanceNormalised));
}
private static double Score(TextBlock current, IReadOnlyList<TextBlock> previous, IReadOnlyList<TextBlock> next,
Func<string, string, double> minimumEditDistanceNormalised, double threshold, int n)
{
n = Math.Min(n, Math.Min(previous.Count, next.Count));
double score = 0;
for (int i = 0; i < n; i++)
{
var s = ScoreI(current, previous[i], next[i], minimumEditDistanceNormalised);
if (s > score) score = s;
if (score >= threshold) return score;
}
return score;
}
/// <summary>
/// If the document has more than three pages, we compare blocks on the next or previous page with an even or odd number,
/// depending on whether the current page number is even or odd, to account for cases with a two-sided layout.
/// </summary>
/// <param name="currentPage">Current page number.</param>
/// <param name="pagesCount">Total number of pages in the document.</param>
private static int GetPreviousPageNumber(int currentPage, int pagesCount)
{
int pMinus1 = currentPage - 1 >= 0 ? currentPage - 1 : pagesCount - 1;
if (pagesCount > 3)
{
pMinus1 = pMinus1 - 1 >= 0 ? pMinus1 - 1 : pagesCount - 1;
}
return pMinus1;
}
/// <summary>
/// If the document has more than three pages, we compare blocks on the next or previous page with an even or odd number,
/// depending on whether the current page number is even or odd, to account for cases with a two-sided layout.
/// </summary>
/// <param name="currentPage">Current page number.</param>
/// <param name="pagesCount">Total number of pages in the document.</param>
private static int GetNextPageNumber(int currentPage, int pagesCount)
{
int pPlus1 = currentPage + 1 < pagesCount ? currentPage + 1 : 0;
if (pagesCount > 3)
{
pPlus1 = pPlus1 + 1 < pagesCount ? pPlus1 + 1 : 0;
}
return pPlus1;
}
}
}