Skip to content

Commit 186a730

Browse files
committed
Bump Release 1.2.0!
Merge branch 'develop' into master
2 parents bfd2514 + d0afbd6 commit 186a730

File tree

17 files changed

+197
-43
lines changed

17 files changed

+197
-43
lines changed

clients/python-client/parsr_client/parsr_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ def send_document(
147147
'server_response']
148148
print('>> Job done!')
149149
return {
150-
'file': file,
151-
'config': config,
150+
'file': file_path,
151+
'config': config_path,
152152
'status_code': r.status_code,
153153
'server_response': r.text}
154154

demo/doc-versioning/doc_versioning/gui/document.html

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,16 @@
2121

2222
<!-- differ
2323
-- --------------------------------------------------- -->
24-
24+
25+
<!-- clickjacking protection-->
26+
<style> html {display : none; } </style>
27+
<script>
28+
if ( self === top ) {
29+
document.documentElement.style.display = 'block';
30+
} else {
31+
top.location = self.location;
32+
}
33+
</script>
2534
</head>
2635
<body>
2736
<div style="text-align: center; margin-top: 50px; margin-left: 50px">

demo/doc-versioning/doc_versioning/gui/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
<link rel="stylesheet" href="{{ url_for('static', filename='css/skeleton.css') }}">
2121
</head>
2222
<body>
23-
<a href="http://par.sr" target="_blank" style="outline:none;border:none;" onclick="openLink(event)">
23+
<a href="http://par.sr" target="_blank" rel="noopener noreferrer" style="outline:none;border:none;" onclick="openLink(event)">
2424
<img src="{{ url_for('static', filename='images/logo.png') }}" width="15%" alt="Parsr Powered" border="0" />
2525
</a>
2626
<div style="text-align: center; margin-top: 50px;">

demo/echo-module-py/echo-module.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121

2222
class PostHandler(BaseHTTPRequestHandler):
2323
def do_POST(self):
24+
2425
content_length = int(self.headers['Content-Length'])
2526
post_data = self.rfile.read(content_length)
2627
json_data = json.loads(post_data)
2728

2829
new_json_data = process_data(json_data)
2930

3031
self.send_response(200)
32+
self.send_header("Content-type", "application/json")
3133
self.end_headers()
3234
self.wfile.write(json.dumps(new_json_data).encode('utf8'))
3335

server/src/output/json/JsonExporter.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,16 +243,18 @@ export class JsonExporter extends Exporter {
243243
.map(elem => this.elementToJsonElement(elem));
244244
}
245245

246-
if (element instanceof Word) {
246+
if (element instanceof Word || element instanceof Character) {
247247
if (typeof element.font !== 'undefined') {
248248
const allFonts = Array.from(this.fontCatalog.keys());
249249
const wordFont = allFonts.filter(font => font.isEqual(element.font));
250250
if (wordFont.length === 0) {
251251
this.currentFontId++;
252252
this.fontCatalog.set(element.font, this.currentFontId);
253253
jsonElement.font = this.currentFontId;
254+
jsonElement.fontSize = element.font.size;
254255
} else {
255256
jsonElement.font = this.fontCatalog.get(wordFont[0]);
257+
jsonElement.fontSize = element.font.size;
256258
}
257259
}
258260
} else if (element instanceof Heading) {

server/src/output/json/README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
4848
"metadata": [],
4949
"box": { "l": 72, "t": 74.13, "w": 8.34, "h": 15 },
5050
"content": "2",
51-
"font": 1
51+
"font": 1,
52+
"fontSize": 15
5253
},
5354
{
5455
"id": 57,
@@ -57,7 +58,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
5758
"metadata": [],
5859
"box": { "l": 93.6, "t": 74.13, "w": 83.34, "h": 15 },
5960
"content": "Quantifying",
60-
"font": 1
61+
"font": 1,
62+
"fontSize": 15
6163
},
6264
{
6365
"id": 58,
@@ -66,7 +68,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
6668
"metadata": [],
6769
"box": { "l": 181.1, "t": 74.13, "w": 84.99, "h": 15 },
6870
"content": "Fuel-Saving",
69-
"font": 1
71+
"font": 1,
72+
"fontSize": 15
7073
},
7174
{
7275
"id": 59,
@@ -75,7 +78,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
7578
"metadata": [],
7679
"box": { "l": 270.25, "t": 74.13, "w": 98.27, "h": 15 },
7780
"content": "Opportunities",
78-
"font": 1
81+
"font": 1,
82+
"fontSize": 15
7983
},
8084
]
8185
},

server/src/processing/ImageDetectionModule/ImageDetectionModule.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import { ImageExtractor, PdfJsImageExtractor, PdfminerImageExtractor } from './e
2828

2929
interface Options {
3030
ocrImages?: boolean;
31+
wordsImagesSource?: boolean;
3132
}
3233

3334
type DocumentImages = {
@@ -123,6 +124,9 @@ export class ImageDetectionModule extends Module<Options> {
123124
if (document && document.pages.length > 0) {
124125
const pageIndex = imagesToScan[index].pageNumber - 1;
125126
const resizedWords = this.scaleWordsToFitImageBox(document, imagesToScan[index].image);
127+
if (this.options.wordsImagesSource === true) {
128+
this.setWordPropertieSrcImage(resizedWords, document.inputFile);
129+
}
126130
this.removeImage(doc, imagesToScan[index]);
127131
doc.pages[pageIndex].elements = doc.pages[pageIndex].elements.concat(resizedWords);
128132
doc.pages[pageIndex].pageRotation = document.pages[pageIndex].pageRotation;
@@ -145,6 +149,12 @@ export class ImageDetectionModule extends Module<Options> {
145149
document.pages[imageDetected.pageNumber - 1].elements = noImageElements;
146150
}
147151

152+
private setWordPropertieSrcImage(imgWords: Word[], srcFile: string) {
153+
imgWords.forEach(word => {
154+
word.properties.srcImage = srcFile;
155+
});
156+
}
157+
148158
private scaleWordsToFitImageBox(document: Document, image: Image): Word[] {
149159
const pageBox = document.pages[0].box;
150160
const imageBox = image.box;

server/src/processing/ImageDetectionModule/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ MuPDF: `mutool extract` is used to extract all image files from a PDF *when pdfm
1515
## Parameters
1616

1717
`ocrImages`: Allows to extract data from detected images using selected OCR. When `true`, all detected images will be replaced with data extracted by OCR.
18+
`wordsImagesSource`: When `true`, all word extracted by OCR will have their propertie filled by the source of the original image.
1819

1920
## How it works
2021

server/src/processing/ImageDetectionModule/defaultConfig.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
"ocrImages": {
66
"value": false,
77
"range": [true, false]
8+
},
9+
"wordsImagesSource": {
10+
"value": true,
11+
"range": [true, false]
812
}
913
}
1014
}

server/src/processing/ImageDetectionModule/extractors/PdfminerImageExtractor.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,14 @@ export class PdfminerImageExtractor extends ImageExtractor {
4646
if (dumpPdfData != null) {
4747
const assets: string[] = readdirSync(doc.assetsFolder);
4848
const pageIds = DumpPdf.extractPageNodeIds(dumpPdfData);
49+
const srcFile: string = doc.assetsFolder;
4950
doc.pages.forEach((page, index) => {
5051
const images = page.getElementsOfType(Image, true);
5152
images.forEach(img => (img.enabled = true));
5253
if (images.length > 0) {
5354
this.linkImages(images, pageIds[index], dumpPdfData, index);
5455
this.linkXObjectWithExtensions(images, assets, index, pageIds[index]);
56+
this.linkSrcImages(images, assets, srcFile);
5557
}
5658
});
5759

@@ -109,6 +111,19 @@ export class PdfminerImageExtractor extends ImageExtractor {
109111
});
110112
}
111113

114+
private linkSrcImages(images: Image[], assets: string[], srcFile: string) {
115+
images
116+
.filter(img => !!img.xObjId)
117+
.forEach(img => {
118+
const asset = assets.filter(filename => {
119+
return filename.startsWith('img-' + img.xObjId.padStart(4, '0'));
120+
});
121+
if (asset.length === 1) {
122+
img.src = srcFile + '/' + asset[0];
123+
}
124+
});
125+
}
126+
112127
private totalDocumentImages(doc: Document): number {
113128
return doc.getElementsOfType(Image, true).length;
114129
}

server/src/processing/LinkDetectionModule/LinkDetectionModule.ts

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ export class LinkDetectionModule extends Module {
3434
public static moduleName = 'link-detection';
3535

3636
public async main(doc: Document): Promise<Document> {
37-
3837
let mdLinks: DumpPdfLinksResponse[] = [];
39-
const fileType: { ext: string; mime: string } = doc.inputFile && filetype(fs.readFileSync(doc.inputFile));
38+
const fileType: { ext: string; mime: string } =
39+
doc.inputFile && filetype(fs.readFileSync(doc.inputFile));
4040
if (!fileType || fileType === null || (fileType && fileType.ext !== 'pdf')) {
4141
logger.warn(
4242
`Warning: Input file ${doc.inputFile} is not a PDF (${utils.prettifyObject(fileType)}); \
@@ -48,11 +48,14 @@ export class LinkDetectionModule extends Module {
4848

4949
const count = mdLinks.reduce((acc, l) => acc + l.links.length, 0);
5050
logger.info('Found ' + count + ' links in document metadata.');
51+
let splittedLinkPart: Word = null;
5152

5253
doc.pages.forEach((page: Page) => {
5354
const pageLinks = mdLinks.find(l => l.pageNumber + 1 === page.pageNumber);
54-
page.getElementsOfType<Word>(Word, true).forEach(word => {
55+
for (let i = 0; i < page.getElementsOfType<Word>(Word, true).length; i++) {
5556
// for a given word, check if the word matches any not used link position.
57+
let word = page.getElementsOfType<Word>(Word, true)[i];
58+
let nextWord = page.getElementsOfType<Word>(Word, true)[i + 1];
5659
(pageLinks || { links: [] }).links.forEach(pageLink => {
5760
const linkBB = new BoundingBox(
5861
pageLink.box.left,
@@ -66,30 +69,45 @@ export class LinkDetectionModule extends Module {
6669
}
6770
});
6871

72+
// Set the targetURL property if it match the link or mail pattern.
73+
// For the link it will first match the beginning of link and then the full link pattern
74+
// to be able to rebuild a link which is separated on two lines.
6975
if (!word.properties.targetURL) {
70-
this.matchTextualLinks(word);
76+
const linkRegexp = /\b((http|https):\/\/?|(www))[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/?))/;
77+
const fullLinkRegexp = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})/;
78+
const mailRegexp = /^(("[\w-\s]+")|([\w-]+(?:\.[\w-]+)*)|("[\w-\s]+")([\w-]+(?:\.[\w-]+)*))(@((?:[\w-]+\.)*\w[\w-]{0,66})\.([a-z]{2,6}(?:\.[a-z]{2})?)$)|(@\[?((25[0-5]\.|2[0-4][0-9]\.|1[0-9]{2}\.|[0-9]{1,2}\.))((25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\.){2}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\]?$)/;
79+
if (splittedLinkPart) {
80+
word.properties.targetURL = splittedLinkPart
81+
.toString()
82+
.concat(word.toString())
83+
.match(linkRegexp)[0];
84+
splittedLinkPart = null;
85+
word.properties.splittedLink = true;
86+
} else if (word.toString().match(linkRegexp)) {
87+
if (!word.toString().match(fullLinkRegexp)) {
88+
word.properties.targetURL = word
89+
.toString()
90+
.concat(nextWord.toString())
91+
.match(linkRegexp)[0];
92+
splittedLinkPart = word;
93+
word.properties.splittedLink = true;
94+
} else {
95+
word.properties.targetURL = word.toString().match(linkRegexp)[0];
96+
}
97+
} else if (word.toString().match(mailRegexp)) {
98+
word.properties.targetURL = `mailto:${word.toString().match(mailRegexp)[0]}`;
99+
}
71100
}
72-
});
101+
}
73102
});
74103
return doc;
75104
}
76105

77-
private matchTextualLinks(word: Word) {
78-
const linkRegexp = /\b((http|https):\/\/?|(www))[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/?))/;
79-
// tslint:disable-next-line:max-line-length
80-
const mailRegexp = /^(("[\w-\s]+")|([\w-]+(?:\.[\w-]+)*)|("[\w-\s]+")([\w-]+(?:\.[\w-]+)*))(@((?:[\w-]+\.)*\w[\w-]{0,66})\.([a-z]{2,6}(?:\.[a-z]{2})?)$)|(@\[?((25[0-5]\.|2[0-4][0-9]\.|1[0-9]{2}\.|[0-9]{1,2}\.))((25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\.){2}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\]?$)/;
81-
if (word.toString().match(linkRegexp)) {
82-
word.properties.targetURL = word.toString().match(linkRegexp)[0];
83-
} else if (word.toString().match(mailRegexp)) {
84-
word.properties.targetURL = `mailto:${word.toString().match(mailRegexp)[0]}`;
85-
}
86-
}
87-
88106
/*
89107
runs the 'dumppdf.py' script and returns a JSON with all the metadata found in the file
90108
*/
91109
private getFileMetadata(pdfFilePath: string): Promise<DumpPdfLinksResponse[]> {
92-
return new Promise((resolve) => {
110+
return new Promise(resolve => {
93111
CommandExecuter.dumpPdf(pdfFilePath)
94112
.then(utils.sanitizeXML)
95113
.then(extractLinks)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Words to Line New Module
2+
3+
## Purpose
4+
5+
Create lines from a bunch of words inside pages.
6+
7+
## What it does
8+
9+
It creates new line elements that contains arrays of word elements.
10+
11+
## Dependencies
12+
13+
No dependencie.
14+
15+
## How it works
16+
17+
It takes every word of the page one by one, then it is splitted into line depending of vertical alignement, finaly each line can be splitted when a space between two words is big enough according to the average space and common space between words.
18+
19+
## Accuracy
20+
21+
Almost perfect.
22+
23+
## Limitations
24+
25+
If on specific case it is not working, you can tune up by incrementing / decrementing value of average and common space detected between words.

server/src/processing/WordsToLineNewModule/WordsToLineNew.ts

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,27 @@ import logger from '../../utils/Logger';
2626
import * as utils from '../../utils';
2727
import { Module } from '../Module';
2828
import { ListDetectionModule } from '../ListDetectionModule/ListDetectionModule';
29+
import * as defaultConfig from './defaultConfig.json';
30+
31+
interface Options {
32+
modifyAvgWordsSpace?: number;
33+
modifyCommonWordsSpace?: number;
34+
}
35+
36+
const defaultOptions = (defaultConfig as any) as Options;
2937

3038
/**
3139
* Stability: Stable
3240
* Merge text block that are side by side to make lines.
3341
*/
34-
export class WordsToLineNewModule extends Module {
42+
export class WordsToLineNewModule extends Module <Options> {
3543
public static moduleName = 'words-to-line-new';
3644
private wordsCounter = 0;
3745

46+
constructor(options?: Options) {
47+
super(options, defaultOptions);
48+
}
49+
3850
public main(doc: Document): Document {
3951
doc.pages = doc.pages.map(page => {
4052
if (page.getElementsOfType<Line>(Line).length > 0) {
@@ -156,16 +168,16 @@ export class WordsToLineNewModule extends Module {
156168
if (index === 0) return 0;
157169
const prevWordEnd = words[index - 1].left + words[index - 1].width;
158170
const distance = word.left - prevWordEnd;
159-
if (distance < word.height * 0.2) {
171+
if (distance + this.options.modifyAvgWordsSpace < word.height * 0.2) {
160172
// If two words are too near then it will reduce avg space a lot
161173
// making each word to be a line
162174
return this.commonWordsSpace(words, index);
163175
}
164-
return distance;
176+
return distance + this.options.modifyAvgWordsSpace;
165177
})
166178
.reduce((a, b) => a + b, 0) /
167179
(words.length - 1);
168-
return Math.round(space * 2.5);
180+
return Math.round(space * 2.5) + this.options.modifyAvgWordsSpace;
169181
}
170182

171183
private commonWordsSpace(words: Word[], excludeIndex: number): number {
@@ -175,11 +187,11 @@ export class WordsToLineNewModule extends Module {
175187
if (index === 0 || index === excludeIndex) return 0;
176188
const prevWordEnd = words[index - 1].left + words[index - 1].width;
177189
const distance = word.left - prevWordEnd;
178-
return distance;
190+
return distance + this.options.modifyCommonWordsSpace;
179191
})
180192
.reduce((a, b) => a + b, 0) /
181193
(words.length - 2);
182-
return Math.round(space * 1.2);
194+
return Math.round(space * 1.2) + this.options.modifyCommonWordsSpace;
183195
}
184196

185197
private inSameLine(lineWords: Word[], lastWord: Word, word: Word, avgSpace: number): boolean {
Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
11
{
2-
"name": "words-to-line",
3-
"description": "Create lines from a bunch of words, according to the reading order."
2+
"name": "words-to-line-new",
3+
"description": "Create lines from a bunch of words.",
4+
"specs": {
5+
"modifyAvgWordsSpace": {
6+
"value": 0,
7+
"range": { "min": -100, "max": 100}
8+
},
9+
"modifyCommonWordsSpace": {
10+
"value": 0,
11+
"range": { "min": -100, "max": 100}
12+
}
13+
}
414
}

0 commit comments

Comments
 (0)