Bump Release 1.2.0!

dafelix42 · dafelix42 · commit 186a7300d775 · 2020-10-02T17:00:44.000+02:00
Merge branch 'develop' into master
diff --git a/clients/python-client/parsr_client/parsr_client.py b/clients/python-client/parsr_client/parsr_client.py
@@ -147,8 +147,8 @@ def send_document(
                         'server_response']
                     print('>> Job done!')
             return {
-                'file': file,
-                'config': config,
+                'file': file_path,
+                'config': config_path,
                 'status_code': r.status_code,
                 'server_response': r.text}
 
diff --git a/demo/doc-versioning/doc_versioning/gui/document.html b/demo/doc-versioning/doc_versioning/gui/document.html
@@ -21,7 +21,16 @@
 
         <!-- differ
         -- --------------------------------------------------- -->
-
+        
+        <!-- clickjacking protection-->
+        <style> html {display : none; } </style>
+        <script>
+            if ( self === top ) {
+                document.documentElement.style.display = 'block';
+            } else {
+                top.location = self.location;
+            }
+        </script>
     </head>
     <body>
         <div style="text-align: center; margin-top: 50px; margin-left: 50px">
diff --git a/demo/doc-versioning/doc_versioning/gui/index.html b/demo/doc-versioning/doc_versioning/gui/index.html
@@ -20,7 +20,7 @@
         <link rel="stylesheet" href="{{ url_for('static', filename='css/skeleton.css') }}">
     </head>
     <body>
-        <a href="http://par.sr" target="_blank" style="outline:none;border:none;" onclick="openLink(event)">
+        <a href="http://par.sr" target="_blank" rel="noopener noreferrer" style="outline:none;border:none;" onclick="openLink(event)">
             <img src="{{ url_for('static', filename='images/logo.png') }}" width="15%" alt="Parsr Powered" border="0" />
         </a>
         <div style="text-align: center; margin-top: 50px;">
diff --git a/demo/echo-module-py/echo-module.py b/demo/echo-module-py/echo-module.py
@@ -21,13 +21,15 @@
 
 class PostHandler(BaseHTTPRequestHandler):
 	def do_POST(self):
+
 		content_length = int(self.headers['Content-Length'])
 		post_data = self.rfile.read(content_length)
 		json_data = json.loads(post_data)
 
 		new_json_data = process_data(json_data)
 
 		self.send_response(200)
+		self.send_header("Content-type", "application/json")
 		self.end_headers()
 		self.wfile.write(json.dumps(new_json_data).encode('utf8'))
 
diff --git a/server/src/output/json/JsonExporter.ts b/server/src/output/json/JsonExporter.ts
@@ -243,16 +243,18 @@ export class JsonExporter extends Exporter {
           .map(elem => this.elementToJsonElement(elem));
       }
 
-      if (element instanceof Word) {
+      if (element instanceof Word || element instanceof Character) {
         if (typeof element.font !== 'undefined') {
           const allFonts = Array.from(this.fontCatalog.keys());
           const wordFont = allFonts.filter(font => font.isEqual(element.font));
           if (wordFont.length === 0) {
             this.currentFontId++;
             this.fontCatalog.set(element.font, this.currentFontId);
             jsonElement.font = this.currentFontId;
+            jsonElement.fontSize = element.font.size;
           } else {
             jsonElement.font = this.fontCatalog.get(wordFont[0]);
+            jsonElement.fontSize = element.font.size;
           }
         }
       } else if (element instanceof Heading) {
diff --git a/server/src/output/json/README.md b/server/src/output/json/README.md
@@ -48,7 +48,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
                   "metadata": [],
                   "box": { "l": 72, "t": 74.13, "w": 8.34, "h": 15 },
                   "content": "2",
-                  "font": 1
+                  "font": 1,
+                  "fontSize": 15
                 },
                 {
                   "id": 57,
@@ -57,7 +58,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
                   "metadata": [],
                   "box": { "l": 93.6, "t": 74.13, "w": 83.34, "h": 15 },
                   "content": "Quantifying",
-                  "font": 1
+                  "font": 1,
+                  "fontSize": 15
                 },
                 {
                   "id": 58,
@@ -66,7 +68,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
                   "metadata": [],
                   "box": { "l": 181.1, "t": 74.13, "w": 84.99, "h": 15 },
                   "content": "Fuel-Saving",
-                  "font": 1
+                  "font": 1,
+                  "fontSize": 15
                 },
                 {
                   "id": 59,
@@ -75,7 +78,8 @@ Example from [foo.pdf](../../../../samples/foo.pdf) sample file:
                   "metadata": [],
                   "box": { "l": 270.25, "t": 74.13, "w": 98.27, "h": 15 },
                   "content": "Opportunities",
-                  "font": 1
+                  "font": 1,
+                  "fontSize": 15
                 }, 
               ]
             },
diff --git a/server/src/processing/ImageDetectionModule/ImageDetectionModule.ts b/server/src/processing/ImageDetectionModule/ImageDetectionModule.ts
@@ -28,6 +28,7 @@ import { ImageExtractor, PdfJsImageExtractor, PdfminerImageExtractor } from './e
 
 interface Options {
   ocrImages?: boolean;
+  wordsImagesSource?: boolean;
 }
 
 type DocumentImages = {
@@ -123,6 +124,9 @@ export class ImageDetectionModule extends Module<Options> {
           if (document && document.pages.length > 0) {
             const pageIndex = imagesToScan[index].pageNumber - 1;
             const resizedWords = this.scaleWordsToFitImageBox(document, imagesToScan[index].image);
+            if (this.options.wordsImagesSource === true) {
+              this.setWordPropertieSrcImage(resizedWords, document.inputFile);
+            }
             this.removeImage(doc, imagesToScan[index]);
             doc.pages[pageIndex].elements = doc.pages[pageIndex].elements.concat(resizedWords);
             doc.pages[pageIndex].pageRotation = document.pages[pageIndex].pageRotation;
@@ -145,6 +149,12 @@ export class ImageDetectionModule extends Module<Options> {
     document.pages[imageDetected.pageNumber - 1].elements = noImageElements;
   }
 
+  private setWordPropertieSrcImage(imgWords: Word[], srcFile: string) {
+    imgWords.forEach(word => {
+      word.properties.srcImage = srcFile;
+    });
+  }
+
   private scaleWordsToFitImageBox(document: Document, image: Image): Word[] {
     const pageBox = document.pages[0].box;
     const imageBox = image.box;
diff --git a/server/src/processing/ImageDetectionModule/README.md b/server/src/processing/ImageDetectionModule/README.md
@@ -15,6 +15,7 @@ MuPDF: `mutool extract` is used to extract all image files from a PDF *when pdfm
 ## Parameters
 
 `ocrImages`: Allows to extract data from detected images using selected OCR. When `true`, all detected images will be replaced with data extracted by OCR.
+`wordsImagesSource`: When `true`, all word extracted by OCR will have their propertie filled by the source of the original image.
 
 ## How it works
 
diff --git a/server/src/processing/ImageDetectionModule/defaultConfig.json b/server/src/processing/ImageDetectionModule/defaultConfig.json
@@ -5,6 +5,10 @@
     "ocrImages": {
       "value": false,
       "range": [true, false]
+    },
+    "wordsImagesSource": {
+      "value": true,
+      "range": [true, false]
     }
   }
 }
diff --git a/server/src/processing/ImageDetectionModule/extractors/PdfminerImageExtractor.ts b/server/src/processing/ImageDetectionModule/extractors/PdfminerImageExtractor.ts
@@ -46,12 +46,14 @@ export class PdfminerImageExtractor extends ImageExtractor {
     if (dumpPdfData != null) {
       const assets: string[] = readdirSync(doc.assetsFolder);
       const pageIds = DumpPdf.extractPageNodeIds(dumpPdfData);
+      const srcFile: string = doc.assetsFolder;
       doc.pages.forEach((page, index) => {
         const images = page.getElementsOfType(Image, true);
         images.forEach(img => (img.enabled = true));
         if (images.length > 0) {
           this.linkImages(images, pageIds[index], dumpPdfData, index);
           this.linkXObjectWithExtensions(images, assets, index, pageIds[index]);
+          this.linkSrcImages(images, assets, srcFile);
         }
       });
 
@@ -109,6 +111,19 @@ export class PdfminerImageExtractor extends ImageExtractor {
       });
   }
 
+  private linkSrcImages(images: Image[], assets: string[], srcFile: string) {
+    images
+      .filter(img => !!img.xObjId)
+      .forEach(img => {
+        const asset = assets.filter(filename => {
+          return filename.startsWith('img-' + img.xObjId.padStart(4, '0'));
+        });
+        if (asset.length === 1) {
+          img.src = srcFile + '/' + asset[0];
+        }
+      });
+  }
+
   private totalDocumentImages(doc: Document): number {
     return doc.getElementsOfType(Image, true).length;
   }
diff --git a/server/src/processing/LinkDetectionModule/LinkDetectionModule.ts b/server/src/processing/LinkDetectionModule/LinkDetectionModule.ts
@@ -34,9 +34,9 @@ export class LinkDetectionModule extends Module {
   public static moduleName = 'link-detection';
 
   public async main(doc: Document): Promise<Document> {
-
     let mdLinks: DumpPdfLinksResponse[] = [];
-    const fileType: { ext: string; mime: string } = doc.inputFile && filetype(fs.readFileSync(doc.inputFile));
+    const fileType: { ext: string; mime: string } =
+      doc.inputFile && filetype(fs.readFileSync(doc.inputFile));
     if (!fileType || fileType === null || (fileType && fileType.ext !== 'pdf')) {
       logger.warn(
         `Warning: Input file ${doc.inputFile} is not a PDF (${utils.prettifyObject(fileType)}); \
@@ -48,11 +48,14 @@ export class LinkDetectionModule extends Module {
 
     const count = mdLinks.reduce((acc, l) => acc + l.links.length, 0);
     logger.info('Found ' + count + ' links in document metadata.');
+    let splittedLinkPart: Word = null;
 
     doc.pages.forEach((page: Page) => {
       const pageLinks = mdLinks.find(l => l.pageNumber + 1 === page.pageNumber);
-      page.getElementsOfType<Word>(Word, true).forEach(word => {
+      for (let i = 0; i < page.getElementsOfType<Word>(Word, true).length; i++) {
         // for a given word, check if the word matches any not used link position.
+        let word = page.getElementsOfType<Word>(Word, true)[i];
+        let nextWord = page.getElementsOfType<Word>(Word, true)[i + 1];
         (pageLinks || { links: [] }).links.forEach(pageLink => {
           const linkBB = new BoundingBox(
             pageLink.box.left,
@@ -66,30 +69,45 @@ export class LinkDetectionModule extends Module {
           }
         });
 
+        // Set the targetURL property if it match the link or mail pattern.
+        // For the link it will first match the beginning of link and then the full link pattern
+        // to be able to rebuild a link which is separated on two lines.
         if (!word.properties.targetURL) {
-          this.matchTextualLinks(word);
+          const linkRegexp = /\b((http|https):\/\/?|(www))[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/?))/;
+          const fullLinkRegexp = /(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})/;
+          const mailRegexp = /^(("[\w-\s]+")|([\w-]+(?:\.[\w-]+)*)|("[\w-\s]+")([\w-]+(?:\.[\w-]+)*))(@((?:[\w-]+\.)*\w[\w-]{0,66})\.([a-z]{2,6}(?:\.[a-z]{2})?)$)|(@\[?((25[0-5]\.|2[0-4][0-9]\.|1[0-9]{2}\.|[0-9]{1,2}\.))((25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\.){2}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\]?$)/;
+          if (splittedLinkPart) {
+            word.properties.targetURL = splittedLinkPart
+                .toString()
+                .concat(word.toString())
+                .match(linkRegexp)[0];
+            splittedLinkPart = null;
+            word.properties.splittedLink = true;
+          } else if (word.toString().match(linkRegexp)) {
+            if (!word.toString().match(fullLinkRegexp)) {
+              word.properties.targetURL = word
+                .toString()
+                .concat(nextWord.toString())
+                .match(linkRegexp)[0];
+              splittedLinkPart = word;
+              word.properties.splittedLink = true;
+            } else {
+              word.properties.targetURL = word.toString().match(linkRegexp)[0];
+            }
+          } else if (word.toString().match(mailRegexp)) {
+            word.properties.targetURL = `mailto:${word.toString().match(mailRegexp)[0]}`;
+          }
         }
-      });
+      }
     });
     return doc;
   }
 
-  private matchTextualLinks(word: Word) {
-    const linkRegexp = /\b((http|https):\/\/?|(www))[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/?))/;
-    // tslint:disable-next-line:max-line-length
-    const mailRegexp = /^(("[\w-\s]+")|([\w-]+(?:\.[\w-]+)*)|("[\w-\s]+")([\w-]+(?:\.[\w-]+)*))(@((?:[\w-]+\.)*\w[\w-]{0,66})\.([a-z]{2,6}(?:\.[a-z]{2})?)$)|(@\[?((25[0-5]\.|2[0-4][0-9]\.|1[0-9]{2}\.|[0-9]{1,2}\.))((25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\.){2}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\]?$)/;
-    if (word.toString().match(linkRegexp)) {
-      word.properties.targetURL = word.toString().match(linkRegexp)[0];
-    } else if (word.toString().match(mailRegexp)) {
-      word.properties.targetURL = `mailto:${word.toString().match(mailRegexp)[0]}`;
-    }
-  }
-
   /*
     runs the 'dumppdf.py' script and returns a JSON with all the metadata found in the file
   */
   private getFileMetadata(pdfFilePath: string): Promise<DumpPdfLinksResponse[]> {
-    return new Promise((resolve) => {
+    return new Promise(resolve => {
       CommandExecuter.dumpPdf(pdfFilePath)
         .then(utils.sanitizeXML)
         .then(extractLinks)
diff --git a/server/src/processing/WordsToLineNewModule/README.md b/server/src/processing/WordsToLineNewModule/README.md
@@ -0,0 +1,25 @@
+# Words to Line New Module
+
+## Purpose
+
+Create lines from a bunch of words inside pages.
+
+## What it does
+
+It creates new line elements that contains arrays of word elements.
+
+## Dependencies
+
+No dependencie.
+
+## How it works
+
+It takes every word of the page one by one, then it is splitted into line depending of vertical alignement, finaly each line can be splitted when a space between two words is big enough according to the average space and common space between words.
+
+## Accuracy
+
+Almost perfect.
+
+## Limitations
+
+If on specific case it is not working, you can tune up by incrementing / decrementing value of average and common space detected between words.
diff --git a/server/src/processing/WordsToLineNewModule/WordsToLineNew.ts b/server/src/processing/WordsToLineNewModule/WordsToLineNew.ts
@@ -26,15 +26,27 @@ import logger from '../../utils/Logger';
 import * as utils from '../../utils';
 import { Module } from '../Module';
 import { ListDetectionModule } from '../ListDetectionModule/ListDetectionModule';
+import * as defaultConfig from './defaultConfig.json';
+
+interface Options {
+  modifyAvgWordsSpace?: number;
+  modifyCommonWordsSpace?: number;
+}
+
+const defaultOptions = (defaultConfig as any) as Options;
 
 /**
  * Stability: Stable
  * Merge text block that are side by side to make lines.
  */
-export class WordsToLineNewModule extends Module {
+export class WordsToLineNewModule extends Module <Options> {
   public static moduleName = 'words-to-line-new';
   private wordsCounter = 0;
 
+  constructor(options?: Options) {
+    super(options, defaultOptions);
+  }
+
   public main(doc: Document): Document {
     doc.pages = doc.pages.map(page => {
       if (page.getElementsOfType<Line>(Line).length > 0) {
@@ -156,16 +168,16 @@ export class WordsToLineNewModule extends Module {
           if (index === 0) return 0;
           const prevWordEnd = words[index - 1].left + words[index - 1].width;
           const distance = word.left - prevWordEnd;
-          if (distance < word.height * 0.2) {
+          if (distance + this.options.modifyAvgWordsSpace < word.height * 0.2) {
             // If two words are too near then it will reduce avg space a lot
             // making each word to be a line
             return this.commonWordsSpace(words, index);
           }
-          return distance;
+          return distance + this.options.modifyAvgWordsSpace;
         })
         .reduce((a, b) => a + b, 0) /
       (words.length - 1);
-    return Math.round(space * 2.5);
+    return Math.round(space * 2.5) + this.options.modifyAvgWordsSpace;
   }
 
   private commonWordsSpace(words: Word[], excludeIndex: number): number {
@@ -175,11 +187,11 @@ export class WordsToLineNewModule extends Module {
           if (index === 0 || index === excludeIndex) return 0;
           const prevWordEnd = words[index - 1].left + words[index - 1].width;
           const distance = word.left - prevWordEnd;
-          return distance;
+          return distance + this.options.modifyCommonWordsSpace;
         })
         .reduce((a, b) => a + b, 0) /
       (words.length - 2);
-    return Math.round(space * 1.2);
+    return Math.round(space * 1.2) + this.options.modifyCommonWordsSpace;
   }
 
   private inSameLine(lineWords: Word[], lastWord: Word, word: Word, avgSpace: number): boolean {
diff --git a/server/src/processing/WordsToLineNewModule/defaultConfig.json b/server/src/processing/WordsToLineNewModule/defaultConfig.json
@@ -1,4 +1,14 @@
 {
-  "name": "words-to-line",
-  "description": "Create lines from a bunch of words, according to the reading order."
+  "name": "words-to-line-new",
+  "description": "Create lines from a bunch of words.",
+  "specs": {
+		"modifyAvgWordsSpace": {
+			"value": 0,
+			"range": { "min": -100, "max": 100}
+		},
+		"modifyCommonWordsSpace": {
+			"value": 0,
+			"range": { "min": -100, "max": 100}
+    }
+	}
 }
diff --git a/server/src/types/DocumentRepresentation/JsonExport.ts b/server/src/types/DocumentRepresentation/JsonExport.ts
diff --git a/server/src/types/DocumentRepresentation/Paragraph.ts b/server/src/types/DocumentRepresentation/Paragraph.ts
diff --git a/server/src/types/Metadata/Properties.ts b/server/src/types/Metadata/Properties.ts

Original file line number	Diff line number	Diff line change
`@@ -243,16 +243,18 @@ export class JsonExporter extends Exporter {`
`243`	`243`	`.map(elem => this.elementToJsonElement(elem));`
`244`	`244`	`}`
`245`	`245`
`246`		`- if (element instanceof Word) {`
	`246`	`+ if (element instanceof Word \|\| element instanceof Character) {`
`247`	`247`	`if (typeof element.font !== 'undefined') {`
`248`	`248`	`const allFonts = Array.from(this.fontCatalog.keys());`
`249`	`249`	`const wordFont = allFonts.filter(font => font.isEqual(element.font));`
`250`	`250`	`if (wordFont.length === 0) {`
`251`	`251`	`this.currentFontId++;`
`252`	`252`	`this.fontCatalog.set(element.font, this.currentFontId);`
`253`	`253`	`jsonElement.font = this.currentFontId;`
	`254`	`+ jsonElement.fontSize = element.font.size;`
`254`	`255`	`} else {`
`255`	`256`	`jsonElement.font = this.fontCatalog.get(wordFont[0]);`
	`257`	`+ jsonElement.fontSize = element.font.size;`
`256`	`258`	`}`
`257`	`259`	`}`
`258`	`260`	`} else if (element instanceof Heading) {`
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,10 @@`
`5`	`5`	`"ocrImages": {`
`6`	`6`	`"value": false,`
`7`	`7`	`"range": [true, false]`
	`8`	`+ },`
	`9`	`+ "wordsImagesSource": {`
	`10`	`+ "value": true,`
	`11`	`+ "range": [true, false]`
`8`	`12`	`}`
`9`	`13`	`}`
`10`	`14`	`}`