Parse titles correctly from PDFs when importing (#622)

Mr0grog · web-flow · commit 01575c9165a4 · 2020-08-28T16:10:25.000-07:00
We used to try and extract the title of any and every document as if it were HTML. However, we import lots of things that aren't HTML (e.g. PDF, CSV, XLS) and trying to parse those as HTML can slow, pointless, or memory intensive (the importer recently crashed by running out of memory trying to parse a PDF this way). This change takes more care to only extract the titles from HTML and from PDFs, and adds proper PDF parsing so we actually can extract the titles instead of failing and wasting huge amounts of memory every time.
diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ git+https://github.com/anastasia/htmldiffer@develop
 git+https://github.com/danielballan/htmltreediff@customize
 html5-parser ~=0.4.9 --no-binary lxml
 lxml ~=4.5.2
+PyPDF2 ~=1.26.0
 sentry-sdk ~=0.16.3
 requests ~=2.24.0
 toolz ~=0.10.0
diff --git a/web_monitoring/utils.py b/web_monitoring/utils.py
@@ -4,6 +4,7 @@
 import logging
 import lxml.html
 import os
+from PyPDF2 import PdfFileReader
 import queue
 import re
 import requests
@@ -35,6 +36,12 @@ def extract_title(content_bytes, encoding='utf-8'):
     return WHITESPACE_PATTERN.sub(' ', title.text.strip())
 
 
+def extract_pdf_title(content_bytes):
+    pdf = PdfFileReader(io.BytesIO(content_bytes))
+    info = pdf.getDocumentInfo()
+    return info.title
+
+
 def hash_content(content_bytes):
     "Create a version_hash for the content of a snapshot."
     return hashlib.sha256(content_bytes).hexdigest()