diff --git a/extract_text_mac.sh b/extract_text_mac.sh new file mode 100644 index 0000000..29cbd85 --- /dev/null +++ b/extract_text_mac.sh @@ -0,0 +1,49 @@ +#!/bin/bash +BPATH=$1 # Path to directory containing PDFs. +OPATH=$2 # Path to output directory. +TPATH=$3 +LANG=$4 # See man tesseract > LANGUAGES +MIN_WORDS=5 # Number of words required to accept pdftotext result. +if [ $(echo "$LANG" | wc -c ) -lt 1 ] # Language defaults to eng. + then + LANG='eng' +fi +# If the output path does not exist, attempt to create it. +if [ ! -d "$OPATH" ]; then + mkdir -p "$OPATH" +fi + +if [ ! -d "$TPATH" ]; then + mkdir -p "$TPATH" +fi +for FILEPATH in $BPATH*.pdf; do + # Extracts plain text content from a PDF. + # + # First, attempts to extract embedded text with pdftotext. If that fails, + # converts the PDF to TIFF and attempts to perform OCR with Tesseract. + # + # Path to text file to be created. E.g. ./myfile.txt + OUTFILE=$OPATH$(basename $FILEPATH).txt + touch "$OUTFILE" # The text file will be created regardless of whether + # text is successfully extracted. + # First attempt to use pdftotext to extract embedded text. + echo -n "Attempting pdftotext extraction..." + pdftotext "$FILEPATH" "$OUTFILE" + FILESIZE=$(wc -w < "$OUTFILE") + echo "extracted $FILESIZE words." + # If that fails, try Tesseract. + if [[ $FILESIZE -lt $MIN_WORDS ]] + then + echo -n "Attempting OCR extraction..." + # Use imagemagick to convert the PDF to a high-rest multi-page TIFF. + convert -density 300 "$FILEPATH" -depth 8 -strip -background white \ + -alpha off $TPATH$(basename $FILEPATH).tiff + # Then use Tesseract to perform OCR on the tiff. + tesseract $TPATH$(basename $FILEPATH).tiff "$OUTFILE" $LANG + # We don't need then intermediate TIFF file, so discard it. + rm $TPATH$(basename $FILEPATH).tiff + FILESIZE=$(wc -w < "$OUTFILE") + echo "extracted $FILESIZE words." + fi + +done diff --git a/shell_ocr_mac.py b/shell_ocr_mac.py new file mode 100644 index 0000000..55a11dd --- /dev/null +++ b/shell_ocr_mac.py @@ -0,0 +1,8 @@ +import subprocess +import shlex +# shlex is used for passing parameters + +# subprocess.call(['./extract_text.sh']) +# Run above if no parameters are required + +subprocess.call(shlex.split('./extract_text.sh ./pdf/ ./txt/ ./tiff/' ))