Skip to content

Commit 5331d9e

Browse files
committed
normalize via Unicode NFC
1 parent 8cf48dd commit 5331d9e

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
all: all-tess all-hunspell
66

77
install:
8-
sudo apt-get install sqlite3 wget
8+
sudo apt-get install sqlite3 wget icu-devtools
99
sudo add-apt-repository -u -y ppa:alex-p/tesseract-ocr
1010
sudo apt-get update
1111
sudo apt-get install tesseract-ocr

sql2wordlist.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/bin/sh
2-
sqlite3 -list -readonly -noheader ${1:-lexdb.sqlite} <<EOF | grep -v -e '^[[:punct:]]' -e '^[[:digit:][:punct:]]*$' > ${2:-lexdb_${3:-100}.words}
2+
sqlite3 -list -readonly -noheader ${1:-lexdb.sqlite} <<EOF | uconv -f utf-8 -t utf-8 -x "::nfc;" | grep -v -e '^[[:punct:]]' -e '^[[:digit:][:punct:]]*$' > ${2:-lexdb_${3:-100}.words}
33
select trim(u) from csv where f > ${3:-100} and p != "\$(" and p != "\$," and p != "\$." and p != "FM.xy" and p != "CARD" and p != "XY";
44
EOF

0 commit comments

Comments
 (0)