diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index 20d0ba479..85854fc5d 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -98,42 +98,102 @@ of vocabs. * - punctuation - 32 - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ + * - albanian + - 104 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿çëÇË + * - afrikaans + - 114 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿èëïîôûêÈËÏÎÔÛÊ + * - azerbaijani + - 111 + - 0123456789abcdefghijklmnopqrstuvxyzABCDEFGHIJKLMNOPQRSTUVXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿çəğöşüÇƏĞÖŞÜ₼ + * - basque + - 104 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ñçÑÇ + * - bosanski + - 102 + - 0123456789abcdefghijklmnoprstuvzABCDEFGHIJKLMNOPRSTUVZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿čćđšžČĆĐŠŽ + * - catalan + - 120 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àèéíïòóúüçÀÈÉÍÏÒÓÚÜÇ + * - croatian + - 110 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ČčĆćĐ𩹮ž * - czech - 130 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ * - danish - 106 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°£€¥¢฿æøåÆØÅ + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿æøåÆØÅ * - dutch - 114 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ * - english - 100 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ + * - estonian + - 112 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿šžõäöüŠŽÕÄÖÜ + * - esperanto + - 105 + - 0123456789abcdefghijklmnoprstuvzABCDEFGHIJKLMNOPRSTUVZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ĉĝĥĵŝŭĈĜĤĴŜŬ₷ + * - french + - 126 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ + * - legacy_french + - 123 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ * - finnish - 104 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöÄÖ + * - frisian + - 107 + - 0123456789abcdefghijklmnoprstuvwyzABCDEFGHIJKLMNOPRSTUVWYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿âêôûúÂÊÔÛÚƒ + * - galician + - 98 + - 0123456789abcdefghilmnopqrstuvxyzABCDEFGHILMNOPQRSTUVXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ñÑçÇ * - german - 108 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ - * - croatian + * - hausa + - 101 + - 0123456789abcdefghijklmnorstuwyzABCDEFGHIJKLMNORSTUWYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ɓɗƙƴƁƊƘƳ₦ + * - hungarian + - 114 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóöúüÁÉÍÓÖÚÜ + * - icelandic + - 114 + - 0123456789abdefghijklmnoprstuvxyzABDEFGHIJKLMNOPRSTUVXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ðáéíóúýþæöÐÁÉÍÓÚÝÞÆÖ + * - indonesian + - 100 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ + * - irish - 110 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ČčĆćĐ𩹮ž - * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - * - hebrew - - 235 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיךכלםמןנסעףפץצקרשתְֱֲֳִֵֶַָׇֹֺֻֽ־ֿ׀ׁׂ׃ׅׄ׆׳״֑֖֛֢֣֤֥֦֧֪֚֭֮֒֓֔֕֗֘֙֜֝֞֟֠֡֨֩֫֬֯ׯװױײיִﬞײַﬠﬡﬢﬣﬤﬥﬦﬧﬨ﬩שׁשׂשּׁשּׂאַאָאּבּגּדּהּוּזּטּיּךּכּלּמּנּסּףּפּצּקּרּשּתּוֹבֿכֿפֿﭏ₪ + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúÁÉÍÓÚ * - italian - 120 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ - * - latin + * - latvian + - 116 + - 0123456789abcdefghijklmnoprstuvyzABCDEFGHIJKLMNOPRSTUVYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ + * - lithuanian + - 112 + - 0123456789abcdefghijklmnoprstuvyzABCDEFGHIJKLMNOPRSTUVYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ąčęėįšųūžĄČĘĖĮŠŲŪŽ + * - luxembourgish + - 110 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüéëÄÖÜÉË + * - malagasy - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ + - 0123456789abdefghijklmnoprstvyzABDEFGHIJKLMNOPRSTVYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ôñÔÑ + * - malay + - 100 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ + * - maltese + - 104 + - 0123456789abdefghijklmnopqrstuvwxzABDEFGHIJKLMNOPQRSTUVWXZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ċġħżĊĠĦŻ + * - montenegrin + - 103 + - 0123456789abcdefghijklmnoprstuvzABCDEFGHIJKLMNOPRSTUVZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿čćšžźČĆŠŚŽŹ * - norwegian - 106 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿æøåÆØÅ @@ -141,17 +201,53 @@ of vocabs. - 118 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ąćęłńóśźżĄĆĘŁŃÓŚŹŻ * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ + - 128 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ + * - romanian + - 110 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ăâîșțĂÂÎȘȚ + * - scottish_gaelic + - 94 + - 0123456789abcdefghilmnoprstuABCDEFGHILMNOPRSTU!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àèìòùÀÈÌÒÙ + * - serbian_latin + - 110 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿čćđžšČĆĐŽŠ + * - slovak + - 134 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ôäčďľňšťžáéíĺóŕúýÔÄČĎĽŇŠŤŽÁÉÍĹÓŔÚÝ + * - slovene + - 102 + - 0123456789abcdefghijklmnoprstuvzABCDEFGHIJKLMNOPRSTUVZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿čćđšžČĆĐŠŽ + * - somali + - 94 + - 0123456789abcdefghijklmnoqrstuwxyABCDEFGHIJKLMNOQRSTUWXY!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ * - spanish - 116 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ + * - swahili + - 96 + - 0123456789abcdefghijklmnoprstuvwyzABCDEFGHIJKLMNOPRSTUVWYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ * - swedish - 106 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿åäöÅÄÖ + * - tagalog + - 95 + - 0123456789abdefghijklmnoprstuvyzABDEFGHIJKLMNOPRSTUVYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ñÑ₱ + * - turkish + - 112 + - 0123456789abcdefghijklmnoprstuvyzABCDEFGHIJKLMNOPRSTUVYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿çğıöşüâîûÇĞİÖŞÜÂÎÛ + * - uzbek_latin + - 110 + - 0123456789abcdefghijklmnopqrstuvxyzABCDEFGHIJKLMNOPQRSTUVXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿çğɉñöşÇĞɈÑÖŞ * - vietnamese - 234 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ + * - welsh + - 102 + - 0123456789abcdefghijlmnoprstuwyABCDEFGHIJLMNOPRSTUWY!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿âêîôŵŷÂÊÎÔŴŶ + * - Zulu + - 100 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ * - ancient_greek - 48 - αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index 7b48b5d21..332722e17 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -3,6 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. +import re import string __all__ = ["VOCABS"] @@ -50,49 +51,107 @@ VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"] VOCABS["english"] = VOCABS["latin"] + "°" + VOCABS["currency"] +VOCABS["albanian"] = VOCABS["english"] + "çëÇË" + +VOCABS["afrikaans"] = VOCABS["english"] + "èëïîôûêÈËÏÎÔÛÊ" + +VOCABS["azerbaijani"] = re.sub(r"[Ww]", "", VOCABS["english"]) + "çəğöşüÇƏĞÖŞÜ" + "₼" + +VOCABS["basque"] = VOCABS["english"] + "ñçÑÇ" + +VOCABS["bosanski"] = re.sub(r"[QqWwXxYy]", "", VOCABS["english"]) + "čćđšžČĆĐŠŽ" + +VOCABS["catalan"] = VOCABS["english"] + "àèéíïòóúüçÀÈÉÍÏÒÓÚÜÇ" + +VOCABS["croatian"] = VOCABS["english"] + "ČčĆćĐ𩹮ž" + VOCABS["czech"] = VOCABS["english"] + "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ" VOCABS["danish"] = VOCABS["english"] + "æøåÆØÅ" VOCABS["dutch"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" +VOCABS["estonian"] = VOCABS["english"] + "šžõäöüŠŽÕÄÖÜ" + +VOCABS["esperanto"] = re.sub(r"[QqWwXxYy]", "", VOCABS["english"]) + "ĉĝĥĵŝŭĈĜĤĴŜŬ" + "₷" + VOCABS["french"] = VOCABS["english"] + "àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ" VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ" + VOCABS["currency"] VOCABS["finnish"] = VOCABS["english"] + "äöÄÖ" +VOCABS["frisian"] = re.sub(r"[QqXx]", "", VOCABS["english"]) + "âêôûúÂÊÔÛÚ" + "ƒ" + +VOCABS["galician"] = re.sub(r"[JjKkWw]", "", VOCABS["english"]) + "ñÑçÇ" + VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ" -VOCABS["croatian"] = VOCABS["english"] + "ČčĆćĐ𩹮ž" +VOCABS["hausa"] = re.sub(r"[PpQqVvXx]", "", VOCABS["english"]) + "ɓɗƙƴƁƊƘƳ" + "₦" -VOCABS["hebrew"] = ( - VOCABS["english"] - + VOCABS["hebrew_letters"] - + VOCABS["hebrew_vowels"] - + VOCABS["hebrew_punctuation"] - + VOCABS["hebrew_cantillations"] - + VOCABS["hebrew_specials"] - + "₪" -) +VOCABS["hungarian"] = VOCABS["english"] + "áéíóöúüÁÉÍÓÖÚÜ" + +VOCABS["icelandic"] = re.sub(r"[CcQqWw]", "", VOCABS["english"]) + "ðáéíóúýþæöÐÁÉÍÓÚÝÞÆÖ" + +VOCABS["indonesian"] = VOCABS["english"] + +VOCABS["irish"] = VOCABS["english"] + "áéíóúÁÉÍÓÚ" VOCABS["italian"] = VOCABS["english"] + "àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ" +VOCABS["latvian"] = re.sub(r"[QqWwXx]", "", VOCABS["english"]) + "āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ" + +VOCABS["lithuanian"] = re.sub(r"[QqWwXx]", "", VOCABS["english"]) + "ąčęėįšųūžĄČĘĖĮŠŲŪŽ" + +VOCABS["luxembourgish"] = VOCABS["english"] + "äöüéëÄÖÜÉË" + +VOCABS["malagasy"] = re.sub(r"[CcQqUuWwXx]", "", VOCABS["english"]) + "ôñÔÑ" + +VOCABS["malay"] = VOCABS["english"] + +VOCABS["maltese"] = re.sub(r"[CcYy]", "", VOCABS["english"]) + "ċġħżĊĠĦŻ" + +VOCABS["montenegrin"] = re.sub(r"[QqWwXxYy]", "", VOCABS["english"]) + "čćšžźČĆŠŚŽŹ" + VOCABS["norwegian"] = VOCABS["english"] + "æøåÆØÅ" VOCABS["polish"] = VOCABS["english"] + "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ" VOCABS["portuguese"] = VOCABS["english"] + "áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ" +VOCABS["romanian"] = VOCABS["english"] + "ăâîșțĂÂÎȘȚ" + +VOCABS["scottish_gaelic"] = re.sub(r"[JjKkQqVvWwXxYyZz]", "", VOCABS["english"]) + "àèìòùÀÈÌÒÙ" + +VOCABS["serbian_latin"] = VOCABS["english"] + "čćđžšČĆĐŽŠ" + +VOCABS["slovak"] = VOCABS["english"] + "ôäčďľňšťžáéíĺóŕúýÔÄČĎĽŇŠŤŽÁÉÍĹÓŔÚÝ" + +VOCABS["slovene"] = re.sub(r"[QqWwXxYy]", "", VOCABS["english"]) + "čćđšžČĆĐŠŽ" + +VOCABS["somali"] = re.sub(r"[PpVvZz]", "", VOCABS["english"]) + VOCABS["spanish"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" + "¡¿" +VOCABS["swahili"] = re.sub(r"[QqXx]", "", VOCABS["english"]) + VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ" +VOCABS["tagalog"] = re.sub(r"[CcQqWwXx]", "", VOCABS["english"]) + "ñÑ" + "₱" + +VOCABS["turkish"] = re.sub(r"[QqWwXx]", "", VOCABS["english"]) + "çğıöşüâîûÇĞİÖŞÜÂÎÛ" + +VOCABS["uzbek_latin"] = re.sub(r"[Ww]", "", VOCABS["english"]) + "çğɉñöşÇĞɈÑÖŞ" + VOCABS["vietnamese"] = ( VOCABS["english"] + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵ" + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ" ) +VOCABS["welsh"] = re.sub(r"[KkQqVvXxZz]", "", VOCABS["english"]) + "âêîôŵŷÂÊÎÔŴŶ" + +VOCABS["Zulu"] = VOCABS["english"] + # Non-latin alphabets. # Arabic VOCABS["arabic"] = ( @@ -117,6 +176,17 @@ + VOCABS["punctuation"] ) +# Hebrew +VOCABS["hebrew"] = ( + VOCABS["english"] + + VOCABS["hebrew_letters"] + + VOCABS["hebrew_vowels"] + + VOCABS["hebrew_punctuation"] + + VOCABS["hebrew_cantillations"] + + VOCABS["hebrew_specials"] + + "₪" +) + # Hindi VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]