From 61963fd1dfb0a21f50c9cc55eb1e9add2eb8118d Mon Sep 17 00:00:00 2001 From: computermacgyver Date: Thu, 3 Oct 2024 00:48:59 +0100 Subject: [PATCH 1/2] no newlines in text for FastText --- app/main/lib/langid.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/main/lib/langid.py b/app/main/lib/langid.py index 4a060582..b125f042 100644 --- a/app/main/lib/langid.py +++ b/app/main/lib/langid.py @@ -1,6 +1,7 @@ # 3rd party langid providers from flask import current_app as app import json +import re from google.cloud import translate_v2 as translate # import requests # Used for MicrosoftLangidProvider @@ -87,7 +88,7 @@ class FastTextLangidProvider: fasttext_model = fasttext.load_model("extra/fasttext_language_id/lid.176.ftz") @staticmethod def langid(text): - prediction = list(FastTextLangidProvider.fasttext_model.predict(text)) + prediction = list(FastTextLangidProvider.fasttext_model.predict(re.sub("[\n\r]"," ",text,re.MULTILINE))) # prediction is a list of tuples, e.g., [('__label__en',), array([0.22517213])] language = prediction[0][0].split("__")[-1] From ba0ef070473907d376c5b58c648049395b799b4a Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 2 Oct 2024 17:00:45 -0700 Subject: [PATCH 2/2] quick test with newlines --- app/test/test_langid.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app/test/test_langid.py b/app/test/test_langid.py index 21664515..395f2031 100644 --- a/app/test/test_langid.py +++ b/app/test/test_langid.py @@ -14,6 +14,7 @@ class TestLangidBlueprint(BaseTestCase): TESTS = [ { 'fasttext': 'hi', 'cld3': 'hi', 'microsoft': 'hi', 'google': 'hi', 'text': 'नमस्ते मेरा नाम करीम है' }, + { 'fasttext': 'hi', 'cld3': 'hi', 'microsoft': 'hi', 'google': 'hi', 'text': 'नमस्ते मेरा नाम\n\n करीम है' }, { 'fasttext': None, 'cld3': 'hi-Latn', 'microsoft': 'en', 'google': ['hi', 'hi-Latn'], 'text': 'namaste mera naam Karim hai' }, { 'fasttext': 'mr', 'cld3': 'mr', 'microsoft': 'hi', 'google': 'mr', 'text': 'हॅलो माझे नाव करीम आहे' }, { 'fasttext': 'bn', 'cld3': 'bn', 'microsoft': 'bn', 'google': 'bn', 'text': 'হ্যালো আমার নাম কারিম' },