Skip to content

Commit 4811ac7

Browse files
committed
Added automatic insertion of a period at the end of an input sentence in the translator
1 parent 78e753b commit 4811ac7

File tree

2 files changed

+45
-7
lines changed

2 files changed

+45
-7
lines changed

app/src/main/java/nie/translator/rtranslator/voice_translation/neural_networks/translation/Translator.java

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import java.util.ArrayDeque;
4242
import java.util.ArrayList;
4343
import java.util.HashMap;
44+
import java.util.Locale;
4445
import java.util.Map;
4546
import java.util.Optional;
4647

@@ -480,10 +481,11 @@ private void performTextTranslation(final String textToTranslate, final CustomLo
480481
//tokenization
481482
long time = System.currentTimeMillis();
482483
TokenizerResult input = null;
484+
String correctedSubText = correctText(textSplit.get(i), inputLanguage.getLocale());
483485
if (mode == MADLAD_CACHE) {
484-
input = tokenizer.tokenize(inputLanguage.getCode(), outputLanguage.getCode(), textSplit.get(i));
486+
input = tokenizer.tokenize(inputLanguage.getCode(), outputLanguage.getCode(), correctedSubText);
485487
} else { //if mode == NLLB_CACHE
486-
input = tokenizer.tokenize(getNllbLanguageCode(inputLanguage.getCode()), getNllbLanguageCode(outputLanguage.getCode()), textSplit.get(i));
488+
input = tokenizer.tokenize(getNllbLanguageCode(inputLanguage.getCode()), getNllbLanguageCode(outputLanguage.getCode()), correctedSubText);
487489
}
488490
android.util.Log.i("performance", "Tokenization done in: " + (System.currentTimeMillis() - time) + "ms");
489491
//encoder execution
@@ -508,10 +510,11 @@ private void performTextTranslation(final String textToTranslate, final CustomLo
508510
executeCacheDecoderGreedy(input, encoderResult, completeOutput, outputLanguage, new TranslateListener() {
509511
@Override
510512
public void onTranslatedText(String text, long resultID, boolean isFinal, CustomLocale languageOfText) {
513+
//we return the partial results
511514
String outputText;
512515
if(joinedStringOutput[0].equals("")){
513516
outputText = joinedStringOutput[0] + text;
514-
}else {
517+
} else {
515518
outputText = joinedStringOutput[0] + " " + text;
516519
}
517520
if (saveResults) {
@@ -527,6 +530,7 @@ public void onTranslatedText(String text, long resultID, boolean isFinal, Custom
527530

528531
@Override
529532
public void onFailure(int[] reasons, long value) {
533+
//we do not return the partial results and notify an error
530534
if (responseListener != null) {
531535
mainHandler.post(() -> responseListener.onFailure(reasons, value));
532536
} else {
@@ -1114,6 +1118,41 @@ public long getCurrentResultID(){
11141118
return currentResultID;
11151119
}
11161120

1121+
private String correctText(String text, Locale locale){
1122+
String correctedText = text;
1123+
String language = locale.getLanguage();
1124+
//we add an eventual period if missing (or in general a terminator symbol)
1125+
if(!language.equals("th")) {
1126+
correctedText = correctedText.trim(); //we remove eventual white space from both ends of the text
1127+
if(correctedText.length() >= 2) {
1128+
if (!Character.isLetterOrDigit(correctedText.charAt(correctedText.length() - 1))) {
1129+
return correctedText;
1130+
}
1131+
return correctedText + getSentenceTerminator(locale);
1132+
}
1133+
}
1134+
return text;
1135+
}
1136+
1137+
private static String getSentenceTerminator(Locale locale) {
1138+
// Assuming most languages use a period (.)
1139+
// Add custom cases for specific languages as needed
1140+
String language = locale.getLanguage();
1141+
switch (language) {
1142+
case "zh": // Chinese
1143+
case "ja": // Japanese
1144+
case "ko": // Korean
1145+
return "。"; // Ideographic full stop
1146+
case "hi": // Hindi
1147+
return "।";
1148+
case "my": // Burmese
1149+
return "။"; // Burmese full stop
1150+
// Add other cases as needed for more languages
1151+
default:
1152+
return ".";
1153+
}
1154+
}
1155+
11171156

11181157
private void initializeNllbLanguagesCodes(Context context){
11191158
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();

app/src/main/java/nie/translator/rtranslator/voice_translation/neural_networks/voice/Recognizer.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -561,11 +561,10 @@ private String correctText(String text){
561561
String regex = "<\\|[^>]*\\|> "; //with this regex we remove all substrings of the form "<|something|> "
562562
correctedText = correctedText.replaceAll(regex, "");
563563

564+
//we remove eventual white space from both ends of the text
565+
correctedText = correctedText.trim();
566+
564567
if(correctedText.length() >= 2) {
565-
//if the text start with a white space we remove it
566-
if (text.charAt(0) == ' ') {
567-
correctedText = text.substring(1);
568-
}
569568
//if the correctedText start with a lower case letter we make it upper case
570569
char firstChar = correctedText.charAt(0);
571570
if (Character.isLowerCase(firstChar)) {

0 commit comments

Comments
 (0)