Skip to content

Add pythainlp.translate.word_translate #1102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ transformers==4.51.3
ufal.chu-liu-edmonds==1.0.3
wtpsplit==1.3.0
wunsen==0.0.3
word2word==1.0.0
3 changes: 3 additions & 0 deletions docs/api/translate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Modules

The `Translate` class is the central component of the module, offering a unified interface for various translation tasks. It acts as a coordinator, directing translation requests to specific language pairs and models.

.. autoclass:: word_translate
:members:

.. autofunction:: pythainlp.translate.en_th.download_model_all
:noindex:

Expand Down
4 changes: 2 additions & 2 deletions pythainlp/translate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
Language translation.
"""

__all__ = ["Translate", "ThZhTranslator", "ZhThTranslator"]
__all__ = ["Translate", "ThZhTranslator", "ZhThTranslator", "word_translate"]

from pythainlp.translate.core import Translate
from pythainlp.translate.core import Translate, word_translate
from pythainlp.translate.zh_th import (
ThZhTranslator,
ZhThTranslator,
Expand Down
44 changes: 43 additions & 1 deletion pythainlp/translate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from typing import List, Union


class Translate:
"""
Expand Down Expand Up @@ -83,7 +85,7 @@ def load_model(self):
else:
raise ValueError("Not support language!")

def translate(self, text) -> str:
def translate(self, text: str) -> str:
"""
Translate text

Expand All @@ -94,3 +96,43 @@ def translate(self, text) -> str:
if self.engine == "small100":
return self.model.translate(text, tgt_lang=self.target_lang)
return self.model.translate(text)


def word_translate(
word: str,
src: str,
target: str,
engine: str="word2word"
) -> Union[List[str], None]:
"""
Translate word from source language to target language.

:param str word: text
:param str src: src language
:param str target: target language
:param str engine: Word translate engine (the default engine is word2word)
:return: return list word translate or None
:rtype: Union[List[str], None]

:Example:

Translate word from Thai to English::

from pythainlp.translate import word_translate
print(word_translate("แมว","th","en"))
# output: ['cat', 'cats', 'kitty', 'kitten', 'Cat']

Translate word from English to Thai::

from pythainlp.translate import word_translate
print(word_translate("cat","en","th"))
# output: ['แมว', 'แมวป่า', 'ข่วน', 'เลี้ยง', 'อาหาร']

"""
if engine=="word2word":
from .word2word_translate import translate
return translate(word=word, src=src, target=target)
else:
raise NotImplementedError(
f"pythainlp.translate.word_translate isn't support {engine}."
)
2 changes: 1 addition & 1 deletion pythainlp/translate/th_fr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

- Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr
"""
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


class ThFrTranslator:
Expand All @@ -36,6 +35,7 @@ def __init__(
use_gpu: bool = False,
pretrained: str = "Helsinki-NLP/opus-mt-th-fr",
) -> None:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained)
self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained)
if use_gpu:
Expand Down
87 changes: 87 additions & 0 deletions pythainlp/translate/word2word_translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from typing import List, Union
from word2word import Word2word

support_list = set(['zh_tw',
'el',
'te',
'hu',
'eu',
'ko',
'ru',
'lv',
'bg',
'sk',
'vi',
'gl',
'et',
'ta',
'fa',
'it',
'ms',
'id',
'pt',
'fr',
'sr',
'mk',
'sv',
'si',
'en',
'ka',
'uk',
'sl',
'hi',
'ca',
'lt',
'es',
'no',
'de',
'he',
'cs',
'ze_zh',
'fi',
'pl',
'tl',
'is',
'ze_en',
'kk',
'bn',
'tr',
'ur',
'pt_br',
'ar',
'ro',
'bs',
'ml',
'zh_cn',
'da',
'hr',
'sq',
'af',
'eo',
'nl',
'ja',
'th'])


def translate(word: str, src: str, target: str) -> Union[List[str], None]:
"""
Word translate

:param str word: text
:param str src: src language
:param str target: target language
:return: return list word translate or None
:rtype: Union[List[str], None]
"""
if src not in support_list or target not in support_list:
raise NotImplementedError(
f"word2word doesn't support {src}-{target}."
)
elif src==target:
return [word]
_engine = Word2word(src, target)
return _engine(word)
2 changes: 1 addition & 1 deletion pythainlp/translate/zh_th.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
- GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth
- Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822
"""
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


class ThZhTranslator:
Expand All @@ -30,6 +29,7 @@ def __init__(
use_gpu: bool = False,
pretrained: str = "Lalita/marianmt-th-zh_cn",
) -> None:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained)
self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained)
if use_gpu:
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"sentencepiece>=0.1.91",
"torch>=1.0.0",
"transformers>=4.6.0",
"word2word>=1.0.0"
],
"transformers_ud": [
"transformers>=4.22.1",
Expand Down Expand Up @@ -145,6 +146,7 @@
"ufal.chu-liu-edmonds>=1.0.2",
"wtpsplit>=1.0.1",
"wunsen>=0.0.3",
"word2word>=1.0.0",
],
}

Expand Down
22 changes: 21 additions & 1 deletion tests/extra/testx_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import unittest

from pythainlp.translate import Translate
from pythainlp.translate import Translate, word_translate
from pythainlp.translate.en_th import (
EnThTranslator,
ThEnTranslator,
Expand Down Expand Up @@ -73,3 +73,23 @@ def test_translate(self):
# )
with self.assertRaises(ValueError):
self.th_cat_translator = Translate('th', 'cat', engine="fkfj")

def test_word_translate(self):
self.assertIsNone(word_translate("cat", src="en", target="th"))
self.assertIsNone(word_translate("แมว", src="en", target="th"))
self.assertIsNone(
word_translate("cat", src="en", target="th", engine="word2word")
)
self.assertIsNone(
word_translate("แมว", src="en", target="th", engine="word2word")
)
self.assertEqual(
word_translate("แมว", src="th", target="th", engine="word2word"),
["แมว"]
)

with self.assertRaises(NotImplementedError):
word_translate("cat", src="en", target="th", engine="cat")

with self.assertRaises(NotImplementedError):
word_translate("แมว", src="th", target="xxx", engine="word2word")
Loading