Skip to content

Commit aad134d

Browse files
authored
Merge pull request #1102 from PyThaiNLP/add-word2word
Add pythainlp.translate.word_translate
2 parents cf7e780 + 23d49ad commit aad134d

File tree

9 files changed

+161
-6
lines changed

9 files changed

+161
-6
lines changed

docker_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@ transformers==4.51.3
3939
ufal.chu-liu-edmonds==1.0.3
4040
wtpsplit==1.3.0
4141
wunsen==0.0.3
42+
word2word==1.0.0

docs/api/translate.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ Modules
1212

1313
The `Translate` class is the central component of the module, offering a unified interface for various translation tasks. It acts as a coordinator, directing translation requests to specific language pairs and models.
1414

15+
.. autoclass:: word_translate
16+
:members:
17+
1518
.. autofunction:: pythainlp.translate.en_th.download_model_all
1619
:noindex:
1720

pythainlp/translate/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
Language translation.
77
"""
88

9-
__all__ = ["Translate", "ThZhTranslator", "ZhThTranslator"]
9+
__all__ = ["Translate", "ThZhTranslator", "ZhThTranslator", "word_translate"]
1010

11-
from pythainlp.translate.core import Translate
11+
from pythainlp.translate.core import Translate, word_translate
1212
from pythainlp.translate.zh_th import (
1313
ThZhTranslator,
1414
ZhThTranslator,

pythainlp/translate/core.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
33
# SPDX-FileType: SOURCE
44
# SPDX-License-Identifier: Apache-2.0
5+
from typing import List, Union
6+
57

68
class Translate:
79
"""
@@ -83,7 +85,7 @@ def load_model(self):
8385
else:
8486
raise ValueError("Not support language!")
8587

86-
def translate(self, text) -> str:
88+
def translate(self, text: str) -> str:
8789
"""
8890
Translate text
8991
@@ -94,3 +96,43 @@ def translate(self, text) -> str:
9496
if self.engine == "small100":
9597
return self.model.translate(text, tgt_lang=self.target_lang)
9698
return self.model.translate(text)
99+
100+
101+
def word_translate(
102+
word: str,
103+
src: str,
104+
target: str,
105+
engine: str="word2word"
106+
) -> Union[List[str], None]:
107+
"""
108+
Translate word from source language to target language.
109+
110+
:param str word: text
111+
:param str src: src language
112+
:param str target: target language
113+
:param str engine: Word translate engine (the default engine is word2word)
114+
:return: return list word translate or None
115+
:rtype: Union[List[str], None]
116+
117+
:Example:
118+
119+
Translate word from Thai to English::
120+
121+
from pythainlp.translate import word_translate
122+
print(word_translate("แมว","th","en"))
123+
# output: ['cat', 'cats', 'kitty', 'kitten', 'Cat']
124+
125+
Translate word from English to Thai::
126+
127+
from pythainlp.translate import word_translate
128+
print(word_translate("cat","en","th"))
129+
# output: ['แมว', 'แมวป่า', 'ข่วน', 'เลี้ยง', 'อาหาร']
130+
131+
"""
132+
if engine=="word2word":
133+
from .word2word_translate import translate
134+
return translate(word=word, src=src, target=target)
135+
else:
136+
raise NotImplementedError(
137+
f"pythainlp.translate.word_translate isn't support {engine}."
138+
)

pythainlp/translate/th_fr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
1414
- Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr
1515
"""
16-
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
1716

1817

1918
class ThFrTranslator:
@@ -36,6 +35,7 @@ def __init__(
3635
use_gpu: bool = False,
3736
pretrained: str = "Helsinki-NLP/opus-mt-th-fr",
3837
) -> None:
38+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3939
self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained)
4040
self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained)
4141
if use_gpu:
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
from typing import List, Union
6+
from word2word import Word2word
7+
8+
support_list = set(['zh_tw',
9+
'el',
10+
'te',
11+
'hu',
12+
'eu',
13+
'ko',
14+
'ru',
15+
'lv',
16+
'bg',
17+
'sk',
18+
'vi',
19+
'gl',
20+
'et',
21+
'ta',
22+
'fa',
23+
'it',
24+
'ms',
25+
'id',
26+
'pt',
27+
'fr',
28+
'sr',
29+
'mk',
30+
'sv',
31+
'si',
32+
'en',
33+
'ka',
34+
'uk',
35+
'sl',
36+
'hi',
37+
'ca',
38+
'lt',
39+
'es',
40+
'no',
41+
'de',
42+
'he',
43+
'cs',
44+
'ze_zh',
45+
'fi',
46+
'pl',
47+
'tl',
48+
'is',
49+
'ze_en',
50+
'kk',
51+
'bn',
52+
'tr',
53+
'ur',
54+
'pt_br',
55+
'ar',
56+
'ro',
57+
'bs',
58+
'ml',
59+
'zh_cn',
60+
'da',
61+
'hr',
62+
'sq',
63+
'af',
64+
'eo',
65+
'nl',
66+
'ja',
67+
'th'])
68+
69+
70+
def translate(word: str, src: str, target: str) -> Union[List[str], None]:
71+
"""
72+
Word translate
73+
74+
:param str word: text
75+
:param str src: src language
76+
:param str target: target language
77+
:return: return list word translate or None
78+
:rtype: Union[List[str], None]
79+
"""
80+
if src not in support_list or target not in support_list:
81+
raise NotImplementedError(
82+
f"word2word doesn't support {src}-{target}."
83+
)
84+
elif src==target:
85+
return [word]
86+
_engine = Word2word(src, target)
87+
return _engine(word)

pythainlp/translate/zh_th.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
- GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth
1111
- Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822
1212
"""
13-
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
1413

1514

1615
class ThZhTranslator:
@@ -30,6 +29,7 @@ def __init__(
3029
use_gpu: bool = False,
3130
pretrained: str = "Lalita/marianmt-th-zh_cn",
3231
) -> None:
32+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3333
self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained)
3434
self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained)
3535
if use_gpu:

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
"sentencepiece>=0.1.91",
8686
"torch>=1.0.0",
8787
"transformers>=4.6.0",
88+
"word2word>=1.0.0"
8889
],
8990
"transformers_ud": [
9091
"transformers>=4.22.1",
@@ -145,6 +146,7 @@
145146
"ufal.chu-liu-edmonds>=1.0.2",
146147
"wtpsplit>=1.0.1",
147148
"wunsen>=0.0.3",
149+
"word2word>=1.0.0",
148150
],
149151
}
150152

tests/extra/testx_translate.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import unittest
77

8-
from pythainlp.translate import Translate
8+
from pythainlp.translate import Translate, word_translate
99
from pythainlp.translate.en_th import (
1010
EnThTranslator,
1111
ThEnTranslator,
@@ -73,3 +73,23 @@ def test_translate(self):
7373
# )
7474
with self.assertRaises(ValueError):
7575
self.th_cat_translator = Translate('th', 'cat', engine="fkfj")
76+
77+
def test_word_translate(self):
78+
self.assertIsNone(word_translate("cat", src="en", target="th"))
79+
self.assertIsNone(word_translate("แมว", src="en", target="th"))
80+
self.assertIsNone(
81+
word_translate("cat", src="en", target="th", engine="word2word")
82+
)
83+
self.assertIsNone(
84+
word_translate("แมว", src="en", target="th", engine="word2word")
85+
)
86+
self.assertEqual(
87+
word_translate("แมว", src="th", target="th", engine="word2word"),
88+
["แมว"]
89+
)
90+
91+
with self.assertRaises(NotImplementedError):
92+
word_translate("cat", src="en", target="th", engine="cat")
93+
94+
with self.assertRaises(NotImplementedError):
95+
word_translate("แมว", src="th", target="xxx", engine="word2word")

0 commit comments

Comments
 (0)