From 4915a7798442cc37030b80b4f6c47dcd63666f6a Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Sun, 11 Aug 2024 20:26:01 +0700 Subject: [PATCH 1/2] Add thaig2p_v2 --- docs/api/transliterate.rst | 3 ++- pythainlp/transliterate/core.py | 4 ++++ pythainlp/transliterate/thaig2p_v2.py | 33 +++++++++++++++++++++++++++ tests/test_transliterate.py | 2 ++ 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 pythainlp/transliterate/thaig2p_v2.py diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst index 8f832fbad..8b71bff1b 100644 --- a/docs/api/transliterate.rst +++ b/docs/api/transliterate.rst @@ -55,7 +55,8 @@ This section includes multiple transliteration engines designed to suit various - **icu**: Utilizes the ICU transliteration system for phonetic conversion. - **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text. -- **thaig2p**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. +- **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. +- **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0 - **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration. - **iso_11940**: Focuses on the ISO 11940 transliteration standard. diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index b1d3d502e..a70193936 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -109,6 +109,8 @@ def transliterate( `TLTK `_., * *iso_11940* - Thai text into Latin characters with ISO 11940. * *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA) + * *thaig2p_v2* - Thai Grapheme-to-Phoneme, + output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0 :Example: :: @@ -159,6 +161,8 @@ def transliterate( from pythainlp.transliterate.tltk import tltk_ipa as transliterate elif engine == "iso_11940": from pythainlp.transliterate.iso_11940 import transliterate + elif engine == "thaig2p_v2": + from pythainlp.transliterate.thaig2p_v2 import transliterate else: # use default engine: "thaig2p" from pythainlp.transliterate.thaig2p import transliterate diff --git a/pythainlp/transliterate/thaig2p_v2.py b/pythainlp/transliterate/thaig2p_v2.py new file mode 100644 index 000000000..1bb5d4e2b --- /dev/null +++ b/pythainlp/transliterate/thaig2p_v2.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +""" +Thai Grapheme-to-Phoneme (Thai G2P) + +huggingface: https://huggingface.co/pythainlp/thaig2p-v2.0 +""" + +# Use a pipeline as a high-level helper +from transformers import pipeline + + +class ThaiG2P: + """ + Latin transliteration of Thai words, using International Phonetic Alphabet + """ + + def __init__(self, device: str="cpu"): + self.pipe = pipeline("text2text-generation", model="pythainlp/thaig2p-v2.0", device=device) + + def g2p(self, text: str) -> str: + return self.pipe(text)[0]["generated_text"] + + +_THAI_G2P = None + + +def transliterate(text: str, device="cpu") -> str: + global _THAI_G2P + if _THAI_G2P == None: + _THAI_G2P = ThaiG2P(device=device) + return _THAI_G2P.g2p(text) diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py index 130d18668..478716503 100644 --- a/tests/test_transliterate.py +++ b/tests/test_transliterate.py @@ -216,6 +216,8 @@ def test_transliterate(self): self.assertEqual(transliterate("คน", engine="ipa"), "kʰon") self.assertIsNotNone(transliterate("คน", engine="thaig2p")) self.assertIsNotNone(transliterate("แมว", engine="thaig2p")) + self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2")) + self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2")) self.assertIsNotNone(transliterate("คน", engine="tltk_g2p")) self.assertIsNotNone(transliterate("แมว", engine="tltk_g2p")) self.assertIsNotNone(transliterate("คน", engine="tltk_ipa")) From 6aaac9c70cada65272fc8b0a545b95cd1e707dac Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Tue, 13 Aug 2024 22:31:28 +0700 Subject: [PATCH 2/2] Fixed pep8 --- pythainlp/transliterate/thaig2p_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/transliterate/thaig2p_v2.py b/pythainlp/transliterate/thaig2p_v2.py index 1bb5d4e2b..fe8e25dcd 100644 --- a/pythainlp/transliterate/thaig2p_v2.py +++ b/pythainlp/transliterate/thaig2p_v2.py @@ -16,7 +16,7 @@ class ThaiG2P: Latin transliteration of Thai words, using International Phonetic Alphabet """ - def __init__(self, device: str="cpu"): + def __init__(self, device: str = "cpu"): self.pipe = pipeline("text2text-generation", model="pythainlp/thaig2p-v2.0", device=device) def g2p(self, text: str) -> str: @@ -28,6 +28,6 @@ def g2p(self, text: str) -> str: def transliterate(text: str, device="cpu") -> str: global _THAI_G2P - if _THAI_G2P == None: + if _THAI_G2P is None: _THAI_G2P = ThaiG2P(device=device) return _THAI_G2P.g2p(text)