diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index d4264296be..0d9a8faa6f 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -6,6 +6,8 @@ d=nodejs-addon-examples echo "dir: $d" cd $d +echo "----------streaming asr----------" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 @@ -31,6 +33,8 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 node ./test_asr_streaming_paraformer.js rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en +echo "----------non-streaming asr----------" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 @@ -58,3 +62,35 @@ rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 node ./test_asr_non_streaming_paraformer.js rm -rf sherpa-onnx-paraformer-zh-2023-03-28 + +echo "----------tts----------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 +tar xvf vits-piper-en_GB-cori-medium.tar.bz2 +rm vits-piper-en_GB-cori-medium.tar.bz2 + +node ./test_tts_non_streaming_vits_piper_en.js +rm -rf vits-piper-en_GB-cori-medium + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2 +tar xvf vits-coqui-de-css10.tar.bz2 +rm vits-coqui-de-css10.tar.bz2 + +node ./test_tts_non_streaming_vits_coqui_de.js +rm -rf vits-coqui-de-css10 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 +tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 +rm sherpa-onnx-vits-zh-ll.tar.bz2 + +node ./test_tts_non_streaming_vits_zh_ll.js +rm -rf sherpa-onnx-vits-zh-ll + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 +tar xvf vits-icefall-zh-aishell3.tar.bz2 +rm vits-icefall-zh-aishell3.tar.bz2 + +node ./test_tts_non_streaming_vits_zh_aishell3.js +rm -rf vits-icefall-zh-aishell3 + +ls -lh diff --git a/.github/workflows/npm-addon-linux-aarch64.yaml b/.github/workflows/npm-addon-linux-aarch64.yaml index bbeb266e3c..c846009426 100644 --- a/.github/workflows/npm-addon-linux-aarch64.yaml +++ b/.github/workflows/npm-addon-linux-aarch64.yaml @@ -94,7 +94,7 @@ jobs: -DSHERPA_ONNX_ENABLE_BINARY=OFF \ .. - make -j + make -j2 make install cd .. diff --git a/.gitignore b/.gitignore index 89b784338e..282da268fa 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ sherpa-onnx-ced-* node_modules package-lock.json sherpa-onnx-nemo-* +sherpa-onnx-vits-* diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 9cc352e762..faf3bfdbdd 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -143,3 +143,43 @@ node ./test_asr_non_streaming_paraformer.js npm install naudiodon2 node ./test_vad_asr_non_streaming_paraformer_microphone.js ``` + +## Text-to-speech with piper VITS models (TTS) + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 +tar xvf vits-piper-en_GB-cori-medium.tar.bz2 +rm vits-piper-en_GB-cori-medium.tar.bz2 + +node ./test_tts_non_streaming_vits_piper_en.js +``` + +## Text-to-speech with piper Coqui-ai/TTS models (TTS) + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2 +tar xvf vits-coqui-de-css10.tar.bz2 +rm vits-coqui-de-css10.tar.bz2 + +node ./test_tts_non_streaming_vits_coqui_de.js +``` + +## Text-to-speech with vits Chinese models (1/2) + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 +tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 +rm sherpa-onnx-vits-zh-ll.tar.bz2 + +node ./test_tts_non_streaming_vits_zh_ll.js +``` + +## Text-to-speech with vits Chinese models (2/2) + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 +tar xvf vits-icefall-zh-aishell3.tar.bz2 +rm vits-icefall-zh-aishell3.tar.bz2 + +node ./test_tts_non_streaming_vits_zh_aishell3.js +``` diff --git a/nodejs-addon-examples/test_tts_non_streaming_vits_coqui_de.js b/nodejs-addon-examples/test_tts_non_streaming_vits_coqui_de.js new file mode 100644 index 0000000000..6b3e56737c --- /dev/null +++ b/nodejs-addon-examples/test_tts_non_streaming_vits_coqui_de.js @@ -0,0 +1,43 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + +// please download model files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +function createOfflineTts() { + const config = { + model: { + vits: { + model: './vits-coqui-de-css10/model.onnx', + tokens: './vits-coqui-de-css10/tokens.txt', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }, + maxNumStences: 1, + }; + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); + +const text = 'Alles hat ein Ende, nur die Wurst hat zwei.' + +let start = performance.now(); +const audio = tts.generate({text: text, sid: 0, speed: 1.0}); +let stop = performance.now(); +const elapsed_seconds = (stop - start) / 1000; +const duration = audio.samples.length / audio.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +const filename = 'test-coqui-de.wav'; +sherpa_onnx.writeWave( + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); + +console.log(`Saved to ${filename}`); diff --git a/nodejs-addon-examples/test_tts_non_streaming_vits_piper_en.js b/nodejs-addon-examples/test_tts_non_streaming_vits_piper_en.js new file mode 100644 index 0000000000..3de28d34a6 --- /dev/null +++ b/nodejs-addon-examples/test_tts_non_streaming_vits_piper_en.js @@ -0,0 +1,46 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + +// please download model files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +function createOfflineTts() { + const config = { + model: { + vits: { + model: './vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx', + tokens: './vits-piper-en_GB-cori-medium/tokens.txt', + dataDir: './vits-piper-en_GB-cori-medium/espeak-ng-data', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }, + maxNumStences: 1, + }; + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); + +const text = + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' + + +let start = performance.now(); +const audio = tts.generate({text: text, sid: 0, speed: 1.0}); +let stop = performance.now(); +const elapsed_seconds = (stop - start) / 1000; +const duration = audio.samples.length / audio.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +const filename = 'test-piper-en.wav'; +sherpa_onnx.writeWave( + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); + +console.log(`Saved to ${filename}`); diff --git a/nodejs-addon-examples/test_tts_non_streaming_vits_zh_aishell3.js b/nodejs-addon-examples/test_tts_non_streaming_vits_zh_aishell3.js new file mode 100644 index 0000000000..09f63711ea --- /dev/null +++ b/nodejs-addon-examples/test_tts_non_streaming_vits_zh_aishell3.js @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + +// please download model files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +function createOfflineTts() { + const config = { + model: { + vits: { + model: './vits-icefall-zh-aishell3/model.onnx', + tokens: './vits-icefall-zh-aishell3/tokens.txt', + lexicon: './vits-icefall-zh-aishell3/lexicon.txt', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }, + maxNumStences: 1, + ruleFsts: + './vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst', + ruleFars: './vits-icefall-zh-aishell3/rule.far', + }; + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); + +const text = + '他在长沙出生,长白山长大,去过长江,现在他是一个银行的行长,主管行政工作。有困难,请拨110,或者13020240513。今天是2024年5月13号, 他上个月的工资是12345块钱。' + +let start = performance.now(); +const audio = tts.generate({text: text, sid: 88, speed: 1.0}); +let stop = performance.now(); +const elapsed_seconds = (stop - start) / 1000; +const duration = audio.samples.length / audio.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +const filename = 'test-zh-aishell3.wav'; +sherpa_onnx.writeWave( + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); + +console.log(`Saved to ${filename}`); diff --git a/nodejs-addon-examples/test_tts_non_streaming_vits_zh_ll.js b/nodejs-addon-examples/test_tts_non_streaming_vits_zh_ll.js new file mode 100644 index 0000000000..978ba465ec --- /dev/null +++ b/nodejs-addon-examples/test_tts_non_streaming_vits_zh_ll.js @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + +// please download model files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +function createOfflineTts() { + const config = { + model: { + vits: { + model: './sherpa-onnx-vits-zh-ll/model.onnx', + tokens: './sherpa-onnx-vits-zh-ll/tokens.txt', + lexicon: './sherpa-onnx-vits-zh-ll/lexicon.txt', + dictDir: './sherpa-onnx-vits-zh-ll/dict', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }, + maxNumStences: 1, + ruleFsts: + './sherpa-onnx-vits-zh-ll/date.fst,./sherpa-onnx-vits-zh-ll/phone.fst,./sherpa-onnx-vits-zh-ll/number.fst', + }; + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); + +const text = + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月13号,拨打110或者18920240513。123456块钱。' + +let start = performance.now(); +const audio = tts.generate({text: text, sid: 2, speed: 1.0}); +let stop = performance.now(); +const elapsed_seconds = (stop - start) / 1000; +const duration = audio.samples.length / audio.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +const filename = 'test-zh-ll.wav'; +sherpa_onnx.writeWave( + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); + +console.log(`Saved to ${filename}`); diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js index 4567994c45..34a4b2e788 100644 --- a/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js @@ -99,7 +99,7 @@ ai.on('data', data => { .split(' ')[0]}.wav`; sherpa_onnx.writeWave( filename, - {samples: segment.samples, sampleRate: vad.config.sampleRate}) + {samples: segment.samples, sampleRate: vad.config.sampleRate}); index += 1; } diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js index 7e133b4fd9..6abb096101 100644 --- a/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js @@ -97,7 +97,7 @@ ai.on('data', data => { .split(' ')[0]}.wav`; sherpa_onnx.writeWave( filename, - {samples: segment.samples, sampleRate: vad.config.sampleRate}) + {samples: segment.samples, sampleRate: vad.config.sampleRate}); index += 1; } diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js index c554e1e900..5bd7a91682 100644 --- a/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js @@ -102,7 +102,7 @@ ai.on('data', data => { .split(' ')[0]}.wav`; sherpa_onnx.writeWave( filename, - {samples: segment.samples, sampleRate: vad.config.sampleRate}) + {samples: segment.samples, sampleRate: vad.config.sampleRate}); index += 1; } diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js index 0261c5cbd2..06c87e56c3 100644 --- a/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js @@ -98,7 +98,7 @@ ai.on('data', data => { .split(' ')[0]}.wav`; sherpa_onnx.writeWave( filename, - {samples: segment.samples, sampleRate: vad.config.sampleRate}) + {samples: segment.samples, sampleRate: vad.config.sampleRate}); index += 1; } diff --git a/nodejs-addon-examples/test_vad_microphone.js b/nodejs-addon-examples/test_vad_microphone.js index 63cffa6e3b..196ec3cdf1 100644 --- a/nodejs-addon-examples/test_vad_microphone.js +++ b/nodejs-addon-examples/test_vad_microphone.js @@ -71,7 +71,7 @@ ai.on('data', data => { .split(' ')[0]}.wav`; sherpa_onnx.writeWave( filename, - {samples: segment.samples, sampleRate: vad.config.sampleRate}) + {samples: segment.samples, sampleRate: vad.config.sampleRate}); const duration = segment.samples.length / vad.config.sampleRate; console.log(`${index} End of speech. Duration: ${duration} seconds`); console.log(`Saved to ${filename}`); diff --git a/scripts/node-addon-api/CMakeLists.txt b/scripts/node-addon-api/CMakeLists.txt index 3668f9558d..de86cf63cd 100644 --- a/scripts/node-addon-api/CMakeLists.txt +++ b/scripts/node-addon-api/CMakeLists.txt @@ -19,6 +19,7 @@ include_directories(${CMAKE_JS_INC}) set(srcs src/non-streaming-asr.cc + src/non-streaming-tts.cc src/sherpa-onnx-node-addon-api.cc src/streaming-asr.cc src/vad.cc diff --git a/scripts/node-addon-api/lib/addon.js b/scripts/node-addon-api/lib/addon.js index 4ef18f2bc8..c55e3e251b 100644 --- a/scripts/node-addon-api/lib/addon.js +++ b/scripts/node-addon-api/lib/addon.js @@ -25,8 +25,8 @@ for (const p of possible_paths) { } if (!found) { - let msg = - `Could not find sherpa-onnx. Tried\n\n ${possible_paths.join('\n ')}\n` + let msg = `Could not find sherpa-onnx-node. Tried\n\n ${ + possible_paths.join('\n ')}\n` if (os.platform() == 'darwin' && process.env.DYLD_LIBRARY_PATH && !process.env.DYLD_LIBRARY_PATH.includes( `node_modules/sherpa-onnx-${platform_arch}`)) { diff --git a/scripts/node-addon-api/lib/non-streaming-tts.js b/scripts/node-addon-api/lib/non-streaming-tts.js new file mode 100644 index 0000000000..168dc5bf83 --- /dev/null +++ b/scripts/node-addon-api/lib/non-streaming-tts.js @@ -0,0 +1,25 @@ +const addon = require('./addon.js'); + +class OfflineTts { + constructor(config) { + this.handle = addon.createOfflineTts(config); + this.config = config; + + this.numSpeakers = addon.getOfflineTtsNumSpeakers(this.handle); + this.sampleRate = addon.getOfflineTtsSampleRate(this.handle); + } + + /* + input obj: {text: "xxxx", sid: 0, speed: 1.0} + where text is a string, sid is a int32, speed is a float + + return an object {samples: Float32Array, sampleRate: } + */ + generate(obj) { + return addon.offlineTtsGenerate(this.handle, obj); + } +} + +module.exports = { + OfflineTts, +} diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js index a8840a2980..fa7d3e8b34 100644 --- a/scripts/node-addon-api/lib/sherpa-onnx.js +++ b/scripts/node-addon-api/lib/sherpa-onnx.js @@ -1,11 +1,13 @@ const addon = require('./addon.js') const streaming_asr = require('./streaming-asr.js'); const non_streaming_asr = require('./non-streaming-asr.js'); +const non_streaming_tts = require('./non-streaming-tts.js'); const vad = require('./vad.js'); module.exports = { OnlineRecognizer: streaming_asr.OnlineRecognizer, OfflineRecognizer: non_streaming_asr.OfflineRecognizer, + OfflineTts: non_streaming_tts.OfflineTts, readWave: addon.readWave, writeWave: addon.writeWave, Display: streaming_asr.Display, diff --git a/scripts/node-addon-api/src/non-streaming-tts.cc b/scripts/node-addon-api/src/non-streaming-tts.cc new file mode 100644 index 0000000000..24dc409650 --- /dev/null +++ b/scripts/node-addon-api/src/non-streaming-tts.cc @@ -0,0 +1,388 @@ +// scripts/node-addon-api/src/non-streaming-tts.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include + +#include "napi.h" // NOLINT +#include "sherpa-onnx/c-api/c-api.h" + +static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineTtsVitsModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("vits") || !obj.Get("vits").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("vits").As(); + + if (o.Has("model") && o.Get("model").IsString()) { + Napi::String model = o.Get("model").As(); + std::string s = model.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.model = p; + } + + if (o.Has("lexicon") && o.Get("lexicon").IsString()) { + Napi::String lexicon = o.Get("lexicon").As(); + std::string s = lexicon.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.lexicon = p; + } + + if (o.Has("tokens") && o.Get("tokens").IsString()) { + Napi::String tokens = o.Get("tokens").As(); + std::string s = tokens.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.tokens = p; + } + + if (o.Has("dataDir") && o.Get("dataDir").IsString()) { + Napi::String data_dir = o.Get("dataDir").As(); + std::string s = data_dir.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.data_dir = p; + } + + if (o.Has("noiseScale") && o.Get("noiseScale").IsNumber()) { + c.noise_scale = o.Get("noiseScale").As().FloatValue(); + } + + if (o.Has("noiseScaleW") && o.Get("noiseScaleW").IsNumber()) { + c.noise_scale_w = o.Get("noiseScaleW").As().FloatValue(); + } + + if (o.Has("lengthScale") && o.Get("lengthScale").IsNumber()) { + c.length_scale = o.Get("lengthScale").As().FloatValue(); + } + + if (o.Has("dictDir") && o.Get("dictDir").IsString()) { + Napi::String dict_dir = o.Get("dictDir").As(); + std::string s = dict_dir.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.dict_dir = p; + } + + return c; +} + +static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineTtsModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("model") || !obj.Get("model").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("model").As(); + + c.vits = GetOfflineTtsVitsModelConfig(o); + + if (o.Has("numThreads") && o.Get("numThreads").IsNumber()) { + c.num_threads = o.Get("numThreads").As().Int32Value(); + } + + if (o.Has("debug") && + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { + if (o.Get("debug").IsBoolean()) { + c.debug = o.Get("debug").As().Value(); + } else { + c.debug = o.Get("debug").As().Int32Value(); + } + } + + if (o.Has("provider") && o.Get("provider").IsString()) { + Napi::String provider = o.Get("provider").As(); + std::string s = provider.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.provider = p; + } + + return c; +} + +static Napi::External CreateOfflineTtsWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsObject()) { + Napi::TypeError::New(env, "Expect an object as the argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Object o = info[0].As(); + + SherpaOnnxOfflineTtsConfig c; + memset(&c, 0, sizeof(c)); + + c.model = GetOfflineTtsModelConfig(o); + + if (o.Has("ruleFsts") && o.Get("ruleFsts").IsString()) { + Napi::String rule_fsts = o.Get("ruleFsts").As(); + std::string s = rule_fsts.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.rule_fsts = p; + } + + if (o.Has("maxNumSentences") && o.Get("maxNumSentences").IsNumber()) { + c.max_num_sentences = + o.Get("maxNumSentences").As().Int32Value(); + } + + if (o.Has("ruleFars") && o.Get("ruleFars").IsString()) { + Napi::String rule_fars = o.Get("ruleFars").As(); + std::string s = rule_fars.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.rule_fars = p; + } + + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c); + + if (c.model.vits.model) { + delete[] c.model.vits.model; + } + + if (c.model.vits.lexicon) { + delete[] c.model.vits.lexicon; + } + + if (c.model.vits.tokens) { + delete[] c.model.vits.tokens; + } + + if (c.model.vits.data_dir) { + delete[] c.model.vits.data_dir; + } + + if (c.model.vits.dict_dir) { + delete[] c.model.vits.dict_dir; + } + + if (c.model.provider) { + delete[] c.model.provider; + } + + if (c.rule_fsts) { + delete[] c.rule_fsts; + } + + if (c.rule_fars) { + delete[] c.rule_fars; + } + + if (!tts) { + Napi::TypeError::New(env, "Please check your config!") + .ThrowAsJavaScriptException(); + + return {}; + } + + return Napi::External::New( + env, tts, [](Napi::Env env, SherpaOnnxOfflineTts *tts) { + SherpaOnnxDestroyOfflineTts(tts); + }); +} + +static Napi::Number OfflineTtsSampleRateWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an offline tts pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxOfflineTts *tts = + info[0].As>().Data(); + + int32_t sample_rate = SherpaOnnxOfflineTtsSampleRate(tts); + + return Napi::Number::New(env, sample_rate); +} + +static Napi::Number OfflineTtsNumSpeakersWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an offline tts pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxOfflineTts *tts = + info[0].As>().Data(); + + int32_t num_speakers = SherpaOnnxOfflineTtsNumSpeakers(tts); + + return Napi::Number::New(env, num_speakers); +} + +static Napi::Object OfflineTtsGenerateWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an offline tts pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxOfflineTts *tts = + info[0].As>().Data(); + + if (!info[1].IsObject()) { + Napi::TypeError::New(env, "Argument 1 should be an object") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Object obj = info[1].As(); + + if (!obj.Has("text")) { + Napi::TypeError::New(env, "The argument object should have a field text") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!obj.Get("text").IsString()) { + Napi::TypeError::New(env, "The object['text'] should be a string") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!obj.Has("sid")) { + Napi::TypeError::New(env, "The argument object should have a field sid") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!obj.Get("sid").IsNumber()) { + Napi::TypeError::New(env, "The object['sid'] should be a number") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!obj.Has("speed")) { + Napi::TypeError::New(env, "The argument object should have a field speed") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!obj.Get("speed").IsNumber()) { + Napi::TypeError::New(env, "The object['speed'] should be a number") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::String _text = obj.Get("text").As(); + std::string text = _text.Utf8Value(); + int32_t sid = obj.Get("sid").As().Int32Value(); + float speed = obj.Get("speed").As().FloatValue(); + + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerate(tts, text.c_str(), sid, speed); + + Napi::ArrayBuffer arrayBuffer = Napi::ArrayBuffer::New( + env, const_cast(audio->samples), sizeof(float) * audio->n, + [](Napi::Env /*env*/, void * /*data*/, + const SherpaOnnxGeneratedAudio *hint) { + SherpaOnnxDestroyOfflineTtsGeneratedAudio(hint); + }, + audio); + Napi::Float32Array float32Array = + Napi::Float32Array::New(env, audio->n, arrayBuffer, 0); + + Napi::Object ans = Napi::Object::New(env); + ans.Set(Napi::String::New(env, "samples"), float32Array); + ans.Set(Napi::String::New(env, "sampleRate"), audio->sample_rate); + return ans; +} + +void InitNonStreamingTts(Napi::Env env, Napi::Object exports) { + exports.Set(Napi::String::New(env, "createOfflineTts"), + Napi::Function::New(env, CreateOfflineTtsWrapper)); + + exports.Set(Napi::String::New(env, "getOfflineTtsSampleRate"), + Napi::Function::New(env, OfflineTtsSampleRateWrapper)); + + exports.Set(Napi::String::New(env, "getOfflineTtsNumSpeakers"), + Napi::Function::New(env, OfflineTtsNumSpeakersWrapper)); + + exports.Set(Napi::String::New(env, "offlineTtsGenerate"), + Napi::Function::New(env, OfflineTtsGenerateWrapper)); +} diff --git a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc index 5e0211dd28..90ad4d999c 100644 --- a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc +++ b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc @@ -7,6 +7,8 @@ void InitStreamingAsr(Napi::Env env, Napi::Object exports); void InitNonStreamingAsr(Napi::Env env, Napi::Object exports); +void InitNonStreamingTts(Napi::Env env, Napi::Object exports); + void InitVad(Napi::Env env, Napi::Object exports); void InitWaveReader(Napi::Env env, Napi::Object exports); @@ -16,6 +18,7 @@ void InitWaveWriter(Napi::Env env, Napi::Object exports); Napi::Object Init(Napi::Env env, Napi::Object exports) { InitStreamingAsr(env, exports); InitNonStreamingAsr(env, exports); + InitNonStreamingTts(env, exports); InitVad(env, exports); InitWaveReader(env, exports); InitWaveWriter(env, exports); diff --git a/scripts/node-addon-api/src/streaming-asr.cc b/scripts/node-addon-api/src/streaming-asr.cc index fb17e62e3b..7412427cc4 100644 --- a/scripts/node-addon-api/src/streaming-asr.cc +++ b/scripts/node-addon-api/src/streaming-asr.cc @@ -605,7 +605,7 @@ static void InputFinishedWrapper(const Napi::CallbackInfo &info) { if (info.Length() != 1) { std::ostringstream os; - os << "Expect only 1 arguments. Given: " << info.Length(); + os << "Expect only 1 argument. Given: " << info.Length(); Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 3109890edb..e5ecfb7ba0 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -823,7 +823,7 @@ SHERPA_ONNX_API int32_t SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts); // Generate audio from the given text and speaker id (sid). -// The user has to use DestroyOfflineTtsGeneratedAudio() to free the +// The user has to use SherpaOnnxDestroyOfflineTtsGeneratedAudio() to free the // returned pointer to avoid memory leak. SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,