Skip to content

Support scaling the duration of a pause in TTS. #1820

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ int32_t main() {
keywords_spotter_config.keywords_buf = keywords_buf;
keywords_spotter_config.keywords_buf_size = keywords_buf_size;

SherpaOnnxKeywordSpotter *keywords_spotter =
const SherpaOnnxKeywordSpotter *keywords_spotter =
SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config);

free((void *)tokens_buf);
Expand All @@ -130,7 +130,7 @@ int32_t main() {
return -1;
}

SherpaOnnxOnlineStream *stream =
const SherpaOnnxOnlineStream *stream =
SherpaOnnxCreateKeywordStream(keywords_spotter);

const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);
Expand Down
3 changes: 3 additions & 0 deletions flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct {
external int maxNumSenetences;

external Pointer<Utf8> ruleFars;

@Float()
external double silenceScale;
}

final class SherpaOnnxGeneratedAudio extends Struct {
Expand Down
5 changes: 4 additions & 1 deletion flutter/sherpa_onnx/lib/src/tts.dart
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,19 @@ class OfflineTtsConfig {
this.ruleFsts = '',
this.maxNumSenetences = 1,
this.ruleFars = '',
this.silenceScale = 0.2,
});

@override
String toString() {
return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)';
return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars, silenceScale: $silenceScale)';
}

final OfflineTtsModelConfig model;
final String ruleFsts;
final int maxNumSenetences;
final String ruleFars;
final double silenceScale;
}

class GeneratedAudio {
Expand Down Expand Up @@ -180,6 +182,7 @@ class OfflineTts {
c.ref.ruleFsts = config.ruleFsts.toNativeUtf8();
c.ref.maxNumSenetences = config.maxNumSenetences;
c.ref.ruleFars = config.ruleFars.toNativeUtf8();
c.ref.silenceScale = config.silenceScale;

final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences);
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
SHERPA_ONNX_ASSIGN_ATTR_STR(silence_scale, silenceScale);

#if __OHOS__
std::unique_ptr<NativeResourceManager,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export class OfflineTtsConfig {
public ruleFsts: string = '';
public ruleFars: string = '';
public maxNumSentences: number = 1;
public silenceScale: number = 0.2;
}

export class TtsOutput {
Expand Down Expand Up @@ -98,4 +99,4 @@ export class OfflineTts {
generateAsync(input: TtsInput): Promise<TtsOutput> {
return offlineTtsGenerateAsync(this.handle, input);
}
}
}
6 changes: 4 additions & 2 deletions scripts/dotnet/OfflineTtsConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public OfflineTtsConfig()
RuleFsts = "";
MaxNumSentences = 1;
RuleFars = "";
SilenceScale = 0.2F;
}
public OfflineTtsModelConfig Model;

Expand All @@ -23,6 +24,7 @@ public OfflineTtsConfig()

[MarshalAs(UnmanagedType.LPStr)]
public string RuleFars;
}

}
public float SilenceScale;
}
}
2 changes: 2 additions & 0 deletions scripts/go/sherpa_onnx.go
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ type OfflineTtsConfig struct {
RuleFsts string
RuleFars string
MaxNumSentences int
SilenceScale float32
}

type GeneratedAudio struct {
Expand Down Expand Up @@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
defer C.free(unsafe.Pointer(c.rule_fars))

c.max_num_sentences = C.int(config.MaxNumSentences)
c.silence_scale = C.float(config.SilenceScale)

// vits
c.model.vits.model = C.CString(config.Model.Vits.Model)
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1);
tts_config.silence_scale = SHERPA_ONNX_OR(config->silence_scale, 0.2);

if (tts_config.model.debug) {
#if __OHOS__
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
const char *rule_fsts;
int32_t max_num_sentences;
const char *rule_fars;
float silence_scale;
} SherpaOnnxOfflineTtsConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/c-api/cxx-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {

c.rule_fsts = config.rule_fsts.c_str();
c.max_num_sentences = config.max_num_sentences;
c.silence_scale = config.silence_scale;
c.rule_fars = config.rule_fars.c_str();

auto p = SherpaOnnxCreateOfflineTts(&c);
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/c-api/cxx-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ struct OfflineTtsConfig {
std::string rule_fsts;
std::string rule_fars;
int32_t max_num_sentences = 1;
float silence_scale = 0.2;
};

struct GeneratedAudio {
Expand Down
6 changes: 6 additions & 0 deletions sherpa-onnx/csrc/offline-tts-kokoro-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total);

float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}

return ans;
}

Expand Down
6 changes: 6 additions & 0 deletions sherpa-onnx/csrc/offline-tts-matcha-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total);

float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}

return ans;
}

Expand Down
6 changes: 6 additions & 0 deletions sherpa-onnx/csrc/offline-tts-vits-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total);

float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}

return ans;
}

Expand Down
79 changes: 78 additions & 1 deletion sherpa-onnx/csrc/offline-tts.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "sherpa-onnx/csrc/offline-tts.h"

#include <cmath>
#include <string>
#include <utility>

Expand All @@ -23,6 +24,72 @@

namespace sherpa_onnx {

struct SilenceInterval {
int32_t start;
int32_t end;
};

GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
if (scale == 1) {
return *this;
}
// if the interval is larger than 0.6 second, then we assume it is a pause
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);

std::vector<SilenceInterval> intervals;
int32_t num_samples = static_cast<int32_t>(samples.size());

int32_t last = -1;
int32_t i;
for (i = 0; i != num_samples; ++i) {
if (fabs(samples[i]) <= 0.01) {
if (last == -1) {
last = i;
}
continue;
}

if (last != -1 && i - last < threshold) {
last = -1;
continue;
}

if (last != -1) {
intervals.push_back({last, i});
last = -1;
}
}

if (last != -1 && num_samples - last > threshold) {
intervals.push_back({last, num_samples});
}

if (intervals.empty()) {
return *this;
}

GeneratedAudio ans;
ans.sample_rate = sample_rate;
ans.samples.reserve(samples.size());

i = 0;
for (const auto &interval : intervals) {
ans.samples.insert(ans.samples.end(), samples.begin() + i,
samples.begin() + interval.start);
i = interval.end;
int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);

ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
samples.begin() + interval.start + n);
}

if (i < num_samples) {
ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
}

return ans;
}

void OfflineTtsConfig::Register(ParseOptions *po) {
model.Register(po);

Expand All @@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
"Maximum number of sentences that we process at a time. "
"This is to avoid OOM for very long input text. "
"If you set it to -1, then we process all sentences in a single batch.");

po->Register("tts-silence-scale", &silence_scale,
"Duration of the pause is scaled by this number. So a smaller "
"value leads to a shorter pause.");
}

bool OfflineTtsConfig::Validate() const {
Expand All @@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
}
}

if (silence_scale < 0.001) {
SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
return false;
}

return model.Validate();
}

Expand All @@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
os << "model=" << model.ToString() << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\", ";
os << "max_num_sentences=" << max_num_sentences << ")";
os << "max_num_sentences=" << max_num_sentences << ", ";
os << "silence_scale=" << silence_scale << ")";

return os.str();
}
Expand Down
15 changes: 13 additions & 2 deletions sherpa-onnx/csrc/offline-tts.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,20 @@ struct OfflineTtsConfig {
// If you set it to -1, then we process all sentences in a single batch.
int32_t max_num_sentences = 1;

// A silence interval containing audio samples with value close to 0.
//
// the duration of the new interval is old_duration * silence_scale.
float silence_scale = 0.2;

OfflineTtsConfig() = default;
OfflineTtsConfig(const OfflineTtsModelConfig &model,
const std::string &rule_fsts, const std::string &rule_fars,
int32_t max_num_sentences)
int32_t max_num_sentences, float silence_scale)
: model(model),
rule_fsts(rule_fsts),
rule_fars(rule_fars),
max_num_sentences(max_num_sentences) {}
max_num_sentences(max_num_sentences),
silence_scale(silence_scale) {}

void Register(ParseOptions *po);
bool Validate() const;
Expand All @@ -50,6 +56,11 @@ struct OfflineTtsConfig {
struct GeneratedAudio {
std::vector<float> samples;
int32_t sample_rate;

// Silence means pause here.
// If scale > 1, then it increases the duration of a pause
// If scale < 1, then it reduces the duration of a pause
GeneratedAudio ScaleSilence(float scale) const;
};

class OfflineTtsImpl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ public class OfflineTtsConfig {
private final String ruleFsts;
private final String ruleFars;
private final int maxNumSentences;
private final float silenceScale;

private OfflineTtsConfig(Builder builder) {
this.model = builder.model;
this.ruleFsts = builder.ruleFsts;
this.ruleFars = builder.ruleFars;
this.maxNumSentences = builder.maxNumSentences;
this.silenceScale = builder.silenceScale;
}

public static Builder builder() {
Expand All @@ -35,11 +37,16 @@ public int getMaxNumSentences() {
return maxNumSentences;
}

public float getSilenceScale() {
return silenceScale;
}

public static class Builder {
private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build();
private String ruleFsts = "";
private String ruleFars = "";
private int maxNumSentences = 1;
private float silenceScale = 0.2f;

public OfflineTtsConfig build() {
return new OfflineTtsConfig(this);
Expand All @@ -64,5 +71,10 @@ public Builder setMaxNumSentences(int maxNumSentences) {
this.maxNumSentences = maxNumSentences;
return this;
}

public Builder setSilenceScale(float silenceScale) {
this.silenceScale = silenceScale;
return this;
}
}
}
3 changes: 3 additions & 0 deletions sherpa-onnx/jni/offline-tts.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
fid = env->GetFieldID(cls, "maxNumSentences", "I");
ans.max_num_sentences = env->GetIntField(config, fid);

fid = env->GetFieldID(cls, "silenceScale", "F");
ans.silence_scale = env->GetFloatField(config, fid);

return ans;
}

Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/kotlin-api/Tts.kt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ data class OfflineTtsConfig(
var ruleFsts: String = "",
var ruleFars: String = "",
var maxNumSentences: Int = 1,
var silenceScale: Float = 0.2f,
)

class GeneratedAudio(
Expand Down
Loading
Loading