Skip to content

Add Pascal API for Dolphin CTC models #2096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ jobs:
cd ./pascal-api-examples

pushd non-streaming-asr

./run-dolphin-ctc.sh
rm -rf sherpa-onnx-*
echo "---"

./run-zipformer-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
Expand Down Expand Up @@ -253,7 +258,13 @@ jobs:

cd ./pascal-api-examples


pushd vad-with-non-streaming-asr

time ./run-vad-with-dolphin-ctc.sh
rm -rf sherpa-onnx-*
echo "---"

time ./run-vad-with-moonshine.sh
rm -rf sherpa-onnx-*
echo "---"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ This repository supports running the following functions **locally**

on the following platforms and operating systems:

- x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64)
- x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64), **RK NPU**
- Linux, macOS, Windows, openKylin
- Android, WearOS
- iOS
Expand Down
1 change: 1 addition & 0 deletions pascal-api-examples/non-streaming-asr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ APIs with non-streaming models for speech recognition.

|File|Description|
|----|-----------|
|[run-dolphin-ctc.sh](./run-dolphin-ctc.sh)|Use a non-streaming [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model for speech recognition|
|[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition|
|[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition|
|[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers|
Expand Down
76 changes: 76 additions & 0 deletions pascal-api-examples/non-streaming-asr/dolphin_ctc.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{ Copyright (c) 2025 Xiaomi Corporation }

{
This file shows how to use a non-streaming Dolphin CTC model
to decode files.

You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program dolphin_ctc;

{$mode objfpc}

uses
sherpa_onnx,
DateUtils,
SysUtils;

var
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;

Config: TSherpaOnnxOfflineRecognizerConfig;
Recognizer: TSherpaOnnxOfflineRecognizer;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;

Start: TDateTime;
Stop: TDateTime;

Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;

WaveFilename := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav';

Wave := SherpaOnnxReadWave(WaveFilename);

Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
Stream := Recognizer.CreateStream();
Start := Now;

Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);

RecognitionResult := Recognizer.GetResult(Stream);

Stop := Now;

Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;

WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));

{Free resources to avoid memory leak.

Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.
42 changes: 42 additions & 0 deletions pascal-api-examples/non-streaming-asr/run-dolphin-ctc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
ls -lh lib
popd
fi

if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
fi

fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./dolphin_ctc.pas

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

./dolphin_ctc
7 changes: 5 additions & 2 deletions pascal-api-examples/vad-with-non-streaming-asr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ with non-streaming speech recognition models.

|Directory| Description|
|---------|------------|
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
|[run-vad-with-dolphin-ctc.sh](./run-vad-with-dolphin-ctc.sh)|It shows how to use the VAD + [Dolphin](https://github.com/DataoceanAI/Dolphin) for speech recognition.|
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + [Whisper](https://github.com/openai/whisper) for speech recognition.|
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) for speech recognition.|
|[run-vad-with-moonshine.sh](./run-vad-with-moonshine.sh)|It shows how to use the VAD + [Moonshine](https://github.com/usefulsensors/moonshine) for speech recognition.|


Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi

if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi

if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
fi

fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_dolphin.pas

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

./vad_with_dolphin
135 changes: 135 additions & 0 deletions pascal-api-examples/vad-with-non-streaming-asr/vad_with_dolphin.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
{ Copyright (c) 2025 Xiaomi Corporation }

{
This file shows how to use a non-streaming Dolphin model
with silero VAD to decode files.

You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program vad_with_dolphin;

{$mode objfpc}

uses
sherpa_onnx,
SysUtils;

function CreateVad(): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;

SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);

SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}

Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;

Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;

function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);

Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;

Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;

var
Wave: TSherpaOnnxWave;

Recognizer: TSherpaOnnxOfflineRecognizer;
Vad: TSherpaOnnxVoiceActivityDetector;

Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;

Start: Single;
Duration: Single;

Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Vad := CreateVad();
Recognizer := CreateOfflineRecognizer();

Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> Vad.Config.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[Vad.Config.SampleRate, Wave.SampleRate]));

Exit;
end;

WindowSize := Vad.Config.SileroVad.WindowSize;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;

while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();

Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);

Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));

FreeAndNil(Stream);
end;
end;

Vad.Flush;

while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();

Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);

Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));

FreeAndNil(Stream);
end;

FreeAndNil(Recognizer);
FreeAndNil(Vad);
end.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program vad_with_whisper;
program vad_with_sense_voice;

{$mode objfpc}

Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1969,7 +1969,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
return p->impl->GetOutputSamplingRate();
}

void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
void SherpaOnnxLinearResamplerReset(const SherpaOnnxLinearResampler *p) {
p->impl->Reset();
}

Expand Down
Loading
Loading