Skip to content

Commit fcb85f9

Browse files
authored
Merge pull request #37 from mutablelogic/main
update from main
2 parents 0fb4c49 + 7aef1cf commit fcb85f9

File tree

14 files changed

+258
-37
lines changed

14 files changed

+258
-37
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ ifeq ($(GGML_CUDA),1)
2222
endif
2323

2424
# Targets
25-
all: build server cli
25+
all: server cli
2626

2727
# Generate the pkg-config files
2828
generate: mkdir go-tidy

cmd/server/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ func main() {
5656

5757
// Create a whisper service
5858
log.Println("Storing models at", dir)
59-
whisper, err := whisper.New(dir)
59+
whisper, err := whisper.New(dir, whisper.OptMaxConcurrent(1))
6060
if err != nil {
6161
log.Println(err)
6262
os.Exit(-2)

doc/notes.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
if (!whisper_is_multilingual(ctx)) {
2+
if (params.language != "en" || params.translate) {
3+
params.language = "en";
4+
params.translate = false;
5+
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
6+
}
7+
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ require (
88
github.com/djthorpe/go-tablewriter v0.0.8
99
github.com/go-audio/wav v1.1.0
1010
github.com/mutablelogic/go-client v1.0.9
11+
github.com/mutablelogic/go-media v1.6.8
1112
github.com/mutablelogic/go-server v1.4.13
1213
github.com/stretchr/testify v1.9.0
1314
)
@@ -19,6 +20,7 @@ require (
1920
github.com/mattn/go-runewidth v0.0.16 // indirect
2021
github.com/pmezard/go-difflib v1.0.0 // indirect
2122
github.com/rivo/uniseg v0.4.7 // indirect
23+
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
2224
golang.org/x/sys v0.22.0 // indirect
2325
golang.org/x/term v0.22.0 // indirect
2426
gopkg.in/yaml.v3 v3.0.1 // indirect

go.sum

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
88
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
99
github.com/djthorpe/go-errors v1.0.3 h1:GZeMPkC1mx2vteXLI/gvxZS0Ee9zxzwD1mcYyKU5jD0=
1010
github.com/djthorpe/go-errors v1.0.3/go.mod h1:HtfrZnMd6HsX75Mtbv9Qcnn0BqOrrFArvCaj3RMnZhY=
11-
github.com/djthorpe/go-tablewriter v0.0.7 h1:jnNsJDjjLLCt0OAqB5DzGZN7V3beT1IpNMQ8GcOwZDU=
12-
github.com/djthorpe/go-tablewriter v0.0.7/go.mod h1:NVBvytpL+6fHfCKn0+3lSi15/G3A1HWf2cLNeHg6YBg=
1311
github.com/djthorpe/go-tablewriter v0.0.8 h1:uRhB9XVgK1n9tvVS7KMyxhxxGGtDvqC80toDTpW4DB4=
1412
github.com/djthorpe/go-tablewriter v0.0.8/go.mod h1:NVBvytpL+6fHfCKn0+3lSi15/G3A1HWf2cLNeHg6YBg=
1513
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@@ -18,18 +16,18 @@ github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
1816
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
1917
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
2018
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
19+
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
20+
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
2121
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
2222
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
23-
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
24-
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
23+
github.com/llgcode/draw2d v0.0.0-20240627062922-0ed1ff131195 h1:Vdz2cBh5Fw2MYHWi3ED2PraDQaWEUhNCr1XFHrP4N5A=
24+
github.com/llgcode/draw2d v0.0.0-20240627062922-0ed1ff131195/go.mod h1:1Vk0LDW6jG5cGc2D9RQUxHaE0vYhTvIwSo9mOL6K4/U=
2525
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
2626
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
27-
github.com/mutablelogic/go-client v1.0.8 h1:A3QtP0wdf+W3dE5k7dobwGYqqn4ZpIqRFu+h9vPoy7Y=
28-
github.com/mutablelogic/go-client v1.0.8/go.mod h1:aP9ecBd4R/acJEJSyp81U3mey9W3AHQV/G1XzfcrLx0=
2927
github.com/mutablelogic/go-client v1.0.9 h1:Eh4sjQOFDldP/L3IizqkcOD3WigZR+u1VaHTUM4ujYw=
3028
github.com/mutablelogic/go-client v1.0.9/go.mod h1:VLyB8j8IBJSK/FXvvqhmq93PRWDKkyLu8R7V2Vudb6A=
31-
github.com/mutablelogic/go-server v1.4.11 h1:feI9IyuK6pv7Gi7fbExfU51uDRMfQo0U9wo0vWN2wf8=
32-
github.com/mutablelogic/go-server v1.4.11/go.mod h1:9nenPAohKu8bFoRgwHJh+3s8h0kLFjUAb8KZvT1TQNU=
29+
github.com/mutablelogic/go-media v1.6.8 h1:3v4povSQlOnvg9mHx6Bp9NVdCCjrNdDCjMHBGFHnVE8=
30+
github.com/mutablelogic/go-media v1.6.8/go.mod h1:HulNT0yyH63a3FRlbuzNDakhOypYrmtFVkHEXZjDgAY=
3331
github.com/mutablelogic/go-server v1.4.13 h1:k5LJJ/pCvyiw34UX341vRhliBOS6i7V65U/UICcOJOA=
3432
github.com/mutablelogic/go-server v1.4.13/go.mod h1:9nenPAohKu8bFoRgwHJh+3s8h0kLFjUAb8KZvT1TQNU=
3533
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -39,14 +37,12 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
3937
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
4038
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
4139
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
42-
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
43-
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
44-
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
45-
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
40+
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
41+
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
42+
golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ=
43+
golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E=
4644
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
4745
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
48-
golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA=
49-
golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0=
5046
golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk=
5147
golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4=
5248
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=

pkg/whisper/api/transcribe.go

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
// Packages
1010

11+
"github.com/go-audio/wav"
1112
"github.com/mutablelogic/go-server/pkg/httprequest"
1213
"github.com/mutablelogic/go-server/pkg/httpresponse"
1314
"github.com/mutablelogic/go-whisper/pkg/whisper"
@@ -67,28 +68,32 @@ func TranscribeFile(ctx context.Context, service *whisper.Whisper, w http.Respon
6768
defer f.Close()
6869

6970
// Read samples
70-
//buf, err := wav.NewDecoder(f).FullPCMBuffer()
71-
//if err != nil {
72-
// httpresponse.Error(w, http.StatusInternalServerError, err.Error())
73-
// return
74-
//}
71+
buf, err := wav.NewDecoder(f).FullPCMBuffer()
72+
if err != nil {
73+
httpresponse.Error(w, http.StatusInternalServerError, err.Error())
74+
return
75+
}
7576

7677
// Get context for the model, perform transcription
7778
var result *whisper.Transcription
78-
if err := service.WithModel(model, func(ctx *task.Context) error {
79+
if err := service.WithModel(model, func(task *task.Context) error {
80+
// Check model
81+
if translate && !task.CanTranslate() {
82+
return ErrBadParameter.With("model is not multilingual, cannot translate")
83+
}
84+
7985
// Set parameters for transcription & translation, default to english
80-
ctx.SetTranslate(translate)
86+
task.SetTranslate(translate)
8187
if req.Language != nil {
82-
if err := ctx.SetLanguage(*req.Language); err != nil {
88+
if err := task.SetLanguage(*req.Language); err != nil {
8389
return err
8490
}
8591
} else if translate {
86-
if err := ctx.SetLanguage("en"); err != nil {
92+
if err := task.SetLanguage("en"); err != nil {
8793
return err
8894
}
8995
}
90-
91-
// Set prompt and temperature
96+
// TODO Set prompt and temperature
9297
/*
9398
if req.Prompt != nil {
9499
ctx.SetPrompt(*req.Prompt)
@@ -98,8 +103,7 @@ func TranscribeFile(ctx context.Context, service *whisper.Whisper, w http.Respon
98103
}
99104
*/
100105
// Perform the transcription, return any errors
101-
//result, err = service.Transcribe(ctx, buf.AsFloat32Buffer().Data)
102-
return ErrNotImplemented
106+
return task.Transcribe(ctx, buf.AsFloat32Buffer().Data)
103107
}); err != nil {
104108
httpresponse.Error(w, http.StatusBadRequest, err.Error())
105109
return

pkg/whisper/segmenter/segmenter.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
package segmenter
2+
3+
import (
4+
"context"
5+
"errors"
6+
"io"
7+
"time"
8+
9+
// Packages
10+
media "github.com/mutablelogic/go-media"
11+
ffmpeg "github.com/mutablelogic/go-media/pkg/ffmpeg"
12+
13+
// Namespace imports
14+
. "github.com/djthorpe/go-errors"
15+
)
16+
17+
type Segmenter struct {
18+
ts time.Duration
19+
sample_rate int
20+
n int
21+
buf []float32
22+
reader *ffmpeg.Reader
23+
}
24+
25+
// SegmentFunc is a callback function which is called when a segment is ready
26+
// to be processed. The first argument is the timestamp of the segment.
27+
type SegmentFunc func(time.Duration, []float32)
28+
29+
//////////////////////////////////////////////////////////////////////////////
30+
// LIFECYCLE
31+
32+
// Create a new segmenter for "NumSamples" with a reader r
33+
// If NumSamples is zero then no segmenting is performed
34+
func NewSegmenter(r io.Reader, dur time.Duration, sample_rate int) (*Segmenter, error) {
35+
segmenter := new(Segmenter)
36+
37+
// Check arguments
38+
if dur < 0 || sample_rate <= 0 {
39+
return nil, ErrBadParameter.With("invalid duration or sample rate arguments")
40+
} else {
41+
segmenter.sample_rate = sample_rate
42+
}
43+
44+
// Sample buffer is duration * sample rate
45+
if dur > 0 {
46+
segmenter.n = int(dur.Seconds()) * sample_rate
47+
segmenter.buf = make([]float32, 0, int(dur.Seconds())*sample_rate)
48+
}
49+
50+
// Open the file
51+
media, err := ffmpeg.NewReader(r)
52+
if err != nil {
53+
return nil, err
54+
} else {
55+
segmenter.reader = media
56+
}
57+
58+
return segmenter, nil
59+
}
60+
61+
// Close the segmenter
62+
func (s *Segmenter) Close() error {
63+
var result error
64+
65+
if s.reader != nil {
66+
result = errors.Join(result, s.reader.Close())
67+
}
68+
s.reader = nil
69+
s.buf = nil
70+
71+
// Return any errors
72+
return result
73+
}
74+
75+
//////////////////////////////////////////////////////////////////////////////
76+
// PUBLIC METHODS
77+
78+
// TODO: segments are output through a callback, with the samples and a timestamp
79+
// TODO: we could do some basic silence and voice detection to segment to ensure
80+
// we don't overtax the CPU/GPU with silence and non-speech
81+
func (s *Segmenter) Decode(ctx context.Context, fn SegmentFunc) error {
82+
// Check input parameters
83+
if fn == nil {
84+
return ErrBadParameter.With("SegmentFunc is nil")
85+
}
86+
87+
// Map function chooses the best audio stream
88+
mapFunc := func(stream int, params *ffmpeg.Par) (*ffmpeg.Par, error) {
89+
if stream == s.reader.BestStream(media.AUDIO) {
90+
return ffmpeg.NewAudioPar("flt", "mono", s.sample_rate)
91+
}
92+
// Ignore no-audio streams
93+
return nil, nil
94+
}
95+
96+
// Decode samples and segment
97+
if err := s.reader.Decode(ctx, mapFunc, func(stream int, frame *ffmpeg.Frame) error {
98+
// We get null frames sometimes, ignore them
99+
if frame == nil {
100+
return nil
101+
}
102+
103+
// Append float32 samples from plane 0 to buffer
104+
s.buf = append(s.buf, frame.Float32(0)...)
105+
106+
// n != 0 and len(buf) >= n we have a segment to process
107+
if s.n != 0 && len(s.buf) >= s.n {
108+
fn(s.ts, s.buf)
109+
// Clear the buffer
110+
s.buf = s.buf[:0]
111+
// Increment the timestamp
112+
s.ts += time.Duration(float64(s.n)/float64(s.sample_rate)) * time.Second
113+
}
114+
115+
// Continue processing
116+
return nil
117+
}); err != nil {
118+
return err
119+
}
120+
121+
// Output any remaining samples
122+
if len(s.buf) > 0 {
123+
fn(s.ts, s.buf)
124+
}
125+
126+
// Return success
127+
return nil
128+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package segmenter_test
2+
3+
import (
4+
"context"
5+
"os"
6+
"testing"
7+
"time"
8+
9+
// Packages
10+
segmenter "github.com/mutablelogic/go-whisper/pkg/whisper/segmenter"
11+
assert "github.com/stretchr/testify/assert"
12+
)
13+
14+
const SAMPLE = "../../../samples/OlivierL.wav"
15+
16+
func Test_segmenter_001(t *testing.T) {
17+
assert := assert.New(t)
18+
19+
f, err := os.Open(SAMPLE)
20+
if !assert.NoError(err) {
21+
t.SkipNow()
22+
}
23+
segmenter, err := segmenter.NewSegmenter(f, time.Second, 16000)
24+
if !assert.NoError(err) {
25+
t.SkipNow()
26+
}
27+
defer segmenter.Close()
28+
29+
assert.NoError(segmenter.Decode(context.Background(), func(ts time.Duration, buf []float32) {
30+
t.Log(ts, len(buf))
31+
}))
32+
}
33+
34+
func Test_segmenter_002(t *testing.T) {
35+
assert := assert.New(t)
36+
37+
f, err := os.Open(SAMPLE)
38+
if !assert.NoError(err) {
39+
t.SkipNow()
40+
}
41+
42+
// No segmentation, just output the audio
43+
segmenter, err := segmenter.NewSegmenter(f, 0, 16000)
44+
if !assert.NoError(err) {
45+
t.SkipNow()
46+
}
47+
defer segmenter.Close()
48+
49+
assert.NoError(segmenter.Decode(context.Background(), func(ts time.Duration, buf []float32) {
50+
t.Log(ts, len(buf))
51+
}))
52+
}

0 commit comments

Comments
 (0)