Skip to content

Commit 0e0660f

Browse files
committed
Added a segmenter
1 parent b0f620c commit 0e0660f

File tree

10 files changed

+145
-27
lines changed

10 files changed

+145
-27
lines changed

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ require (
88
github.com/djthorpe/go-tablewriter v0.0.8
99
github.com/go-audio/wav v1.1.0
1010
github.com/mutablelogic/go-client v1.0.9
11+
github.com/mutablelogic/go-media v1.6.7
1112
github.com/mutablelogic/go-server v1.4.13
1213
github.com/stretchr/testify v1.9.0
1314
)
@@ -19,6 +20,7 @@ require (
1920
github.com/mattn/go-runewidth v0.0.16 // indirect
2021
github.com/pmezard/go-difflib v1.0.0 // indirect
2122
github.com/rivo/uniseg v0.4.7 // indirect
23+
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
2224
golang.org/x/sys v0.22.0 // indirect
2325
golang.org/x/term v0.22.0 // indirect
2426
gopkg.in/yaml.v3 v3.0.1 // indirect

go.sum

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
88
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
99
github.com/djthorpe/go-errors v1.0.3 h1:GZeMPkC1mx2vteXLI/gvxZS0Ee9zxzwD1mcYyKU5jD0=
1010
github.com/djthorpe/go-errors v1.0.3/go.mod h1:HtfrZnMd6HsX75Mtbv9Qcnn0BqOrrFArvCaj3RMnZhY=
11-
github.com/djthorpe/go-tablewriter v0.0.7 h1:jnNsJDjjLLCt0OAqB5DzGZN7V3beT1IpNMQ8GcOwZDU=
12-
github.com/djthorpe/go-tablewriter v0.0.7/go.mod h1:NVBvytpL+6fHfCKn0+3lSi15/G3A1HWf2cLNeHg6YBg=
1311
github.com/djthorpe/go-tablewriter v0.0.8 h1:uRhB9XVgK1n9tvVS7KMyxhxxGGtDvqC80toDTpW4DB4=
1412
github.com/djthorpe/go-tablewriter v0.0.8/go.mod h1:NVBvytpL+6fHfCKn0+3lSi15/G3A1HWf2cLNeHg6YBg=
1513
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@@ -18,18 +16,18 @@ github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
1816
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
1917
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
2018
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
19+
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
20+
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
2121
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
2222
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
23-
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
24-
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
23+
github.com/llgcode/draw2d v0.0.0-20240627062922-0ed1ff131195 h1:Vdz2cBh5Fw2MYHWi3ED2PraDQaWEUhNCr1XFHrP4N5A=
24+
github.com/llgcode/draw2d v0.0.0-20240627062922-0ed1ff131195/go.mod h1:1Vk0LDW6jG5cGc2D9RQUxHaE0vYhTvIwSo9mOL6K4/U=
2525
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
2626
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
27-
github.com/mutablelogic/go-client v1.0.8 h1:A3QtP0wdf+W3dE5k7dobwGYqqn4ZpIqRFu+h9vPoy7Y=
28-
github.com/mutablelogic/go-client v1.0.8/go.mod h1:aP9ecBd4R/acJEJSyp81U3mey9W3AHQV/G1XzfcrLx0=
2927
github.com/mutablelogic/go-client v1.0.9 h1:Eh4sjQOFDldP/L3IizqkcOD3WigZR+u1VaHTUM4ujYw=
3028
github.com/mutablelogic/go-client v1.0.9/go.mod h1:VLyB8j8IBJSK/FXvvqhmq93PRWDKkyLu8R7V2Vudb6A=
31-
github.com/mutablelogic/go-server v1.4.11 h1:feI9IyuK6pv7Gi7fbExfU51uDRMfQo0U9wo0vWN2wf8=
32-
github.com/mutablelogic/go-server v1.4.11/go.mod h1:9nenPAohKu8bFoRgwHJh+3s8h0kLFjUAb8KZvT1TQNU=
29+
github.com/mutablelogic/go-media v1.6.7 h1:0hCr89EVJg7xw8ChABb7Cscr0UiZ1+Tl9xDXong0lu0=
30+
github.com/mutablelogic/go-media v1.6.7/go.mod h1:vWKq6QKqUQ+sAwfbU/DgakJGIk2Uq7ozH0qSxhysCkM=
3331
github.com/mutablelogic/go-server v1.4.13 h1:k5LJJ/pCvyiw34UX341vRhliBOS6i7V65U/UICcOJOA=
3432
github.com/mutablelogic/go-server v1.4.13/go.mod h1:9nenPAohKu8bFoRgwHJh+3s8h0kLFjUAb8KZvT1TQNU=
3533
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -41,12 +39,10 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT
4139
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
4240
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
4341
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
44-
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
45-
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
42+
golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ=
43+
golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E=
4644
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
4745
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
48-
golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA=
49-
golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0=
5046
golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk=
5147
golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4=
5248
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=

pkg/whisper/api/transcribe.go

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
// Packages
1010

11+
"github.com/go-audio/wav"
1112
"github.com/mutablelogic/go-server/pkg/httprequest"
1213
"github.com/mutablelogic/go-server/pkg/httpresponse"
1314
"github.com/mutablelogic/go-whisper/pkg/whisper"
@@ -67,28 +68,32 @@ func TranscribeFile(ctx context.Context, service *whisper.Whisper, w http.Respon
6768
defer f.Close()
6869

6970
// Read samples
70-
//buf, err := wav.NewDecoder(f).FullPCMBuffer()
71-
//if err != nil {
72-
// httpresponse.Error(w, http.StatusInternalServerError, err.Error())
73-
// return
74-
//}
71+
buf, err := wav.NewDecoder(f).FullPCMBuffer()
72+
if err != nil {
73+
httpresponse.Error(w, http.StatusInternalServerError, err.Error())
74+
return
75+
}
7576

7677
// Get context for the model, perform transcription
7778
var result *whisper.Transcription
78-
if err := service.WithModel(model, func(ctx *task.Context) error {
79+
if err := service.WithModel(model, func(task *task.Context) error {
80+
// Check model
81+
if translate && !task.CanTranslate() {
82+
return ErrBadParameter.With("model is not multilingual, cannot translate")
83+
}
84+
7985
// Set parameters for transcription & translation, default to english
80-
ctx.SetTranslate(translate)
86+
task.SetTranslate(translate)
8187
if req.Language != nil {
82-
if err := ctx.SetLanguage(*req.Language); err != nil {
88+
if err := task.SetLanguage(*req.Language); err != nil {
8389
return err
8490
}
8591
} else if translate {
86-
if err := ctx.SetLanguage("en"); err != nil {
92+
if err := task.SetLanguage("en"); err != nil {
8793
return err
8894
}
8995
}
90-
91-
// Set prompt and temperature
96+
// TODO Set prompt and temperature
9297
/*
9398
if req.Prompt != nil {
9499
ctx.SetPrompt(*req.Prompt)
@@ -98,8 +103,7 @@ func TranscribeFile(ctx context.Context, service *whisper.Whisper, w http.Respon
98103
}
99104
*/
100105
// Perform the transcription, return any errors
101-
//result, err = service.Transcribe(ctx, buf.AsFloat32Buffer().Data)
102-
return ErrNotImplemented
106+
return task.Transcribe(ctx, buf.AsFloat32Buffer().Data)
103107
}); err != nil {
104108
httpresponse.Error(w, http.StatusBadRequest, err.Error())
105109
return

pkg/whisper/segmenter/segmenter.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package segmenter
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"io"
8+
9+
// Packages
10+
media "github.com/mutablelogic/go-media"
11+
ffmpeg "github.com/mutablelogic/go-media/pkg/ffmpeg"
12+
)
13+
14+
type Segmenter struct {
15+
reader *ffmpeg.Reader
16+
}
17+
18+
//////////////////////////////////////////////////////////////////////////////
19+
// LIFECYCLE
20+
21+
// Create a new segmenter for "NumSamples" with a reader r
22+
// If NumSamples is zero then no segmenting is performed
23+
func NewSegmenter(r io.Reader) (*Segmenter, error) {
24+
segmenter := new(Segmenter)
25+
26+
// Open the file
27+
media, err := ffmpeg.NewReader(r)
28+
if err != nil {
29+
return nil, err
30+
} else {
31+
segmenter.reader = media
32+
}
33+
34+
return segmenter, nil
35+
}
36+
37+
// Close the segmenter
38+
func (s *Segmenter) Close() error {
39+
var result error
40+
41+
if s.reader != nil {
42+
result = errors.Join(result, s.reader.Close())
43+
}
44+
s.reader = nil
45+
46+
// Return any errors
47+
return result
48+
}
49+
50+
//////////////////////////////////////////////////////////////////////////////
51+
// PUBLIC METHODS
52+
53+
func (s *Segmenter) Decode(ctx context.Context) error {
54+
mapFunc := func(stream int, params *ffmpeg.Par) (*ffmpeg.Par, error) {
55+
if stream == s.reader.BestStream(media.AUDIO) {
56+
return ffmpeg.NewAudioPar("flt", "mono", 16000)
57+
}
58+
// Ignore no-audio streams
59+
return nil, nil
60+
}
61+
return s.reader.Decode(ctx, mapFunc, func(stream int, frame *ffmpeg.Frame) error {
62+
// Append float32 samples to buffer
63+
fmt.Println("TODO: Implement Decode", frame)
64+
return nil
65+
})
66+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package segmenter_test
2+
3+
import (
4+
"context"
5+
"os"
6+
"testing"
7+
8+
// Packages
9+
segmenter "github.com/mutablelogic/go-whisper/pkg/whisper/segmenter"
10+
assert "github.com/stretchr/testify/assert"
11+
)
12+
13+
const SAMPLE_EN = "../../../samples/jfk.wav"
14+
const SAMPLE_FR = "../../../samples/OlivierL.wav"
15+
const SAMPLE_DE = "../../../samples/ge-podcast.wav"
16+
17+
func Test_segmenter_001(t *testing.T) {
18+
assert := assert.New(t)
19+
20+
f, err := os.Open(SAMPLE_EN)
21+
if !assert.NoError(err) {
22+
t.SkipNow()
23+
}
24+
segmenter, err := segmenter.NewSegmenter(f)
25+
if !assert.NoError(err) {
26+
t.SkipNow()
27+
}
28+
defer segmenter.Close()
29+
30+
assert.NoError(segmenter.Decode(context.Background()))
31+
}

pkg/whisper/task/context.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ func (task *Context) CopyParams() {
132132
task.params.SetLanguage("auto")
133133
}
134134

135+
// Model is multilingual and can translate
136+
func (task *Context) CanTranslate() bool {
137+
return whisper.Whisper_is_multilingual(task.whisper)
138+
}
139+
135140
// Transcribe samples. The samples should be 16KHz float32 samples in
136141
// a single channel.
137142
// TODO: We need a low-latency streaming version of this function.

pkg/whisper/whisper_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
"testing"
88

99
// Packages
10-
"github.com/go-audio/wav"
10+
wav "github.com/go-audio/wav"
1111
whisper "github.com/mutablelogic/go-whisper/pkg/whisper"
1212
task "github.com/mutablelogic/go-whisper/pkg/whisper/task"
1313
assert "github.com/stretchr/testify/assert"

sys/whisper/generate.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ package whisper
77
#cgo pkg-config: libwhisper
88
#cgo darwin pkg-config: libwhisper-darwin
99
#cgo linux pkg-config: libwhisper-linux
10-
#cgo arm64 pkg-config: cuda-12.2 cublas-12.2 cudart-12.2
1110
*/
1211
import "C"
1312

sys/whisper/generate_cuda.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
//go:build cuda
2+
package whisper
3+
4+
///////////////////////////////////////////////////////////////////////////////
5+
// CGO
6+
7+
/*
8+
#cgo arm64 pkg-config: cuda-12.2 cublas-12.2 cudart-12.2
9+
*/
10+
import "C"

sys/whisper/whisper.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ func Whisper_lang_str_full(id int) string {
6868
return C.GoString(C.whisper_lang_str_full(C.int(id)))
6969
}
7070

71+
// Return model capabilities
72+
func Whisper_is_multilingual(ctx *Context) bool {
73+
return C.whisper_is_multilingual((*C.struct_whisper_context)(ctx)) != 0
74+
}
75+
7176
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
7277
// Not thread safe for same context
7378
// Uses the specified decoding strategy to obtain the text.

0 commit comments

Comments
 (0)