Skip to content

Commit a4e1f09

Browse files
committed
Merge branch 'v1' of github.com:mutablelogic/go-whisper into stream
2 parents b83b27c + 9a7843c commit a4e1f09

File tree

18 files changed

+283
-60
lines changed

18 files changed

+283
-60
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ docker: docker-dep submodule
5656
--build-arg OS=${OS} \
5757
--build-arg SOURCE=${BUILD_MODULE} \
5858
--build-arg VERSION=${VERSION} \
59-
-f etc/Dockerfile.${ARCH} .
59+
-f etc/Dockerfile.${OS}-${ARCH} .
6060

6161
# Test whisper bindings
6262
test: generate libwhisper libggml

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ curl -F model=ggml-medium-q5_0 -F file=@samples/jfk.wav localhost:8080/v1/audio/
6262
To translate a media file into a different language, you can use the following command:
6363

6464
```bash
65-
curl -F model=ggml-medium-q5_0 -F file=@samples/ge-podcast.wav -F language=en localhost:8080/v1/audio/translations\?stream=true
65+
curl -F model=ggml-medium-q5_0 -F file=@samples/de-podcast.wav -F language=en localhost:8080/v1/audio/translations\?stream=true
6666
```
6767

6868
There's more information on the API [here](doc/API.md).

cmd/server/flags.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ func NewFlags(name string, args []string) (*Flags, error) {
2020
FlagSet: flag.NewFlagSet(name, flag.ContinueOnError),
2121
}
2222
flags.endpoint = flags.String("endpoint", "/v1", "HTTP endpoint")
23-
flags.listen = flags.String("listen", ":8080", "HTTP Listen address")
23+
flags.listen = flags.String("listen", "127.0.0.1:8080", "HTTP Listen address")
2424
flags.dir = flags.String("dir", "${WHISPER_DATA}", "Model data directory")
25-
flags.debug = flags.Bool("debug", false, "Display debug information")
25+
flags.debug = flags.Bool("debug", false, "Output additional debug information")
2626

2727
// Parse flags and return any error
2828
return flags, flags.Parse(args)

cmd/server/main.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
httpserver "github.com/mutablelogic/go-server/pkg/httpserver"
1616
whisper "github.com/mutablelogic/go-whisper"
1717
api "github.com/mutablelogic/go-whisper/pkg/whisper/api"
18+
version "github.com/mutablelogic/go-whisper/pkg/whisper/version"
1819
)
1920

2021
func main() {
@@ -45,6 +46,16 @@ func main() {
4546
os.Exit(-1)
4647
}
4748

49+
// Print version
50+
if version.GitSource != "" {
51+
log.Println(name, version.GitSource)
52+
} else {
53+
log.Println(name)
54+
}
55+
if version.GitTag != "" {
56+
log.Println("Version:", version.GitTag)
57+
}
58+
4859
// Create a whisper service
4960
log.Println("Storing models at", dir)
5061
opts := []whisper.Opt{
@@ -77,7 +88,7 @@ func main() {
7788
api.RegisterEndpoints(flags.Endpoint(), mux, whisper)
7889

7990
// Create a new HTTP server
80-
log.Println("List address", flags.Listen())
91+
log.Println("Listen address", flags.Listen())
8192
server, err := httpserver.Config{
8293
Listen: flags.Listen(),
8394
Router: mux,

doc/API.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,20 @@ event: ok
144144

145145
### Translation
146146

147-
This is the same as transcription (above) except that the `language` parameter is not optional, and should be the language to translate the audio into.
147+
This is the same as transcription (above) except that the `language` parameter is always set to 'en', to translate the audio into English.
148148

149149
```html
150150
POST /v1/audio/translations
151151
POST /v1/audio/translations?stream={bool}
152152
```
153+
154+
### Diarization
155+
156+
To diarize an Enlgish-language audio file, use the following endpoint:
157+
158+
```html
159+
POST /v1/audio/diarize
160+
POST /v1/audio/diarize?stream={bool}
161+
```
162+
163+
The segments returned include a "speaker_turn" field which indicates that the segment is a new speaker. It requires a separate download of a [diarization model](https://huggingface.co/akashmjn/tinydiarize-whisper.cpp).
File renamed without changes.
File renamed without changes.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ require (
88
github.com/djthorpe/go-tablewriter v0.0.8
99
github.com/go-audio/wav v1.1.0
1010
github.com/mutablelogic/go-client v1.0.9
11-
github.com/mutablelogic/go-media v1.6.10
11+
github.com/mutablelogic/go-media v1.6.11
1212
github.com/mutablelogic/go-server v1.4.15
1313
github.com/stretchr/testify v1.9.0
1414
)

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6T
2626
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
2727
github.com/mutablelogic/go-client v1.0.9 h1:Eh4sjQOFDldP/L3IizqkcOD3WigZR+u1VaHTUM4ujYw=
2828
github.com/mutablelogic/go-client v1.0.9/go.mod h1:VLyB8j8IBJSK/FXvvqhmq93PRWDKkyLu8R7V2Vudb6A=
29-
github.com/mutablelogic/go-media v1.6.10 h1:LJCNGiAJHFETtATbktTpe38lquUDiLjgeaWXsi1fzI8=
30-
github.com/mutablelogic/go-media v1.6.10/go.mod h1:HulNT0yyH63a3FRlbuzNDakhOypYrmtFVkHEXZjDgAY=
29+
github.com/mutablelogic/go-media v1.6.11 h1:czwRvuWIaqDArZrHv0e7nEIjXJkCbnNWkrQzkTOs96w=
30+
github.com/mutablelogic/go-media v1.6.11/go.mod h1:HulNT0yyH63a3FRlbuzNDakhOypYrmtFVkHEXZjDgAY=
3131
github.com/mutablelogic/go-server v1.4.15 h1:jOvVdDmVK+PGCMBAk5atKHVonnccwy/b4dWwWFAOTso=
3232
github.com/mutablelogic/go-server v1.4.15/go.mod h1:9nenPAohKu8bFoRgwHJh+3s8h0kLFjUAb8KZvT1TQNU=
3333
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

pkg/whisper/api/register.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func RegisterEndpoints(base string, mux *http.ServeMux, whisper *whisper.Whisper
7070

7171
switch r.Method {
7272
case http.MethodPost:
73-
TranscribeFile(r.Context(), whisper, w, r, true)
73+
TranscribeFile(r.Context(), whisper, w, r, Translate)
7474
default:
7575
httpresponse.Error(w, http.StatusMethodNotAllowed)
7676
}
@@ -84,7 +84,21 @@ func RegisterEndpoints(base string, mux *http.ServeMux, whisper *whisper.Whisper
8484

8585
switch r.Method {
8686
case http.MethodPost:
87-
TranscribeFile(r.Context(), whisper, w, r, false)
87+
TranscribeFile(r.Context(), whisper, w, r, Transcribe)
88+
default:
89+
httpresponse.Error(w, http.StatusMethodNotAllowed)
90+
}
91+
})
92+
93+
// Diarize: POST /v1/audio/diarize
94+
// Transcribes audio into the input language - language parameter should be set to the source
95+
// language of the audio. Output speaker parts.
96+
mux.HandleFunc(joinPath(base, "audio/diarize"), func(w http.ResponseWriter, r *http.Request) {
97+
defer r.Body.Close()
98+
99+
switch r.Method {
100+
case http.MethodPost:
101+
TranscribeFile(r.Context(), whisper, w, r, Diarize)
88102
default:
89103
httpresponse.Error(w, http.StatusMethodNotAllowed)
90104
}

0 commit comments

Comments
 (0)