Skip to content

Commit a76d738

Browse files
authored
Merge pull request #60 from mutablelogic/v1
Added the beginning of the API
2 parents eacdcfb + 3e5ad11 commit a76d738

File tree

9 files changed

+197
-19
lines changed

9 files changed

+197
-19
lines changed

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ ifeq ($(GGML_CUDA),1)
3030
endif
3131

3232
# Targets
33-
all: whisper
33+
all: whisper api
3434

3535
# Generate the pkg-config files
3636
generate: mkdir go-tidy
@@ -42,6 +42,11 @@ whisper: mkdir generate go-tidy libwhisper libggml
4242
@echo "Building whisper"
4343
@PKG_CONFIG_PATH=${ROOT_PATH}/${BUILD_DIR} ${GO} build ${BUILD_FLAGS} -o ${BUILD_DIR}/whisper ./cmd/whisper
4444

45+
# Make api
46+
api: mkdir go-tidy
47+
@echo "Building api"
48+
@${GO} build ${BUILD_FLAGS} -o ${BUILD_DIR}/api ./cmd/api
49+
4550
# Build docker container
4651
docker: docker-dep submodule
4752
@echo build docker image: ${BUILD_TAG} for ${OS}/${ARCH}

cmd/api/main.go

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"os"
6+
"path/filepath"
7+
"syscall"
8+
9+
// Packages
10+
kong "github.com/alecthomas/kong"
11+
tablewriter "github.com/djthorpe/go-tablewriter"
12+
opt "github.com/mutablelogic/go-client"
13+
ctx "github.com/mutablelogic/go-server/pkg/context"
14+
client "github.com/mutablelogic/go-whisper/pkg/client"
15+
)
16+
17+
////////////////////////////////////////////////////////////////////////////////
18+
// TYPES
19+
20+
type Globals struct {
21+
Url string `name:"url" help:"URL of whisper service (can be set from WHISPER_URL env)" default:"${WHISPER_URL}"`
22+
Debug bool `name:"debug" help:"Enable debug output"`
23+
24+
// Writer, service and context
25+
writer *tablewriter.Writer
26+
client *client.Client
27+
ctx context.Context
28+
}
29+
30+
type CLI struct {
31+
Globals
32+
33+
Ping PingCmd `cmd help:"Ping the whisper service"`
34+
}
35+
36+
////////////////////////////////////////////////////////////////////////////////
37+
// GLOBALS
38+
39+
const (
40+
defaultEndpoint = "http://localhost:8080/api/v1"
41+
)
42+
43+
////////////////////////////////////////////////////////////////////////////////
44+
// MAIN
45+
46+
func main() {
47+
// The name of the executable
48+
name, err := os.Executable()
49+
if err != nil {
50+
panic(err)
51+
} else {
52+
name = filepath.Base(name)
53+
}
54+
55+
// Create a cli parser
56+
cli := CLI{}
57+
cmd := kong.Parse(&cli,
58+
kong.Name(name),
59+
kong.Description("speech transcription and translation service client"),
60+
kong.UsageOnError(),
61+
kong.ConfigureHelp(kong.HelpOptions{Compact: true}),
62+
kong.Vars{
63+
"WHISPER_URL": envOrDefault("WHISPER_URL", defaultEndpoint),
64+
},
65+
)
66+
67+
// Set whisper client options
68+
opts := []opt.ClientOpt{}
69+
if cli.Globals.Debug {
70+
opts = append(opts, opt.OptTrace(os.Stderr, true))
71+
}
72+
73+
// Create a whisper client
74+
client, err := client.New(cli.Globals.Url, opts...)
75+
if err != nil {
76+
cmd.FatalIfErrorf(err)
77+
return
78+
} else {
79+
cli.Globals.client = client
80+
}
81+
82+
// Create a tablewriter object with text output
83+
writer := tablewriter.New(os.Stdout, tablewriter.OptOutputText())
84+
cli.Globals.writer = writer
85+
86+
// Create a context
87+
cli.Globals.ctx = ctx.ContextForSignal(os.Interrupt, syscall.SIGQUIT)
88+
89+
// Run the command
90+
if err := cmd.Run(&cli.Globals); err != nil {
91+
cmd.FatalIfErrorf(err)
92+
}
93+
}
94+
95+
func envOrDefault(name, def string) string {
96+
if value := os.Getenv(name); value != "" {
97+
return value
98+
} else {
99+
return def
100+
}
101+
}

cmd/api/ping.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package main
2+
3+
type PingCmd struct{}
4+
5+
func (cmd *PingCmd) Run(ctx *Globals) error {
6+
if err := ctx.client.Ping(ctx.ctx); err != nil {
7+
return err
8+
}
9+
return ctx.writer.Write("OK")
10+
}

cmd/whisper/transcribe.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ func (cmd *TranscribeCmd) Run(ctx *Globals) error {
3636
defer f.Close()
3737

3838
// Create a segmenter - read segments based on requested segment size
39-
segmenter, err := segmenter.New(f, 0, whisper.SampleRate)
39+
// TODO
40+
segmenter, err := segmenter.NewReader(f, 0, whisper.SampleRate)
4041
if err != nil {
4142
return err
4243
}

etc/Dockerfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASE_TAG=0.0.10-4-g6421fd2
1+
ARG BASE_TAG=1.0.0
22
ARG BASE_DEV_CONTAINER=ghcr.io/mutablelogic/cuda-dev:${BASE_TAG}
33
ARG BASE_RUN_CONTAINER=ghcr.io/mutablelogic/cuda-rt:${BASE_TAG}
44
ARG CUDA_DOCKER_ARCH=all
@@ -14,7 +14,7 @@ ARG ARCH
1414
ARG OS
1515

1616
RUN apt-get -y update \
17-
&& apt-get -y install software-properties-common curl libgomp1 \
17+
&& apt-get -y install software-properties-common curl \
1818
&& add-apt-repository -y ppa:ubuntuhandbook1/ffmpeg6 \
1919
&& apt-get -y update \
2020
&& apt-get -y install libavformat-dev libavcodec-dev libavdevice-dev libavfilter-dev libavutil-dev libswscale-dev libswresample-dev
@@ -35,12 +35,12 @@ RUN make -j$(nproc)
3535
# Setup runtime container
3636
FROM ${BASE_RUN_CONTAINER} AS runtime
3737
RUN apt-get -y update \
38-
&& apt-get -y install software-properties-common libgomp1 \
38+
&& apt-get -y install software-properties-common \
3939
&& add-apt-repository -y ppa:ubuntuhandbook1/ffmpeg6 \
4040
&& apt-get -y update \
4141
&& apt-get -y install libavformat60 libavcodec60 libavdevice60 libavfilter9 libavutil58 libswscale7 libswresample4
4242
COPY --from=build --chmod=755 /app/build/whisper /usr/local/bin/whisper
43-
COPY --from=build /app/build/whisper /usr/local/bin/whisper
43+
COPY --from=build --chmod=755 /app/build/api /usr/local/bin/api
4444
COPY --chmod=755 etc/entrypoint.sh .
4545

4646
# Entrypoint when running the server

pkg/api/transcribe.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func TranscribeFile(ctx context.Context, service *whisper.Whisper, w http.Respon
101101
defer f.Close()
102102

103103
// Create a segmenter - read segments based on requested segment size
104-
segmenter, err := segmenter.New(f, req.SegmentDur(), whisper.SampleRate)
104+
segmenter, err := segmenter.NewReader(f, req.SegmentDur(), whisper.SampleRate)
105105
if err != nil {
106106
httpresponse.Error(w, http.StatusBadRequest, err.Error())
107107
return

pkg/client/client.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ func New(endpoint string, opts ...client.ClientOpt) (*Client, error) {
3434
}
3535
}
3636

37+
///////////////////////////////////////////////////////////////////////////////
38+
// PING
39+
40+
func (c *Client) Ping(ctx context.Context) error {
41+
return c.DoWithContext(ctx, client.MethodGet, nil, client.OptPath("health"))
42+
}
43+
3744
///////////////////////////////////////////////////////////////////////////////
3845
// MODELS
3946

pkg/segmenter/segmenter.go

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,15 @@ type SegmentFunc func(time.Duration, []float32) error
3131
//////////////////////////////////////////////////////////////////////////////
3232
// LIFECYCLE
3333

34-
// Create a new segmenter for a specific "dur" duration of samples with
35-
// a reader r. If dur is zero then no segmenting is performed, the whole
34+
// Create a new segmenter with a reader r which segments raw audio of 'dur'
35+
// length. If dur is zero then no segmenting is performed, the whole
3636
// audio file is read, which could cause some memory issues.
37+
//
3738
// The sample rate is the number of samples per second.
39+
//
3840
// At the moment, the audio format is auto-detected, but there should be
3941
// a way to specify the audio format.
40-
func New(r io.Reader, dur time.Duration, sample_rate int) (*Segmenter, error) {
42+
func NewReader(r io.Reader, dur time.Duration, sample_rate int) (*Segmenter, error) {
4143
segmenter := new(Segmenter)
4244

4345
// Check arguments
@@ -136,12 +138,3 @@ func (s *Segmenter) Decode(ctx context.Context, fn SegmentFunc) error {
136138
// Return success
137139
return nil
138140
}
139-
140-
// Return the file duration from the file or timestamp
141-
func (s *Segmenter) Duration() time.Duration {
142-
if s.reader != nil {
143-
return s.reader.Duration()
144-
} else {
145-
return s.ts + time.Duration(len(s.buf))*time.Second/time.Duration(s.sample_rate)
146-
}
147-
}

pkg/segmenter/silence.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package segmenter
2+
3+
import (
4+
"math"
5+
"time"
6+
// Packages
7+
)
8+
9+
////////////////////////////////////////////////////////////////////////////////
10+
// TYPES
11+
12+
// silence is a silence detector and audio booster for raw samples
13+
// typical values are gain=20, threshold=0.003, timeout=2s
14+
type silence struct {
15+
Gain float64 // gain in decibels
16+
Threshold float64 // threshold for silence
17+
Timeout time.Duration // duration of silence before stopping recording
18+
19+
// When we last started recording
20+
t time.Time
21+
r bool
22+
}
23+
24+
////////////////////////////////////////////////////////////////////////////////
25+
// PUBLIC METHODS
26+
27+
// Increase gain and compute energy of a frame of audio data, return true
28+
// if the frame of data should be recorded, false if it should be ignored
29+
func (s *silence) Process(data []float32) bool {
30+
energy := process(data, float32(math.Pow(10, s.Gain/20.0)))
31+
32+
// Compute the gain
33+
if energy > s.Threshold {
34+
if s.t.IsZero() {
35+
// Transition from silence to recording
36+
s.r = true
37+
}
38+
s.t = time.Now()
39+
} else if !s.t.IsZero() {
40+
if time.Since(s.t) > s.Timeout {
41+
// Transition from recording to silence
42+
s.t = time.Time{}
43+
s.r = false
44+
}
45+
}
46+
return s.r
47+
}
48+
49+
////////////////////////////////////////////////////////////////////////////////
50+
// PRIVATE METHODS
51+
52+
// Increase gain and compute energy of a frame of audio data, return the
53+
// energy of the frame of data
54+
func process(data []float32, gain float32) float64 {
55+
energy := float64(0)
56+
for i := 0; i < len(data); i++ {
57+
data[i] *= gain
58+
energy += float64(data[i]) * float64(data[i])
59+
}
60+
return energy / math.Sqrt(float64(len(data)))
61+
}

0 commit comments

Comments
 (0)