Skip to content

Commit 2cd4a17

Browse files
committed
Added speech to text
1 parent e967c37 commit 2cd4a17

File tree

2 files changed

+42
-3
lines changed

2 files changed

+42
-3
lines changed

cmd/api/openai.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ var (
3737
openaiSystemPrompt string
3838
openaiPrompt string
3939
openaiLanguage string
40+
openaiExt string
41+
openaiSpeed float64
4042
)
4143

4244
///////////////////////////////////////////////////////////////////////////////
@@ -59,6 +61,7 @@ func openaiRegister(flags *Flags) {
5961
// TODO flags.Float(openaiName, "temperature", 0, "Sampling temperature to use, between 0.0 and 2.0")
6062
flags.String(openaiName, "prompt", "", "An optional text to guide the model's style or continue a previous audio segment")
6163
//flags.String(openaiName, "language", "", "The language of the input audio in ISO-639-1 format")
64+
flags.Float(openaiName, "speed", 0, "The speed of the generated audio")
6265

6366
// Register commands
6467
flags.Register(Cmd{
@@ -72,6 +75,7 @@ func openaiRegister(flags *Flags) {
7275
{Name: "chat", Call: openaiChat, Description: "Create a chat completion", MinArgs: 1, Syntax: "<text>..."},
7376
{Name: "transcribe", Call: openaiTranscribe, Description: "Transcribes audio into the input language", MinArgs: 1, MaxArgs: 1, Syntax: "<filename>"},
7477
{Name: "translate", Call: openaiTranslate, Description: "Translates audio into English", MinArgs: 1, MaxArgs: 1, Syntax: "<filename>"},
78+
{Name: "say", Call: openaiTextToSpeech, Description: "Text to speech", MinArgs: 2, Syntax: "<voice-id> <text>..."},
7579
},
7680
})
7781
}
@@ -96,6 +100,7 @@ func openaiParse(flags *Flags, opts ...client.ClientOpt) error {
96100
openaiSystemPrompt = flags.GetString("system")
97101
openaiPrompt = flags.GetString("prompt")
98102
openaiLanguage = flags.GetString("language")
103+
openaiExt = flags.GetOutExt()
99104

100105
if temp, err := flags.GetValue("temperature"); err == nil {
101106
t := temp.(float64)
@@ -117,6 +122,9 @@ func openaiParse(flags *Flags, opts ...client.ClientOpt) error {
117122
v := count.(uint64)
118123
openaiCount = &v
119124
}
125+
if speed, err := flags.GetValue("speed"); err == nil {
126+
openaiSpeed = speed.(float64)
127+
}
120128

121129
// Return success
122130
return nil
@@ -274,6 +282,7 @@ func openaiTranscribe(ctx context.Context, w *tablewriter.Writer, args []string)
274282

275283
func openaiTranslate(ctx context.Context, w *tablewriter.Writer, args []string) error {
276284
opts := []openai.Opt{}
285+
277286
if openaiModel != "" {
278287
opts = append(opts, openai.OptModel(openaiModel))
279288
}
@@ -303,3 +312,33 @@ func openaiTranslate(ctx context.Context, w *tablewriter.Writer, args []string)
303312
// Write output
304313
return w.Write(transcription)
305314
}
315+
316+
func openaiTextToSpeech(ctx context.Context, w *tablewriter.Writer, args []string) error {
317+
opts := []openai.Opt{}
318+
319+
// Set response format
320+
if openaiResponseFormat != "" {
321+
opts = append(opts, openai.OptResponseFormat(openaiResponseFormat))
322+
} else if openaiExt != "" {
323+
opts = append(opts, openai.OptResponseFormat(openaiExt))
324+
}
325+
326+
// Set other options
327+
if openaiSpeed > 0 {
328+
opts = append(opts, openai.OptSpeed(float32(openaiSpeed)))
329+
}
330+
331+
// The text to speak
332+
voice := args[0]
333+
text := strings.Join(args[1:], " ")
334+
335+
// Request -> Response
336+
if n, err := openaiClient.TextToSpeech(ctx, w.Output(), voice, text, opts...); err != nil {
337+
return err
338+
} else {
339+
openaiClient.Debugf("wrote %v bytes", n)
340+
}
341+
342+
// Return success
343+
return nil
344+
}

pkg/openai/audio.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func (r reqTranscribe) String() string {
8181

8282
// Creates audio for the given text, outputs to the writer and returns
8383
// the number of bytes written
84-
func (c *Client) Speech(w io.Writer, voice, text string, opts ...Opt) (int64, error) {
84+
func (c *Client) TextToSpeech(ctx context.Context, w io.Writer, voice, text string, opts ...Opt) (int64, error) {
8585
var request reqSpeech
8686
var response respSpeech
8787

@@ -101,11 +101,11 @@ func (c *Client) Speech(w io.Writer, voice, text string, opts ...Opt) (int64, er
101101
// Make a response object, write the data
102102
if payload, err := client.NewJSONRequest(request); err != nil {
103103
return 0, err
104-
} else if err := c.Do(payload, &response, client.OptPath("audio/speech")); err != nil {
104+
} else if err := c.DoWithContext(ctx, payload, &response, client.OptPath("audio/speech")); err != nil {
105105
return 0, err
106106
}
107107

108-
// Return the mimetype of the response
108+
// Return the number of bytes written
109109
return response.bytes, nil
110110
}
111111

0 commit comments

Comments
 (0)