Skip to content

Commit 8800125

Browse files
committed
Updated audio
1 parent 6b45998 commit 8800125

File tree

9 files changed

+219
-32
lines changed

9 files changed

+219
-32
lines changed

7283d94c6a7e5bc91e7875ccf51a96d3.m2a

25.8 KB
Binary file not shown.

README.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ flag. For example, to have the model generate a caption for the image in the fil
443443
the following command:
444444

445445
```bash
446-
llm complete --model gpt-4o --file picture.png "Explain this image"
446+
llm complete --file picture.png "Explain this image"
447447
```
448448

449449
### Generate an image
@@ -456,7 +456,22 @@ the following command:
456456
llm complete --model dall-e-3 --format image "A picture of a cat"
457457
```
458458

459-
It will write the file in the current working directory.
459+
Flags `--size`, `--quality` and `--style` can be used to specify the image parameters. It will write the image
460+
file in the current working directory.
461+
462+
### Convert text to speech
463+
464+
To have a model generate text from speech:
465+
466+
```bash
467+
echo book.txt | llm complete --model tts-1 --format mp3 --voice coral
468+
```
469+
470+
It will write the audio file in the current working directory. You can currently write
471+
the following audio formats and voices:
472+
473+
* Formats: `--format mp3`, `--format opus`, `--format aac`, `--format flac`, `--format wav`, `--format pcm`
474+
* Voices: `--voice alloy`, `--voice ash`, `--voice coral`, `--voice echo`, `--voice fable`, `--voice onyx`, `--voice nova`, `--voice sage`, `--voice shimmer`
460475

461476
## Contributing & Distribution
462477

attachment.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ type AttachmentMeta struct {
2323
ExpiresAt uint64 `json:"expires_at,omitempty"`
2424
Caption string `json:"transcript,omitempty"`
2525
Data []byte `json:"data"`
26+
Type string `json:"type"`
2627
}
2728

2829
// OpenAI image metadata
@@ -57,19 +58,23 @@ func NewAttachmentWithImage(image *ImageMeta) *Attachment {
5758

5859
// ReadAttachment returns an attachment from a reader object.
5960
// It is the responsibility of the caller to close the reader.
60-
func ReadAttachment(r io.Reader) (*Attachment, error) {
61-
var filename string
61+
func ReadAttachment(r io.Reader, mimetype ...string) (*Attachment, error) {
62+
var filename, typ string
6263
data, err := io.ReadAll(r)
6364
if err != nil {
6465
return nil, err
6566
}
6667
if f, ok := r.(*os.File); ok {
6768
filename = f.Name()
6869
}
70+
if len(mimetype) > 0 {
71+
typ = mimetype[0]
72+
}
6973
return &Attachment{
7074
meta: &AttachmentMeta{
7175
Filename: filename,
7276
Data: data,
77+
Type: typ,
7378
},
7479
}, nil
7580
}
@@ -176,6 +181,11 @@ func (a *Attachment) Caption() string {
176181
// on the data and/or filename extension. Returns an empty string if
177182
// there is no data or filename
178183
func (a *Attachment) Type() string {
184+
// If there's a mimetype set, use this
185+
if a.meta != nil && a.meta.Type != "" {
186+
return a.meta.Type
187+
}
188+
179189
// If there's no data or filename, return empty
180190
if len(a.Data()) == 0 && a.Filename() == "" {
181191
return ""
@@ -191,9 +201,9 @@ func (a *Attachment) Type() string {
191201
}
192202

193203
// Mimetype based on filename
194-
if a.Filename() != "" {
204+
if a.meta != nil && a.meta.Filename != "" {
195205
// Detect mimetype from extension
196-
mimetype = mime.TypeByExtension(filepath.Ext(a.Filename()))
206+
mimetype = mime.TypeByExtension(filepath.Ext(a.meta.Filename))
197207
}
198208

199209
// Return the default mimetype

cmd/llm/complete.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@ package main
22

33
import (
44
"context"
5+
"encoding/json"
56
"fmt"
67
"io"
78
"os"
89
"strings"
910

1011
// Packages
1112
llm "github.com/mutablelogic/go-llm"
12-
"github.com/mutablelogic/go-llm/pkg/openai"
13+
openai "github.com/mutablelogic/go-llm/pkg/openai"
1314
)
1415

1516
////////////////////////////////////////////////////////////////////////////////
@@ -89,14 +90,23 @@ func (cmd *CompleteCmd) Run(globals *Globals) error {
8990
completion, err := model.Completion(ctx, string(prompt), opts...)
9091
if err != nil {
9192
return err
93+
} else if completion == nil {
94+
return llm.ErrInternalServerError.Withf("No completion returned")
9295
}
9396

9497
// Print the completion - text
9598
if cmd.NoStream {
9699
fmt.Println(completion.Text(0))
100+
} else {
101+
fmt.Println()
97102
}
98103

99104
// Output completion attachments
105+
type Result struct {
106+
Filename string `json:"filename"`
107+
Caption string `json:"caption,omitempty"`
108+
}
109+
var out []Result
100110
for i := 0; i < completion.Num(); i++ {
101111
attachment := completion.Attachment(i)
102112
if attachment == nil {
@@ -113,9 +123,21 @@ func (cmd *CompleteCmd) Run(globals *Globals) error {
113123

114124
if _, err := f.Write(attachment.Data()); err != nil {
115125
return err
116-
} else {
117-
fmt.Printf("%q written to %s\n", attachment.Caption(), attachment.Filename())
118126
}
127+
128+
out = append(out, Result{
129+
Filename: attachment.Filename(),
130+
Caption: attachment.Caption(),
131+
})
132+
}
133+
134+
// Print the completion - attachments
135+
if len(out) > 0 {
136+
data, err := json.MarshalIndent(out, "", " ")
137+
if err != nil {
138+
return err
139+
}
140+
fmt.Println(string(data))
119141
}
120142

121143
// Return success

opt_format.go

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import "strings"
88
const (
99
mimeTypeText = "text/plain"
1010
mimeTypeJSON = "application/json"
11-
mimeTypeJpeg = "image/jpeg"
1211
mimeTypeMP3 = "audio/mpeg"
1312
mimeTypeOpus = "audio/opus"
1413
mimeTypeAAC = "audio/aac"
@@ -25,21 +24,20 @@ var (
2524
mimeTypeJSON: "json_object",
2625
"json": "json_object",
2726
"json_object": "json_object",
28-
mimeTypeJpeg: "image",
29-
"jpeg": "image",
3027
"image": "image",
31-
mimeTypeMP3: "mp3",
32-
mimeTypeOpus: "opus",
33-
mimeTypeAAC: "aac",
34-
mimeTypeFLAC: "flac",
35-
mimeTypeWAV: "wav",
36-
mimeTypePCM: "pcm",
37-
"mp3": "mp3",
38-
"opus": "opus",
39-
"aac": "aac",
40-
"flac": "flac",
41-
"wav": "wav",
42-
"pcm": "pcm",
28+
mimeTypeMP3: "audio",
29+
mimeTypeOpus: "audio",
30+
mimeTypeAAC: "audio",
31+
mimeTypeFLAC: "audio",
32+
mimeTypeWAV: "audio",
33+
mimeTypePCM: "audio",
34+
"audio": "audio",
35+
"mp3": "audio",
36+
"opus": "audio",
37+
"aac": "audio",
38+
"flac": "audio",
39+
"wav": "audio",
40+
"pcm": "audio",
4341
}
4442
audioValues = []string{
4543
"mp3", "opus", "aac", "flac", "wav", "pcm",

pkg/openai/audio.go

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,99 @@
11
package openai
2+
3+
import (
4+
"context"
5+
"io"
6+
7+
// Packages
8+
client "github.com/mutablelogic/go-client"
9+
llm "github.com/mutablelogic/go-llm"
10+
)
11+
12+
///////////////////////////////////////////////////////////////////////////////
13+
// PRIVATE METHODS
14+
15+
type reqAudioCompletion struct {
16+
Model string `json:"model"`
17+
Input string `json:"input"`
18+
Voice string `json:"voice"`
19+
Speed float64 `json:"speed,omitempty"`
20+
ResponseFormat string `json:"response_format,omitempty"`
21+
}
22+
23+
type responseAudio struct {
24+
audio *llm.Attachment
25+
}
26+
27+
// Send a completion request with text for text-to-speech
28+
func (model *model) audioCompletion(ctx context.Context, input string, opt *llm.Opts) (llm.Completion, error) {
29+
// Request
30+
req, err := client.NewJSONRequest(reqAudioCompletion{
31+
Model: model.Name(),
32+
Input: input,
33+
Voice: optVoice(opt),
34+
Speed: optSpeed(opt),
35+
ResponseFormat: optAudioFormat(opt),
36+
})
37+
if err != nil {
38+
return nil, err
39+
}
40+
41+
// Response
42+
var response responseAudio
43+
if err := model.DoWithContext(ctx, req, &response, client.OptPath("audio", "speech")); err != nil {
44+
return nil, err
45+
}
46+
47+
return &response, nil
48+
}
49+
50+
func (resp *responseAudio) Unmarshal(mimetype string, r io.Reader) error {
51+
// Unmarshal the response
52+
attachment, err := llm.ReadAttachment(r, mimetype)
53+
if err != nil {
54+
return err
55+
} else {
56+
resp.audio = attachment
57+
}
58+
return nil
59+
}
60+
61+
///////////////////////////////////////////////////////////////////////////////
62+
// COMPLETION
63+
64+
// Return the number of completions
65+
func (r *responseAudio) Num() int {
66+
return 1
67+
}
68+
69+
// Return message for a specific completion
70+
func (r *responseAudio) Choice(index int) llm.Completion {
71+
if index != 0 {
72+
return nil
73+
}
74+
return r
75+
}
76+
77+
// Return the role of the completion
78+
func (r *responseAudio) Role() string {
79+
return "assistant"
80+
}
81+
82+
// Unsupported
83+
func (r *responseAudio) Text(index int) string {
84+
return ""
85+
}
86+
87+
// Return media content for a specific completion
88+
func (r *responseAudio) Attachment(index int) *llm.Attachment {
89+
if index != 0 {
90+
return nil
91+
} else {
92+
return r.audio
93+
}
94+
}
95+
96+
// Unsupported
97+
func (r *responseAudio) ToolCalls(index int) []llm.ToolCall {
98+
return nil
99+
}

pkg/openai/image.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,6 @@ func (model *model) imageCompletion(ctx context.Context, prompt string, opt *llm
6666
return &response, nil
6767
}
6868

69-
// Send a completion request with text for a text-to-speech completion
70-
// TODO
71-
func (model *model) audioCompletion(ctx context.Context, prompt string, opt *llm.Opts) (llm.Completion, error) {
72-
return nil, llm.ErrNotImplemented
73-
}
74-
7569
///////////////////////////////////////////////////////////////////////////////
7670
// PRIVATE METHODS
7771

pkg/openai/opt.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,17 @@ func WithAudio(voice, format string) llm.Opt {
133133
}
134134
}
135135

136+
// Parameters for speech output
137+
func WithAudioSpeed(v float64) llm.Opt {
138+
return func(o *llm.Opts) error {
139+
if v < 0.25 || v > 4.0 {
140+
return llm.ErrBadParameter.With("speed")
141+
}
142+
o.Set("speed", v)
143+
return nil
144+
}
145+
}
146+
136147
// Parameters for image output
137148
func WithSize(v string) llm.Opt {
138149
return func(o *llm.Opts) error {
@@ -264,9 +275,14 @@ func optPrediction(opts *llm.Opts) *Content {
264275
}
265276

266277
func optAudio(opts *llm.Opts) *Audio {
267-
if v, ok := opts.Get("audio").(*Audio); ok {
278+
v, ok := opts.Get("audio").(*Audio)
279+
if ok {
268280
return v
269281
}
282+
if v == nil {
283+
opts.Set("audio", NewAudio("ash", "mp3"))
284+
return optAudio(opts)
285+
}
270286
return nil
271287
}
272288

pkg/openai/opt_audio.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package openai
22

3-
import "strings"
3+
import (
4+
"strings"
5+
6+
"github.com/mutablelogic/go-llm"
7+
)
48

59
///////////////////////////////////////////////////////////////////////////////
610
// TYPES
@@ -11,6 +15,9 @@ type Audio struct {
1115

1216
// Supported formats: wav, mp3, flac, opus, or pcm16
1317
Format string `json:"format"`
18+
19+
// Return the speed
20+
Speed float64 `json:"speed,omitempty"`
1421
}
1522

1623
///////////////////////////////////////////////////////////////////////////////
@@ -24,3 +31,30 @@ func NewAudio(voice, format string) *Audio {
2431
}
2532
return &Audio{Voice: voice, Format: format}
2633
}
34+
35+
///////////////////////////////////////////////////////////////////////////////
36+
// PRIVATE METHODS
37+
38+
func optVoice(opts *llm.Opts) string {
39+
if audio := optAudio(opts); audio != nil {
40+
return audio.Voice
41+
} else {
42+
return ""
43+
}
44+
}
45+
46+
func optSpeed(opts *llm.Opts) float64 {
47+
if audio := optAudio(opts); audio != nil {
48+
return audio.Speed
49+
} else {
50+
return 1.0
51+
}
52+
}
53+
54+
func optAudioFormat(opts *llm.Opts) string {
55+
if audio := optAudio(opts); audio != nil {
56+
return audio.Format
57+
} else {
58+
return "mp3"
59+
}
60+
}

0 commit comments

Comments
 (0)