Skip to content

Commit 7499250

Browse files
committed
Added some more options and optimized output for my tastes
- Adding System prompt (and to change it) - Use fixed seed and 0 temperature - Possibility to force a one sentence answer (if needed) - Optional usage of the chat API
1 parent 0d93391 commit 7499250

File tree

1 file changed

+37
-16
lines changed

1 file changed

+37
-16
lines changed

main.go

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ import (
1414
)
1515

1616
type args struct {
17-
Path string `arg:"positional,required" help:"Path to an image or a directory with images"`
18-
DryRun bool `arg:"--dry-run,-n" help:"Don't write captions as .txt (stripping the original extension)"`
19-
StartCaption string `arg:"--start,-s" help:"Start the caption with this (image of Leela the dog,)"`
20-
EndCaption string `arg:"--end,-e" help:"End the caption with this (in the style of 'something')"`
21-
Prompt string `arg:"--prompt,-p" help:"The prompt to use" default:"Please describe the content and style of this image in detail. Answer only with one sentence that is starting with \"A ...\""`
22-
Model string `arg:"--model,-m" help:"The model that will be used (must be a vision model like \"llava\")" default:"x/llama3.2-vision"`
23-
Force bool `arg:"--force,-f" help:"Also process the image if a file with .txt extension exists"`
17+
Path string `arg:"positional,required" help:"Path to an image or a directory with images"`
18+
DryRun bool `arg:"--dry-run,-n" help:"Don't write captions as .txt (stripping the original extension)"`
19+
StartCaption string `arg:"--start,-s" help:"Start the caption with this (image of Leela the dog,)"`
20+
EndCaption string `arg:"--end,-e" help:"End the caption with this (in the style of 'something')"`
21+
Prompt string `arg:"--prompt,-p" help:"The prompt to use" default:"Please describe the content and style of this image in detail. Answer only with one sentence that is starting with \"A ...\""`
22+
ForceOneSentence bool `arg:"--force-one-sentence" help:"Stops generation after the first period (.)"`
23+
UseChatAPI bool `arg:"--use-chat-api,-c" help:"Use the chat API instead of the generate API"`
24+
System string `arg:"--system" help:"The system prompt that will be used (does not work with chat API)" default:"Analyse images in a neutral way. Describe foreground, background and style in detail."`
25+
Model string `arg:"--model,-m" help:"The model that will be used (must be a vision model like \"llava\")" default:"x/llama3.2-vision"`
26+
Force bool `arg:"--force,-f" help:"Also process the image if a file with .txt extension exists"`
2427
}
2528

2629
const appName = "capollama"
@@ -32,17 +35,32 @@ func (args) Version() string {
3235
return appName + " " + fullVersion
3336
}
3437

35-
func GenerateWithImage(ol *api.Client, model, prompt, imagePath string) (string, error) {
38+
func options(args args) map[string]any {
39+
opts := map[string]any{
40+
"num_predict": 200,
41+
"temperature": 0,
42+
"seed": 1,
43+
}
44+
if args.ForceOneSentence {
45+
opts["stop"] = []string{"."}
46+
47+
}
48+
return opts
49+
}
50+
51+
func GenerateWithImage(ol *api.Client, model string, prompt string, options map[string]any, system string, imagePath string) (string, error) {
3652
// First, convert the image to base64
3753
imgData, err := os.ReadFile(imagePath)
3854
if err != nil {
3955
return "", fmt.Errorf("failed to read image: %w", err)
4056
}
4157

4258
req := &api.GenerateRequest{
43-
Model: model,
44-
Prompt: prompt,
45-
Images: []api.ImageData{imgData},
59+
Model: model,
60+
Prompt: prompt,
61+
Images: []api.ImageData{imgData},
62+
Options: options,
63+
System: system,
4664
}
4765

4866
ctx := context.Background()
@@ -59,8 +77,7 @@ func GenerateWithImage(ol *api.Client, model, prompt, imagePath string) (string,
5977
return response.String(), nil
6078
}
6179

62-
/*
63-
func ChatWithImage(ol *api.Client, model, prompt, imagePath string) (string, error) {
80+
func ChatWithImage(ol *api.Client, model string, prompt string, options map[string]any, imagePath string) (string, error) {
6481
// First, convert the image to base64
6582
imageData, err := os.ReadFile(imagePath)
6683
if err != nil {
@@ -77,6 +94,7 @@ func ChatWithImage(ol *api.Client, model, prompt, imagePath string) (string, err
7794
req := &api.ChatRequest{
7895
Model: model,
7996
Messages: []api.Message{msg},
97+
Options: options,
8098
}
8199

82100
var response strings.Builder
@@ -91,7 +109,6 @@ func ChatWithImage(ol *api.Client, model, prompt, imagePath string) (string, err
91109
}
92110
return response.String(), nil
93111
}
94-
*/
95112

96113
// ProcessImages walks through a given path and processes image files
97114
func ProcessImages(path string, processFunc func(imagePath, rootDir string)) error {
@@ -163,8 +180,12 @@ func main() {
163180
}
164181
}
165182

166-
captionText, err := GenerateWithImage(ol, args.Model, args.Prompt, path)
167-
//captionText, err := ChatWithImage(ol, args.Model, args.Prompt, path)
183+
var captionText string
184+
if args.UseChatAPI {
185+
captionText, err = ChatWithImage(ol, args.Model, args.Prompt, options(args), path)
186+
} else {
187+
captionText, err = GenerateWithImage(ol, args.Model, args.Prompt, options(args), args.System, path)
188+
}
168189
if err != nil {
169190
log.Fatalf("Aborting because of %v", err)
170191
}

0 commit comments

Comments
 (0)