Skip to content

Commit 6516b45

Browse files
committed
Update GenAIVision Example
- create NNGenAIVision class - add vision prompt support for Llama model
1 parent 773aaec commit 6516b45

File tree

3 files changed

+310
-185
lines changed

3 files changed

+310
-185
lines changed

Arduino_package/hardware/libraries/NeuralNetwork/examples/GenAIVision/GenAIVision.ino

Lines changed: 22 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,41 @@
11
/*
22
3-
This sketch shows the example of image prompts openAI Vision and Gemini Vision API
3+
This sketch shows the example of image prompts using APIs.
44
5-
openAI Vision
5+
openAI platform - openAI vision
66
https://platform.openai.com/docs/guides/vision
77
8-
Gemini Vision
8+
Google AI Studio - Gemini vision
99
https://ai.google.dev/gemini-api/docs/vision
1010
11+
GroqCloud - Llama vision
12+
https://console.groq.com/docs/overview
13+
1114
Example Guide: TBD
1215
1316
Credit : ChungYi Fu (Kaohsiung, Taiwan)
1417
1518
*/
1619

17-
String openAI_key = ""; // paste your generated openAI key here
18-
String Gemini_key = ""; // paste your generated Gemini key here
20+
String openAI_key = ""; // paste your generated openAI API key here
21+
String Gemini_key = ""; // paste your generated Gemini API key here
22+
String Llama_key = ""; // paste your generated Llama API key here
1923
char wifi_ssid[] = "Network_SSID5"; // change to your network SSID
2024
char wifi_pass[] = "Password"; // change to your network password
2125

2226
#include <WiFi.h>
23-
WiFiSSLClient client;
24-
#include <ArduinoJson.h>
25-
#include "Base64.h"
27+
#include "NNGenAIVision.h"
2628
#include "VideoStream.h"
29+
WiFiSSLClient client;
30+
NNGenAIVision llm;
2731
VideoSetting config(768, 768, CAM_FPS, VIDEO_JPEG, 1);
2832
#define CHANNEL 0
2933

3034
uint32_t img_addr = 0;
3135
uint32_t img_len = 0;
3236

37+
String prompt = "Please describe the image, and if there is a text, please summarize the content";
38+
3339
void initWiFi()
3440
{
3541
for (int i = 0; i < 2; i++) {
@@ -59,175 +65,6 @@ void initWiFi()
5965
}
6066
}
6167

62-
String SendStillToOpenaiVision(String key, String message, bool capture)
63-
{
64-
const char *myDomain = "api.openai.com";
65-
String getResponse = "", Feedback = "";
66-
Serial.println("Connect to " + String(myDomain));
67-
if (client.connect(myDomain, 443)) {
68-
Serial.println("Connection successful");
69-
if (capture) {
70-
Camera.getImage(0, &img_addr, &img_len);
71-
}
72-
uint8_t *fbBuf = (uint8_t *)img_addr;
73-
size_t fbLen = img_len;
74-
75-
char *input = (char *)fbBuf;
76-
char output[base64_enc_len(3)];
77-
String imageFile = "data:image/jpeg;base64,";
78-
for (int i = 0; i < fbLen; i++) {
79-
base64_encode(output, (input++), 3);
80-
if (i % 3 == 0) {
81-
imageFile += String(output);
82-
}
83-
}
84-
String Data = "{\"model\": \"gpt-4o-mini\", \"messages\": [{\"role\": \"user\",\"content\": [{ \"type\": \"text\", \"text\": \"" + message + "\"},{\"type\": \"image_url\", \"image_url\": {\"url\": \"" + imageFile + "\"}}]}]}";
85-
86-
client.println("POST /v1/chat/completions HTTP/1.1");
87-
client.println("Host: " + String(myDomain));
88-
client.println("Authorization: Bearer " + key);
89-
client.println("Content-Type: application/json; charset=utf-8");
90-
client.println("Content-Length: " + String(Data.length()));
91-
client.println("Connection: close");
92-
client.println();
93-
94-
int Index;
95-
for (Index = 0; Index < Data.length(); Index = Index + 1024) {
96-
client.print(Data.substring(Index, Index + 1024));
97-
}
98-
99-
int waitTime = 10000;
100-
long startTime = millis();
101-
boolean state = false;
102-
boolean markState = false;
103-
while ((startTime + waitTime) > millis()) {
104-
Serial.print(".");
105-
delay(100);
106-
while (client.available()) {
107-
char c = client.read();
108-
if (String(c) == "{") {
109-
markState = true;
110-
}
111-
if (state == true && markState == true) {
112-
Feedback += String(c);
113-
}
114-
if (c == '\n') {
115-
if (getResponse.length() == 0) {
116-
state = true;
117-
}
118-
getResponse = "";
119-
} else if (c != '\r') {
120-
getResponse += String(c);
121-
}
122-
startTime = millis();
123-
}
124-
if (Feedback.length() > 0) {
125-
break;
126-
}
127-
}
128-
Serial.println();
129-
client.stop();
130-
131-
JsonObject obj;
132-
DynamicJsonDocument doc(4096);
133-
deserializeJson(doc, Feedback);
134-
obj = doc.as<JsonObject>();
135-
getResponse = obj["choices"][0]["message"]["content"].as<String>();
136-
if (getResponse == "null") {
137-
getResponse = obj["error"]["message"].as<String>();
138-
}
139-
} else {
140-
getResponse = "Connected to " + String(myDomain) + " failed.";
141-
Serial.println("Connected to " + String(myDomain) + " failed.");
142-
}
143-
144-
return getResponse;
145-
}
146-
147-
String SendStillToGeminiVision(String key, String message, bool capture)
148-
{
149-
const char *myDomain = "generativelanguage.googleapis.com";
150-
String getResponse = "", Feedback = "";
151-
Serial.println("Connect to " + String(myDomain));
152-
if (client.connect(myDomain, 443)) {
153-
Serial.println("Connection successful");
154-
if (capture) {
155-
Camera.getImage(0, &img_addr, &img_len);
156-
}
157-
uint8_t *fbBuf = (uint8_t *)img_addr;
158-
size_t fbLen = img_len;
159-
160-
char *input = (char *)fbBuf;
161-
char output[base64_enc_len(3)];
162-
String imageFile = "";
163-
for (int i = 0; i < fbLen; i++) {
164-
base64_encode(output, (input++), 3);
165-
if (i % 3 == 0) {
166-
imageFile += String(output);
167-
}
168-
}
169-
String Data = "{\"contents\": [{\"parts\": [{\"text\": \"" + message + "\"}, {\"inline_data\": {\"mime_type\":\"image/jpeg\",\"data\":\"" + imageFile + "\"}}]}]}";
170-
171-
client.println("POST /v1beta/models/gemini-1.5-flash-latest:generateContent?key=" + key + " HTTP/1.1");
172-
client.println("Host: " + String(myDomain));
173-
client.println("Content-Type: application/json; charset=utf-8");
174-
client.println("Content-Length: " + String(Data.length()));
175-
client.println("Connection: close");
176-
client.println();
177-
178-
int Index;
179-
for (Index = 0; Index < Data.length(); Index = Index + 1024) {
180-
client.print(Data.substring(Index, Index + 1024));
181-
}
182-
183-
int waitTime = 10000;
184-
long startTime = millis();
185-
boolean state = false;
186-
boolean markState = false;
187-
while ((startTime + waitTime) > millis()) {
188-
Serial.print(".");
189-
delay(100);
190-
while (client.available()) {
191-
char c = client.read();
192-
if (String(c) == "{") {
193-
markState = true;
194-
}
195-
if (state == true && markState == true) {
196-
Feedback += String(c);
197-
}
198-
if (c == '\n') {
199-
if (getResponse.length() == 0) {
200-
state = true;
201-
}
202-
getResponse = "";
203-
} else if (c != '\r') {
204-
getResponse += String(c);
205-
}
206-
startTime = millis();
207-
}
208-
if (Feedback.length() > 0) {
209-
break;
210-
}
211-
}
212-
Serial.println();
213-
client.stop();
214-
215-
JsonObject obj;
216-
DynamicJsonDocument doc(4096);
217-
deserializeJson(doc, Feedback);
218-
obj = doc.as<JsonObject>();
219-
getResponse = obj["candidates"][0]["content"]["parts"][0]["text"].as<String>();
220-
if (getResponse == "null") {
221-
getResponse = obj["error"]["message"].as<String>();
222-
}
223-
} else {
224-
getResponse = "Connected to " + String(myDomain) + " failed.";
225-
Serial.println("Connected to " + String(myDomain) + " failed.");
226-
}
227-
228-
return getResponse;
229-
}
230-
23168
void setup()
23269
{
23370
Serial.begin(115200);
@@ -244,15 +81,15 @@ void setup()
24481

24582
// Vision prompt using same taken image
24683
Camera.getImage(0, &img_addr, &img_len);
247-
Serial.println((SendStillToOpenaiVision(openAI_key, "Please describe the image, and if there is text, please summarize the content", 0)));
248-
Serial.println((SendStillToGeminiVision(Gemini_key, "Please describe the image, and if there is text, please summarize the content", 0)));
24984

250-
/*
251-
// Vision prompt using different image
252-
Serial.println((SendStillToOpenaiVision(openAI_key, "Please describe the image, and if there is text, please summarize the content", 1)));
253-
delay(5000);
254-
Serial.println((SendStillToGeminiVision(Gemini_key, "Please describe the image, and if there is text, please summarize the content", 1)));
255-
*/
85+
// openAI vision prompt
86+
// Serial.println((llm.openaivision(openAI_key, prompt, &img_addr, img_len, client)));
87+
88+
// Gemini vision prompt
89+
// Serial.println((llm.geminivision(Gemini_key, prompt, img_addr, img_len, client)));
90+
91+
// Llama vision prompt
92+
Serial.println((llm.llamavision(Llama_key, prompt, img_addr, img_len, client)));
25693
}
25794

25895
void loop()

0 commit comments

Comments
 (0)