Skip to content

[Vertex AI] Add snippets for multi-modal audio and PDF inputs #14750

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
89 changes: 88 additions & 1 deletion FirebaseVertexAI/Tests/Unit/Snippets/MultimodalSnippets.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,28 @@ import XCTest
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
final class MultimodalSnippets: XCTestCase {
let bundle = BundleTestUtil.bundle()
lazy var model = VertexAI.vertexAI().generativeModel(modelName: "gemini-1.5-flash")
lazy var model = VertexAI.vertexAI().generativeModel(modelName: "gemini-2.0-flash")
lazy var videoURL = {
guard let url = bundle.url(forResource: "animals", withExtension: "mp4") else {
fatalError("Video file animals.mp4 not found in Resources.")
}
return url
}()

lazy var audioURL = {
guard let url = bundle.url(forResource: "hello-world", withExtension: "mp3") else {
fatalError("Audio file hello-world.mp3 not found in Resources.")
}
return url
}()

lazy var pdfURL = {
guard let url = bundle.url(forResource: "gemini-report", withExtension: "pdf") else {
fatalError("PDF file gemini-report.pdf not found in Resources.")
}
return url
}()

override func setUpWithError() throws {
try FirebaseApp.configureDefaultAppForSnippets()
}
Expand All @@ -42,6 +56,8 @@ final class MultimodalSnippets: XCTestCase {
await FirebaseApp.deleteDefaultAppForSnippets()
}

// MARK: - Image Input

#if canImport(UIKit)
func testMultimodalOneImageNonStreaming() async throws {
guard let image = UIImage(systemName: "bicycle") else { fatalError() }
Expand Down Expand Up @@ -98,6 +114,8 @@ final class MultimodalSnippets: XCTestCase {
}
#endif // canImport(UIKit)

// MARK: - Video Input

func testMultimodalVideoNonStreaming() async throws {
// Provide the video as `Data` with the appropriate MIME type
let video = try InlineDataPart(data: Data(contentsOf: videoURL), mimeType: "video/mp4")
Expand Down Expand Up @@ -125,4 +143,73 @@ final class MultimodalSnippets: XCTestCase {
}
}
}

// MARK: - Audio Input

func testMultiModalAudioNonStreaming() async throws {
// Provide the audio as `Data` with the appropriate MIME type
let audio = try InlineDataPart(data: Data(contentsOf: audioURL), mimeType: "audio/mpeg")

// Provide a text prompt to include with the audio
let prompt = "Transcribe what's said in this audio recording."

// To generate text output, call `generateContent` with the audio and text prompt
let response = try await model.generateContent(audio, prompt)

// Print the generated text, handling the case where it might be nil
print(response.text ?? "No text in response.")
}

func testMultiModalAudioStreaming() async throws {
// Provide the audio as `Data` with the appropriate MIME type
let audio = try InlineDataPart(data: Data(contentsOf: audioURL), mimeType: "audio/mpeg")

// Provide a text prompt to include with the audio
let prompt = "Transcribe what's said in this audio recording."

// To stream generated text output, call `generateContentStream` with the audio and text prompt
let contentStream = try model.generateContentStream(audio, prompt)

// Print the generated text, handling the case where it might be nil
for try await chunk in contentStream {
if let text = chunk.text {
print(text)
}
}
}

// MARK: - Document Input

func testMultiModalPDFStreaming() async throws {
// Provide the PDF as `Data` with the appropriate MIME type
let pdf = try InlineDataPart(data: Data(contentsOf: pdfURL), mimeType: "application/pdf")

// Provide a text prompt to include with the PDF file
let prompt = "Summarize the important results in this report."

// To stream generated text output, call `generateContentStream` with the PDF file and text
// prompt
let contentStream = try model.generateContentStream(pdf, prompt)

// Print the generated text, handling the case where it might be nil
for try await chunk in contentStream {
if let text = chunk.text {
print(text)
}
}
}

func testMultiModalPDFNonStreaming() async throws {
// Provide the PDF as `Data` with the appropriate MIME type
let pdf = try InlineDataPart(data: Data(contentsOf: pdfURL), mimeType: "application/pdf")

// Provide a text prompt to include with the PDF file
let prompt = "Summarize the important results in this report."

// To generate text output, call `generateContent` with the PDF file and text prompt
let response = try await model.generateContent(pdf, prompt)

// Print the generated text, handling the case where it might be nil
print(response.text ?? "No text in response.")
}
}
Loading