Gemini video tools (#3012)

Mustafa-Esoofally · ysolanky · dirkbrnd · web-flow · commit 300bb0d45cc3 · 2025-05-05T20:38:21.000+02:00
## Summary

Describe key changes, mention related issues or motivation for the
changes.

(If applicable, issue number: #____)

## Type of change

- [ ] Bug fix
- [x] New feature
- [ ] Breaking change
- [x] Improvement
- [ ] Model update
- [ ] Other:

---

## Checklist

- [x] Code complies with style guidelines
- [x] Ran format/validation scripts (`./scripts/format.sh` and
`./scripts/validate.sh`)
- [x] Self-review completed
- [x] Documentation updated (comments, docstrings)
- [x] Examples and guides: Relevant cookbook examples have been included
or updated (if applicable)
- [x] Tested in clean environment
- [ ] Tests added/updated (if applicable)

---

## Additional Notes

Add any important context (deployment instructions, screenshots,
security considerations, etc.)

---------

Co-authored-by: Yash Pratap Solanky &lt;101447028+ysolanky@users.noreply.github.com&gt;
Co-authored-by: Dirk Brand &lt;dirkbrnd@gmail.com&gt;
diff --git a/cookbook/storage/mongodb_storage/mongodb_storage_for_team.py b/cookbook/storage/mongodb_storage/mongodb_storage_for_team.py
@@ -57,7 +57,7 @@ class Article(BaseModel):
     markdown=True,
     debug_mode=True,
     show_members_responses=True,
-    add_member_tools_to_system_message=False
+    add_member_tools_to_system_message=False,
 )
 
 hn_team.print_response("Write an article about the top 2 stories on hackernews")
diff --git a/cookbook/tools/models/__init__.py b/cookbook/tools/models/__init__.py
diff --git a/cookbook/tools/models/gemini_video_generation.py b/cookbook/tools/models/gemini_video_generation.py
@@ -0,0 +1,33 @@
+"""🔧 Example: Using the GeminiTools Toolkit for Video Generation
+
+An Agent using the Gemini video generation tool.
+
+Video generation only works with Vertex AI.
+Make sure you have set the GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables.
+
+Example prompts to try:
+- "Generate a 5-second video of a kitten playing a piano"
+- "Create a short looping animation of a neon city skyline at dusk"
+
+Run `pip install google-genai agno` to install the necessary dependencies.
+"""
+
+from agno.agent import Agent
+from agno.models.openai import OpenAIChat
+from agno.tools.models.gemini import GeminiTools
+from agno.utils.media import save_base64_data
+
+agent = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[GeminiTools(vertexai=True)],  # Video Generation only works on VertexAI mode
+    show_tool_calls=True,
+    debug_mode=True,
+)
+
+agent.print_response(
+    "create a video of a cat driving at top speed",
+)
+response = agent.run_response
+if response.videos:
+    for video in response.videos:
+        save_base64_data(video.content, f"tmp/cat_driving_{video.id}.mp4")
diff --git a/libs/agno/agno/document/reader/firecrawl_reader.py b/libs/agno/agno/document/reader/firecrawl_reader.py
@@ -11,19 +11,26 @@
 except ImportError:
     raise ImportError("The `firecrawl` package is not installed. Please install it via `pip install firecrawl-py`.")
 
+
 @dataclass
 class FirecrawlReader(Reader):
     api_key: Optional[str] = None
     params: Optional[Dict] = None
     mode: Literal["scrape", "crawl"] = "scrape"
-    
-    def __init__(self, api_key: Optional[str] = None, params: Optional[Dict] = None, mode: Literal["scrape", "crawl"] = "scrape", *args, **kwargs) -> None:
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        params: Optional[Dict] = None,
+        mode: Literal["scrape", "crawl"] = "scrape",
+        *args,
+        **kwargs,
+    ) -> None:
         super().__init__(*args, **kwargs)
         self.api_key = api_key
         self.params = params
         self.mode = mode
 
-
     def scrape(self, url: str) -> List[Document]:
         """
         Scrapes a website and returns a list of documents.
diff --git a/libs/agno/agno/media.py b/libs/agno/agno/media.py
@@ -11,7 +11,9 @@ class Media(BaseModel):
 
 
 class VideoArtifact(Media):
-    url: str  # Remote location for file
+    url: Optional[str] = None  # Remote location for file (if no inline content)
+    content: Optional[Union[str, bytes]] = None  # type: ignore
+    mime_type: Optional[str] = None  # MIME type of the video content
     eta: Optional[str] = None
     length: Optional[str] = None
 
diff --git a/libs/agno/agno/models/meta/llama.py b/libs/agno/agno/models/meta/llama.py
@@ -152,7 +152,7 @@ def request_kwargs(self) -> Dict[str, Any]:
         # Add tools
         if self._tools is not None and len(self._tools) > 0:
             request_params["tools"] = self._tools
-            
+
             # Fix optional parameters where the "type" is [<type>, null]
             for tool in request_params["tools"]:  # type: ignore
                 if "parameters" in tool["function"] and "properties" in tool["function"]["parameters"]:  # type: ignore
diff --git a/libs/agno/agno/models/meta/llama_openai.py b/libs/agno/agno/models/meta/llama_openai.py
@@ -33,7 +33,7 @@ class LlamaOpenAI(OpenAILike):
 
     api_key: Optional[str] = getenv("LLAMA_API_KEY")
     base_url: Optional[str] = "https://api.llama.com/compat/v1/"
-    
+
     # Request parameters
     max_completion_tokens: Optional[int] = None
     repetition_penalty: Optional[float] = None
@@ -47,8 +47,7 @@ class LlamaOpenAI(OpenAILike):
 
     supports_native_structured_outputs: bool = False
     supports_json_schema_outputs: bool = True
-    
-    
+
     @property
     def request_kwargs(self) -> Dict[str, Any]:
         """
@@ -76,7 +75,7 @@ def request_kwargs(self) -> Dict[str, Any]:
         # Add tools
         if self._tools is not None and len(self._tools) > 0:
             request_params["tools"] = self._tools
-            
+
             # Fix optional parameters where the "type" is [<type>, null]
             for tool in request_params["tools"]:  # type: ignore
                 if "parameters" in tool["function"] and "properties" in tool["function"]["parameters"]:  # type: ignore
diff --git a/libs/agno/agno/tools/models/gemini.py b/libs/agno/agno/tools/models/gemini.py
@@ -1,44 +1,67 @@
 import base64
+import time
 from os import getenv
-from typing import Optional
+from typing import Any, Optional
 from uuid import uuid4
 
 from agno.agent import Agent
-from agno.media import ImageArtifact
+from agno.media import ImageArtifact, VideoArtifact
 from agno.tools import Toolkit
-from agno.utils.log import log_debug, log_error
+from agno.utils.log import log_debug, log_error, log_info
 
 try:
     from google.genai import Client
+    from google.genai.types import GenerateImagesResponse, GenerateVideosOperation
 except (ModuleNotFoundError, ImportError):
     raise ImportError("`google-genai` not installed. Please install using `pip install google-genai`")
 
 
 class GeminiTools(Toolkit):
-    """Tools for interacting with Google Gemini API (including Imagen for images)"""
+    """Tools for interacting with Google Gemini API"""
 
     def __init__(
         self,
         api_key: Optional[str] = None,
+        vertexai: bool = False,
+        project_id: Optional[str] = None,
+        location: Optional[str] = None,
         image_generation_model: str = "imagen-3.0-generate-002",
+        video_generation_model: str = "veo-2.0-generate-001",
         **kwargs,
     ):
-        super().__init__(name="gemini_tools", tools=[self.generate_image], **kwargs)
+        super().__init__(name="gemini_tools", tools=[self.generate_image, self.generate_video], **kwargs)
 
+        # Set mode and credentials: use only provided vertexai parameter
+        self.vertexai = vertexai
+        self.project_id = project_id
+        self.location = location
+
+        # Load API key from argument or environment
         self.api_key = api_key or getenv("GOOGLE_API_KEY")
-        if not self.api_key:
-            raise ValueError(
-                "GOOGLE_API_KEY not set. Please provide api_key or set the GOOGLE_API_KEY environment variable."
-            )
+        if not self.vertexai and not self.api_key:
+            log_error("GOOGLE_API_KEY not set. Please set the GOOGLE_API_KEY environment variable.")
+            raise ValueError("GOOGLE_API_KEY not set. Please provide api_key or set the environment variable.")
+
+        # Prepare client parameters
+        client_params: dict[str, Any] = {}
+        if self.vertexai:
+            log_info("Using Vertex AI API")
+            client_params["vertexai"] = True
+            client_params["project"] = self.project_id or getenv("GOOGLE_CLOUD_PROJECT")
+            client_params["location"] = self.location or getenv("GOOGLE_CLOUD_LOCATION")
+        else:
+            log_info("Using Gemini API")
+            client_params["api_key"] = self.api_key
 
         try:
-            self.client = Client(api_key=self.api_key)
+            self.client = Client(**client_params)
             log_debug("Google GenAI Client created successfully.")
         except Exception as e:
             log_error(f"Failed to create Google GenAI Client: {e}", exc_info=True)
             raise ValueError(f"Failed to create Google GenAI Client. Error: {e}")
 
         self.image_model = image_generation_model
+        self.video_model = video_generation_model
 
     def generate_image(
         self,
@@ -54,40 +77,89 @@ def generate_image(
         """
 
         try:
-            response = self.client.models.generate_images(
+            response: GenerateImagesResponse = self.client.models.generate_images(
                 model=self.image_model,
                 prompt=prompt,
             )
 
             log_debug("DEBUG: Raw Gemini API response")
 
-            image_bytes = None
-            actual_mime_type = "image/png"
+            # Extract image bytes
+            image_bytes = response.generated_images[0].image.image_bytes
+            for generated_image in response.generated_images:
+                image_bytes = generated_image.image.image_bytes
+                if not image_bytes:
+                    log_error("No valid image data extracted.")
+                    return "Failed to generate image: No valid image data extracted."
+                base64_encoded_image_bytes = base64.b64encode(image_bytes)
+                actual_mime_type = "image/png"
+
+                media_id = str(uuid4())
+                agent.add_image(
+                    ImageArtifact(
+                        id=media_id,
+                        content=base64_encoded_image_bytes,
+                        original_prompt=prompt,
+                        mime_type=actual_mime_type,
+                    )
+                )
+                log_debug(f"Successfully generated image {media_id} with model {self.image_model}")
+            return "Image generated successfully"
 
-            if response.generated_images and response.generated_images[0].image.image_bytes:
-                image_bytes = response.generated_images[0].image.image_bytes
-            else:
-                log_error("No image data found in the response structure.")
-                return "Failed to generate image: No valid image data extracted."
+        except Exception as e:
+            log_error(f"Failed to generate image: Client or method not available ({e})")
+            return f"Failed to generate image: Client or method not available ({e})"
 
-            if image_bytes is None:
-                log_error("image_bytes is None after extraction.")
-                return "Failed to generate image: No valid image data extracted."
+    def generate_video(
+        self,
+        agent: Agent,
+        prompt: str,
+    ) -> str:
+        """Generate a video based on a text prompt.
+        Args:
+            prompt (str): The text prompt to generate the video from.
+        Returns:
+            str: A message indicating success or failure.
+        """
+        # Video generation requires Vertex AI mode.
+        if not self.vertexai:
+            log_error("Video generation requires Vertex AI mode. Please enable Vertex AI mode.")
+            return (
+                "Video generation requires Vertex AI mode. "
+                "Please set `vertexai=True` or environment variable `GOOGLE_GENAI_USE_VERTEXAI=true`."
+            )
 
-            base64_encoded_image_bytes = base64.b64encode(image_bytes)
+        from google.genai.types import GenerateVideosConfig
 
-            media_id = str(uuid4())
-            agent.add_image(
-                ImageArtifact(
-                    id=media_id,
-                    content=base64_encoded_image_bytes,
-                    original_prompt=prompt,
-                    mime_type=actual_mime_type,
-                )
+        try:
+            operation: GenerateVideosOperation = self.client.models.generate_videos(
+                model=self.video_model,
+                prompt=prompt,
+                config=GenerateVideosConfig(
+                    enhance_prompt=True,
+                ),
             )
-            log_debug(f"Successfully generated image {media_id} with model {self.image_model}")
-            return f"Image generated successfully with ID: {media_id}"
 
+            while not operation.done:
+                time.sleep(5)
+                operation = self.client.operations.get(operation=operation)
+
+            for video in operation.result.generated_videos:
+                generated_video = video.video
+
+                media_id = str(uuid4())
+                encoded_video = base64.b64encode(generated_video.video_bytes).decode("utf-8")
+
+                agent.add_video(
+                    VideoArtifact(
+                        id=media_id,
+                        content=encoded_video,
+                        original_prompt=prompt,
+                        mime_type=generated_video.mime_type,
+                    )
+                )
+                log_debug(f"Successfully generated video {media_id} with model {self.video_model}")
+            return "Video generated successfully"
         except Exception as e:
-            log_error(f"Failed to generate image: Client or method not available ({e})")
-            return f"Failed to generate image: Client or method not available ({e})"
+            log_error(f"Failed to generate video: {e}")
+            return f"Failed to generate video: {e}"
diff --git a/libs/agno/tests/unit/tools/models/test_gemini.py b/libs/agno/tests/unit/tools/models/test_gemini.py
@@ -6,7 +6,7 @@
 import pytest
 
 from agno.agent import Agent
-from agno.media import ImageArtifact
+from agno.media import ImageArtifact, VideoArtifact
 from agno.tools.models.gemini import GeminiTools
 
 
@@ -173,3 +173,63 @@ def test_generate_image_no_image_bytes(mock_gemini_tools, mock_agent, mock_faile
         prompt=prompt,
     )
     mock_agent.add_image.assert_not_called()
+
+
+# Tests for generate_video method
+def test_generate_video_requires_vertexai(mock_gemini_tools, mock_agent):
+    """Test video generation when vertexai mode is disabled."""
+    prompt = "A sample video prompt"
+    result = mock_gemini_tools.generate_video(mock_agent, prompt)
+    expected = (
+        "Video generation requires Vertex AI mode. "
+        "Please set `vertexai=True` or environment variable `GOOGLE_GENAI_USE_VERTEXAI=true`."
+    )
+    assert result == expected
+    mock_agent.add_video.assert_not_called()
+
+
+@pytest.fixture
+def mock_video_operation():
+    """Fixture for a completed video generation operation."""
+    op = MagicMock()
+    op.done = True
+    video = MagicMock()
+    video.video_bytes = b"fake_video_bytes"
+    video.mime_type = "video/mp4"
+    op.result = MagicMock(generated_videos=[MagicMock(video=video)])
+    return op
+
+
+def test_generate_video_success(mock_gemini_tools, mock_agent, mock_video_operation):
+    """Test successful video generation."""
+    mock_gemini_tools.vertexai = True
+    mock_gemini_tools.client.models.generate_videos.return_value = mock_video_operation
+    prompt = "A sample video prompt"
+    with patch("agno.tools.models.gemini.uuid4", return_value=UUID("87654321-4321-8765-4321-876543214321")):
+        result = mock_gemini_tools.generate_video(mock_agent, prompt)
+        expected_id = "87654321-4321-8765-4321-876543214321"
+        assert result == f"Video generated successfully with ID: {expected_id}"
+        assert mock_gemini_tools.client.models.generate_videos.called
+        call_args = mock_gemini_tools.client.models.generate_videos.call_args
+        assert call_args.kwargs["model"] == mock_gemini_tools.video_model
+        assert call_args.kwargs["prompt"] == prompt
+        mock_agent.add_video.assert_called_once()
+        added = mock_agent.add_video.call_args[0][0]
+        assert isinstance(added, VideoArtifact)
+        assert added.id == expected_id
+        assert added.original_prompt == prompt
+        assert added.mime_type == "video/mp4"
+        import base64
+
+        expected_content = base64.b64encode(b"fake_video_bytes").decode("utf-8")
+        assert added.content == expected_content
+
+
+def test_generate_video_exception(mock_gemini_tools, mock_agent):
+    """Test video generation when API raises exception."""
+    mock_gemini_tools.vertexai = True
+    mock_gemini_tools.client.models.generate_videos.side_effect = Exception("API error")
+    prompt = "A sample video prompt"
+    result = mock_gemini_tools.generate_video(mock_agent, prompt)
+    assert result == "Failed to generate video: API error"
+    mock_agent.add_video.assert_not_called()

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ class Article(BaseModel):`
`57`	`57`	`markdown=True,`
`58`	`58`	`debug_mode=True,`
`59`	`59`	`show_members_responses=True,`
`60`		`- add_member_tools_to_system_message=False`
	`60`	`+ add_member_tools_to_system_message=False,`
`61`	`61`	`)`
`62`	`62`
`63`	`63`	`hn_team.print_response("Write an article about the top 2 stories on hackernews")`