Added support for google specific arguments for video analysis (#2110)

Sumered · DouweM · web-flow · commit 036428cb45f5 · 2025-07-07T13:45:00.000-06:00
Co-authored-by: Douwe Maan &lt;douwe@pydantic.dev&gt;
diff --git a/pydantic_ai_slim/pydantic_ai/messages.py b/pydantic_ai_slim/pydantic_ai/messages.py
@@ -99,6 +99,13 @@ class FileUrl(ABC):
     * If False, the URL is sent directly to the model and no download is performed.
     """
 
+    vendor_metadata: dict[str, Any] | None = None
+    """Vendor-specific metadata for the file.
+
+    Supported by:
+    - `GoogleModel`: `VideoUrl.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
+    """
+
     @property
     @abstractmethod
     def media_type(self) -> str:
@@ -263,6 +270,13 @@ class BinaryContent:
     media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str
     """The media type of the binary data."""
 
+    vendor_metadata: dict[str, Any] | None = None
+    """Vendor-specific metadata for the file.
+
+    Supported by:
+    - `GoogleModel`: `BinaryContent.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
+    """
+
     kind: Literal['binary'] = 'binary'
     """Type identifier, this is available on all parts as a discriminator."""
 
diff --git a/pydantic_ai_slim/pydantic_ai/models/google.py b/pydantic_ai_slim/pydantic_ai/models/google.py
@@ -55,6 +55,7 @@
         GenerateContentConfigDict,
         GenerateContentResponse,
         HttpOptionsDict,
+        MediaResolution,
         Part,
         PartDict,
         SafetySettingDict,
@@ -120,6 +121,12 @@ class GoogleModelSettings(ModelSettings, total=False):
     See the [Gemini API docs](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/add-labels-to-api-calls) for use cases and limitations.
     """
 
+    google_video_resolution: MediaResolution
+    """The video resolution to use for the model.
+
+    See <https://ai.google.dev/api/generate-content#MediaResolution> for more information.
+    """
+
 
 @dataclass(init=False)
 class GoogleModel(Model):
@@ -291,6 +298,7 @@ async def _generate_content(
             safety_settings=model_settings.get('google_safety_settings'),
             thinking_config=model_settings.get('google_thinking_config'),
             labels=model_settings.get('google_labels'),
+            media_resolution=model_settings.get('google_video_resolution'),
             tools=cast(ToolListUnionDict, tools),
             tool_config=tool_config,
             response_mime_type=response_mime_type,
@@ -398,9 +406,15 @@ async def _map_user_prompt(self, part: UserPromptPart) -> list[PartDict]:
                 elif isinstance(item, BinaryContent):
                     # NOTE: The type from Google GenAI is incorrect, it should be `str`, not `bytes`.
                     base64_encoded = base64.b64encode(item.data).decode('utf-8')
-                    content.append({'inline_data': {'data': base64_encoded, 'mime_type': item.media_type}})  # type: ignore
+                    inline_data_dict = {'inline_data': {'data': base64_encoded, 'mime_type': item.media_type}}
+                    if item.vendor_metadata:
+                        inline_data_dict['video_metadata'] = item.vendor_metadata
+                    content.append(inline_data_dict)  # type: ignore
                 elif isinstance(item, VideoUrl) and item.is_youtube:
-                    content.append({'file_data': {'file_uri': item.url, 'mime_type': item.media_type}})
+                    file_data_dict = {'file_data': {'file_uri': item.url, 'mime_type': item.media_type}}
+                    if item.vendor_metadata:
+                        file_data_dict['video_metadata'] = item.vendor_metadata
+                    content.append(file_data_dict)  # type: ignore
                 elif isinstance(item, FileUrl):
                     if self.system == 'google-gla' or item.force_download:
                         downloaded_item = await download_item(item, data_format='base64')
diff --git a/pydantic_ai_slim/pyproject.toml b/pydantic_ai_slim/pyproject.toml
@@ -64,7 +64,7 @@ logfire = ["logfire>=3.11.0"]
 openai = ["openai>=1.76.0"]
 cohere = ["cohere>=5.13.11; platform_system != 'Emscripten'"]
 vertexai = ["google-auth>=2.36.0", "requests>=2.32.2"]
-google = ["google-genai>=1.15.0"]
+google = ["google-genai>=1.24.0"]
 anthropic = ["anthropic>=0.52.0"]
 groq = ["groq>=0.19.0"]
 mistral = ["mistralai>=1.2.5"]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -10,6 +10,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import datetime
+from functools import cached_property
 from pathlib import Path
 from types import ModuleType
 from typing import TYPE_CHECKING, Any, Callable
@@ -19,7 +20,7 @@
 from _pytest.assertion.rewrite import AssertionRewritingHook
 from pytest_mock import MockerFixture
 from typing_extensions import TypeAlias
-from vcr import VCR
+from vcr import VCR, request as vcr_request
 
 import pydantic_ai.models
 from pydantic_ai.messages import BinaryContent
@@ -194,6 +195,29 @@ def pytest_recording_configure(config: Any, vcr: VCR):
 
     vcr.register_serializer('yaml', json_body_serializer)
 
+    def method_matcher(r1: vcr_request.Request, r2: vcr_request.Request) -> None:
+        if r1.method.upper() != r2.method.upper():
+            raise AssertionError(f'{r1.method} != {r2.method}')
+
+    vcr.register_matcher('method', method_matcher)
+
+
+@pytest.fixture(autouse=True)
+def mock_vcr_aiohttp_content(mocker: MockerFixture):
+    try:
+        from vcr.stubs import aiohttp_stubs
+    except ImportError:
+        return
+
+    # google-genai calls `self.response_stream.content.readline()` where `self.response_stream` is a `MockClientResponse`,
+    # which creates a new `MockStream` each time instead of returning the same one, resulting in the readline cursor not being respected.
+    # So we turn `content` into a cached property to return the same one each time.
+    # VCR issue: https://github.com/kevin1024/vcrpy/issues/927. Once that's is resolved, we can remove this patch.
+    cached_content = cached_property(aiohttp_stubs.MockClientResponse.content.fget)  # type: ignore
+    cached_content.__set_name__(aiohttp_stubs.MockClientResponse, 'content')
+    mocker.patch('vcr.stubs.aiohttp_stubs.MockClientResponse.content', new=cached_content)
+    mocker.patch('vcr.stubs.aiohttp_stubs.MockStream.set_exception', return_value=None)
+
 
 @pytest.fixture(scope='module')
 def vcr_config():
diff --git a/tests/models/cassettes/test_google/test_google_model_video_as_binary_content_input_with_vendor_metadata.yaml b/tests/models/cassettes/test_google/test_google_model_video_as_binary_content_input_with_vendor_metadata.yaml
diff --git a/tests/models/cassettes/test_google/test_google_model_youtube_video_url_input_with_vendor_metadata.yaml b/tests/models/cassettes/test_google/test_google_model_youtube_video_url_input_with_vendor_metadata.yaml
@@ -0,0 +1,73 @@
+interactions:
+- request:
+    headers:
+      content-type:
+      - application/json
+    method: post
+    parsed_body:
+      contents:
+      - parts:
+        - text: Explain me this video
+        - fileData:
+            fileUri: https://youtu.be/lCdaVNyHtjU
+            mimeType: video/mp4
+          videoMetadata:
+            fps: 0.2
+        role: user
+      generationConfig: {}
+      systemInstruction:
+        parts:
+        - text: You are a helpful chatbot.
+        role: user
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
+  response:
+    headers:
+      alt-svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      content-length:
+      - '2759'
+      content-type:
+      - application/json; charset=UTF-8
+      server-timing:
+      - gfet4t7; dur=11467
+      transfer-encoding:
+      - chunked
+      vary:
+      - Origin
+      - X-Origin
+      - Referer
+    parsed_body:
+      candidates:
+      - avgLogprobs: -0.4793745385795377
+        content:
+          parts:
+          - text: |-
+              Okay, based on the image, here's what I can infer:
+
+              *   **A camera monitor is mounted on top of a camera.**
+              *   **The monitor's screen is on, displaying a view of the rocky mountains.**
+              *   **This setting suggests a professional video shoot.**
+
+              If you'd like a more detailed explanation, please provide additional information about the video.
+          role: model
+        finishReason: STOP
+      modelVersion: gemini-2.0-flash
+      responseId: ldpraPqBM6HshMIPgsi60QI
+      usageMetadata:
+        candidatesTokenCount: 459
+        candidatesTokensDetails:
+        - modality: TEXT
+          tokenCount: 459
+        promptTokenCount: 4605
+        promptTokensDetails:
+        - modality: TEXT
+          tokenCount: 10
+        - modality: AUDIO
+          tokenCount: 1475
+        - modality: VIDEO
+          tokenCount: 3120
+        totalTokenCount: 5064
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/models/test_google.py b/tests/models/test_google.py
@@ -420,6 +420,23 @@ async def test_google_model_video_as_binary_content_input(
 """)
 
 
+async def test_google_model_video_as_binary_content_input_with_vendor_metadata(
+    allow_model_requests: None, video_content: BinaryContent, google_provider: GoogleProvider
+):
+    m = GoogleModel('gemini-2.0-flash', provider=google_provider)
+    agent = Agent(m, system_prompt='You are a helpful chatbot.')
+    video_content.vendor_metadata = {'start_offset': '2s', 'end_offset': '10s'}
+
+    result = await agent.run(['Explain me this video', video_content])
+    assert result.output == snapshot("""\
+Okay, I can describe what is visible in the image.
+
+The image shows a camera setup in an outdoor setting. The camera is mounted on a tripod and has an external monitor attached to it. The monitor is displaying a scene that appears to be a desert landscape with rocky formations and mountains in the background. The foreground and background of the overall image, outside of the camera monitor, is also a blurry, desert landscape. The colors in the background are warm and suggest either sunrise, sunset, or reflected light off the rock formations.
+
+It looks like someone is either reviewing footage on the monitor, or using it as an aid for framing the shot.\
+""")
+
+
 async def test_google_model_image_url_input(allow_model_requests: None, google_provider: GoogleProvider):
     m = GoogleModel('gemini-2.0-flash', provider=google_provider)
     agent = Agent(m, system_prompt='You are a helpful chatbot.')
@@ -454,6 +471,32 @@ async def test_google_model_video_url_input(allow_model_requests: None, google_p
 """)
 
 
+async def test_google_model_youtube_video_url_input_with_vendor_metadata(
+    allow_model_requests: None, google_provider: GoogleProvider
+):
+    m = GoogleModel('gemini-2.0-flash', provider=google_provider)
+    agent = Agent(m, system_prompt='You are a helpful chatbot.')
+
+    result = await agent.run(
+        [
+            'Explain me this video',
+            VideoUrl(
+                url='https://youtu.be/lCdaVNyHtjU',
+                vendor_metadata={'fps': 0.2},
+            ),
+        ]
+    )
+    assert result.output == snapshot("""\
+Okay, based on the image, here's what I can infer:
+
+*   **A camera monitor is mounted on top of a camera.**
+*   **The monitor's screen is on, displaying a view of the rocky mountains.**
+*   **This setting suggests a professional video shoot.**
+
+If you'd like a more detailed explanation, please provide additional information about the video.\
+""")
+
+
 async def test_google_model_document_url_input(allow_model_requests: None, google_provider: GoogleProvider):
     m = GoogleModel('gemini-2.0-flash', provider=google_provider)
     agent = Agent(m, system_prompt='You are a helpful chatbot.')
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -2829,7 +2829,10 @@ def test_binary_content_all_messages_json():
             {
                 'parts': [
                     {
-                        'content': ['Hello', {'data': 'SGVsbG8=', 'media_type': 'text/plain', 'kind': 'binary'}],
+                        'content': [
+                            'Hello',
+                            {'data': 'SGVsbG8=', 'media_type': 'text/plain', 'vendor_metadata': None, 'kind': 'binary'},
+                        ],
                         'timestamp': IsStr(),
                         'part_kind': 'user-prompt',
                     }
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -2829,7 +2829,10 @@ def test_binary_content_all_messages_json():`
`2829`	`2829`	`{`
`2830`	`2830`	`'parts': [`
`2831`	`2831`	`{`
`2832`		`- 'content': ['Hello', {'data': 'SGVsbG8=', 'media_type': 'text/plain', 'kind': 'binary'}],`
	`2832`	`+ 'content': [`
	`2833`	`+ 'Hello',`
	`2834`	`+ {'data': 'SGVsbG8=', 'media_type': 'text/plain', 'vendor_metadata': None, 'kind': 'binary'},`
	`2835`	`+ ],`
`2833`	`2836`	`'timestamp': IsStr(),`
`2834`	`2837`	`'part_kind': 'user-prompt',`
`2835`	`2838`	`}`