diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md index 1c90e9facb..535994221d 100644 --- a/docs/source/en/package_reference/inference_types.md +++ b/docs/source/en/package_reference/inference_types.md @@ -197,6 +197,18 @@ This part of the lib is still under development and will be improved in future r +## image_to_video + +[[autodoc]] huggingface_hub.ImageToVideoInput + +[[autodoc]] huggingface_hub.ImageToVideoOutput + +[[autodoc]] huggingface_hub.ImageToVideoParameters + +[[autodoc]] huggingface_hub.ImageToVideoTargetSize + + + ## object_detection [[autodoc]] huggingface_hub.ObjectDetectionBoundingBox diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md index 3746086ed2..84dda55956 100644 --- a/docs/source/ko/package_reference/inference_types.md +++ b/docs/source/ko/package_reference/inference_types.md @@ -196,6 +196,18 @@ rendered properly in your Markdown viewer. +## image_to_video[[huggingface_hub.ImageToVideoInput]] + +[[autodoc]] huggingface_hub.ImageToVideoInput + +[[autodoc]] huggingface_hub.ImageToVideoOutput + +[[autodoc]] huggingface_hub.ImageToVideoParameters + +[[autodoc]] huggingface_hub.ImageToVideoTargetSize + + + ## object_detection[[huggingface_hub.ObjectDetectionBoundingBox]] [[autodoc]] huggingface_hub.ObjectDetectionBoundingBox diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index c58b3e4aca..bd46c3e807 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -372,6 +372,10 @@ "ImageToTextInput", "ImageToTextOutput", "ImageToTextParameters", + "ImageToVideoInput", + "ImageToVideoOutput", + "ImageToVideoParameters", + "ImageToVideoTargetSize", "ObjectDetectionBoundingBox", "ObjectDetectionInput", "ObjectDetectionOutputElement", @@ -660,6 +664,10 @@ "ImageToTextInput", "ImageToTextOutput", "ImageToTextParameters", + "ImageToVideoInput", + "ImageToVideoOutput", + "ImageToVideoParameters", + "ImageToVideoTargetSize", "InferenceApi", "InferenceClient", "InferenceEndpoint", @@ -1370,6 +1378,10 @@ def __dir__(): ImageToTextInput, # noqa: F401 ImageToTextOutput, # noqa: F401 ImageToTextParameters, # noqa: F401 + ImageToVideoInput, # noqa: F401 + ImageToVideoOutput, # noqa: F401 + ImageToVideoParameters, # noqa: F401 + ImageToVideoTargetSize, # noqa: F401 ObjectDetectionBoundingBox, # noqa: F401 ObjectDetectionInput, # noqa: F401 ObjectDetectionOutputElement, # noqa: F401 diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py index 63f6a653d6..bfffc0ae3b 100644 --- a/src/huggingface_hub/inference/_generated/types/__init__.py +++ b/src/huggingface_hub/inference/_generated/types/__init__.py @@ -85,6 +85,7 @@ ImageToTextOutput, ImageToTextParameters, ) +from .image_to_video import ImageToVideoInput, ImageToVideoOutput, ImageToVideoParameters, ImageToVideoTargetSize from .object_detection import ( ObjectDetectionBoundingBox, ObjectDetectionInput, diff --git a/src/huggingface_hub/inference/_generated/types/image_to_video.py b/src/huggingface_hub/inference/_generated/types/image_to_video.py new file mode 100644 index 0000000000..92192a2a05 --- /dev/null +++ b/src/huggingface_hub/inference/_generated/types/image_to_video.py @@ -0,0 +1,60 @@ +# Inference code generated from the JSON schema spec in @huggingface/tasks. +# +# See: +# - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts +# - specs: https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks. +from typing import Any, Optional + +from .base import BaseInferenceType, dataclass_with_extra + + +@dataclass_with_extra +class ImageToVideoTargetSize(BaseInferenceType): + """The size in pixel of the output video frames.""" + + height: int + width: int + + +@dataclass_with_extra +class ImageToVideoParameters(BaseInferenceType): + """Additional inference parameters for Image To Video""" + + guidance_scale: Optional[float] = None + """For diffusion models. A higher guidance scale value encourages the model to generate + videos closely linked to the text prompt at the expense of lower image quality. + """ + negative_prompt: Optional[str] = None + """One prompt to guide what NOT to include in video generation.""" + num_frames: Optional[float] = None + """The num_frames parameter determines how many video frames are generated.""" + num_inference_steps: Optional[int] = None + """The number of denoising steps. More denoising steps usually lead to a higher quality + video at the expense of slower inference. + """ + prompt: Optional[str] = None + """The text prompt to guide the video generation.""" + seed: Optional[int] = None + """Seed for the random number generator.""" + target_size: Optional[ImageToVideoTargetSize] = None + """The size in pixel of the output video frames.""" + + +@dataclass_with_extra +class ImageToVideoInput(BaseInferenceType): + """Inputs for Image To Video inference""" + + inputs: str + """The input image data as a base64-encoded string. If no `parameters` are provided, you can + also provide the image data as a raw bytes payload. + """ + parameters: Optional[ImageToVideoParameters] = None + """Additional inference parameters for Image To Video""" + + +@dataclass_with_extra +class ImageToVideoOutput(BaseInferenceType): + """Outputs of inference for the Image To Video task""" + + video: Any + """The generated video returned as raw bytes in the payload."""