Skip to content

Commit 590c27a

Browse files
committed
wan
1 parent 8db9073 commit 590c27a

File tree

5 files changed

+206
-526
lines changed

5 files changed

+206
-526
lines changed

docs/source/en/api/loaders/lora.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,8 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
9898

9999
## LoraBaseMixin
100100

101-
[[autodoc]] loaders.lora_base.LoraBaseMixin
101+
[[autodoc]] loaders.lora_base.LoraBaseMixin
102+
103+
## WanLoraLoaderMixin
104+
105+
[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin

docs/source/en/api/pipelines/cogvideox.md

Lines changed: 31 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,32 @@
2626
You can find all the original CogVideoX checkpoints under the [CogVideoX](https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce) collection.
2727

2828
> [!TIP]
29-
> Click on the CogVideoX models in the right sidebar for more examples of how to use CogVideoX for other video generation tasks.
29+
> Click on the CogVideoX models in the right sidebar for more examples of other video generation tasks.
3030
3131
The example below demonstrates how to generate a video optimized for memory or inference speed.
3232

3333
<hfoptions id="usage">
3434
<hfoption id="memory">
3535

36+
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
37+
38+
The quantized CogVideoX 5B model below requires ~16GB of VRAM.
39+
3640
```py
3741
import torch
3842
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
3943
from diffusers.hooks import apply_group_offloading
4044
from diffusers.utils import export_to_video
4145

46+
# quantize weights to int8 with torchao
47+
quantization_config = TorchAoConfig("int8wo")
48+
transformer = CogVideoXTransformer3DModel.from_pretrained(
49+
"THUDM/CogVideoX-5b",
50+
subfolder="transformer",
51+
quantization_config=quantization_config,
52+
torch_dtype=torch.bfloat16,
53+
)
54+
4255
# fp8 layerwise weight-casting
4356
transformer = CogVideoXTransformer3DModel.from_pretrained(
4457
"THUDM/CogVideoX-5b",
@@ -60,10 +73,13 @@ pipeline.to("cuda")
6073
# model-offloading
6174
pipeline.enable_model_cpu_offload()
6275

63-
prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
64-
"The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
65-
"Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
66-
"with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
76+
prompt = """
77+
A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea.
78+
The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse.
79+
Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood,
80+
with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.
81+
"""
82+
6783
video = pipeline(
6884
prompt=prompt,
6985
guidance_scale=6,
@@ -72,45 +88,6 @@ video = pipeline(
7288
export_to_video(video, "output.mp4", fps=8)
7389
```
7490

75-
Reduce memory usage even more if necessary by quantizing a model to a lower precision data type.
76-
77-
```py
78-
import torch
79-
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel, TorchAoConfig
80-
from diffusers.utils import export_to_video
81-
82-
# quantize weights to int8 with torchao
83-
quantization_config = TorchAoConfig("int8wo")
84-
transformer = CogVideoXTransformer3DModel.from_pretrained(
85-
"THUDM/CogVideoX-5b",
86-
subfolder="transformer",
87-
quantization_config=quantization_config,
88-
torch_dtype=torch.bfloat16,
89-
)
90-
# fp8 layerwise weight-casting
91-
transformer.enable_layerwise_casting(
92-
storage_dtype=torch.float8_e4m3fn,
93-
compute_dtype=torch.bfloat16
94-
)
95-
96-
pipeline = CogVideoXPipeline.from_pretrained(
97-
"THUDM/CogVideoX-5b",
98-
transformer=transformer,
99-
torch_dtype=torch.bfloat16,
100-
)
101-
pipeline.to("cuda")
102-
103-
# model-offloading
104-
pipeline.enable_model_cpu_offload()
105-
106-
prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
107-
"The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
108-
"Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
109-
"with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
110-
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
111-
export_to_video(video, "output.mp4", fps=8)
112-
```
113-
11491
</hfoption>
11592
<hfoption id="inference speed">
11693

@@ -119,7 +96,6 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
11996
```py
12097
import torch
12198
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
122-
from diffusers.hooks import apply_group_offloading
12399
from diffusers.utils import export_to_video
124100

125101
pipeline = CogVideoXPipeline.from_pretrained(
@@ -133,10 +109,13 @@ pipeline.transformer = torch.compile(
133109
pipeline.transformer, mode="max-autotune", fullgraph=True
134110
)
135111

136-
prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
137-
"The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
138-
"Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
139-
"with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
112+
prompt = """
113+
A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea.
114+
The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse.
115+
Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood,
116+
with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.
117+
"""
118+
140119
video = pipeline(
141120
prompt=prompt,
142121
guidance_scale=6,
@@ -186,8 +165,11 @@ export_to_video(video, "output.mp4", fps=8)
186165
).frames[0]
187166
export_to_video(video, "output.mp4", fps=16)
188167
```
168+
189169
- The text-to-video (T2V) checkpoints work best with a resolution of 1360x768 because that was the resolution it was pretrained on.
170+
190171
- The image-to-video (I2V) checkpoints work with multiple resolutions. The width can vary from 768 to 1360, but the height must be 758. Both height and width must be divisible by 16.
172+
191173
- Both T2V and I2V checkpoints work best with 81 and 161 frames. It is recommended to export the generated video at 16fps.
192174

193175
## CogVideoXPipeline

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
# HunyuanVideo
2222

23-
[HunyuanVideo](https://huggingface.co/papers/2412.03603) is a 13B diffusion transformer model designed to be competitive with closed-source video foundation models and enable wider community access. This model uses a "dual-stream to single-stream" architecture to separately process the video and text tokens first, before concatenating and feeding them to the transformer to fuse the multimodal information. A pretrained multimodal large language model (MLLM) is used as the encoder because it has better image-text alignment, better image detail description and reasoning, and it can be used as a zero-shot learner if system instructions are added to user prompts. Finally, HunyuanVideo uses a 3D causal variational autoencoder to more efficiently process video data at the original resolution and frame rate.
23+
[HunyuanVideo](https://huggingface.co/papers/2412.03603) is a 13B parameter diffusion transformer model designed to be competitive with closed-source video foundation models and enable wider community access. This model uses a "dual-stream to single-stream" architecture to separately process the video and text tokens first, before concatenating and feeding them to the transformer to fuse the multimodal information. A pretrained multimodal large language model (MLLM) is used as the encoder because it has better image-text alignment, better image detail description and reasoning, and it can be used as a zero-shot learner if system instructions are added to user prompts. Finally, HunyuanVideo uses a 3D causal variational autoencoder to more efficiently process video data at the original resolution and frame rate.
2424

2525
You can find all the original HunyuanVideo checkpoints under the [Tencent](https://huggingface.co/tencent) organization.
2626

@@ -32,12 +32,16 @@ The example below demonstrates how to generate a video optimized for memory or i
3232
<hfoptions id="usage">
3333
<hfoption id="memory">
3434

35+
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
36+
37+
The quantized HunyuanVideo model below requires ~14GB of VRAM.
38+
3539
```py
3640
import torch
3741
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
3842
from diffusers.utils import export_to_video
3943

40-
# quantization
44+
# quantize weights to int4 with bitsandbytes
4145
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
4246
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
4347
"hunyuanvideo-community/HunyuanVideo",
@@ -52,7 +56,7 @@ pipeline = HunyuanVideoPipeline.from_pretrained(
5256
torch_dtype=torch.float16,
5357
)
5458

55-
# model-offloading
59+
# model-offloading and tiling
5660
pipeline.enable_model_cpu_offload()
5761
pipeline.vae.enable_tiling()
5862

@@ -71,7 +75,7 @@ import torch
7175
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
7276
from diffusers.utils import export_to_video
7377

74-
# quantization
78+
# quantize weights to int4 with bitsandbytes
7579
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
7680
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
7781
"hunyuanvideo-community/HunyuanVideo",
@@ -86,7 +90,7 @@ pipeline = HunyuanVideoPipeline.from_pretrained(
8690
torch_dtype=torch.float16,
8791
)
8892

89-
# model-offloading
93+
# model-offloading and tiling
9094
pipeline.enable_model_cpu_offload()
9195
pipeline.vae.enable_tiling()
9296

@@ -132,10 +136,11 @@ export_to_video(video, "output.mp4", fps=15)
132136
pipeline.load_lora_weights("https://huggingface.co/lucataco/hunyuan-steamboat-willie-10", adapter_name="steamboat-willie")
133137
pipeline.set_adapters("steamboat-willie", 0.9)
134138

135-
# model-offloading
139+
# model-offloading and tiling
136140
pipeline.enable_model_cpu_offload()
137141
pipeline.vae.enable_tiling()
138142

143+
# use "In the style of SWR" to trigger the LoRA
139144
prompt = """
140145
In the style of SWR. A black and white animated scene featuring a fluffy teddy bear sits on a bed of soft pillows surrounded by children's toys.
141146
"""
@@ -150,7 +155,7 @@ export_to_video(video, "output.mp4", fps=15)
150155
| text encoder dtype | `torch.float16` |
151156
| transformer dtype | `torch.bfloat16` |
152157
| vae dtype | `torch.float16` |
153-
| `num_frames` | 4 * k + 1 |
158+
| `num_frames (k)` | 4 * `k` + 1 |
154159

155160
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
156161

docs/source/en/api/pipelines/ltx_video.md

Lines changed: 22 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,22 @@
2020

2121
# LTX-Video
2222

23-
[LTX-Video](https://huggingface.co/Lightricks/LTX-Video) is a diffusion transformer designed for fast and real-time generation of high-resolution videos from text and images. The main feature of LTX-Video is the Video-VAE. The Video-VAE has a higher pixel to latent compression ratio (1:192) which enables more efficient video data processing and faster generation speed. To support and prevent the finer details from being lost during generation, the Video-VAE decoder performs the latent to pixel conversion *and* the last denoising step.
23+
[LTX-Video](https://huggingface.co/Lightricks/LTX-Video) is a diffusion transformer designed for fast and real-time generation of high-resolution videos from text and images. The main feature of LTX-Video is the Video-VAE. The Video-VAE has a higher pixel to latent compression ratio (1:192) which enables more efficient video data processing and faster generation speed. To support and prevent finer details from being lost during generation, the Video-VAE decoder performs the latent to pixel conversion *and* the last denoising step.
2424

2525
You can find all the original LTX-Video checkpoints under the [Lightricks](https://huggingface.co/Lightricks) organization.
2626

2727
> [!TIP]
28-
> Click on the LTX-Video models in the right sidebar for more examples of how to use LTX-Video for other video generation tasks.
28+
> Click on the LTX-Video models in the right sidebar for more examples of other video generation tasks.
2929
3030
The example below demonstrates how to generate a video optimized for memory or inference speed.
3131

3232
<hfoptions id="usage">
3333
<hfoption id="memory">
3434

35+
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
36+
37+
The LTX-Video model below requires ~10GB of VRAM.
38+
3539
```py
3640
import torch
3741
from diffusers import LTXPipeline, LTXVideoTransformer3DModel
@@ -58,7 +62,9 @@ pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_d
5862
apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
5963
apply_group_offloading(pipeline.vae, onload_device=onload_device, offload_type="leaf_level")
6064

61-
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
65+
prompt = """
66+
A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
67+
"""
6268
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
6369

6470
video = pipeline(
@@ -74,55 +80,6 @@ video = pipeline(
7480
export_to_video(video, "output.mp4", fps=24)
7581
```
7682

77-
</details>
78-
79-
Reduce memory usage even more if necessary by quantizing a model to a lower precision data type.
80-
81-
```py
82-
import torch
83-
from diffusers.utils import export_to_video
84-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, LTXVideoTransformer3DModel, LTXPipeline
85-
from transformers import BitsAndBytesConfig as BitsAndBytesConfig, T5EncoderModel
86-
87-
# quantize weights to int8 with bitsandbytes
88-
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
89-
text_encoder = T5EncoderModel.from_pretrained(
90-
"Lightricks/LTX-Video",
91-
subfolder="text_encoder",
92-
quantization_config=quantization_config,
93-
torch_dtype=torch.bfloat16,
94-
)
95-
96-
quantization_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
97-
transformer = LTXVideoTransformer3DModel.from_pretrained(
98-
"Lightricks/LTX-Video",
99-
subfolder="transformer",
100-
quantization_config=quantization_config,
101-
torch_dtype=torch.bfloat16,
102-
)
103-
104-
pipeline = LTXPipeline.from_pretrained(
105-
"Lightricks/LTX-Video",
106-
text_encoder=text_en,
107-
transformer=transformer,
108-
torch_dtype=torch.bfloat16,
109-
)
110-
111-
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
112-
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
113-
video = pipeline(
114-
prompt=prompt,
115-
negative_prompt=negative_prompt,
116-
width=768,
117-
height=512,
118-
num_frames=161,
119-
decode_timestep=0.03,
120-
decode_noise_scale=0.025,
121-
num_inference_steps=50,
122-
).frames[0]
123-
export_to_video(video, "output.mp4", fps=24)
124-
```
125-
12683
</hfoption>
12784
<hfoption id="inference speed">
12885

@@ -143,7 +100,9 @@ pipeline.transformer = torch.compile(
143100
pipeline.transformer, mode="max-autotune", fullgraph=True
144101
)
145102

146-
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
103+
prompt = """
104+
A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
105+
"""
147106
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
148107

149108
video = pipeline(
@@ -164,24 +123,27 @@ export_to_video(video, "output.mp4", fps=24)
164123

165124
## Notes
166125

167-
- LTX-Video supports LoRAs with [`~LTXVideoLoraLoaderMixin.load_lora_weights`].
126+
- LTX-Video supports LoRAs with [`~loaders.LTXVideoLoraLoaderMixin.load_lora_weights`].
168127

169128
```py
170129
import torch
171130
from diffusers import LTXConditionPipeline
172-
from diffusers.utils import export_to_video
131+
from diffusers.utils import export_to_video, load_image
173132

174133
pipeline = LTXConditionPipeline.from_pretrained(
175134
"Lightricks/LTX-Video-0.9.5", torch_dtype=torch.bfloat16
176135
)
177136

178137
pipeline.load_lora_weights("Lightricks/LTX-Video-Cakeify-LoRA", adapter_name="cakeify")
179-
pipeline.set_adapters("cakeify", 0.9)
138+
pipeline.set_adapters("cakeify")
180139

181-
prompt = "CAKEIFY a person using a knife to cut a cake shaped like a pair of cowboy boots"
140+
# use "CAKEIFY" to trigger the LoRA
141+
prompt = "CAKEIFY a person using a knife to cut a cake shaped like a cereal box"
142+
image = load_image("https://i5.walmartimages.com/asr/c0463def-4995-47a7-9486-294fff8cf9fc.f9779f3fc4c621cf1fe86465af1d2ecd.jpeg")
182143

183144
video = pipeline(
184145
prompt=prompt,
146+
image=image,
185147
width=768,
186148
height=512,
187149
num_frames=161,
@@ -191,7 +153,8 @@ export_to_video(video, "output.mp4", fps=24)
191153
).frames[0]
192154
export_to_video(video, "output.mp4", fps=24)
193155
```
194-
- LTX-Video supports loading from single files, such as [GGUF checkpoints](../../quantization/gguf), with [`FromOriginalModelMixin.from_single_file`] or [`FromSingleFileMixin.from_single_file`].
156+
157+
- LTX-Video supports loading from single files, such as [GGUF checkpoints](../../quantization/gguf), with [`loaders.FromOriginalModelMixin.from_single_file`] or [`loaders.FromSingleFileMixin.from_single_file`].
195158

196159
```py
197160
import torch
@@ -206,7 +169,7 @@ export_to_video(video, "output.mp4", fps=24)
206169
pipeline = LTXPipeline.from_pretrained(
207170
"Lightricks/LTX-Video",
208171
transformer=transformer,
209-
torch_dtype=bfloat16
172+
torch_dtype=torch.bfloat16
210173
)
211174
```
212175

0 commit comments

Comments
 (0)