Skip to content

Commit 1ec8c2e

Browse files
committed
add hunyuanimage
1 parent 8f6e9c9 commit 1ec8c2e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+6342
-414
lines changed

docs/diffusers/_toctree.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,8 @@
269269
title: FluxTransformer2DModel
270270
- local: api/models/hunyuan_transformer2d
271271
title: HunyuanDiT2DModel
272+
- local: api/models/hunyuanimage_transformer_2d
273+
title: HunyuanImageTransformer2DModel
272274
- local: api/models/hunyuan_video_transformer_3d
273275
title: HunyuanVideoTransformer3DModel
274276
- local: api/models/latte_transformer3d
@@ -333,6 +335,10 @@
333335
title: AutoencoderKLCogVideoX
334336
- local: api/models/autoencoderkl_cosmos
335337
title: AutoencoderKLCosmos
338+
- local: api/models/autoencoder_kl_hunyuanimage
339+
title: AutoencoderKLHunyuanImage
340+
- local: api/models/autoencoder_kl_hunyuanimage_refiner
341+
title: AutoencoderKLHunyuanImageRefiner
336342
- local: api/models/autoencoder_kl_hunyuan_video
337343
title: AutoencoderKLHunyuanVideo
338344
- local: api/models/autoencoderkl_ltx_video
@@ -429,6 +435,8 @@
429435
title: HiDream-I1
430436
- local: api/pipelines/framepack
431437
title: Framepack
438+
- local: api/pipelines/hunyuanimage21
439+
title: HunyuanImage2.1
432440
- local: api/pipelines/hunyuandit
433441
title: Hunyuan-DiT
434442
- local: api/pipelines/hunyuan_video
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# AutoencoderKLHunyuanImage
13+
14+
The 2D variational autoencoder (VAE) model with KL loss used in [HunyuanImage2.1].
15+
16+
The model can be loaded with the following code snippet.
17+
18+
```python
19+
from mindone.diffusers import AutoencoderKLHunyuanImage
20+
21+
vae = AutoencoderKLHunyuanImage.from_pretrained("hunyuanvideo-community/HunyuanImage-2.1-Diffusers", subfolder="vae", mindspore_dtype=ms.bfloat16)
22+
```
23+
24+
::: mindone.diffusers.AutoencoderKLHunyuanImage
25+
26+
::: mindone.diffusers.models.autoencoders.vae.DecoderOutput
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# AutoencoderKLHunyuanImageRefiner
13+
14+
The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanImage2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) for its refiner pipeline.
15+
16+
The model can be loaded with the following code snippet.
17+
18+
```python
19+
from mindone.diffusers import AutoencoderKLHunyuanImageRefiner
20+
21+
vae = AutoencoderKLHunyuanImageRefiner.from_pretrained("hunyuanvideo-community/HunyuanImage-2.1-Refiner-Diffusers", subfolder="vae", mindspore_dtype=ms.bfloat16)
22+
```
23+
24+
::: mindone.diffusers.AutoencoderKLHunyuanImageRefiner
25+
26+
::: mindone.diffusers.models.autoencoders.vae.DecoderOutput
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# HunyuanImageTransformer2DModel
13+
14+
A Diffusion Transformer model for [HunyuanImage2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1).
15+
16+
The model can be loaded with the following code snippet.
17+
18+
```python
19+
from mindone.diffusers import HunyuanImageTransformer2DModel
20+
21+
transformer = HunyuanImageTransformer2DModel.from_pretrained("hunyuanvideo-community/HunyuanImage-2.1-Diffusers", subfolder="transformer", mindspore_dtype=ms.bfloat16)
22+
```
23+
24+
::: mindone.diffusers.HunyuanImageTransformer2DModel
25+
26+
::: mindone.diffusers.models.modeling_outputs.Transformer2DModelOutput
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License. -->
14+
15+
# HunyuanImage2.1
16+
17+
18+
HunyuanImage-2.1 is a 17B text-to-image model that is capable of generating 2K (2048 x 2048) resolution images
19+
20+
HunyuanImage-2.1 comes in the following variants:
21+
22+
| model type | model id |
23+
|:----------:|:--------:|
24+
| HunyuanImage-2.1 | [hunyuanvideo-community/HunyuanImage-2.1-Diffusers](https://huggingface.co/hunyuanvideo-community/HunyuanImage-2.1-Diffusers) |
25+
| HunyuanImage-2.1-Distilled | [hunyuanvideo-community/HunyuanImage-2.1-Distilled-Diffusers](https://huggingface.co/hunyuanvideo-community/HunyuanImage-2.1-Distilled-Diffusers) |
26+
| HunyuanImage-2.1-Refiner | [hunyuanvideo-community/HunyuanImage-2.1-Refiner-Diffusers](https://huggingface.co/hunyuanvideo-community/HunyuanImage-2.1-Refiner-Diffusers) |
27+
28+
> [!TIP]
29+
> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
30+
31+
## HunyuanImage-2.1
32+
33+
HunyuanImage-2.1 applies [Adaptive Projected Guidance (APG)](https://huggingface.co/papers/2410.02416) combined with Classifier-Free Guidance (CFG) in the denoising loop. `HunyuanImagePipeline` has a `guider` component (read more about [Guider](../modular_diffusers/guiders.md)) and does not take a `guidance_scale` parameter at runtime. To change guider-related parameters, e.g., `guidance_scale`, you can update the `guider` configuration instead.
34+
35+
```python
36+
import mindspore as ms
37+
from mindone.diffusers import HunyuanImagePipeline
38+
39+
pipe = HunyuanImagePipeline.from_pretrained(
40+
"hunyuanvideo-community/HunyuanImage-2.1-Diffusers",
41+
mindspore_dtype=ms.bfloat16
42+
)
43+
```
44+
45+
You can inspect the `guider` object:
46+
47+
```py
48+
>>> pipe.guider
49+
AdaptiveProjectedMixGuidance {
50+
"_class_name": "AdaptiveProjectedMixGuidance",
51+
"_diffusers_version": "0.36.0.dev0",
52+
"adaptive_projected_guidance_momentum": -0.5,
53+
"adaptive_projected_guidance_rescale": 10.0,
54+
"adaptive_projected_guidance_scale": 10.0,
55+
"adaptive_projected_guidance_start_step": 5,
56+
"enabled": true,
57+
"eta": 0.0,
58+
"guidance_rescale": 0.0,
59+
"guidance_scale": 3.5,
60+
"start": 0.0,
61+
"stop": 1.0,
62+
"use_original_formulation": false
63+
}
64+
65+
State:
66+
step: None
67+
num_inference_steps: None
68+
timestep: None
69+
count_prepared: 0
70+
enabled: True
71+
num_conditions: 2
72+
momentum_buffer: None
73+
is_apg_enabled: False
74+
is_cfg_enabled: True
75+
```
76+
77+
To update the guider with a different configuration, use the `new()` method. For example, to generate an image with `guidance_scale=5.0` while keeping all other default guidance parameters:
78+
79+
```py
80+
import mindspore as ms
81+
from mindone.diffusers import HunyuanImagePipeline
82+
83+
pipe = HunyuanImagePipeline.from_pretrained(
84+
"hunyuanvideo-community/HunyuanImage-2.1-Diffusers",
85+
mindspore_dtype=ms.bfloat16
86+
)
87+
88+
# Update the guider configuration
89+
pipe.guider = pipe.guider.new(guidance_scale=5.0)
90+
91+
prompt = (
92+
"A cute, cartoon-style anthropomorphic penguin plush toy with fluffy fur, standing in a painting studio, "
93+
"wearing a red knitted scarf and a red beret with the word 'Tencent' on it, holding a paintbrush with a "
94+
"focused expression as it paints an oil painting of the Mona Lisa, rendered in a photorealistic photographic style."
95+
)
96+
97+
image = pipe(
98+
prompt=prompt,
99+
num_inference_steps=50,
100+
height=2048,
101+
width=2048,
102+
).images[0]
103+
image.save("image.png")
104+
```
105+
106+
107+
## HunyuanImage-2.1-Distilled
108+
109+
use `distilled_guidance_scale` with the guidance-distilled checkpoint,
110+
111+
```py
112+
import mindspore as ms
113+
from mindone.diffusers import HunyuanImagePipeline
114+
pipe = HunyuanImagePipeline.from_pretrained("hunyuanvideo-community/HunyuanImage-2.1-Distilled-Diffusers", mindspore_dtype=ms.bfloat16)
115+
116+
prompt = (
117+
"A cute, cartoon-style anthropomorphic penguin plush toy with fluffy fur, standing in a painting studio, "
118+
"wearing a red knitted scarf and a red beret with the word 'Tencent' on it, holding a paintbrush with a "
119+
"focused expression as it paints an oil painting of the Mona Lisa, rendered in a photorealistic photographic style."
120+
)
121+
122+
out = pipe(
123+
prompt,
124+
num_inference_steps=8,
125+
distilled_guidance_scale=3.25,
126+
height=2048,
127+
width=2048,
128+
generator=generator,
129+
).images[0]
130+
131+
```
132+
133+
134+
::: mindone.diffusers.HunyuanImagePipeline
135+
136+
::: mindone.diffusers.HunyuanImageRefinerPipeline
137+
138+
::: mindone.diffusers.pipelines.hunyuan_image.pipeline_output.HunyuanImagePipelineOutput

mindone/diffusers/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
"configuration_utils": ["ConfigMixin"],
1818
"guiders": [
1919
"AdaptiveProjectedGuidance",
20+
"AdaptiveProjectedMixGuidance",
2021
"AutoGuidance",
22+
"BaseGuidance",
2123
"ClassifierFreeGuidance",
2224
"ClassifierFreeZeroStarGuidance",
2325
"FrequencyDecoupledGuidance",
@@ -61,6 +63,8 @@
6163
"AutoencoderKLAllegro",
6264
"AutoencoderKLCogVideoX",
6365
"AutoencoderKLCosmos",
66+
"AutoencoderKLHunyuanImage",
67+
"AutoencoderKLHunyuanImageRefiner",
6468
"AutoencoderKLHunyuanVideo",
6569
"AutoencoderKLLTXVideo",
6670
"AutoencoderKLMagvit",
@@ -90,6 +94,7 @@
9094
"HunyuanDiT2DControlNetModel",
9195
"HunyuanDiT2DModel",
9296
"HunyuanDiT2DMultiControlNetModel",
97+
"HunyuanImageTransformer2DModel",
9398
"HunyuanVideoFramepackTransformer3DModel",
9499
"HunyuanVideoTransformer3DModel",
95100
"I2VGenXLUNet",
@@ -204,6 +209,8 @@
204209
"HunyuanDiTControlNetPipeline",
205210
"HunyuanDiTPAGPipeline",
206211
"HunyuanDiTPipeline",
212+
"HunyuanImagePipeline",
213+
"HunyuanImageRefinerPipeline",
207214
"HunyuanSkyreelsImageToVideoPipeline",
208215
"HunyuanVideoFramepackPipeline",
209216
"HunyuanVideoImageToVideoPipeline",
@@ -411,7 +418,9 @@
411418
from .configuration_utils import ConfigMixin
412419
from .guiders import (
413420
AdaptiveProjectedGuidance,
421+
AdaptiveProjectedMixGuidance,
414422
AutoGuidance,
423+
BaseGuidance,
415424
ClassifierFreeGuidance,
416425
ClassifierFreeZeroStarGuidance,
417426
FrequencyDecoupledGuidance,
@@ -442,6 +451,8 @@
442451
AutoencoderKLAllegro,
443452
AutoencoderKLCogVideoX,
444453
AutoencoderKLCosmos,
454+
AutoencoderKLHunyuanImage,
455+
AutoencoderKLHunyuanImageRefiner,
445456
AutoencoderKLHunyuanVideo,
446457
AutoencoderKLLTXVideo,
447458
AutoencoderKLMagvit,
@@ -471,6 +482,7 @@
471482
HunyuanDiT2DControlNetModel,
472483
HunyuanDiT2DModel,
473484
HunyuanDiT2DMultiControlNetModel,
485+
HunyuanImageTransformer2DModel,
474486
HunyuanVideoFramepackTransformer3DModel,
475487
HunyuanVideoTransformer3DModel,
476488
I2VGenXLUNet,
@@ -596,6 +608,8 @@
596608
HunyuanDiTControlNetPipeline,
597609
HunyuanDiTPAGPipeline,
598610
HunyuanDiTPipeline,
611+
HunyuanImagePipeline,
612+
HunyuanImageRefinerPipeline,
599613
HunyuanSkyreelsImageToVideoPipeline,
600614
HunyuanVideoFramepackPipeline,
601615
HunyuanVideoImageToVideoPipeline,

mindone/diffusers/guiders/__init__.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,15 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from typing import Union
16-
15+
from ..utils import logging
1716
from .adaptive_projected_guidance import AdaptiveProjectedGuidance
17+
from .adaptive_projected_guidance_mix import AdaptiveProjectedMixGuidance
1818
from .auto_guidance import AutoGuidance
1919
from .classifier_free_guidance import ClassifierFreeGuidance
2020
from .classifier_free_zero_star_guidance import ClassifierFreeZeroStarGuidance
2121
from .frequency_decoupled_guidance import FrequencyDecoupledGuidance
22+
from .guider_utils import BaseGuidance
2223
from .perturbed_attention_guidance import PerturbedAttentionGuidance
2324
from .skip_layer_guidance import SkipLayerGuidance
2425
from .smoothed_energy_guidance import SmoothedEnergyGuidance
2526
from .tangential_classifier_free_guidance import TangentialClassifierFreeGuidance
26-
27-
GuiderType = Union[
28-
AdaptiveProjectedGuidance,
29-
AutoGuidance,
30-
ClassifierFreeGuidance,
31-
ClassifierFreeZeroStarGuidance,
32-
FrequencyDecoupledGuidance,
33-
PerturbedAttentionGuidance,
34-
SkipLayerGuidance,
35-
SmoothedEnergyGuidance,
36-
TangentialClassifierFreeGuidance,
37-
]

0 commit comments

Comments
 (0)