addressed PR comments

ishan-modi · ishan-modi · commit 3d085a2b95c3 · 2025-04-07T16:20:52.000+05:30
diff --git a/docs/source/en/api/models/controlnet_sana.md b/docs/source/en/api/models/controlnet_sana.md
@@ -24,17 +24,12 @@ The original codebase can be found at [NVlabs/Sana](https://github.com/NVlabs/Sa
 ## Loading from the original format
 By default the [`SanaControlNetModel`] should be loaded with [`~ModelMixin.from_pretrained`]
 ```py
-from diffusers import SanaControlNetModel, SanaControlNetPipeline
+from diffusers import SanaControlNetModel
 import torch
 
 controlnet = SanaControlNetModel.from_pretrained(
     "ishan24/Sana_600M_1024px_ControlNet_diffusers",
 )
-pipe = SanaControlNetPipeline.from_pretrained(
-    "Efficient-Large-Model/Sana_600M_1024px_diffusers",
-    controlnet=controlnet,
-)
-pipe.to('cuda')
 ```
 
 ## SanaControlNetModel
diff --git a/docs/source/en/api/pipelines/controlnet_sana.md b/docs/source/en/api/pipelines/controlnet_sana.md
@@ -42,12 +42,9 @@ pipe = SanaControlNetPipeline.from_pretrained(
     "Efficient-Large-Model/Sana_600M_1024px_diffusers",
     variant="fp16",
     controlnet=controlnet,
-    torch_dtype=torch.float16,
+    torch_dtype={'default': torch.bfloat16, 'transformer': torch.float16},
 )
-
 pipe.to('cuda')
-pipe.vae.to(torch.bfloat16)
-pipe.text_encoder.to(torch.bfloat16)
 
 cond_image = load_image(
     "https://huggingface.co/ishan24/Sana_600M_1024px_ControlNet_diffusers/resolve/main/hed_example.png"
diff --git a/src/diffusers/pipelines/sana/pipeline_sana_controlnet.py b/src/diffusers/pipelines/sana/pipeline_sana_controlnet.py
@@ -367,13 +367,7 @@ def encode_prompt(
             prompt_embeds = prompt_embeds[0][:, select_index]
             prompt_attention_mask = prompt_attention_mask[:, select_index]
 
-        if self.transformer is not None:
-            dtype = self.transformer.dtype
-        elif self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        else:
-            dtype = None
-
+        dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 
         bs_embed, seq_len, _ = prompt_embeds.shape
@@ -406,6 +400,8 @@ def encode_prompt(
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
@@ -956,6 +952,7 @@ def __call__(
             height, width = control_image.shape[-2:]
 
             control_image = self.vae.encode(control_image).latent
+            control_image = control_image.to(self.vae.dtype)
             control_image = control_image * self.vae.config.scaling_factor
 
         else:
@@ -992,12 +989,14 @@ def __call__(
                     continue
 
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                latent_model_input = latent_model_input.to(prompt_embeds.dtype)
 
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
+                timestep = t.expand(latent_model_input.shape[0])
 
                 # controlnet(s) inference
+                latent_model_input = latent_model_input.to(dtype=self.controlnet.dtype)
+                prompt_embeds = prompt_embeds.to(dtype=self.controlnet.dtype)
+                control_image = control_image.to(dtype=self.controlnet.dtype)
                 controlnet_block_samples = self.controlnet(
                     latent_model_input,
                     encoder_hidden_states=prompt_embeds,
@@ -1010,6 +1009,9 @@ def __call__(
                 )[0]
 
                 # predict noise model_output
+                latent_model_input = latent_model_input.to(dtype=self.transformer.dtype)
+                prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
+                controlnet_block_samples = controlnet_block_samples.to(dtype=self.transformer.dtype)
                 noise_pred = self.transformer(
                     latent_model_input,
                     encoder_hidden_states=prompt_embeds,
diff --git a/tests/pipelines/sana/test_sana_controlnet.py b/tests/pipelines/sana/test_sana_controlnet.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import inspect
 import unittest
 
@@ -27,12 +26,8 @@
     SanaControlNetPipeline,
     SanaTransformer2DModel,
 )
-from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
-    require_torch_accelerator,
-    slow,
     torch_device,
 )
 
@@ -79,6 +74,7 @@ def get_dummy_components(self):
             sample_size=32,
         )
 
+        torch.manual_seed(0)
         transformer = SanaTransformer2DModel(
             patch_size=1,
             in_channels=4,
@@ -329,57 +325,3 @@ def test_inference_batch_single_identical(self):
     def test_float16_inference(self):
         # Requires higher tolerance as model seems very sensitive to dtype
         super().test_float16_inference(expected_max_diff=0.08)
-
-
-@slow
-@require_torch_accelerator
-class SanaPipelineIntegrationTests(unittest.TestCase):
-    prompt = "A painting of a squirrel eating a burger."
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_sana_1024(self):
-        generator = torch.Generator("cpu").manual_seed(0)
-        controlnet = SanaControlNetModel.from_pretrained(
-            "ishan24/Sana_600M_1024px_ControlNet_diffusers", torch_dtype=torch.float16
-        )
-
-        pipe = SanaControlNetPipeline.from_pretrained(
-            "Efficient-Large-Model/Sana_600M_1024px_diffusers",
-            variant="fp16",
-            torch_dtype=torch.float16,
-            controlnet=controlnet,
-        )
-        pipe.vae.to(torch.bfloat16)
-        pipe.text_encoder.to(torch.bfloat16)
-        pipe.enable_model_cpu_offload(device=torch_device)
-        control_image = load_image(
-            "https://huggingface.co/ishan24/Sana_600M_1024px_ControlNet_diffusers/resolve/main/hed_example.png"
-        )
-
-        image = pipe(
-            prompt=self.prompt,
-            height=1024,
-            width=1024,
-            generator=generator,
-            num_inference_steps=20,
-            output_type="np",
-            control_image=control_image,
-        ).images[0]
-
-        image = image.flatten()
-        output_slice = np.concatenate((image[:16], image[-16:]))
-
-        # fmt: off
-        expected_slice = np.array([0.0427, 0.0789, 0.0662, 0.0464, 0.082, 0.0574, 0.0535, 0.0886, 0.0647, 0.0549, 0.0872, 0.0605, 0.0593, 0.0942, 0.0674, 0.0581, 0.0076, 0.0168, 0.0027, 0.0063, 0.0159, 0.0, 0.0071, 0.0198, 0.0034, 0.0105, 0.0212, 0.0, 0.0, 0.0166, 0.0042, 0.0125])
-        # fmt: on
-
-        self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-4))