Update Wan Animate pipeline tests after transformer an pipeline changes

dg845 · dg845 · commit 332d3c2c2bbb · 2025-11-04T07:37:12.000+01:00
diff --git a/tests/pipelines/wan/test_wan_animate.py b/tests/pipelines/wan/test_wan_animate.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import unittest
 
 import numpy as np
@@ -32,7 +33,13 @@
     WanAnimateTransformer3DModel,
 )
 
-from ...testing_utils import enable_full_determinism
+from ...testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
 
@@ -75,21 +82,30 @@ def get_dummy_components(self):
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
 
         torch.manual_seed(0)
+        channel_sizes = {"4": 16, "8": 16, "16": 16}
         transformer = WanAnimateTransformer3DModel(
             patch_size=(1, 2, 2),
             num_attention_heads=2,
             attention_head_dim=12,
             in_channels=36,
+            latent_channels=16,
             out_channels=16,
             text_dim=32,
             freq_dim=256,
             ffn_dim=32,
             num_layers=2,
             cross_attn_norm=True,
             qk_norm="rms_norm_across_heads",
-            rope_max_seq_len=32,
             image_dim=4,
-            pos_embed_seq_len=2 * (4 * 4 + 1),
+            rope_max_seq_len=32,
+            motion_encoder_channel_sizes=channel_sizes,
+            motion_encoder_size=16,
+            motion_style_dim=8,
+            motion_dim=4,
+            motion_encoder_dim=16,
+            face_encoder_hidden_dim=16,
+            face_encoder_num_heads=2,
+            inject_face_latents_blocks=2,
         )
 
         torch.manual_seed(0)
@@ -127,27 +143,29 @@ def get_dummy_inputs(self, device, seed=0):
         num_frames = 17
         height = 16
         width = 16
+        face_height = 16
+        face_width = 16
 
-        pose_video = [Image.new("RGB", (height, width))] * num_frames
-        face_video = [Image.new("RGB", (height, width))] * num_frames
         image = Image.new("RGB", (height, width))
+        pose_video = [Image.new("RGB", (height, width))] * num_frames
+        face_video = [Image.new("RGB", (face_height, face_width))] * num_frames
 
         inputs = {
             "image": image,
             "pose_video": pose_video,
             "face_video": face_video,
             "prompt": "dance monkey",
             "negative_prompt": "negative",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 1.0,
             "height": height,
             "width": width,
-            "num_frames": num_frames,
-            "mode": "animation",
-            "num_frames_for_temporal_guidance": 1,
-            "max_sequence_length": 16,
+            "segment_frame_length": 77,  # TODO: can we set this to num_frames?
+            "num_inference_steps": 2,
+            "mode": "animate",
+            "prev_segment_conditioning_frames": 1,
+            "generator": generator,
+            "guidance_scale": 1.0,
             "output_type": "pt",
+            "max_sequence_length": 16,
         }
         return inputs
 
@@ -168,6 +186,26 @@ def test_inference(self):
         max_diff = np.abs(video - expected_video).max()
         self.assertLessEqual(max_diff, 1e10)
 
+    def test_inference_replacement(self):
+        """Test the pipeline in replacement mode with background and mask videos."""
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["mode"] = "replace"
+        num_frames = 17
+        height = 16
+        width = 16
+        inputs["background_video"] = [Image.new("RGB", (height, width))] * num_frames
+        inputs["mask_video"] = [Image.new("RGB", (height, width))] * num_frames
+
+        video = pipe(**inputs).frames[0]
+        self.assertEqual(video.shape, (17, 3, 16, 16))
+
     def test_inference_with_single_reference_image(self):
         """Test inference with a single reference image for additional context."""
         device = "cpu"
@@ -200,46 +238,22 @@ def test_inference_with_multiple_reference_image(self):
     def test_attention_slicing_forward_pass(self):
         pass
 
-    @unittest.skip("Errors out because passing multiple prompts at once is not yet supported by this pipeline.")
-    def test_encode_prompt_works_in_isolation(self):
-        pass
 
-    @unittest.skip("Batching is not yet supported with this pipeline")
-    def test_inference_batch_consistent(self):
-        pass
+@slow
+@require_torch_accelerator
+class WanAnimatePipelineIntegrationTests(unittest.TestCase):
+    prompt = "A painting of a squirrel eating a burger."
 
-    @unittest.skip("Batching is not yet supported with this pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        backend_empty_cache(torch_device)
 
-    @unittest.skip(
-        "AutoencoderKLWan encoded latents are always in FP32. This test is not designed to handle mixed dtype inputs"
-    )
-    def test_float16_inference(self):
-        pass
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
 
-    @unittest.skip(
-        "AutoencoderKLWan encoded latents are always in FP32. This test is not designed to handle mixed dtype inputs"
-    )
-    def test_save_load_float16(self):
+    @unittest.skip("TODO: test needs to be implemented")
+    def test_wan_animate(self):
         pass
-
-    def test_inference_replacement_mode(self):
-        """Test the pipeline in replacement mode with background and mask videos."""
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["mode"] = "replacement"
-        num_frames = 17
-        height = 16
-        width = 16
-        inputs["background_video"] = [Image.new("RGB", (height, width))] * num_frames
-        inputs["mask_video"] = [Image.new("RGB", (height, width))] * num_frames
-
-        video = pipe(**inputs).frames[0]
-        self.assertEqual(video.shape, (17, 3, 16, 16))