fix mllama ut (#1735)

lvyufeng · web-flow · commit 5b4dad33c4b6 · 2024-10-14T11:00:04.000+08:00
diff --git a/mindnlp/transformers/generation/candidate_generator.py b/mindnlp/transformers/generation/candidate_generator.py
@@ -412,9 +412,16 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en
         model_kwargs[mask_key] = mask[:, :mask_length_diff]
     elif mask_length_diff > 0:
         model_kwargs[mask_key] = ops.cat([mask, ops.ones((mask.shape[0], mask_length_diff), dtype=mask.dtype)], dim=-1)
+    if "cross_attention_mask" in model_kwargs:
+        # Mllama case is special and has another mask for cross attention model
+        cross_mask = model_kwargs["cross_attention_mask"]
+        if mask_length_diff < 0:
+            model_kwargs["cross_attention_mask"] = cross_mask[:, :mask_length_diff]
+        elif mask_length_diff > 0:
+            new_mask = cross_mask[:, -1:, :, :].tile((1, mask_length_diff, 1, 1))
+            model_kwargs["cross_attention_mask"] = ops.cat([cross_mask, new_mask], dim=1)
     return model_kwargs
 
-
 def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
     """Expands or crops the model's token_type_ids for decoding purposes, to the defined length"""
     if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
diff --git a/mindnlp/transformers/generation/utils.py b/mindnlp/transformers/generation/utils.py
@@ -2388,7 +2388,7 @@ def _dola_decoding(
         this_peer_finished = False
 
         # prepare layers for DoLa decoding
-        final_layer = self.config.num_hidden_layers
+        final_layer = self.config.get_text_config().num_hidden_layers
         # if the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer,
         # as the early exit from word embeddings will become identity function
         # if the model is really shallow (<=2 layers), we use the 1st layer if it's not the final layer and the 0-th
diff --git a/mindnlp/transformers/models/mllama/modeling_mllama.py b/mindnlp/transformers/models/mllama/modeling_mllama.py
@@ -883,7 +883,7 @@ class MllamaPreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
     _supports_static_cache = False
     # _supports_sdpa = True
-    # _supports_quantized_cache = True
+    _supports_quantized_cache = False
 
     def _init_weights(self, module):
         std = self.config.get_text_config().initializer_range
@@ -1515,6 +1515,8 @@ def prepare_inputs_for_generation(
 
 
 class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
+    _supports_quantized_cache = False  # quant cache not supported in encoder-decoder setting
+
     def __init__(self, config: MllamaConfig):
         super().__init__(config)
         self.vocab_size = config.text_config.vocab_size
diff --git a/tests/ut/transformers/models/mllama/test_modeling_mllama.py b/tests/ut/transformers/models/mllama/test_modeling_mllama.py
@@ -131,14 +131,6 @@ def setUp(self):
         self.model_tester = MllamaText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True)
 
-    @unittest.skip(reason="The outputs don't match, no idea why")
-    def test_beam_search_low_memory(self):
-        pass
-
-    @unittest.skip(reason="Quanto test is borken")
-    def test_generate_with_quant_cache(self):
-        pass
-
 
 class MllamaVisionText2TextModelTester:
     def __init__(
@@ -201,6 +193,7 @@ def __init__(
         self.image_size = 224
         self.max_num_images = 1
         self.max_image_tiles = 4
+        self.image_length = 904
 
     def get_config(self):
         return MllamaConfig(
@@ -319,86 +312,50 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             self.assertTrue(ops.allclose(out_embeds, out_ids))
 
-    @unittest.skip(reason="Static cache not supported")
-    def test_static_cache_matches_dynamic(self):
-        # TypeError: list indices must be integers or slices, not tuple
-        # TODO: @raushan, please look into this for new cache format
-        pass
-
-    @unittest.skip(reason="Mllama has dynamic control flow which is not yet supported by compile")
-    def test_generate_compile_fullgraph(self):
-        pass
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        # Mllama has cross attention layers and those have a different shape than normal attention layers
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+        cross_attention_layers = self.model_tester.text_config["cross_attention_layers"]
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx if not use_cache else 1
+            src_len = min_length + idx
+            expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+            expected_shape_cross = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                self.model_tester.image_length,
+            )
+            expected_shapes = [
+                expected_shape if layer_idx not in cross_attention_layers else expected_shape_cross
+                for layer_idx in range(len(iter_attentions))
+            ]
+            self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
 
-    @unittest.skip(reason="The outputs don't match, no idea why")
-    def test_beam_search_low_memory(self):
-        pass
 
-    @unittest.skip(reason="Mllama is not yet supported by compile")
-    def test_sdpa_can_compile_dynamic(self):
-        # TODO: look into this, AttributeError("'tensor' object has no attribute '__pow__'")
-        # relevant issue: https://github.com/pytorch/pytorch/issues/133166
+    @unittest.skip(reason="The test itself is broken")  # TODO @zucchini-nlp
+    def test_generate_with_quant_cache(self):
         pass
 
     @unittest.skip(reason="The test itself is broken")  # TODO @zucchini-nlp
-    def test_generate_with_quant_cache(self):
+    def test_beam_search_low_memory(self):
         pass
 
     @unittest.skip(reason="AssertionError: Items in the second set but not the first: might be a setting issue")
     def test_model_parallelism(self):
         pass
 
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_compile_cuda_graph_time(self):
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_torch_compile_fullgraph(self):
-        pass
-
-    @unittest.skip(reason="Device side assert triggered")
-    def test_assisted_decoding_with_num_logits_to_keep(self):
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_beam_sample_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_beam_search_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_constrained_beam_search_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_dola_decoding_sample():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_generate_methods_with_num_logits_to_keep():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_greedy_generate_dict_outputs():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_group_beam_search_generate_dict_output():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_model_parallel_beam_search():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_new_cache_format_2():
-        pass
-
-    @unittest.skip(reason="Failing test, need to fix")
-    def test_sample_generate_dict_output():
-        pass
-
 
 @require_mindspore
 class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):