(feat)Example MmaDA: update performance in readme (#1377)

wtomin · web-flow · commit 36d1e6ac8d41 · 2025-10-18T03:09:38.000Z
* print text token throughputs

* print image generation throughputs

* update readme

* revert zero.py get_optimizer_param_tuples

* udpate performance of zero2 training
diff --git a/examples/mmada/README.md b/examples/mmada/README.md
@@ -40,12 +40,12 @@ Here is the development plan of the project:
 
 | MindSpore | Ascend Driver |  Firmware   | CANN toolkit/kernel |
 |:---------:|:-------------:|:-----------:|:-------------------:|
-|   2.6.0   |  24.1.RC3     | 7.6.0.1.220 |  8.0.RC3.beta1     |
+| 2.6.0/2.7.0 | 24.1.RC3.b080  |   7.5.T11.0.B088   | 8.1.RC1    |
 
 </div>
 
 1. Install
-   [CANN 8.0.RC3.beta1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.beta1)
+   [CANN 8.1.RC1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.1.RC1)
    and MindSpore according to the [official instructions](https://www.mindspore.cn/install).
 2. Install requirements
     ```shell
@@ -98,7 +98,7 @@ python generate.py
 
 ### 2. MultiModal Generation
 
-For multiModal generation, please run:
+For multimodal generation, please run:
 ```
 python3 inference_mmu.py config=configs/mmada_demo.yaml mmu_image_root=./mmu_validation question='Please describe this image in detail.'
 ```
@@ -109,10 +109,28 @@ The outputs are stored locally.
 For text-to-image generation, please run:
 ```
 python3 inference_t2i.py config=configs/mmada_demo.yaml batch_size=1 validation_prompts_file=validation_prompts/text2image_prompts.txt guidance_scale=3.5 generation_timesteps=15
-mode='t2i'
 ```
 The outputs are stored locally.
 
+### Performance
+
+The following experiments are tested on Ascend Atlas 800T A2 machines with mindspore **2.7.0** under **pynative** mode:
+
+| model | # card(s) | batch size | task | throughput (token/s) |
+|:-:|:-:|:-:|:-:|:-:|
+|  MMaDA-8B-Base | 1 | 1  | text generation |  12.56 |
+|  MMaDA-8B-Base | 1 | 1  | mmu generation  |  13.48  |
+|  MMaDA-8B-Base | 1 | 1  | text-to-image generation| 167.50 |
+
+The following experiments are tested on Ascend Atlas 800T A2 machines with mindspore **2.6.0** under **pynative** mode:
+
+| model | # card(s) | batch size | task | throughput (token/s) |
+|:-:|:-:|:-:|:-:|:-:|
+|  MMaDA-8B-Base | 1 | 1  | text generation |  12.53 |
+|  MMaDA-8B-Base | 1 | 1  | mmu generation  |  13.50  |
+|  MMaDA-8B-Base | 1 | 1  | text-to-image generation| 168.60 |
+
+
 ## 🔧 Training
 
 
@@ -164,6 +182,21 @@ msrun --bind_core=True --worker_num=8 --local_worker_num=8 --master_port=9000 --
 python training/train_mmada_stage2.py config=configs/mmada_finetune_artwork.yaml
 ```
 
+### Performance
+
+The following experiments are tested on Ascend Atlas 800T A2 machines with mindspore **2.7.0** under **pynative** mode:
+
+| model | # card(s) | batch size | parallelism |task | per batch time (seconds) |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| MMaDA-8B-Base  | 8 |  4 | zero2  | finetune  | 1.29 |
+
+The following experiments are tested on Ascend Atlas 800T A2 machines with mindspore **2.6.0** under **pynative** mode:
+
+| model | # card(s) | batch size | parallelism | task | per batch time (seconds) |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| MMaDA-8B-Base  | 8 |  4 | zero2  | finetune  | 1.30 |
+
+
 
 ## 🤝 Acknowledgments
 
diff --git a/examples/mmada/generate.py b/examples/mmada/generate.py
@@ -168,6 +168,7 @@ def main():
         remasking="low_confidence",
     )
     print(f"Inference time: {time() - infer_start:.3f}s")
+    print(f"Throughput: {out.shape[1] / (time() - infer_start):.3f} token/s")
     print(tokenizer.batch_decode(out[:, input_ids.shape[1] :], skip_special_tokens=True))
 
 
diff --git a/examples/mmada/inference_mmu.py b/examples/mmada/inference_mmu.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 
 import os
+from time import time
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 os.environ["SAFETENSORS_WEIGHTS_NAME"] = "pytorch_model.safetensors"  # vq_model
@@ -122,6 +123,8 @@ def draw_caption_on_image(
     responses = ["" for i in range(len(file_list))]
     images = []
     config.question = config.question.split(" *** ")
+
+    throughputs = []
     for i, file_name in enumerate(tqdm(file_list)):
         image_path = os.path.join(config.mmu_image_root, file_name)
         image_ori = Image.open(image_path).convert("RGB")
@@ -152,10 +155,12 @@ def draw_caption_on_image(
                 ],
                 dim=1,
             )
+            infer_start = time()
             output_ids = model.mmu_generate(input_ids, max_new_tokens=1024, steps=512, block_length=1024)
             text = uni_prompting.text_tokenizer.batch_decode(
                 output_ids[:, input_ids.shape[1] :], skip_special_tokens=True
             )
+            throughputs.append(output_ids.shape[1] / (time() - infer_start))
             print(text[0])
             responses[i] += text[0]
 
@@ -169,3 +174,4 @@ def draw_caption_on_image(
     draw_caption_on_image(pil_images, responses, output_dir, file_list=file_list)
 
     print("Generated captions are saved in", output_dir)
+    print(f"Average throughput: {np.mean(throughputs):.3f} token/s")
diff --git a/examples/mmada/inference_t2i.py b/examples/mmada/inference_t2i.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 
 import os
+from time import time
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 import numpy as np
@@ -116,6 +117,8 @@ def draw_caption_on_image(
     with open(config.dataset.params.validation_prompts_file, "r") as f:
         validation_prompts = f.read().splitlines()
     output_images, output_responses = [], []
+    print("Generating images with batch size: ", config.training.batch_size)
+    throughputs = []
     for step in tqdm(range(0, len(validation_prompts), config.training.batch_size)):
         prompts = validation_prompts[step : step + config.training.batch_size]
 
@@ -133,7 +136,7 @@ def draw_caption_on_image(
             mask_schedule = get_mask_schedule(schedule, **args)
         else:
             mask_schedule = get_mask_schedule(config.training.get("mask_schedule", "cosine"))
-
+        infer_start = time()
         gen_token_ids = model.t2i_generate(
             input_ids=input_ids,
             uncond_input_ids=uncond_input_ids,
@@ -153,6 +156,7 @@ def draw_caption_on_image(
         images = vq_model.decode_code(gen_token_ids)
         output_images.append(images)
         output_responses.extend(prompts)
+        throughputs.append(gen_token_ids.shape[1] / config.training.batch_size / (time() - infer_start))
 
     images = mint.cat(output_images, dim=0)
     images = mint.clamp((images + 1.0) / 2.0, min=0.0, max=1.0)
@@ -162,5 +166,5 @@ def draw_caption_on_image(
     output_dir = "./inference_t2i_outputs/"
     os.makedirs(output_dir, exist_ok=True)
     draw_caption_on_image(pil_images, output_responses, output_dir)
-
+    print(f"Average throughput: {np.mean(throughputs):.3f} token/s")
     print("Generated images are saved in ", output_dir)
diff --git a/mindone/trainers/zero.py b/mindone/trainers/zero.py
@@ -271,12 +271,19 @@ def split_param(self, param):
 
     def get_optimizer_param_tuples(self):
         param_tuples = []
-        for attr in self.optimizer.__dict__:
-            if isinstance(getattr(self.optimizer, attr), ms.ParameterTuple):
-                if attr in ["_parameters", "parameters"]:
+        if ms.get_context("mode") == ms.PYNATIVE_MODE:
+            for name in self.optimizer._params_list:
+                if name in ["_parameters", "parameters"]:
                     continue
-                _logger.debug(f"Add optimizer param_tuples {attr}")
-                param_tuples.append(getattr(self.optimizer, attr))
+                _logger.debug(f"Add optimizer param_tuples {name}")
+                param_tuples.append(getattr(self.optimizer, name))
+        else:
+            for attr in self.optimizer.__dict__:
+                if isinstance(getattr(self.optimizer, attr), ms.ParameterTuple):
+                    if attr in ["_parameters", "parameters"]:
+                        continue
+                    _logger.debug(f"Add optimizer param_tuples {attr}")
+                    param_tuples.append(getattr(self.optimizer, attr))
         return param_tuples
 
     def dump_params_split_info(self, params_split_info):

Original file line number	Diff line number	Diff line change
`@@ -168,6 +168,7 @@ def main():`
`168`	`168`	`remasking="low_confidence",`
`169`	`169`	`)`
`170`	`170`	`print(f"Inference time: {time() - infer_start:.3f}s")`
	`171`	`+ print(f"Throughput: {out.shape[1] / (time() - infer_start):.3f} token/s")`
`171`	`172`	`print(tokenizer.batch_decode(out[:, input_ids.shape[1] :], skip_special_tokens=True))`
`172`	`173`
`173`	`174`