diff --git a/examples/opensora_hpcai/README.md b/examples/opensora_hpcai/README.md index 410c3f145e..24a3008c3e 100644 --- a/examples/opensora_hpcai/README.md +++ b/examples/opensora_hpcai/README.md @@ -27,6 +27,8 @@ This repository is built on the models and code released by HPC-AI Tech. We are | mindspore | ascend driver | firmware | cann toolkit/kernel | |:---------:|:-------------:|:-----------:|:-------------------:| | 2.5.0 | 24.0.0 | 7.5.0.3.220 | 8.0.0.beta1 | +| 2.6.0 | 24.1.rc3 | 7.7.0.1.238 | 8.1.RC1 | +| 2.7.0 | 24.1.rc3 | 7.7.0.1.238 | 8.2.RC1 | @@ -413,7 +415,7 @@ Experiments are conducted on Ascend Atlas 800T A2 machines with MindSpore 2.5.0 ```shell # OSv1.2 -python scripts/inference.py --config configs/opensora-v1-2/inference/sample_iv2v.yaml --ckpt_path /path/to/your/opensora-v1-1.ckpt +python scripts/inference.py --config configs/opensora-v1-2/inference/sample_iv2v.yaml --ckpt_path /path/to/your/opensora-v1-2.ckpt # OSv1.1 python scripts/inference.py --config configs/opensora-v1-1/inference/sample_iv2v.yaml --ckpt_path /path/to/your/opensora-v1-1.ckpt ``` @@ -438,13 +440,13 @@ python scripts/inference.py --config configs/opensora-v1-1/inference/sample_t2v. We evaluate the inference performance of text-to-video generation by measuring the average sampling time per step and the total sampling time of a video. -All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.3.1 graph mode. +All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.7.0 graph mode. | model name | cards | batch size | resolution | jit level | precision | scheduler | step | graph compile | s/step | s/video | recipe | |:-----------:|:-----:|:----------:|:------------:|:---------:|:---------:|:---------:|:----:|:-------------:|:------:|:-------:|:-------------------------------------------------------:| -| STDiT2-XL/2 | 1 | 1 | 16x640x360 | O0 | bf16 | DDPM | 100 | 1~2 mins | 1.56 | 156.00 | [yaml](configs/opensora-v1-1/inference/sample_t2v.yaml) | -| STDiT3-XL/2 | 1 | 1 | 51x720x1280 | O0 | bf16 | RFlow | 30 | 1~2 mins | 5.88 | 176.40 | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) | -| STDiT3-XL/2 | 1 | 1 | 102x720x1280 | O0 | bf16 | RFlow | 30 | 1~2 min | 13.71 | 411.30 | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) | +| STDiT2-XL/2 | 1 | 1 | 16x640x360 | O0 | bf16 | DDPM | 100 | 1~2 mins | 1.56 | 156.0 | [yaml](configs/opensora-v1-1/inference/sample_t2v.yaml) | +| STDiT3-XL/2 | 1 | 1 | 51x720x1280 | O0 | bf16 | RFlow | 30 | 1~2 mins | 4.83 | 155.4 | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) | +| STDiT3-XL/2 | 1 | 1 | 102x720x1280 | O0 | bf16 | RFlow | 30 | 1~2 mins | 8.81 | 286.9 | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) | @@ -778,13 +780,14 @@ Here ✅ means that the data is seen during training, and 🆗 means although no We evaluate the training performance of Open-Sora v1.2 on the MixKit dataset with high-resolution videos (1080P, duration 12s to 100s). -All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.3.1 graph mode. -| model name | cards | batch size | resolution | precision | sink | jit level | graph compile | s/step | recipe | -| :--: | :--: | :--: | :--: | :--: | :--: | :--: |:--: | :--: | :--: | -| STDiT3-XL/2 | 8 | 1 | 51x720x1280| bf16 | ON | O1 | 12 mins | 14.23 | [yaml](configs/opensora-v1-2/train/train_720x1280x51.yaml) -| STDiT3-XL/2 | 8 | dynamic | stage 1 | bf16 | OFF | O1 | 22 mins | 13.17 | [yaml](configs/opensora-v1-2/train/train_stage1_ms.yaml) -| STDiT3-XL/2 | 8 | dynamic | stage 2 | bf16 | OFF | O1 | 22 mins | 31.04 | [yaml](configs/opensora-v1-2/train/train_stage2_ms.yaml) -| STDiT3-XL/2 | 8 | dynamic | stage 3 | bf16 | OFF | O1 | 22 mins | 31.17 | [yaml](configs/opensora-v1-2/train/train_stage3_ms.yaml) +All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.7.0 graph mode. + +| model name | cards | batch size | resolution | precision | jit level | graph compile | s/step | recipe | +|:-----------:|:-----:|:----------:|:-----------:|:---------:|:---------:|:-------------:|:------:|:----------------------------------------------------------:| +| STDiT3-XL/2 | 8 | 1 | 51x720x1280 | bf16 | O1 | 100 s | 11.24 | [yaml](configs/opensora-v1-2/train/train_720x1280x51.yaml) | +| STDiT3-XL/2 | 8 | dynamic | stage 1 | bf16 | O1 | 14 mins | 13.17 | [yaml](configs/opensora-v1-2/train/train_stage1_ms.yaml) | +| STDiT3-XL/2 | 8 | dynamic | stage 2 | bf16 | O1 | 14 mins | 26.04 | [yaml](configs/opensora-v1-2/train/train_stage2_ms.yaml) | +| STDiT3-XL/2 | 8 | dynamic | stage 3 | bf16 | O1 | 14 mins | 27.83 | [yaml](configs/opensora-v1-2/train/train_stage3_ms.yaml) | Note that the step time of dynamic training can be influenced by the resolution and duration distribution of the source videos. diff --git a/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml b/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml index 01432dc103..577715e302 100644 --- a/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml +++ b/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml @@ -28,7 +28,7 @@ loop: 1 condition_frame_length: 5 # ms -jit_level: O0 +jit_level: O1 captions: - "Snow falling over multiple houses and trees on winter landscape against night sky. christmas festivity and celebration concept" diff --git a/examples/opensora_hpcai/opensora/models/layers/blocks.py b/examples/opensora_hpcai/opensora/models/layers/blocks.py index e84edd22ad..b082a9eedf 100644 --- a/examples/opensora_hpcai/opensora/models/layers/blocks.py +++ b/examples/opensora_hpcai/opensora/models/layers/blocks.py @@ -177,7 +177,7 @@ def construct(self, x, cond, mask=None): # 2+: mask adaptation for multi-head attention if mask is not None: # flip mask, since ms FA treats 1 as discard, 0 as retain. - mask = 1 - mask.to(ms.int32) + mask = 1 - mask # 3. attn compute if self.enable_flash_attention: @@ -266,7 +266,7 @@ def construct(self, x: Tensor, cond: Tensor, mask: Optional[Tensor] = None) -> T # 2+: mask adaptation for multi-head attention if mask is not None: # flip mask, since ms FA treats 1 as discard, 0 as retain. - mask = 1 - mask.to(ms.int32) + mask = 1 - mask # 3. attn compute if self.enable_flash_attention: @@ -274,7 +274,7 @@ def construct(self, x: Tensor, cond: Tensor, mask: Optional[Tensor] = None) -> T # (b n_k) -> (b 1 1 n_k), will be broadcast according to qk sim, e.g. (b num_heads n_q n_k) mask = mask[:, None, None, :] # (b 1 1 n_k) -> (b 1 n_q n_k) - mask = self.repeat_interleave(mask.to(ms.int32), int(q.shape[1]), axis=-2) + mask = self.repeat_interleave(mask, int(q.shape[1]), -2) x = self.flash_attention(q, k, v, mask=mask) # FA attn_mask def: retention and 1 indicates discard. Input tensor of shape :math:`(B, N1, S1, S2)`, `(B, 1, S1, S2)` `(S1, S2)` @@ -384,7 +384,7 @@ def construct(self, x, mask=None, freqs_cis: Optional[Tensor] = None): # mask process if mask is not None: - mask = 1 - mask.to(ms.int32) + mask = 1 - mask if self.enable_flash_attention: if mask is not None: @@ -500,8 +500,8 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine: bool = True, self.gamma = Parameter(initializer("ones", normalized_shape, dtype=dtype)) self.beta = Parameter(initializer("zeros", normalized_shape, dtype=dtype)) else: - self.gamma = ops.ones(normalized_shape, dtype=dtype) - self.beta = ops.zeros(normalized_shape, dtype=dtype) + self.gamma = Tensor(np.ones(normalized_shape, dtype=np.float32)) + self.beta = Tensor(np.zeros(normalized_shape, dtype=np.float32)) def construct(self, x: Tensor): normalized_shape = x.shape[-1:] @@ -592,10 +592,7 @@ def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None): self.norm_final = LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) # (1152, 4*8) self.linear = nn.Dense(hidden_size, num_patch * out_channels, has_bias=True) - # self.scale_shift_table = Parameter((ops.randn(2, hidden_size, dtype=ms.float32) / hidden_size**0.5).astype(ms.float32)) - self.scale_shift_table = Parameter( - ms.Tensor((np.random.randn(2, hidden_size) / hidden_size**0.5), dtype=ms.float32) - ) + self.scale_shift_table = Parameter(np.random.randn(2, hidden_size).astype(np.float32) / hidden_size**0.5) self.out_channels = out_channels self.d_t = d_t self.d_s = d_s @@ -614,11 +611,13 @@ def construct( T = self.d_t if S is None: S = self.d_s - shift, scale = self.chunk(self.scale_shift_table[None] + t[:, None], 2, 1) + + scale_shift_table = self.scale_shift_table.to(x.dtype) + shift, scale = self.chunk(scale_shift_table[None] + t[:, None], 2, 1) x = t2i_modulate(self.norm_final(x), shift, scale) if frames_mask is not None: - shift_zero, scale_zero = self.chunk(self.scale_shift_table[None] + t0[:, None], 2, 1) + shift_zero, scale_zero = self.chunk(scale_shift_table[None] + t0[:, None], 2, 1) x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero) x = t_mask_select(frames_mask, x, x_zero, T, S) @@ -639,9 +638,9 @@ def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU, tok in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0 ) - y_embedding = ops.randn(token_num, in_channels) / in_channels**0.5 + y_embedding = np.random.randn(token_num, in_channels).astype(np.float32) / in_channels**0.5 # just for token dropping replacement, not learnable - self.y_embedding = Parameter(Tensor(y_embedding, dtype=ms.float32), requires_grad=False) + self.y_embedding = Parameter(y_embedding, requires_grad=False) self.uncond_prob = uncond_prob @@ -656,9 +655,7 @@ def token_drop(self, caption, force_drop_ids=None): # manually expand dims to avoid infer-shape bug in ms2.3 daily caption = ops.where( - drop_ids[:, None, None, None], - self.y_embedding[None, None, :, :].to(caption.dtype), - caption, + drop_ids[:, None, None, None], self.y_embedding[None, None, :, :].to(caption.dtype), caption ) return caption diff --git a/examples/opensora_hpcai/opensora/models/layers/operation_selector.py b/examples/opensora_hpcai/opensora/models/layers/operation_selector.py index 269f3966c3..41b5d929a7 100644 --- a/examples/opensora_hpcai/opensora/models/layers/operation_selector.py +++ b/examples/opensora_hpcai/opensora/models/layers/operation_selector.py @@ -1,6 +1,6 @@ import mindspore as ms from mindspore import mint, ops -from mindspore.ops.function.array_func import chunk_ext, repeat_interleave_ext +from mindspore.ops.function.array_func import chunk_ext use_dynamic_ops = False @@ -15,50 +15,13 @@ def check_dynamic_mode(): return use_dynamic_ops -def repeat_interleave_ext_v2(input, repeats, axis=None): - # A more efficient implementation for replacing mint.repeat_interleave_ext - if isinstance(repeats, ms.Tensor): - if repeats.ndim > 1: - raise ValueError(f"repeats must be int, but get Tensor and ndim > 1, repeats.ndim {repeats.ndim}") - else: - repeats = int(repeats) - if isinstance(repeats, (tuple, list)): - if len(repeats) > 1: - raise ValueError(f"repeats must be int, but get list and len > 1, len(repeats) {len(repeats)}") - else: - repeats = repeats[0] - if not isinstance(repeats, int): - raise ValueError(f"repeats must be int, but get {repeats}") - if axis is None: - input = input.reshape[-1] - axis = 0 - - if not isinstance(axis, int): - raise ValueError(f"axis must be int, but get {axis}") - axis = axis + input.ndim if axis < 0 else axis - x_shape = input.shape - tile_axis = [1] - y_shape = list(x_shape) - y_shape[axis] = -1 - for i in range(1, input.ndim + 1): - if i == axis + 1: - tile_axis.append(repeats) - else: - tile_axis.append(1) - input = ops.expand_dims(input, axis + 1) - - return mint.tile(input, tuple(tile_axis)).reshape(y_shape) - - def get_repeat_interleave_op(): mode = ms.get_context("mode") if (mode == 0) and (not check_dynamic_mode()): # provide better performance for static shape in graph mode return ops.repeat_interleave else: - # FIXME: check overflow for v2 - # return repeat_interleave_ext_v2 - return repeat_interleave_ext + return mint.repeat_interleave def get_chunk_op(): diff --git a/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py b/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py index 304481d2ee..406b996eb0 100644 --- a/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py +++ b/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py @@ -28,7 +28,7 @@ def rotate_half(x: Tensor) -> Tensor: def apply_rotary_emb(freqs: Parameter, t: Tensor, scale: float = 1.0, seq_dim: int = -2) -> Tensor: # FIXME: start_index is always 0 in OS1.2 and ops.concat doesn't support empty elements. OS1.x future versions may need start_index > 0 # t, t_right = t[..., start_index:end_index], t[..., end_index:] - t = (t * freqs.cos().astype(t.dtype) * scale) + (rotate_half(t) * freqs.sin().astype(t.dtype) * scale) + t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale) return t @@ -139,7 +139,7 @@ def get_axial_freqs(self, *dims): raise NotImplementedError def construct(self, t: Tensor, seq_len=None, offset=0) -> Tensor: - freqs = t.astype(self.freqs.dtype)[..., None] * self.freqs + freqs = t[..., None] * self.freqs.to(t.dtype) return self.repeat_interleave(freqs, 2, -1) # ... n -> ... (n r), r = 2 diff --git a/examples/opensora_hpcai/opensora/models/stdit/stdit3.py b/examples/opensora_hpcai/opensora/models/stdit/stdit3.py index d5a1ac4032..12f7b406de 100644 --- a/examples/opensora_hpcai/opensora/models/stdit/stdit3.py +++ b/examples/opensora_hpcai/opensora/models/stdit/stdit3.py @@ -102,13 +102,14 @@ def construct( ) -> Tensor: # prepare modulate parameters B, N, C = x.shape + scale_shift_table = self.scale_shift_table.to(x.dtype) shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.chunk( - self.scale_shift_table[None] + t.reshape(B, 6, -1), 6, 1 + scale_shift_table[None] + t.reshape(B, 6, -1), 6, 1 ) # frames mask branch shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = self.chunk( - self.scale_shift_table[None] + t0.reshape(B, 6, -1), 6, 1 + scale_shift_table[None] + t0.reshape(B, 6, -1), 6, 1 ) # modulate (attention) diff --git a/examples/opensora_hpcai/opensora/models/vae/losses.py b/examples/opensora_hpcai/opensora/models/vae/losses.py index 0ba8a13b4b..cb62bba67c 100644 --- a/examples/opensora_hpcai/opensora/models/vae/losses.py +++ b/examples/opensora_hpcai/opensora/models/vae/losses.py @@ -106,7 +106,6 @@ def construct(self, x: ms.Tensor, global_step: ms.Tensor = -1, weights: ms.Tenso weights: sample weights global_step: global training step """ - print("D--: x shape: ", x.shape) # 3d vae forward, get posterior (mean, logvar) and recons # x -> VAE2d-Enc -> x_z -> TemporalVAE-Enc -> z ~ posterior -> TempVAE-Dec -> x_z_rec -> VAE2d-Dec -> x_rec x_rec, x_z_rec, z, posterior_mean, posterior_logvar, x_z = self.autoencoder(x) diff --git a/examples/opensora_hpcai/opensora/models/vae/vae.py b/examples/opensora_hpcai/opensora/models/vae/vae.py index e30b941e1d..64f20624a8 100644 --- a/examples/opensora_hpcai/opensora/models/vae/vae.py +++ b/examples/opensora_hpcai/opensora/models/vae/vae.py @@ -117,12 +117,11 @@ def __init__( self.scale_factor = 0.18215 @staticmethod - def rearrange_in(x): - B, C, T, H, W = x.shape - # (b c t h w) -> (b t c h w) - x = ops.transpose(x, (0, 2, 1, 3, 4)) + def rearrange_in(x, transpose: bool = True): + if transpose: # (b c t h w) -> (b t c h w) + x = ops.transpose(x, (0, 2, 1, 3, 4)) + B, T, C, H, W = x.shape x = ops.reshape(x, (B * T, C, H, W)) - return x @staticmethod @@ -147,8 +146,8 @@ def encode(self, x): # is_video = (x.ndim == 5) B = x.shape[0] - # B C T H W -> (B T) C H W - x = self.rearrange_in(x) + # B T C H W -> (B T) C H W + x = self.rearrange_in(x, transpose=False) pad_num = None if self.micro_batch_parallel: diff --git a/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py b/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py index e14d17a79a..29396e3d71 100644 --- a/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py +++ b/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py @@ -142,15 +142,12 @@ def construct( - assume model input/output shape: (b c f h w) unet2d input/output shape: (b c h w) """ - print("x shape: ", x.shape) - with no_grad(): # 1. get image/video latents z using vae - # (b f c h w) -> (b c f h w) - x = ops.transpose(x, (0, 2, 1, 3, 4)) - if not self.video_emb_cached: x = self.get_latents(x) + else: + x = ops.transpose(x, (0, 2, 1, 3, 4)) # 2. get conditions if not self.text_emb_cached: diff --git a/examples/opensora_hpcai/opensora/utils/model_utils.py b/examples/opensora_hpcai/opensora/utils/model_utils.py index f657d8afea..103bcb9905 100644 --- a/examples/opensora_hpcai/opensora/utils/model_utils.py +++ b/examples/opensora_hpcai/opensora/utils/model_utils.py @@ -9,18 +9,16 @@ from mindspore import Model as MSModel from mindspore import Parameter, context, load_checkpoint -from mindspore.nn import GraphCell, GroupNorm, SiLU # GELU +from mindspore.nn import GraphCell, GroupNorm from mindspore.train.callback import _CallbackManager from ..models.layers.blocks import Attention, LayerNorm, LlamaRMSNorm, PositionEmbedding2D, SinusoidalEmbedding -# SORA's whitelist (FP32) operators -WHITELIST_OPS = [ +# SORA's blacklist (FP32) operators for O2 AMP level +BLACKLIST_OPS = [ LayerNorm, Attention, LlamaRMSNorm, - SiLU, - # GELU, GroupNorm, PositionEmbedding2D, SinusoidalEmbedding, diff --git a/examples/opensora_hpcai/scripts/infer_t5.py b/examples/opensora_hpcai/scripts/infer_t5.py index d0a6129fed..221a919b8c 100644 --- a/examples/opensora_hpcai/scripts/infer_t5.py +++ b/examples/opensora_hpcai/scripts/infer_t5.py @@ -153,7 +153,7 @@ def main(args): # print(captions) text_tokens, mask = text_encoder.get_text_tokens_and_mask(captions, return_tensor=True) - text_emb = text_encoder(text_tokens, mask) + text_emb = text_encoder(text_tokens, mask).last_hidden_state end_time = time.time() time_cost = end_time - start_time @@ -197,7 +197,7 @@ def main(args): ns = len(batch_prompts) batch_text_tokens, batch_mask = text_encoder.get_text_tokens_and_mask(batch_prompts, return_tensor=True) - batch_text_emb = text_encoder(batch_text_tokens, batch_mask) + batch_text_emb = text_encoder(batch_text_tokens, batch_mask).last_hidden_state # save result batch_mask = batch_mask.asnumpy().astype(np.uint8) diff --git a/examples/opensora_hpcai/scripts/inference.py b/examples/opensora_hpcai/scripts/inference.py index 757a2b5a54..a3714b0069 100644 --- a/examples/opensora_hpcai/scripts/inference.py +++ b/examples/opensora_hpcai/scripts/inference.py @@ -26,7 +26,7 @@ from opensora.pipelines import InferPipeline, InferPipelineFiTLike from opensora.utils.amp import auto_mixed_precision from opensora.utils.cond_data import get_references, read_captions_from_csv, read_captions_from_txt -from opensora.utils.model_utils import WHITELIST_OPS, _check_cfgs_in_parser, str2bool +from opensora.utils.model_utils import BLACKLIST_OPS, _check_cfgs_in_parser, str2bool from opensora.utils.util import IMG_FPS, apply_mask_strategy, process_mask_strategies, process_prompts from mindone.data.data_split import distribute_samples @@ -279,7 +279,7 @@ def main(args): if args.dtype in ["fp16", "bf16"]: latte_model = auto_mixed_precision( - latte_model, amp_level=args.amp_level, dtype=dtype_map[args.dtype], custom_fp32_cells=WHITELIST_OPS + latte_model, amp_level=args.amp_level, dtype=dtype_map[args.dtype], custom_fp32_cells=BLACKLIST_OPS ) if args.ckpt_path: diff --git a/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh b/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh index 1ce963a95e..f69856d715 100644 --- a/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh +++ b/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh @@ -3,7 +3,6 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export MS_ENABLE_NUMA=0 # plot memory usage, feature/model: 1 export MS_MEMORY_STATISTIC=0 -export MS_DATASET_SINK_QUEUE=4 # operation/graph fusion for dynamic shape export MS_DEV_ENABLE_KERNEL_PACKET=on @@ -13,7 +12,7 @@ export GLOG_v=2 output_dir=outputs/OSv1.2_720p_51 -msrun --bind_core=True --worker_num=8 --local_worker_num=8 --log_dir=$output_dir \ +msrun --bind_core=True --master_port=8200 --worker_num=8 --local_worker_num=8 --log_dir=$output_dir \ python scripts/train.py \ --mode=0 \ --jit_level O1 \ @@ -22,7 +21,6 @@ msrun --bind_core=True --worker_num=8 --local_worker_num=8 --log_dir=$output_dir --video_folder datasets/mixkit-100videos/mixkit \ --text_embed_folder datasets/mixkit-100videos/t5_emb_300 \ --use_parallel True \ - --dataset_sink_mode=True \ --num_parallel_workers=4 \ --prefetch_size=4 \ --enable_flash_attention=True \ diff --git a/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh b/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh index 13a92392a0..c0f9506990 100644 --- a/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh +++ b/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh @@ -3,7 +3,6 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export MS_ENABLE_NUMA=0 # plot memory usage, feature/model: 1 export MS_MEMORY_STATISTIC=0 -export MS_DATASET_SINK_QUEUE=4 # operation/graph fusion for dynamic shape export MS_DEV_ENABLE_KERNEL_PACKET=on @@ -23,7 +22,6 @@ python scripts/train.py \ --csv_path datasets/mixkit-100videos/video_caption_train.csv \ --video_folder datasets/mixkit-100videos/mixkit \ --text_embed_folder datasets/mixkit-100videos/t5_emb_300 \ ---dataset_sink_mode=False \ --num_parallel_workers=2 \ --prefetch_size=2 \ --enable_flash_attention=True \ diff --git a/examples/opensora_hpcai/scripts/train.py b/examples/opensora_hpcai/scripts/train.py index 5318464c19..354729d108 100644 --- a/examples/opensora_hpcai/scripts/train.py +++ b/examples/opensora_hpcai/scripts/train.py @@ -39,7 +39,7 @@ from opensora.utils.callbacks import EMAEvalSwapCallback, PerfRecorderCallback from opensora.utils.ema import EMA, save_ema_ckpts from opensora.utils.metrics import BucketLoss -from opensora.utils.model_utils import WHITELIST_OPS, Model +from opensora.utils.model_utils import BLACKLIST_OPS, Model from opensora.utils.resume import flush_from_cache, get_resume_ckpt, get_resume_states, resume_train_net, save_train_net from mindone.trainers.callback import EvalSaveCallback, OverflowMonitor, ProfilerCallbackEpoch, StopAtStepCallback @@ -467,7 +467,7 @@ def main(args): latte_model, amp_level=args.amp_level, dtype=dtype_map[args.dtype], - custom_fp32_cells=WHITELIST_OPS, + custom_fp32_cells=BLACKLIST_OPS, ) # load checkpoint if args.pretrained_model_path: diff --git a/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py b/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py index 38b560afa7..9af85ab21e 100644 --- a/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py +++ b/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py @@ -4,7 +4,7 @@ from opensora.acceleration.parallel_states import create_parallel_group, get_sequence_parallel_group from opensora.models.stdit.stdit3 import STDiT3 from opensora.utils.amp import auto_mixed_precision -from opensora.utils.model_utils import WHITELIST_OPS +from opensora.utils.model_utils import BLACKLIST_OPS import mindspore as ms import mindspore.nn as nn @@ -82,7 +82,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float32): non_dist_model, amp_level="O2", dtype=model_dtype, - custom_fp32_cells=WHITELIST_OPS, + custom_fp32_cells=BLACKLIST_OPS, ) # sequence parallel model @@ -95,7 +95,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float32): dist_model, amp_level="O2", dtype=model_dtype, - custom_fp32_cells=WHITELIST_OPS, + custom_fp32_cells=BLACKLIST_OPS, ) for (_, w0), (_, w1) in zip(non_dist_model.parameters_and_names(), dist_model.parameters_and_names()): diff --git a/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py b/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py index c64679687d..b2cf656e01 100644 --- a/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py +++ b/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py @@ -4,7 +4,7 @@ from opensora.acceleration.parallel_states import create_parallel_group from opensora.models.vae.vae import OpenSoraVAE_V1_2 from opensora.utils.amp import auto_mixed_precision -from opensora.utils.model_utils import WHITELIST_OPS +from opensora.utils.model_utils import BLACKLIST_OPS import mindspore as ms import mindspore.nn as nn @@ -58,7 +58,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float16): non_dist_model, amp_level="O2", dtype=model_dtype, - custom_fp32_cells=WHITELIST_OPS, + custom_fp32_cells=BLACKLIST_OPS, ) non_dist_model.set_train(False) @@ -72,7 +72,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float16): dist_model, amp_level="O2", dtype=model_dtype, - custom_fp32_cells=WHITELIST_OPS, + custom_fp32_cells=BLACKLIST_OPS, ) dist_model.set_train(False) diff --git a/mindone/trainers/callback.py b/mindone/trainers/callback.py index ee1ec9d860..ccb8ffbb4c 100755 --- a/mindone/trainers/callback.py +++ b/mindone/trainers/callback.py @@ -442,11 +442,13 @@ def on_train_step_end(self, run_context): cur_step = cb_params.cur_step_num if cur_step == self.end_step: self.profiler.stop() - self.profiler.analyse() - _logger.info(f"finish analyzing profiler in step range [{self.start_step}, {self.end_step}]") + _logger.info(f"Finished profiling in step range [{self.start_step}, {self.end_step}]") if self.exit_after_analyze: run_context.request_stop() + def on_train_end(self, run_context): + self.profiler.analyse() + class ProfilerCallbackEpoch(Callback): def __init__(self, start_epoch, stop_epoch, output_dir="./profiler_data"): @@ -466,4 +468,6 @@ def on_train_epoch_end(self, run_context): epoch_num = cb_params.cur_epoch_num if epoch_num == self.stop_epoch: self.profiler.stop() - self.profiler.analyse() + + def on_train_end(self, run_context): + self.profiler.analyse()