diff --git a/examples/opensora_hpcai/README.md b/examples/opensora_hpcai/README.md
index 410c3f145e..24a3008c3e 100644
--- a/examples/opensora_hpcai/README.md
+++ b/examples/opensora_hpcai/README.md
@@ -27,6 +27,8 @@ This repository is built on the models and code released by HPC-AI Tech. We are
 | mindspore | ascend driver |  firmware   | cann toolkit/kernel |
 |:---------:|:-------------:|:-----------:|:-------------------:|
 |   2.5.0   |    24.0.0     | 7.5.0.3.220 |     8.0.0.beta1     |
+|   2.6.0   |   24.1.rc3    | 7.7.0.1.238 |       8.1.RC1       |
+|   2.7.0   |   24.1.rc3    | 7.7.0.1.238 |       8.2.RC1       |
 
 
 
@@ -413,7 +415,7 @@ Experiments are conducted on Ascend Atlas 800T A2 machines with MindSpore 2.5.0
 
 ```shell
 # OSv1.2
-python scripts/inference.py --config configs/opensora-v1-2/inference/sample_iv2v.yaml --ckpt_path /path/to/your/opensora-v1-1.ckpt
+python scripts/inference.py --config configs/opensora-v1-2/inference/sample_iv2v.yaml --ckpt_path /path/to/your/opensora-v1-2.ckpt
 # OSv1.1
 python scripts/inference.py --config configs/opensora-v1-1/inference/sample_iv2v.yaml --ckpt_path /path/to/your/opensora-v1-1.ckpt
 ```
@@ -438,13 +440,13 @@ python scripts/inference.py --config configs/opensora-v1-1/inference/sample_t2v.
 
 We evaluate the inference performance of text-to-video generation by measuring the average sampling time per step and the total sampling time of a video.
 
-All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.3.1 graph mode.
+All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.7.0 graph mode.
 
 | model name  | cards | batch size |  resolution  | jit level | precision | scheduler | step | graph compile | s/step | s/video |                         recipe                          |
 |:-----------:|:-----:|:----------:|:------------:|:---------:|:---------:|:---------:|:----:|:-------------:|:------:|:-------:|:-------------------------------------------------------:|
-| STDiT2-XL/2 |   1   |     1      |  16x640x360  |    O0     |   bf16    |   DDPM    | 100  |   1~2 mins    |  1.56  | 156.00  | [yaml](configs/opensora-v1-1/inference/sample_t2v.yaml) |
-| STDiT3-XL/2 |   1   |     1      | 51x720x1280  |    O0     |   bf16    |   RFlow   |  30  |   1~2 mins    |  5.88  | 176.40  | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) |
-| STDiT3-XL/2 |   1   |     1      | 102x720x1280 |    O0     |   bf16    |   RFlow   |  30  |    1~2 min    | 13.71  | 411.30  | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) |
+| STDiT2-XL/2 |   1   |     1      |  16x640x360  |    O0     |   bf16    |   DDPM    | 100  |   1~2 mins    |  1.56  |  156.0  | [yaml](configs/opensora-v1-1/inference/sample_t2v.yaml) |
+| STDiT3-XL/2 |   1   |     1      | 51x720x1280  |    O0     |   bf16    |   RFlow   |  30  |   1~2 mins    |  4.83  |  155.4  | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) |
+| STDiT3-XL/2 |   1   |     1      | 102x720x1280 |    O0     |   bf16    |   RFlow   |  30  |   1~2 mins    |  8.81  |  286.9  | [yaml](configs/opensora-v1-2/inference/sample_t2v.yaml) |
 
 </details>
 
@@ -778,13 +780,14 @@ Here ✅ means that the data is seen during training, and 🆗 means although no
 
 We evaluate the training performance of Open-Sora v1.2 on the MixKit dataset with high-resolution videos (1080P, duration 12s to 100s).
 
-All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.3.1 graph mode.
-| model name   | cards  | batch size | resolution | precision  | sink      | jit level | graph compile |  s/step | recipe |
-| :--:         | :--:   | :--:       | :--:       | :--:       | :--:      | :--:      |:--:          | :--:       | :--:   |
-| STDiT3-XL/2  |  8     | 1          | 51x720x1280| bf16       | ON      | O1        |    12 mins   | 14.23   | [yaml](configs/opensora-v1-2/train/train_720x1280x51.yaml)
-| STDiT3-XL/2  |  8     | dynamic    | stage 1 | bf16       |   OFF    | O1        |      22 mins   | 13.17   | [yaml](configs/opensora-v1-2/train/train_stage1_ms.yaml)
-| STDiT3-XL/2  |  8     | dynamic    | stage 2 | bf16       |   OFF    | O1        |     22 mins     | 31.04   | [yaml](configs/opensora-v1-2/train/train_stage2_ms.yaml)
-| STDiT3-XL/2  |  8     | dynamic    | stage 3 | bf16       |   OFF    | O1        |     22 mins     | 31.17   | [yaml](configs/opensora-v1-2/train/train_stage3_ms.yaml)
+All experiments are tested on Ascend Atlas 800T A2 machines with mindspore 2.7.0 graph mode.
+
+| model name  | cards | batch size | resolution  | precision | jit level | graph compile | s/step |                           recipe                           |
+|:-----------:|:-----:|:----------:|:-----------:|:---------:|:---------:|:-------------:|:------:|:----------------------------------------------------------:|
+| STDiT3-XL/2 |   8   |     1      | 51x720x1280 |   bf16    |    O1     |     100 s     | 11.24  | [yaml](configs/opensora-v1-2/train/train_720x1280x51.yaml) |
+| STDiT3-XL/2 |   8   |  dynamic   |   stage 1   |   bf16    |    O1     |    14 mins    | 13.17  |  [yaml](configs/opensora-v1-2/train/train_stage1_ms.yaml)  |
+| STDiT3-XL/2 |   8   |  dynamic   |   stage 2   |   bf16    |    O1     |    14 mins    | 26.04  |  [yaml](configs/opensora-v1-2/train/train_stage2_ms.yaml)  |
+| STDiT3-XL/2 |   8   |  dynamic   |   stage 3   |   bf16    |    O1     |    14 mins    | 27.83  |  [yaml](configs/opensora-v1-2/train/train_stage3_ms.yaml)  |
 
 Note that the step time of dynamic training can be influenced by the resolution and duration distribution of the source videos.
 
diff --git a/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml b/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml
index 01432dc103..577715e302 100644
--- a/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml
+++ b/examples/opensora_hpcai/configs/opensora-v1-2/inference/sample_t2v.yaml
@@ -28,7 +28,7 @@ loop: 1
 condition_frame_length: 5
 
 # ms
-jit_level: O0
+jit_level: O1
 
 captions:
   - "Snow falling over multiple houses and trees on winter landscape against night sky. christmas festivity and celebration concept"
diff --git a/examples/opensora_hpcai/opensora/models/layers/blocks.py b/examples/opensora_hpcai/opensora/models/layers/blocks.py
index e84edd22ad..b082a9eedf 100644
--- a/examples/opensora_hpcai/opensora/models/layers/blocks.py
+++ b/examples/opensora_hpcai/opensora/models/layers/blocks.py
@@ -177,7 +177,7 @@ def construct(self, x, cond, mask=None):
         # 2+: mask adaptation for multi-head attention
         if mask is not None:
             # flip mask, since ms FA treats 1 as discard, 0 as retain.
-            mask = 1 - mask.to(ms.int32)
+            mask = 1 - mask
 
         # 3. attn compute
         if self.enable_flash_attention:
@@ -266,7 +266,7 @@ def construct(self, x: Tensor, cond: Tensor, mask: Optional[Tensor] = None) -> T
         # 2+: mask adaptation for multi-head attention
         if mask is not None:
             # flip mask, since ms FA treats 1 as discard, 0 as retain.
-            mask = 1 - mask.to(ms.int32)
+            mask = 1 - mask
 
         # 3. attn compute
         if self.enable_flash_attention:
@@ -274,7 +274,7 @@ def construct(self, x: Tensor, cond: Tensor, mask: Optional[Tensor] = None) -> T
                 # (b n_k) -> (b 1 1 n_k), will be broadcast according to qk sim, e.g. (b num_heads n_q n_k)
                 mask = mask[:, None, None, :]
                 # (b 1 1 n_k) -> (b 1 n_q n_k)
-                mask = self.repeat_interleave(mask.to(ms.int32), int(q.shape[1]), axis=-2)
+                mask = self.repeat_interleave(mask, int(q.shape[1]), -2)
             x = self.flash_attention(q, k, v, mask=mask)
 
             # FA attn_mask def: retention and 1 indicates discard. Input tensor of shape :math:`(B, N1, S1, S2)`, `(B, 1, S1, S2)` `(S1, S2)`
@@ -384,7 +384,7 @@ def construct(self, x, mask=None, freqs_cis: Optional[Tensor] = None):
 
         # mask process
         if mask is not None:
-            mask = 1 - mask.to(ms.int32)
+            mask = 1 - mask
 
         if self.enable_flash_attention:
             if mask is not None:
@@ -500,8 +500,8 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine: bool = True,
             self.gamma = Parameter(initializer("ones", normalized_shape, dtype=dtype))
             self.beta = Parameter(initializer("zeros", normalized_shape, dtype=dtype))
         else:
-            self.gamma = ops.ones(normalized_shape, dtype=dtype)
-            self.beta = ops.zeros(normalized_shape, dtype=dtype)
+            self.gamma = Tensor(np.ones(normalized_shape, dtype=np.float32))
+            self.beta = Tensor(np.zeros(normalized_shape, dtype=np.float32))
 
     def construct(self, x: Tensor):
         normalized_shape = x.shape[-1:]
@@ -592,10 +592,7 @@ def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None):
         self.norm_final = LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         # (1152, 4*8)
         self.linear = nn.Dense(hidden_size, num_patch * out_channels, has_bias=True)
-        # self.scale_shift_table = Parameter((ops.randn(2, hidden_size, dtype=ms.float32) / hidden_size**0.5).astype(ms.float32))
-        self.scale_shift_table = Parameter(
-            ms.Tensor((np.random.randn(2, hidden_size) / hidden_size**0.5), dtype=ms.float32)
-        )
+        self.scale_shift_table = Parameter(np.random.randn(2, hidden_size).astype(np.float32) / hidden_size**0.5)
         self.out_channels = out_channels
         self.d_t = d_t
         self.d_s = d_s
@@ -614,11 +611,13 @@ def construct(
             T = self.d_t
         if S is None:
             S = self.d_s
-        shift, scale = self.chunk(self.scale_shift_table[None] + t[:, None], 2, 1)
+
+        scale_shift_table = self.scale_shift_table.to(x.dtype)
+        shift, scale = self.chunk(scale_shift_table[None] + t[:, None], 2, 1)
         x = t2i_modulate(self.norm_final(x), shift, scale)
 
         if frames_mask is not None:
-            shift_zero, scale_zero = self.chunk(self.scale_shift_table[None] + t0[:, None], 2, 1)
+            shift_zero, scale_zero = self.chunk(scale_shift_table[None] + t0[:, None], 2, 1)
             x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero)
             x = t_mask_select(frames_mask, x, x_zero, T, S)
 
@@ -639,9 +638,9 @@ def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU, tok
             in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0
         )
 
-        y_embedding = ops.randn(token_num, in_channels) / in_channels**0.5
+        y_embedding = np.random.randn(token_num, in_channels).astype(np.float32) / in_channels**0.5
         # just for token dropping replacement, not learnable
-        self.y_embedding = Parameter(Tensor(y_embedding, dtype=ms.float32), requires_grad=False)
+        self.y_embedding = Parameter(y_embedding, requires_grad=False)
 
         self.uncond_prob = uncond_prob
 
@@ -656,9 +655,7 @@ def token_drop(self, caption, force_drop_ids=None):
 
         # manually expand dims to avoid infer-shape bug in ms2.3 daily
         caption = ops.where(
-            drop_ids[:, None, None, None],
-            self.y_embedding[None, None, :, :].to(caption.dtype),
-            caption,
+            drop_ids[:, None, None, None], self.y_embedding[None, None, :, :].to(caption.dtype), caption
         )
 
         return caption
diff --git a/examples/opensora_hpcai/opensora/models/layers/operation_selector.py b/examples/opensora_hpcai/opensora/models/layers/operation_selector.py
index 269f3966c3..41b5d929a7 100644
--- a/examples/opensora_hpcai/opensora/models/layers/operation_selector.py
+++ b/examples/opensora_hpcai/opensora/models/layers/operation_selector.py
@@ -1,6 +1,6 @@
 import mindspore as ms
 from mindspore import mint, ops
-from mindspore.ops.function.array_func import chunk_ext, repeat_interleave_ext
+from mindspore.ops.function.array_func import chunk_ext
 
 use_dynamic_ops = False
 
@@ -15,50 +15,13 @@ def check_dynamic_mode():
     return use_dynamic_ops
 
 
-def repeat_interleave_ext_v2(input, repeats, axis=None):
-    # A more efficient implementation for replacing mint.repeat_interleave_ext
-    if isinstance(repeats, ms.Tensor):
-        if repeats.ndim > 1:
-            raise ValueError(f"repeats must be int, but get Tensor and ndim > 1, repeats.ndim {repeats.ndim}")
-        else:
-            repeats = int(repeats)
-    if isinstance(repeats, (tuple, list)):
-        if len(repeats) > 1:
-            raise ValueError(f"repeats must be int, but get list and len > 1, len(repeats) {len(repeats)}")
-        else:
-            repeats = repeats[0]
-    if not isinstance(repeats, int):
-        raise ValueError(f"repeats must be int, but get {repeats}")
-    if axis is None:
-        input = input.reshape[-1]
-        axis = 0
-
-    if not isinstance(axis, int):
-        raise ValueError(f"axis must be int, but get {axis}")
-    axis = axis + input.ndim if axis < 0 else axis
-    x_shape = input.shape
-    tile_axis = [1]
-    y_shape = list(x_shape)
-    y_shape[axis] = -1
-    for i in range(1, input.ndim + 1):
-        if i == axis + 1:
-            tile_axis.append(repeats)
-        else:
-            tile_axis.append(1)
-    input = ops.expand_dims(input, axis + 1)
-
-    return mint.tile(input, tuple(tile_axis)).reshape(y_shape)
-
-
 def get_repeat_interleave_op():
     mode = ms.get_context("mode")
     if (mode == 0) and (not check_dynamic_mode()):
         # provide better performance for static shape in graph mode
         return ops.repeat_interleave
     else:
-        # FIXME: check overflow for v2
-        # return repeat_interleave_ext_v2
-        return repeat_interleave_ext
+        return mint.repeat_interleave
 
 
 def get_chunk_op():
diff --git a/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py b/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py
index 304481d2ee..406b996eb0 100644
--- a/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py
+++ b/examples/opensora_hpcai/opensora/models/layers/rotary_embedding.py
@@ -28,7 +28,7 @@ def rotate_half(x: Tensor) -> Tensor:
 def apply_rotary_emb(freqs: Parameter, t: Tensor, scale: float = 1.0, seq_dim: int = -2) -> Tensor:
     # FIXME: start_index is always 0 in OS1.2 and ops.concat doesn't support empty elements. OS1.x future versions may need start_index > 0
     # t, t_right = t[..., start_index:end_index], t[..., end_index:]
-    t = (t * freqs.cos().astype(t.dtype) * scale) + (rotate_half(t) * freqs.sin().astype(t.dtype) * scale)
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
 
     return t
 
@@ -139,7 +139,7 @@ def get_axial_freqs(self, *dims):
         raise NotImplementedError
 
     def construct(self, t: Tensor, seq_len=None, offset=0) -> Tensor:
-        freqs = t.astype(self.freqs.dtype)[..., None] * self.freqs
+        freqs = t[..., None] * self.freqs.to(t.dtype)
         return self.repeat_interleave(freqs, 2, -1)  # ... n -> ... (n r), r = 2
 
 
diff --git a/examples/opensora_hpcai/opensora/models/stdit/stdit3.py b/examples/opensora_hpcai/opensora/models/stdit/stdit3.py
index d5a1ac4032..12f7b406de 100644
--- a/examples/opensora_hpcai/opensora/models/stdit/stdit3.py
+++ b/examples/opensora_hpcai/opensora/models/stdit/stdit3.py
@@ -102,13 +102,14 @@ def construct(
     ) -> Tensor:
         # prepare modulate parameters
         B, N, C = x.shape
+        scale_shift_table = self.scale_shift_table.to(x.dtype)
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.chunk(
-            self.scale_shift_table[None] + t.reshape(B, 6, -1), 6, 1
+            scale_shift_table[None] + t.reshape(B, 6, -1), 6, 1
         )
 
         # frames mask branch
         shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = self.chunk(
-            self.scale_shift_table[None] + t0.reshape(B, 6, -1), 6, 1
+            scale_shift_table[None] + t0.reshape(B, 6, -1), 6, 1
         )
 
         # modulate (attention)
diff --git a/examples/opensora_hpcai/opensora/models/vae/losses.py b/examples/opensora_hpcai/opensora/models/vae/losses.py
index 0ba8a13b4b..cb62bba67c 100644
--- a/examples/opensora_hpcai/opensora/models/vae/losses.py
+++ b/examples/opensora_hpcai/opensora/models/vae/losses.py
@@ -106,7 +106,6 @@ def construct(self, x: ms.Tensor, global_step: ms.Tensor = -1, weights: ms.Tenso
         weights: sample weights
         global_step: global training step
         """
-        print("D--: x shape: ", x.shape)
         # 3d vae forward, get posterior (mean, logvar) and recons
         # x -> VAE2d-Enc -> x_z -> TemporalVAE-Enc -> z ~ posterior -> TempVAE-Dec -> x_z_rec -> VAE2d-Dec -> x_rec
         x_rec, x_z_rec, z, posterior_mean, posterior_logvar, x_z = self.autoencoder(x)
diff --git a/examples/opensora_hpcai/opensora/models/vae/vae.py b/examples/opensora_hpcai/opensora/models/vae/vae.py
index e30b941e1d..64f20624a8 100644
--- a/examples/opensora_hpcai/opensora/models/vae/vae.py
+++ b/examples/opensora_hpcai/opensora/models/vae/vae.py
@@ -117,12 +117,11 @@ def __init__(
         self.scale_factor = 0.18215
 
     @staticmethod
-    def rearrange_in(x):
-        B, C, T, H, W = x.shape
-        # (b c t h w) -> (b t c h w)
-        x = ops.transpose(x, (0, 2, 1, 3, 4))
+    def rearrange_in(x, transpose: bool = True):
+        if transpose:  # (b c t h w) -> (b t c h w)
+            x = ops.transpose(x, (0, 2, 1, 3, 4))
+        B, T, C, H, W = x.shape
         x = ops.reshape(x, (B * T, C, H, W))
-
         return x
 
     @staticmethod
@@ -147,8 +146,8 @@ def encode(self, x):
         # is_video = (x.ndim == 5)
 
         B = x.shape[0]
-        # B C T H W -> (B T) C H W
-        x = self.rearrange_in(x)
+        # B T C H W -> (B T) C H W
+        x = self.rearrange_in(x, transpose=False)
 
         pad_num = None
         if self.micro_batch_parallel:
diff --git a/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py b/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py
index e14d17a79a..29396e3d71 100644
--- a/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py
+++ b/examples/opensora_hpcai/opensora/pipelines/train_pipeline.py
@@ -142,15 +142,12 @@ def construct(
             - assume model input/output shape: (b c f h w)
                 unet2d input/output shape: (b c h w)
         """
-        print("x shape: ", x.shape)
-
         with no_grad():
             # 1. get image/video latents z using vae
-            # (b f c h w) -> (b c f h w)
-            x = ops.transpose(x, (0, 2, 1, 3, 4))
-
             if not self.video_emb_cached:
                 x = self.get_latents(x)
+            else:
+                x = ops.transpose(x, (0, 2, 1, 3, 4))
 
             # 2. get conditions
             if not self.text_emb_cached:
diff --git a/examples/opensora_hpcai/opensora/utils/model_utils.py b/examples/opensora_hpcai/opensora/utils/model_utils.py
index f657d8afea..103bcb9905 100644
--- a/examples/opensora_hpcai/opensora/utils/model_utils.py
+++ b/examples/opensora_hpcai/opensora/utils/model_utils.py
@@ -9,18 +9,16 @@
 
 from mindspore import Model as MSModel
 from mindspore import Parameter, context, load_checkpoint
-from mindspore.nn import GraphCell, GroupNorm, SiLU  # GELU
+from mindspore.nn import GraphCell, GroupNorm
 from mindspore.train.callback import _CallbackManager
 
 from ..models.layers.blocks import Attention, LayerNorm, LlamaRMSNorm, PositionEmbedding2D, SinusoidalEmbedding
 
-# SORA's whitelist (FP32) operators
-WHITELIST_OPS = [
+# SORA's blacklist (FP32) operators for O2 AMP level
+BLACKLIST_OPS = [
     LayerNorm,
     Attention,
     LlamaRMSNorm,
-    SiLU,
-    # GELU,
     GroupNorm,
     PositionEmbedding2D,
     SinusoidalEmbedding,
diff --git a/examples/opensora_hpcai/scripts/infer_t5.py b/examples/opensora_hpcai/scripts/infer_t5.py
index d0a6129fed..221a919b8c 100644
--- a/examples/opensora_hpcai/scripts/infer_t5.py
+++ b/examples/opensora_hpcai/scripts/infer_t5.py
@@ -153,7 +153,7 @@ def main(args):
             # print(captions)
 
             text_tokens, mask = text_encoder.get_text_tokens_and_mask(captions, return_tensor=True)
-            text_emb = text_encoder(text_tokens, mask)
+            text_emb = text_encoder(text_tokens, mask).last_hidden_state
 
             end_time = time.time()
             time_cost = end_time - start_time
@@ -197,7 +197,7 @@ def main(args):
             ns = len(batch_prompts)
 
             batch_text_tokens, batch_mask = text_encoder.get_text_tokens_and_mask(batch_prompts, return_tensor=True)
-            batch_text_emb = text_encoder(batch_text_tokens, batch_mask)
+            batch_text_emb = text_encoder(batch_text_tokens, batch_mask).last_hidden_state
 
             # save result
             batch_mask = batch_mask.asnumpy().astype(np.uint8)
diff --git a/examples/opensora_hpcai/scripts/inference.py b/examples/opensora_hpcai/scripts/inference.py
index 757a2b5a54..a3714b0069 100644
--- a/examples/opensora_hpcai/scripts/inference.py
+++ b/examples/opensora_hpcai/scripts/inference.py
@@ -26,7 +26,7 @@
 from opensora.pipelines import InferPipeline, InferPipelineFiTLike
 from opensora.utils.amp import auto_mixed_precision
 from opensora.utils.cond_data import get_references, read_captions_from_csv, read_captions_from_txt
-from opensora.utils.model_utils import WHITELIST_OPS, _check_cfgs_in_parser, str2bool
+from opensora.utils.model_utils import BLACKLIST_OPS, _check_cfgs_in_parser, str2bool
 from opensora.utils.util import IMG_FPS, apply_mask_strategy, process_mask_strategies, process_prompts
 
 from mindone.data.data_split import distribute_samples
@@ -279,7 +279,7 @@ def main(args):
 
     if args.dtype in ["fp16", "bf16"]:
         latte_model = auto_mixed_precision(
-            latte_model, amp_level=args.amp_level, dtype=dtype_map[args.dtype], custom_fp32_cells=WHITELIST_OPS
+            latte_model, amp_level=args.amp_level, dtype=dtype_map[args.dtype], custom_fp32_cells=BLACKLIST_OPS
         )
 
     if args.ckpt_path:
diff --git a/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh b/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh
index 1ce963a95e..f69856d715 100644
--- a/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh
+++ b/examples/opensora_hpcai/scripts/run/run_train_os1.2_720x1280x51.sh
@@ -3,7 +3,6 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export MS_ENABLE_NUMA=0
 # plot memory usage, feature/model: 1
 export MS_MEMORY_STATISTIC=0
-export MS_DATASET_SINK_QUEUE=4
 
 # operation/graph fusion for dynamic shape
 export MS_DEV_ENABLE_KERNEL_PACKET=on
@@ -13,7 +12,7 @@ export GLOG_v=2
 
 output_dir=outputs/OSv1.2_720p_51
 
-msrun --bind_core=True --worker_num=8 --local_worker_num=8 --log_dir=$output_dir  \
+msrun --bind_core=True --master_port=8200 --worker_num=8 --local_worker_num=8 --log_dir=$output_dir  \
 	python scripts/train.py \
 	--mode=0 \
     --jit_level O1 \
@@ -22,7 +21,6 @@ msrun --bind_core=True --worker_num=8 --local_worker_num=8 --log_dir=$output_dir
     --video_folder datasets/mixkit-100videos/mixkit \
     --text_embed_folder  datasets/mixkit-100videos/t5_emb_300 \
   --use_parallel True \
-  --dataset_sink_mode=True \
   --num_parallel_workers=4 \
   --prefetch_size=4 \
   --enable_flash_attention=True \
diff --git a/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh b/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh
index 13a92392a0..c0f9506990 100644
--- a/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh
+++ b/examples/opensora_hpcai/scripts/run/run_train_os1.2_stage2.sh
@@ -3,7 +3,6 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export MS_ENABLE_NUMA=0
 # plot memory usage, feature/model: 1
 export MS_MEMORY_STATISTIC=0
-export MS_DATASET_SINK_QUEUE=4
 
 # operation/graph fusion for dynamic shape
 export MS_DEV_ENABLE_KERNEL_PACKET=on
@@ -23,7 +22,6 @@ python scripts/train.py \
 --csv_path datasets/mixkit-100videos/video_caption_train.csv \
 --video_folder datasets/mixkit-100videos/mixkit \
 --text_embed_folder  datasets/mixkit-100videos/t5_emb_300 \
---dataset_sink_mode=False \
 --num_parallel_workers=2 \
 --prefetch_size=2 \
 --enable_flash_attention=True \
diff --git a/examples/opensora_hpcai/scripts/train.py b/examples/opensora_hpcai/scripts/train.py
index 5318464c19..354729d108 100644
--- a/examples/opensora_hpcai/scripts/train.py
+++ b/examples/opensora_hpcai/scripts/train.py
@@ -39,7 +39,7 @@
 from opensora.utils.callbacks import EMAEvalSwapCallback, PerfRecorderCallback
 from opensora.utils.ema import EMA, save_ema_ckpts
 from opensora.utils.metrics import BucketLoss
-from opensora.utils.model_utils import WHITELIST_OPS, Model
+from opensora.utils.model_utils import BLACKLIST_OPS, Model
 from opensora.utils.resume import flush_from_cache, get_resume_ckpt, get_resume_states, resume_train_net, save_train_net
 
 from mindone.trainers.callback import EvalSaveCallback, OverflowMonitor, ProfilerCallbackEpoch, StopAtStepCallback
@@ -467,7 +467,7 @@ def main(args):
                 latte_model,
                 amp_level=args.amp_level,
                 dtype=dtype_map[args.dtype],
-                custom_fp32_cells=WHITELIST_OPS,
+                custom_fp32_cells=BLACKLIST_OPS,
             )
     # load checkpoint
     if args.pretrained_model_path:
diff --git a/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py b/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py
index 38b560afa7..9af85ab21e 100644
--- a/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py
+++ b/examples/opensora_hpcai/tests/test_stdit3_sequence_parallelism.py
@@ -4,7 +4,7 @@
 from opensora.acceleration.parallel_states import create_parallel_group, get_sequence_parallel_group
 from opensora.models.stdit.stdit3 import STDiT3
 from opensora.utils.amp import auto_mixed_precision
-from opensora.utils.model_utils import WHITELIST_OPS
+from opensora.utils.model_utils import BLACKLIST_OPS
 
 import mindspore as ms
 import mindspore.nn as nn
@@ -82,7 +82,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float32):
             non_dist_model,
             amp_level="O2",
             dtype=model_dtype,
-            custom_fp32_cells=WHITELIST_OPS,
+            custom_fp32_cells=BLACKLIST_OPS,
         )
 
     # sequence parallel model
@@ -95,7 +95,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float32):
             dist_model,
             amp_level="O2",
             dtype=model_dtype,
-            custom_fp32_cells=WHITELIST_OPS,
+            custom_fp32_cells=BLACKLIST_OPS,
         )
 
     for (_, w0), (_, w1) in zip(non_dist_model.parameters_and_names(), dist_model.parameters_and_names()):
diff --git a/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py b/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py
index c64679687d..b2cf656e01 100644
--- a/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py
+++ b/examples/opensora_hpcai/tests/test_vae_1_2_parallelism.py
@@ -4,7 +4,7 @@
 from opensora.acceleration.parallel_states import create_parallel_group
 from opensora.models.vae.vae import OpenSoraVAE_V1_2
 from opensora.utils.amp import auto_mixed_precision
-from opensora.utils.model_utils import WHITELIST_OPS
+from opensora.utils.model_utils import BLACKLIST_OPS
 
 import mindspore as ms
 import mindspore.nn as nn
@@ -58,7 +58,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float16):
             non_dist_model,
             amp_level="O2",
             dtype=model_dtype,
-            custom_fp32_cells=WHITELIST_OPS,
+            custom_fp32_cells=BLACKLIST_OPS,
         )
     non_dist_model.set_train(False)
 
@@ -72,7 +72,7 @@ def run_model(mode: int = 0, model_dtype: ms.dtype = ms.float16):
             dist_model,
             amp_level="O2",
             dtype=model_dtype,
-            custom_fp32_cells=WHITELIST_OPS,
+            custom_fp32_cells=BLACKLIST_OPS,
         )
     dist_model.set_train(False)
 
diff --git a/mindone/trainers/callback.py b/mindone/trainers/callback.py
index ee1ec9d860..ccb8ffbb4c 100755
--- a/mindone/trainers/callback.py
+++ b/mindone/trainers/callback.py
@@ -442,11 +442,13 @@ def on_train_step_end(self, run_context):
         cur_step = cb_params.cur_step_num
         if cur_step == self.end_step:
             self.profiler.stop()
-            self.profiler.analyse()
-            _logger.info(f"finish analyzing profiler in step range [{self.start_step}, {self.end_step}]")
+            _logger.info(f"Finished profiling in step range [{self.start_step}, {self.end_step}]")
             if self.exit_after_analyze:
                 run_context.request_stop()
 
+    def on_train_end(self, run_context):
+        self.profiler.analyse()
+
 
 class ProfilerCallbackEpoch(Callback):
     def __init__(self, start_epoch, stop_epoch, output_dir="./profiler_data"):
@@ -466,4 +468,6 @@ def on_train_epoch_end(self, run_context):
         epoch_num = cb_params.cur_epoch_num
         if epoch_num == self.stop_epoch:
             self.profiler.stop()
-            self.profiler.analyse()
+
+    def on_train_end(self, run_context):
+        self.profiler.analyse()