fix pylint-check

hongziqi · hongziqi · commit c412e3be674a · 2025-04-08T10:52:32.000Z
diff --git a/mindnlp/transformers/integrations/npu_flash_attention.py b/mindnlp/transformers/integrations/npu_flash_attention.py
@@ -1,3 +1,5 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,13 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+FlashAttention2 is supported on Ascend NPU with down-right aligned causal mask by default.
+Set environment variable `NPU_FA2_SPARSE_MODE` to 2 when using top-left aligned causal mask.
+"""
+
+
 import os
 
 import math
+from typing import Optional, Tuple
 import mindspore
 from mindspore.ops import flash_attention_score
 from mindspore import nn
-from typing import Optional, Tuple
 from mindnlp.core import ops
 
 
@@ -25,7 +33,7 @@
 TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE = 2
 DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE = 3
 
-SPARSE_MODE = int(os.getenv("NPU_FA2_SPARSE_MODE", default=DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE))
+SPARSE_MODE = int(os.getenv("NPU_FA2_SPARSE_MODE", default=str(DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE)))
 if SPARSE_MODE not in [TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE, DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE]:
     raise ValueError(
         "Environment variable `NPU_FA2_SPARSE_MODE` can only be set as 2 (top-left aligned causal mask) "
@@ -55,15 +63,15 @@ def bprop(self, input, indices, out, dout):
         assert dout.ndim >= 2
         other_shape = dout.shape[1:]
         grad_output = dout
-        
+
         grad_flat = grad_output.reshape(grad_output.shape[0], -1)
         grad_shape = (input.shape[0], grad_flat.shape[1])
         grad_input = ops.zeros(grad_shape, grad_flat.dtype)
-        
+
         indices_expanded = ops.expand_dims(indices, -1)
         indices_expanded = ops.broadcast_to(indices_expanded, (-1, grad_flat.shape[1]))
         grad_input.scatter_(0, indices_expanded, grad_flat)
-        
+
         return grad_input.reshape(input.shape[0], *other_shape), None
 
 
diff --git a/mindnlp/transformers/modeling_flash_attention_utils.py b/mindnlp/transformers/modeling_flash_attention_utils.py
@@ -1,5 +1,4 @@
-# coding=utf-8
-# Copyright 2024 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,23 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
+"""This module provides utilities for flash attention in Transformers models."""
+
 
+import os
 import inspect
-import mindspore
 from typing import Optional, Tuple
+import mindspore
 from mindnlp.core import ops
 from ..utils import logging
-
-logger = logging.get_logger(__name__)
-flash_attn_func = None
-
 from .integrations.npu_flash_attention import index_first_axis, pad_input, unpad_input
 from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_func
 from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_varlen_func
 
 
-if flash_attn_func:
+logger = logging.get_logger(__name__)
+
+
+if flash_attn_func is not None:
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
 
@@ -285,7 +285,7 @@ def _flash_attention_forward(
     else:
         # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.
         causal = is_causal and query_length != 1
-    
+
     # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
     use_sliding_windows = (
         _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window
@@ -299,7 +299,7 @@ def _flash_attention_forward(
 
     if softcap is not None:
         flash_kwargs["softcap"] = softcap
-    
+
     # PEFT possibly silently casts tensors to fp32, this potentially reconverts to correct dtype or is a no op
     query_states, key_states, value_states = fa_peft_integration_check(
         query_states, key_states, value_states, target_dtype
@@ -312,7 +312,7 @@ def _flash_attention_forward(
         )
         cu_seqlens_q, cu_seqlens_k = cu_seq_lens
         max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-        
+
         attn_output_unpad = flash_attn_varlen_func(
             query_states,
             key_states,
@@ -327,7 +327,7 @@ def _flash_attention_forward(
             **flash_kwargs,
         )
         attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-    
+
     # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
     # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
     # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
diff --git a/mindnlp/transformers/models/whisper/modeling_whisper.py b/mindnlp/transformers/models/whisper/modeling_whisper.py
@@ -460,7 +460,7 @@ def forward(
         causal_mask = attention_mask
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, : key_states.shape[-2]]
-    
+
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.