46
46
47
47
_CONFIG_FOR_DOC = "MiniCPMConfig"
48
48
49
+
49
50
def rms_layernorm (hidden : mindspore .Tensor , weight : mindspore .Tensor , eps : float ):
50
51
"""
51
52
Args:
52
53
hidden (mindspore.Tensor): The input tensor to be normalized.
53
54
weight (mindspore.Tensor): The weight tensor applied to the normalized input.
54
55
eps (float): A small value added to the variance to avoid division by zero.
55
-
56
+
56
57
Returns:
57
58
None: This function does not return a value. It operates in place on the 'hidden' tensor.
58
-
59
+
59
60
Raises:
60
61
ValueError: If the 'hidden' tensor or 'weight' tensor is not of type mindspore.Tensor.
61
62
TypeError: If the 'eps' parameter is not of type float.
@@ -67,11 +68,10 @@ def rms_layernorm(hidden: mindspore.Tensor, weight: mindspore.Tensor, eps: float
67
68
68
69
69
70
class MiniCPMRMSNorm (nn .Module ):
70
-
71
71
"""
72
- MiniCPMRMSNorm is a custom layer normalization module designed to mimic the functionality of T5LayerNorm.
72
+ MiniCPMRMSNorm is a custom layer normalization module designed to mimic the functionality of T5LayerNorm.
73
73
It performs RMS-based layer normalization on the input hidden states using the provided weight and epsilon value.
74
-
74
+
75
75
Parameters:
76
76
hidden_size (int): The size of the hidden states being normalized.
77
77
eps (float, optional): A small value added to the variance to prevent division by zero. Default is 1e-06.
@@ -87,6 +87,7 @@ class MiniCPMRMSNorm(nn.Module):
87
87
__init__: Initializes the MiniCPMRMSNorm instance with the given hidden size and epsilon.
88
88
forward: Applies RMS-based layer normalization on the input hidden states using the weight and epsilon.
89
89
"""
90
+
90
91
def __init__ (self , hidden_size , eps = 1e-6 ):
91
92
"""
92
93
MiniCPMRMSNorm is equivalent to T5LayerNorm
@@ -117,7 +118,6 @@ def forward(self, hidden_states):
117
118
118
119
119
120
class MiniCPMRotaryEmbedding (nn .Module ):
120
-
121
121
"""
122
122
MiniCPMRotaryEmbedding is a class that represents a rotary positional embedding layer for neural networks.
123
123
It inherits from nn.Module and provides methods for initializing the embedding layer, setting cosine and sine cache,
@@ -128,6 +128,7 @@ class MiniCPMRotaryEmbedding(nn.Module):
128
128
cosine and sine values for positional embeddings.
129
129
The forward method generates the positional embeddings based on the input data and the specified sequence length.
130
130
"""
131
+
131
132
def __init__ (self , dim , max_position_embeddings = 2048 , base = 10000 ):
132
133
"""
133
134
Initializes a new instance of the MiniCPMRotaryEmbedding class.
@@ -212,6 +213,7 @@ def forward(self, x, seq_len=None):
212
213
213
214
class MiniCPMLinearScalingRotaryEmbedding (MiniCPMRotaryEmbedding ):
214
215
"""MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
216
+
215
217
def __init__ (self , dim , max_position_embeddings = 2048 , base = 10000 , scaling_factor = 1.0 ):
216
218
"""
217
219
Initializes an instance of MiniCPMLinearScalingRotaryEmbedding.
@@ -260,6 +262,7 @@ def _set_cos_sin_cache(self, seq_len, dtype):
260
262
261
263
class MiniCPMDynamicNTKScalingRotaryEmbedding (MiniCPMRotaryEmbedding ):
262
264
"""MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
265
+
263
266
def __init__ (self , dim , max_position_embeddings = 2048 , base = 10000 , scaling_factor = 1.0 ):
264
267
"""
265
268
Initializes a new instance of the MiniCPMDynamicNTKScalingRotaryEmbedding class.
@@ -302,7 +305,7 @@ def _set_cos_sin_cache(self, seq_len, dtype):
302
305
303
306
if seq_len > self .max_position_embeddings :
304
307
base = self .base * (
305
- (self .scaling_factor * seq_len / self .max_position_embeddings ) - (self .scaling_factor - 1 )
308
+ (self .scaling_factor * seq_len / self .max_position_embeddings ) - (self .scaling_factor - 1 )
306
309
) ** (self .dim / (self .dim - 2 ))
307
310
inv_freq = 1.0 / (base ** (ops .arange (0 , self .dim , 2 ).float () / self .dim ))
308
311
self .inv_freq = inv_freq
@@ -316,6 +319,7 @@ def _set_cos_sin_cache(self, seq_len, dtype):
316
319
self .cos_cached = emb .cos ().to (dtype )
317
320
self .sin_cached = emb .sin ().to (dtype )
318
321
322
+
319
323
def rotate_half (x ):
320
324
"""Rotates half the hidden dims of the input."""
321
325
# x1 = x[..., : x.shape[-1] // 2]
@@ -358,8 +362,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
358
362
k_embed = (k_fp32 * cos ) + (rotate_half (k_fp32 ) * sin )
359
363
return q_embed .to (dtype = orig_dtype ), k_embed .to (dtype = orig_dtype )
360
364
361
- class MiniCPMMLP (nn .Module ):
362
365
366
+ class MiniCPMMLP (nn .Module ):
363
367
"""
364
368
MiniCPMMLP is a neural network model that implements a specific variant of a Multi-Layer Perceptron (MLP)
365
369
architecture for deep learning tasks.
@@ -385,6 +389,7 @@ class MiniCPMMLP(nn.Module):
385
389
Returns:
386
390
down_proj: The output tensor resulting from the forward pass computation of the MiniCPMMLP model.
387
391
"""
392
+
388
393
def __init__ (self , config ):
389
394
"""
390
395
Initializes a MiniCPMMLP object with the provided configuration.
@@ -458,6 +463,7 @@ def repeat_kv(hidden_states: mindspore.Tensor, n_rep: int) -> mindspore.Tensor:
458
463
459
464
class MiniCPMAttention (nn .Module ):
460
465
"""Multi-headed attention from 'Attention Is All You Need' paper"""
466
+
461
467
def __init__ (self , config : MiniCPMConfig , layer_idx : Optional [int ] = None ):
462
468
"""
463
469
Initializes an instance of the MiniCPMAttention class.
@@ -594,14 +600,14 @@ def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
594
600
return tensor .view (bsz , seq_len , self .num_heads , self .head_dim ).swapaxes (1 , 2 )
595
601
596
602
def forward (
597
- self ,
598
- hidden_states : mindspore .Tensor ,
599
- attention_mask : Optional [mindspore .Tensor ] = None ,
600
- position_ids : Optional [mindspore .Tensor ] = None ,
601
- past_key_value : Optional [Cache ] = None ,
602
- output_attentions : bool = False ,
603
- use_cache : bool = False ,
604
- ** kwargs ,
603
+ self ,
604
+ hidden_states : mindspore .Tensor ,
605
+ attention_mask : Optional [mindspore .Tensor ] = None ,
606
+ position_ids : Optional [mindspore .Tensor ] = None ,
607
+ past_key_value : Optional [Cache ] = None ,
608
+ output_attentions : bool = False ,
609
+ use_cache : bool = False ,
610
+ ** kwargs ,
605
611
) -> Tuple [mindspore .Tensor , Optional [mindspore .Tensor ], Optional [Tuple [mindspore .Tensor ]]]:
606
612
'''
607
613
This method forwards the MiniCPMAttention layer.
@@ -730,7 +736,6 @@ def forward(
730
736
731
737
732
738
class MiniCPMDecoderLayer (nn .Module ):
733
-
734
739
"""
735
740
MiniCPMDecoderLayer represents a single layer of the MiniCPM (Minimalist Conditional Pretrained Model) decoder.
736
741
This class is responsible for processing input hidden states through self-attention mechanism and MLP
@@ -767,6 +772,7 @@ class MiniCPMDecoderLayer(nn.Module):
767
772
If 'padding_mask' is passed as a keyword argument in kwargs, a deprecation warning will be issued.
768
773
It is recommended to use 'attention_mask' instead.
769
774
"""
775
+
770
776
def __init__ (self , config : MiniCPMConfig , layer_idx : int ):
771
777
"""
772
778
Initializes a new instance of MiniCPMDecoderLayer.
@@ -796,14 +802,14 @@ def __init__(self, config: MiniCPMConfig, layer_idx: int):
796
802
self .num_hidden_layers = config .num_hidden_layers
797
803
798
804
def forward (
799
- self ,
800
- hidden_states : mindspore .Tensor ,
801
- attention_mask : Optional [mindspore .Tensor ] = None ,
802
- position_ids : Optional [mindspore .Tensor ] = None ,
803
- past_key_value : Optional [Tuple [mindspore .Tensor ]] = None ,
804
- output_attentions : Optional [bool ] = False ,
805
- use_cache : Optional [bool ] = False ,
806
- ** kwargs ,
805
+ self ,
806
+ hidden_states : mindspore .Tensor ,
807
+ attention_mask : Optional [mindspore .Tensor ] = None ,
808
+ position_ids : Optional [mindspore .Tensor ] = None ,
809
+ past_key_value : Optional [Tuple [mindspore .Tensor ]] = None ,
810
+ output_attentions : Optional [bool ] = False ,
811
+ use_cache : Optional [bool ] = False ,
812
+ ** kwargs ,
807
813
) -> Tuple [mindspore .Tensor , Optional [Tuple [mindspore .Tensor , mindspore .Tensor ]]]:
808
814
"""
809
815
Args:
@@ -858,7 +864,6 @@ def forward(
858
864
859
865
860
866
class MiniCPMPreTrainedModel (PreTrainedModel ):
861
-
862
867
"""
863
868
Represents a pre-trained mini version of CPM (Code-PM) model for various NLP tasks.
864
869
This class inherits from PreTrainedModel and provides functionality to initialize weights for different types
@@ -916,6 +921,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
916
921
Args:
917
922
config: MiniCPMConfig
918
923
"""
924
+
919
925
def __init__ (self , config : MiniCPMConfig ):
920
926
"""
921
927
Initializes a MiniCPMModel instance with the provided configuration.
@@ -995,16 +1001,16 @@ def set_input_embeddings(self, new_embeddings):
995
1001
self .embed_tokens = new_embeddings
996
1002
997
1003
def forward (
998
- self ,
999
- input_ids : mindspore .Tensor = None ,
1000
- attention_mask : Optional [mindspore .Tensor ] = None ,
1001
- position_ids : Optional [mindspore .Tensor ] = None ,
1002
- past_key_values : Optional [List [mindspore .Tensor ]] = None ,
1003
- inputs_embeds : Optional [mindspore .Tensor ] = None ,
1004
- use_cache : Optional [bool ] = None ,
1005
- output_attentions : Optional [bool ] = None ,
1006
- output_hidden_states : Optional [bool ] = None ,
1007
- return_dict : Optional [bool ] = None ,
1004
+ self ,
1005
+ input_ids : mindspore .Tensor = None ,
1006
+ attention_mask : Optional [mindspore .Tensor ] = None ,
1007
+ position_ids : Optional [mindspore .Tensor ] = None ,
1008
+ past_key_values : Optional [List [mindspore .Tensor ]] = None ,
1009
+ inputs_embeds : Optional [mindspore .Tensor ] = None ,
1010
+ use_cache : Optional [bool ] = None ,
1011
+ output_attentions : Optional [bool ] = None ,
1012
+ output_hidden_states : Optional [bool ] = None ,
1013
+ return_dict : Optional [bool ] = None ,
1008
1014
) -> Union [Tuple , BaseModelOutputWithPast ]:
1009
1015
"""
1010
1016
Constructs the MiniCPMModel.
@@ -1299,17 +1305,17 @@ def get_decoder(self):
1299
1305
return self .model
1300
1306
1301
1307
def forward (
1302
- self ,
1303
- input_ids : mindspore .Tensor = None ,
1304
- attention_mask : Optional [mindspore .Tensor ] = None ,
1305
- position_ids : Optional [mindspore .Tensor ] = None ,
1306
- past_key_values : Optional [List [mindspore .Tensor ]] = None ,
1307
- inputs_embeds : Optional [mindspore .Tensor ] = None ,
1308
- labels : Optional [mindspore .Tensor ] = None ,
1309
- use_cache : Optional [bool ] = None ,
1310
- output_attentions : Optional [bool ] = None ,
1311
- output_hidden_states : Optional [bool ] = None ,
1312
- return_dict : Optional [bool ] = None ,
1308
+ self ,
1309
+ input_ids : mindspore .Tensor = None ,
1310
+ attention_mask : Optional [mindspore .Tensor ] = None ,
1311
+ position_ids : Optional [mindspore .Tensor ] = None ,
1312
+ past_key_values : Optional [List [mindspore .Tensor ]] = None ,
1313
+ inputs_embeds : Optional [mindspore .Tensor ] = None ,
1314
+ labels : Optional [mindspore .Tensor ] = None ,
1315
+ use_cache : Optional [bool ] = None ,
1316
+ output_attentions : Optional [bool ] = None ,
1317
+ output_hidden_states : Optional [bool ] = None ,
1318
+ return_dict : Optional [bool ] = None ,
1313
1319
) -> Union [Tuple , CausalLMOutputWithPast ]:
1314
1320
r"""
1315
1321
Args:
@@ -1389,7 +1395,7 @@ def forward(
1389
1395
)
1390
1396
1391
1397
def prepare_inputs_for_generation (
1392
- self , input_ids , past_key_values = None , attention_mask = None , inputs_embeds = None , ** kwargs
1398
+ self , input_ids , past_key_values = None , attention_mask = None , inputs_embeds = None , ** kwargs
1393
1399
):
1394
1400
"""
1395
1401
Prepare inputs for generation.
@@ -1428,7 +1434,7 @@ def prepare_inputs_for_generation(
1428
1434
# some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
1429
1435
# input)
1430
1436
if attention_mask is not None and attention_mask .shape [1 ] > input_ids .shape [1 ]:
1431
- input_ids = input_ids [:, - (attention_mask .shape [1 ] - past_length ) :]
1437
+ input_ids = input_ids [:, - (attention_mask .shape [1 ] - past_length ):]
1432
1438
# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1433
1439
# input_ids based on the past_length.
1434
1440
elif past_length < input_ids .shape [1 ]:
@@ -1437,19 +1443,19 @@ def prepare_inputs_for_generation(
1437
1443
1438
1444
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1439
1445
if (
1440
- max_cache_length is not None
1441
- and attention_mask is not None
1442
- and cache_length + input_ids .shape [1 ] > max_cache_length
1446
+ max_cache_length is not None
1447
+ and attention_mask is not None
1448
+ and cache_length + input_ids .shape [1 ] > max_cache_length
1443
1449
):
1444
1450
attention_mask = attention_mask [:, - max_cache_length :]
1445
1451
1446
1452
position_ids = kwargs .get ("position_ids" , None )
1447
1453
if attention_mask is not None and position_ids is None :
1448
1454
# create position_ids on the fly for batch generation
1449
- position_ids = attention_mask .long ().cumsum (- 1 ) - 1
1455
+ position_ids = attention_mask .long ().int (). cumsum (- 1 ) - 1
1450
1456
position_ids = position_ids .masked_fill (attention_mask == 0 , 1 )
1451
1457
if past_key_values :
1452
- position_ids = position_ids [:, - input_ids .shape [1 ] :]
1458
+ position_ids = position_ids [:, - input_ids .shape [1 ]:]
1453
1459
1454
1460
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1455
1461
if inputs_embeds is not None and past_key_values is None :
@@ -1524,10 +1530,10 @@ def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "u
1524
1530
history = []
1525
1531
if logits_processor :
1526
1532
gen_kwargs = {"max_length" : max_length , "num_beams" : num_beams , "do_sample" : do_sample , "top_p" : top_p ,
1527
- "temperature" : temperature , "logits_processor" : logits_processor , ** kwargs }
1533
+ "temperature" : temperature , "logits_processor" : logits_processor , ** kwargs }
1528
1534
else :
1529
1535
gen_kwargs = {"max_length" : max_length , "num_beams" : num_beams , "do_sample" : do_sample , "top_p" : top_p ,
1530
- "temperature" : temperature , "logits_processor" : logits_processor , ** kwargs }
1536
+ "temperature" : temperature , "logits_processor" : logits_processor , ** kwargs }
1531
1537
1532
1538
history .append ({"role" : role , "content" : query })
1533
1539
history_str = tokenizer .apply_chat_template (history , tokenize = False , add_generation_prompt = False )
@@ -1544,7 +1550,6 @@ def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "u
1544
1550
1545
1551
1546
1552
class MiniCPMForSequenceClassification (MiniCPMPreTrainedModel ):
1547
-
1548
1553
"""
1549
1554
MiniCPMForSequenceClassification is a Python class that represents a fine-tuning model for sequence classification
1550
1555
tasks based on the MiniCPM architecture. It inherits from the MiniCPMPreTrainedModel class and provides methods for
@@ -1584,6 +1589,7 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
1584
1589
This class inherits from MiniCPMPreTrainedModel and extends its functionality to support sequence
1585
1590
classification tasks.
1586
1591
"""
1592
+
1587
1593
def __init__ (self , config ):
1588
1594
"""
1589
1595
Initializes a new instance of the MiniCPMForSequenceClassification class.
@@ -1640,17 +1646,17 @@ def set_input_embeddings(self, new_embeddings):
1640
1646
self .model .embed_tokens = new_embeddings
1641
1647
1642
1648
def forward (
1643
- self ,
1644
- input_ids : mindspore .Tensor = None ,
1645
- attention_mask : Optional [mindspore .Tensor ] = None ,
1646
- position_ids : Optional [mindspore .Tensor ] = None ,
1647
- past_key_values : Optional [List [mindspore .Tensor ]] = None ,
1648
- inputs_embeds : Optional [mindspore .Tensor ] = None ,
1649
- labels : Optional [mindspore .Tensor ] = None ,
1650
- use_cache : Optional [bool ] = None ,
1651
- output_attentions : Optional [bool ] = None ,
1652
- output_hidden_states : Optional [bool ] = None ,
1653
- return_dict : Optional [bool ] = None ,
1649
+ self ,
1650
+ input_ids : mindspore .Tensor = None ,
1651
+ attention_mask : Optional [mindspore .Tensor ] = None ,
1652
+ position_ids : Optional [mindspore .Tensor ] = None ,
1653
+ past_key_values : Optional [List [mindspore .Tensor ]] = None ,
1654
+ inputs_embeds : Optional [mindspore .Tensor ] = None ,
1655
+ labels : Optional [mindspore .Tensor ] = None ,
1656
+ use_cache : Optional [bool ] = None ,
1657
+ output_attentions : Optional [bool ] = None ,
1658
+ output_hidden_states : Optional [bool ] = None ,
1659
+ return_dict : Optional [bool ] = None ,
1654
1660
) -> Union [Tuple , SequenceClassifierOutputWithPast ]:
1655
1661
r"""
1656
1662
Args:
@@ -1723,6 +1729,7 @@ def forward(
1723
1729
attentions = transformer_outputs .attentions ,
1724
1730
)
1725
1731
1732
+
1726
1733
__all__ = [
1727
1734
'MiniCPMModel' ,
1728
1735
'MiniCPMPreTrainedModel' ,
0 commit comments