Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions llm/inference/llama3/run_llama3.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import mindspore
from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
from mindspore._c_expression import _framework_profiler_step_start
from mindspore._c_expression import _framework_profiler_step_end

model_id = "LLM-Research/Meta-Llama-3-8B-Instruct"

Expand All @@ -28,7 +26,6 @@
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# _framework_profiler_step_start()
outputs = model.generate(
input_ids,
max_new_tokens=20,
Expand All @@ -38,6 +35,5 @@
# temperature=0.6,
# top_p=0.9,
)
# _framework_profiler_step_end()
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))
52 changes: 51 additions & 1 deletion mindnlp/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=not-callable
# pylint: disable=not-callable, no-name-in-module
"""generation mixin"""
import copy
import inspect
import warnings
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import mindspore
from mindspore._c_expression import _framework_profiler_step_start
from mindspore._c_expression import _framework_profiler_step_end

from mindnlp.core import nn, ops, no_grad
from mindnlp.core.nn import functional as F
from ...utils.testing_utils import parse_flag_from_env

from ..cache_utils import (
Cache,
Expand Down Expand Up @@ -156,6 +161,7 @@ class GenerateDecoderOnlyOutput(ModelOutput):
attentions: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
hidden_states: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
past_key_values: Optional[Tuple[Tuple[Tuple[mindspore.Tensor]]]] = None
average_infer_time: Optional[float] = None


@dataclass
Expand Down Expand Up @@ -208,6 +214,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
cross_attentions: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
past_key_values: Optional[Tuple[Tuple[Tuple[mindspore.Tensor]]]] = None
average_infer_time: Optional[float] = None


@dataclass
Expand Down Expand Up @@ -1642,6 +1649,7 @@ def generate(
- [`~generation.GenerateEncoderDecoderOutput`],
- [`~generation.GenerateBeamEncoderDecoderOutput`]
"""
_run_profiler = parse_flag_from_env('MS_ENABLE_RUNTIME_PROFILER', False)
# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
self._validate_model_class()
tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria
Expand Down Expand Up @@ -1850,6 +1858,11 @@ def generate(
prepared_stopping_criteria = self._get_stopping_criteria(
generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
)

if _run_profiler:
_framework_profiler_step_start()
logger.warning('Enabling the profiler will generate larger files. Please set `max_length` or `max_new_tokens` to a smaller value (recommended less than 10)')

# 10. go into different generation modes
if generation_mode == GenerationMode.ASSISTED_GENERATION:
if generation_config.num_return_sequences > 1:
Expand Down Expand Up @@ -2107,6 +2120,9 @@ def typeerror():
**model_kwargs,
)

if _run_profiler:
_framework_profiler_step_end()

# Convert to legacy cache if needed
if use_dynamic_cache_by_default and generation_config.return_legacy_cache:
if isinstance(result, ModelOutput) and hasattr(result, "past_key_values"):
Expand Down Expand Up @@ -2902,9 +2918,14 @@ def _sample(
unfinished_sequences = ops.ones(batch_size, dtype=mindspore.int64)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)

time_record = []
_record_time = parse_flag_from_env('INFERENCE_TIME_RECORD', False)

while self._has_unfinished_sequences(
this_peer_finished, synced_gpus, cur_len=cur_len, max_length=max_length
):
if _record_time:
infer_start = time.time()
# prepare model inputs
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

Expand Down Expand Up @@ -2971,10 +2992,20 @@ def _sample(
this_peer_finished = unfinished_sequences.max() == 0
cur_len += 1

if _record_time:
infer_stop = time.time()
time_record.append(infer_stop - infer_start)
# This is needed to properly delete outputs.logits which may be very large for first iteration
# Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
del outputs

average_infer_time = None
if time_record:
time_record.pop(0)
average_infer_time = sum(time_record) / len(time_record)
print(f'average inference time is: {average_infer_time}')
print(f'inference time record: {time_record}')

if streamer is not None:
streamer.end()

Expand All @@ -2990,6 +3021,7 @@ def _sample(
cross_attentions=cross_attentions,
decoder_hidden_states=decoder_hidden_states,
past_key_values=model_kwargs.get("past_key_values"),
average_infer_time=average_infer_time
)
else:
return GenerateDecoderOnlyOutput(
Expand All @@ -2999,6 +3031,7 @@ def _sample(
attentions=decoder_attentions,
hidden_states=decoder_hidden_states,
past_key_values=model_kwargs.get("past_key_values"),
average_infer_time=average_infer_time
)
else:
return input_ids
Expand Down Expand Up @@ -3130,7 +3163,13 @@ def _beam_search(

decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder


time_record = []
_record_time = parse_flag_from_env('INFERENCE_TIME_RECORD', False)

while self._has_unfinished_sequences(this_peer_finished, synced_gpus):
if _record_time:
infer_start = time.time()
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

# prepare variable output controls (note: some models won't accept all output controls)
Expand Down Expand Up @@ -3295,6 +3334,17 @@ def replace_negative_indices(next_tokens):
if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
this_peer_finished = True

if _record_time:
infer_stop = time.time()
time_record.append(infer_stop - infer_start)

average_infer_time = None
if time_record:
time_record.pop(0)
average_infer_time = sum(time_record) / len(time_record)
print(f'average inference time is: {average_infer_time}')
print(f'inference time record: {time_record}')

sequence_outputs = beam_scorer.finalize(
input_ids,
beam_scores,
Expand Down
3 changes: 2 additions & 1 deletion scripts/run_pynative_profile.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export MS_ENABLE_RUNTIME_PROFILER=1
export MS_ENABLE_RUNTIME_PROFILER=1
# export INFERENCE_TIME_RECORD=1
17 changes: 16 additions & 1 deletion tests/ut/transformers/models/bert/test_modeling_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,4 +677,19 @@ def test_sdpa_ignored_mask(self):
res_sdpa = model_sdpa(**inp, past_key_values=pkv)
self.assertTrue(
ops.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-3, rtol=1e-3)
)
)

@slow
def test_inference_time(self):
import time
model = BertModel.from_pretrained("google-bert/bert-base-uncased")
input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
infer_time = []
with no_grad():
for i in range(20):
s = time.time()
output = model(input_ids, attention_mask=attention_mask)[0]
t = time.time()
infer_time.append(t - s)
print(infer_time)
29 changes: 24 additions & 5 deletions tests/ut/transformers/models/clip/test_modeling_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch CLIP model."""
"""Testing suite for the MindSpore CLIP model."""

import inspect
import os
import tempfile
import unittest
from typing import Optional, Tuple

import numpy as np
import requests
from parameterized import parameterized
from pytest import mark

from mindnlp.transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
from mindnlp.utils.testing_utils import (
Expand Down Expand Up @@ -647,3 +643,26 @@ def test_inference(self):
print(outputs.logits_per_image)

self.assertTrue(ops.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))

@slow
def test_inference_time(self):
import time
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

image = prepare_img()
inputs = processor(
text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="ms"
)

infer_time = []
# forward pass
with no_grad():
for i in range(20):
s = time.time()
outputs = model(**inputs)
t = time.time()
infer_time.append(t - s)

print(infer_time)
11 changes: 11 additions & 0 deletions tests/ut/transformers/models/mixtral/test_modeling_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,3 +446,14 @@ def test_small_model_logits_batched(self):
self.assertTrue(np.allclose(logits[0, :3, :3].half().asnumpy(), EXPECTED_LOGITS_LEFT.asnumpy(), atol=1e-3, rtol=1e-3))
self.assertTrue(np.allclose(logits[0, -3:, -3:].half().asnumpy(), EXPECTED_LOGITS_LEFT_UNPADDED.asnumpy(), atol=1e-3, rtol=1e-3))
self.assertTrue(np.allclose(logits[1, -3:, -3:].half().asnumpy(), EXPECTED_LOGITS_RIGHT_UNPADDED.asnumpy(), atol=1e-3, rtol=1e-3))

@slow
@require_mindspore
def test_small_model_generate_time(self):
model_id = "hf-internal-testing/Mixtral-tiny"
dummy_input = mindspore.Tensor([[0, 1, 0], [0, 1, 0]])

model = MixtralForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
# TODO: might need to tweak it in case the logits do not match on our daily runners
# these logits have been obtained with the original megablocks impelmentation.
model.generate(dummy_input, max_new_tokens=20)
11 changes: 11 additions & 0 deletions tests/ut/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,3 +553,14 @@ def test_speculative_generation(self):

del model
gc.collect()

@slow
def test_model_a2_7b_generation_time(self):
EXPECTED_TEXT_COMPLETION = """To be or not to be, that is the question. This is the question that has been asked by many people over the"""
prompt = "To be or not to"
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False, mirror='modelscope')
model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", mirror='modelscope')
input_ids = tokenizer.encode(prompt, return_tensors="ms")

# greedy generation outputs
generated_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
22 changes: 22 additions & 0 deletions tests/ut/transformers/models/t5/test_modeling_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,6 +1458,28 @@ def test_contrastive_search_t5(self):
)


@slow
def test_translation_inference_time(self):
model = self.model # google-t5/t5-base
tok = self.tokenizer
use_task_specific_params(model, "translation_en_to_fr")

en_text = (
' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of'
" countless generations of stars: the oldest stars are seen as blue dots. "
)

input_ids = tok.encode(model.config.prefix + en_text, return_tensors="ms")
input_ids = input_ids

output = model.generate(
input_ids=input_ids,
max_new_tokens=50,
do_sample=False,
)
print(output)


@require_mindspore
class TestAsymmetricT5(unittest.TestCase):
def build_model_and_check_forward_pass(self, **kwargs):
Expand Down
Loading