You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import gc
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextIteratorStreamer
import torch
from threading import Thread
# 模型名称
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
def stream_generate(prompt, max_new_tokens=512):
# 构建对话消息
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt},
]
# 应用聊天模板
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
# 准备模型输入
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# 创建流式生成器
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
# 在后台线程中运行生成
generation_kwargs = dict(
**model_inputs,
max_new_tokens=max_new_tokens,
streamer=streamer,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# 流式输出生成的文本
print("Generated text: ", end="", flush=True)
for text in streamer:
print(text, end="", flush=True)
print("\n")
thread.join()
def init_model(model_name):
from transformers import BitsAndBytesConfig
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map='cuda:0', attn_implementation='flash_attention_2',
torch_dtype=torch.bfloat16,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
# llm_int8_threshold=5.0,
llm_int8_skip_modules=["lm_head"],
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
)
return model, tokenizer
if __name__ == "__main__":
# 加载模型和分词器
model, tokenizer = init_model(MODEL_NAME)
# 测试流式生成
prompt = "请用中文介绍一下大语言模型。"
stream_generate(prompt)
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
model, tokenizer = init_model("prithivMLmods/Qwen-UMLS-7B-Instruct")
# 测试多轮对话
prompt = "请详细解释一下Transformer架构。"
stream_generate(prompt)
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
model, tokenizer = init_model("Qwen/Qwen2.5-7B-Instruct")
prompt = "请详细解释一下你是谁。"
stream_generate(prompt)
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
print("done")
Expected behavior
When the model is loaded for the first time, after deleting the model and calling torch.cuda.empty_cache(), the occupied GPU memory is not cleaned. However, the newly occupied GPU memory during the second and third model loading can be cleaned normally.
What can be done to fully reclaim the GPU memory?
The text was updated successfully, but these errors were encountered:
System Info
os: linux
gpu: a100
Reproduction
Expected behavior
When the model is loaded for the first time, after deleting the model and calling torch.cuda.empty_cache(), the occupied GPU memory is not cleaned. However, the newly occupied GPU memory during the second and third model loading can be cleaned normally.
What can be done to fully reclaim the GPU memory?
The text was updated successfully, but these errors were encountered: