diff --git a/hack/optimize/nvidia/Dockerfile b/hack/optimize/nvidia/Dockerfile new file mode 100644 index 000000000..31811bfb6 --- /dev/null +++ b/hack/optimize/nvidia/Dockerfile @@ -0,0 +1,27 @@ +# Use NVIDIA CUDA base image +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 + +# Install dependencies +RUN apt-get update && apt-get install -y \ + python3.10 \ + python3.10-dev \ + python3.10-distutils \ + curl && \ + rm -rf /var/lib/apt/lists/* + +# Install pip +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 + +# Set up Python environment and install torch with CUDA support +WORKDIR /app +RUN python3.10 -m pip install --upgrade pip && \ + python3.10 -m pip install --no-cache-dir torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html + +# Install additional dependencies +COPY requirements.txt . +RUN python3.10 -m pip install --no-cache-dir -r requirements.txt + +# Copy scripts into the container +COPY train_bert_nvidia.py /app/train_bert_nvidia.py +COPY infer_bert_nvidia.py /app/infer_bert_nvidia.py + diff --git a/hack/optimize/nvidia/infer_bert_nvidia.py b/hack/optimize/nvidia/infer_bert_nvidia.py new file mode 100644 index 000000000..bf5b0a043 --- /dev/null +++ b/hack/optimize/nvidia/infer_bert_nvidia.py @@ -0,0 +1,57 @@ +import os +import time +import torch +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset + +def create_dummy_data(tokenizer, num_samples=1000, max_length=128): + sentences = [ + f"This is a dummy sentence number {i}" for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + return TensorDataset( + tokenized_inputs.input_ids, + tokenized_inputs.attention_mask + ) + +def inference_bert(model, tokenizer, batch_sizes, device): + model = model.to(device) + model.eval() + + dataset = create_dummy_data(tokenizer) + for batch_size in batch_sizes: + try: + inference_dataloader = DataLoader(dataset, batch_size=batch_size) + start_time = time.time() + with torch.no_grad(): + for batch in inference_dataloader: + inputs, masks = batch + inputs, masks = inputs.to(device), masks.to(device) + outputs = model(input_ids=inputs, attention_mask=masks) + end_time = time.time() + print(f"Batch Size: {batch_size} Inference time: {end_time - start_time:.2f} seconds") + break + except RuntimeError as e: + if 'out of memory' in str(e).lower(): + print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.") + torch.cuda.empty_cache() + continue + else: + raise e + +def main(): + device = torch.device('cuda') + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + batch_sizes = [1024, 512, 256, 128, 64, 32, 16, 8] + inference_bert(model, tokenizer, batch_sizes, device) + +if __name__ == "__main__": + main() + diff --git a/hack/optimize/nvidia/requirements.txt b/hack/optimize/nvidia/requirements.txt new file mode 100644 index 000000000..2a36640ee --- /dev/null +++ b/hack/optimize/nvidia/requirements.txt @@ -0,0 +1,3 @@ +transformers==4.29 +numpy==1.23 +pynvml diff --git a/hack/optimize/nvidia/train_bert_nvidia.py b/hack/optimize/nvidia/train_bert_nvidia.py new file mode 100644 index 000000000..969ec4a6e --- /dev/null +++ b/hack/optimize/nvidia/train_bert_nvidia.py @@ -0,0 +1,75 @@ +import os +import time +import torch +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset + +def create_dummy_data(tokenizer, num_samples=1000, max_length=128): + sentences = [ + f"This is a dummy sentence number {i}" for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + labels = tokenized_inputs.input_ids.detach().clone() + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + return TensorDataset( + tokenized_inputs.input_ids, + tokenized_inputs.attention_mask, + labels, + next_sentence_labels, + ) + +def train_bert(model, tokenizer, batch_sizes, device): + model = model.to(device) + model.train() + + dataset = create_dummy_data(tokenizer) + for batch_size in batch_sizes: + try: + train_dataloader = DataLoader(dataset, batch_size=batch_size) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) + for _ in range(2): + for batch in train_dataloader: + optimizer.zero_grad() + inputs, masks, labels, next_sentence_labels = batch + inputs, masks, labels, next_sentence_labels = ( + inputs.to(device), + masks.to(device), + labels.to(device), + next_sentenP0+r\P0+r\ce_labels.to(device), + ) + outputs = model( + input_ids=inputs, + attention_mask=masks, + labels=labels, + next_sentence_label=next_sentence_labels, + ) + loss = outputs.loss + loss.backward() + optimizer.step() + break + print(f"Batch Size: {batch_size} Training complete.") + break + except RuntimeError as e: + if 'out of memory' in str(e).lower(): + print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.") + torch.cuda.empty_cache() + continue + else: + raise e + +def main(): + device = torch.device('cuda') + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + batch_sizes = [1024, 512, 256, 128, 64, 32, 16, 8] + train_bert(model, tokenizer, batch_sizes, device) + +if __name__ == "__main__": + main() +