From af9fda0d21a52e9538ff4deed221eb2f5ef0a588 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Wed, 26 Jun 2024 21:15:29 +0000 Subject: [PATCH 01/16] Add python training script, requirements.txt (dependencies), and dockerfile for the e2e BERT training task --- .../bert-training/Dockerfile.bert-training | 40 +++++++ .../images/bert-training/requirements.txt | 3 + e2e2/test/images/bert-training/train.py | 100 ++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 e2e2/test/images/bert-training/Dockerfile.bert-training create mode 100644 e2e2/test/images/bert-training/requirements.txt create mode 100644 e2e2/test/images/bert-training/train.py diff --git a/e2e2/test/images/bert-training/Dockerfile.bert-training b/e2e2/test/images/bert-training/Dockerfile.bert-training new file mode 100644 index 000000000..2b1f1eb0c --- /dev/null +++ b/e2e2/test/images/bert-training/Dockerfile.bert-training @@ -0,0 +1,40 @@ +# Use the NVIDIA CUDA runtime as a parent image +FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 + +# Set environment variable to disable interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python 3.11 +RUN apt-get update && apt-get install -y \ + software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y \ + python3.11 \ + python3.11-dev \ + python3.11-distutils \ + python3-pip && \ + rm -rf /var/lib/apt/lists/* + +# Create a symbolic link to use python3.11 as python +RUN ln -sf /usr/bin/python3.11 /usr/bin/python + +# Set the working directory in the container +WORKDIR /app + +# Copy only the necessary files into the container at /app +COPY train.py /app/ +COPY requirements.txt /app/ + +# Install any needed packages specified in requirements.txt +RUN python -m pip install --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt + +# Install OpenMPI +RUN apt-get update && \ + apt-get install -y openmpi-bin openmpi-common libopenmpi-dev && \ + rm -rf /var/lib/apt/lists/* + +# Set environment variables for OpenMPI +ENV PATH="/usr/lib64/openmpi/bin:${PATH}" +ENV LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" \ No newline at end of file diff --git a/e2e2/test/images/bert-training/requirements.txt b/e2e2/test/images/bert-training/requirements.txt new file mode 100644 index 000000000..a9831ed72 --- /dev/null +++ b/e2e2/test/images/bert-training/requirements.txt @@ -0,0 +1,3 @@ +torch==2.3 +transformers==4.29 +numpy==1.23 \ No newline at end of file diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py new file mode 100644 index 000000000..b3251c4d4 --- /dev/null +++ b/e2e2/test/images/bert-training/train.py @@ -0,0 +1,100 @@ +import os +import time +import torch +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset +import numpy as np + +def create_dummy_data(tokenizer, num_samples=100, max_length=128): + # Create dummy input data + sentences = ["This is a dummy sentence number {}".format(i) for i in range(num_samples)] + tokenized_inputs = tokenizer(sentences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") + labels = tokenized_inputs.input_ids.detach().clone() + + # MLM task: randomly mask some tokens + mlm_probability = 0.15 + input_ids, labels = mask_tokens(tokenized_inputs.input_ids, tokenizer, mlm_probability) + + # NSP task: create dummy pairs + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + + return TensorDataset(input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels) + +def mask_tokens(inputs, tokenizer, mlm_probability): + labels = inputs.clone() + probability_matrix = torch.full(labels.shape, mlm_probability) + special_tokens_mask = [ + tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) + + return inputs, labels + +def setup(rank, world_size): + os.environ['MASTER_ADDR'] = os.environ['MASTER_ADDR'] # Kubernetes sets this + os.environ['MASTER_PORT'] = os.environ['MASTER_PORT'] # Kubernetes sets this + dist.init_process_group("nccl", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + print(f"Process {rank} initialized, using GPU {rank}") + +def cleanup(): + dist.destroy_process_group() + +def train_bert(rank, world_size, model, tokenizer): + setup(rank, world_size) + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForPreTraining.from_pretrained('bert-base-uncased').to(rank) + ddp_model = DDP(model, device_ids=[rank]) + + dataset = create_dummy_data(tokenizer) + train_sampler = torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=world_size, rank=rank) + train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=8) + + optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001) + criterion = torch.nn.CrossEntropyLoss() + + start_time = time.time() + + for epoch in range(1): # Short run for testing + ddp_model.train() + for batch in train_dataloader: + optimizer.zero_grad() + inputs, masks, labels, next_sentence_labels = batch + inputs, masks, labels, next_sentence_labels = inputs.to(rank), masks.to(rank), labels.to(rank), next_sentence_labels.to(rank) + outputs = ddp_model(input_ids=inputs, attention_mask=masks, labels=labels, next_sentence_label=next_sentence_labels) + loss = outputs.loss + loss.backward() + optimizer.step() + + end_time = time.time() + training_time = end_time - start_time + throughput = len(dataset) / training_time + + metrics = { + "training_time": training_time, + "throughput": throughput, + } + + print(f"Process {rank} - Training time: {training_time:.2f} seconds") + print(f"Process {rank} - Throughput: {throughput:.2f} samples/second") + + cleanup() + +def main(): + # Pre-download model and tokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForPreTraining.from_pretrained('bert-base-uncased') + + rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + train_bert(rank, world_size, model, tokenizer) + +if __name__ == "__main__": + main() \ No newline at end of file From 104fa93bc7240bf67d0d9a83933a023a68d75fe5 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Wed, 26 Jun 2024 21:19:15 +0000 Subject: [PATCH 02/16] Add github action to build bert-testing image on PR --- .github/workflows/ci.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1247571d2..8d8e6fc96 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -30,4 +30,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/nvidia/Dockerfile . \ No newline at end of file + - run: docker build --file e2e2/test/images/nvidia/Dockerfile . + build-bert-training: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: docker build --file e2e2/test/images/bert-training/Dockerfile.bert-training . From 477f67216326be229216af6f8e26d2604bebd737 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Wed, 26 Jun 2024 21:49:48 +0000 Subject: [PATCH 03/16] Specify directory the BERT training image should be built in for the github action --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8d8e6fc96..56b376120 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,4 +35,4 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/bert-training/Dockerfile.bert-training . + - run: docker build --file e2e2/test/images/bert-training/Dockerfile.bert-training e2e2/test/images/bert-training From fb7d18f38c0e7445fc0d6af9d5ab420ad4d9d5d3 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Thu, 27 Jun 2024 00:38:23 +0000 Subject: [PATCH 04/16] Add default values and include in docker env for MASTER_ADDR and MASTER_HOST --- e2e2/test/images/bert-training/Dockerfile.bert-training | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/e2e2/test/images/bert-training/Dockerfile.bert-training b/e2e2/test/images/bert-training/Dockerfile.bert-training index 2b1f1eb0c..038766e3d 100644 --- a/e2e2/test/images/bert-training/Dockerfile.bert-training +++ b/e2e2/test/images/bert-training/Dockerfile.bert-training @@ -4,6 +4,10 @@ FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 # Set environment variable to disable interactive prompts ENV DEBIAN_FRONTEND=noninteractive +# Set default values for MASTER_ADDR and MASTER_PORT +ENV MASTER_ADDR=127.0.0.1 +ENV MASTER_PORT=12355 + # Install Python 3.11 RUN apt-get update && apt-get install -y \ software-properties-common && \ From b5aedc7276c79c1d6cd93b3f95e7b6b986352a94 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Thu, 27 Jun 2024 00:39:36 +0000 Subject: [PATCH 05/16] Slightly change env var value retrieval. Also ran a formatter to pretty it up. --- e2e2/test/images/bert-training/train.py | 85 +++++++++++++++++-------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py index b3251c4d4..da23f69eb 100644 --- a/e2e2/test/images/bert-training/train.py +++ b/e2e2/test/images/bert-training/train.py @@ -1,4 +1,5 @@ import os +import json import time import torch import torch.distributed as dist @@ -7,28 +8,45 @@ from torch.utils.data import DataLoader, TensorDataset import numpy as np + def create_dummy_data(tokenizer, num_samples=100, max_length=128): # Create dummy input data - sentences = ["This is a dummy sentence number {}".format(i) for i in range(num_samples)] - tokenized_inputs = tokenizer(sentences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") + sentences = [ + "This is a dummy sentence number {}".format(i) for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) labels = tokenized_inputs.input_ids.detach().clone() # MLM task: randomly mask some tokens mlm_probability = 0.15 - input_ids, labels = mask_tokens(tokenized_inputs.input_ids, tokenizer, mlm_probability) + input_ids, labels = mask_tokens( + tokenized_inputs.input_ids, tokenizer, mlm_probability + ) # NSP task: create dummy pairs next_sentence_labels = torch.randint(0, 2, (num_samples,)) - return TensorDataset(input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels) + return TensorDataset( + input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels + ) + def mask_tokens(inputs, tokenizer, mlm_probability): labels = inputs.clone() probability_matrix = torch.full(labels.shape, mlm_probability) special_tokens_mask = [ - tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) + for val in labels.tolist() ] - probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + probability_matrix.masked_fill_( + torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 + ) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens @@ -36,25 +54,35 @@ def mask_tokens(inputs, tokenizer, mlm_probability): return inputs, labels + def setup(rank, world_size): - os.environ['MASTER_ADDR'] = os.environ['MASTER_ADDR'] # Kubernetes sets this - os.environ['MASTER_PORT'] = os.environ['MASTER_PORT'] # Kubernetes sets this - dist.init_process_group("nccl", rank=rank, world_size=world_size) + master_addr = os.environ["MASTER_ADDR"] + master_port = os.environ["MASTER_PORT"] + dist.init_process_group( + "nccl", + init_method=f"tcp://{master_addr}:{master_port}", + rank=rank, + world_size=world_size, + ) torch.cuda.set_device(rank) print(f"Process {rank} initialized, using GPU {rank}") + def cleanup(): dist.destroy_process_group() + def train_bert(rank, world_size, model, tokenizer): setup(rank, world_size) - - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForPreTraining.from_pretrained('bert-base-uncased').to(rank) + + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased").to(rank) ddp_model = DDP(model, device_ids=[rank]) dataset = create_dummy_data(tokenizer) - train_sampler = torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=world_size, rank=rank) + train_sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank + ) train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=8) optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001) @@ -67,8 +95,18 @@ def train_bert(rank, world_size, model, tokenizer): for batch in train_dataloader: optimizer.zero_grad() inputs, masks, labels, next_sentence_labels = batch - inputs, masks, labels, next_sentence_labels = inputs.to(rank), masks.to(rank), labels.to(rank), next_sentence_labels.to(rank) - outputs = ddp_model(input_ids=inputs, attention_mask=masks, labels=labels, next_sentence_label=next_sentence_labels) + inputs, masks, labels, next_sentence_labels = ( + inputs.to(rank), + masks.to(rank), + labels.to(rank), + next_sentence_labels.to(rank), + ) + outputs = ddp_model( + input_ids=inputs, + attention_mask=masks, + labels=labels, + next_sentence_label=next_sentence_labels, + ) loss = outputs.loss loss.backward() optimizer.step() @@ -77,24 +115,21 @@ def train_bert(rank, world_size, model, tokenizer): training_time = end_time - start_time throughput = len(dataset) / training_time - metrics = { - "training_time": training_time, - "throughput": throughput, - } - print(f"Process {rank} - Training time: {training_time:.2f} seconds") print(f"Process {rank} - Throughput: {throughput:.2f} samples/second") cleanup() + def main(): # Pre-download model and tokenizer - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForPreTraining.from_pretrained('bert-base-uncased') + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") - rank = int(os.environ['OMPI_COMM_WORLD_RANK']) - world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) train_bert(rank, world_size, model, tokenizer) + if __name__ == "__main__": - main() \ No newline at end of file + main() From 7f9480b9efe324912d4cdcaaa46b0ceaa2dc30eb Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Fri, 28 Jun 2024 06:08:35 +0000 Subject: [PATCH 06/16] Update bert training dockerfile to include amazon specific packages for MPI, NCCL, and EFA. --- .../bert-training/Dockerfile.bert-training | 64 +++++++++++++++++-- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/e2e2/test/images/bert-training/Dockerfile.bert-training b/e2e2/test/images/bert-training/Dockerfile.bert-training index 038766e3d..9e0638ecd 100644 --- a/e2e2/test/images/bert-training/Dockerfile.bert-training +++ b/e2e2/test/images/bert-training/Dockerfile.bert-training @@ -34,11 +34,61 @@ COPY requirements.txt /app/ RUN python -m pip install --upgrade pip && \ pip install --no-cache-dir -r requirements.txt -# Install OpenMPI -RUN apt-get update && \ - apt-get install -y openmpi-bin openmpi-common libopenmpi-dev && \ - rm -rf /var/lib/apt/lists/* +ARG EFA_INSTALLER_VERSION=latest +# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 +ARG AWS_OFI_NCCL_VERSION=1.9.1 +ARG NCCL_TESTS_VERSION=master + +# Install necessary dependencies and remove old ones +RUN apt-get update -y && \ + apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \ + rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf && \ + ldconfig && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + sudo git gcc vim kmod openssh-client openssh-server build-essential \ + wget curl autoconf libtool gdb automake python3-distutils cmake \ + apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev + +# SSH configuration +RUN mkdir -p /var/run/sshd && \ + sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Set environment variables for OpenMPI and CUDA +ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH +ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH + +# Install EFA +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +# Install NCCL +RUN apt-key del 7fa2af80 && \ + curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 + +# Install AWS-OFI-NCCL plugin +RUN export OPAL_PREFIX="" && \ + git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ + cd /opt/aws-ofi-nccl && \ + git checkout v${AWS_OFI_NCCL_VERSION}-aws && \ + ./autogen.sh && \ + ./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \ + make && make install + +# Set default values for MASTER_ADDR and MASTER_PORT for local testing +ENV MASTER_ADDR=127.0.0.1 +ENV MASTER_PORT=12355 -# Set environment variables for OpenMPI -ENV PATH="/usr/lib64/openmpi/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" \ No newline at end of file +# Set environment variables for NCCL and clean up +ENV NCCL_PROTO simple +RUN rm -rf /var/lib/apt/lists/* +# Ensure NCCL library is found first +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH From 19613e1b51bb463d1088d772d66b7dfa569c5452 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Tue, 16 Jul 2024 16:16:16 +0000 Subject: [PATCH 07/16] Change Dockerfile.bert-training file name to just Dockerfile --- .../images/bert-training/{Dockerfile.bert-training => Dockerfile} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename e2e2/test/images/bert-training/{Dockerfile.bert-training => Dockerfile} (100%) diff --git a/e2e2/test/images/bert-training/Dockerfile.bert-training b/e2e2/test/images/bert-training/Dockerfile similarity index 100% rename from e2e2/test/images/bert-training/Dockerfile.bert-training rename to e2e2/test/images/bert-training/Dockerfile From 974da505a57c0595adf09043df89ef9c43905f82 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Tue, 16 Jul 2024 16:21:45 +0000 Subject: [PATCH 08/16] Update git workflow to use new Dockerfile path since the name was updated --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 56b376120..782d07c43 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,4 +35,4 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/bert-training/Dockerfile.bert-training e2e2/test/images/bert-training + - run: docker build --file e2e2/test/images/bert-training/Dockerfile e2e2/test/images/bert-training From 5b4ae1a8af6ad21f783fe87c343b55573ebff4b2 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Tue, 16 Jul 2024 17:07:07 +0000 Subject: [PATCH 09/16] Update Docker image to use Python version 3.10.12 and build from source to be consistent with the other test images --- e2e2/test/images/bert-training/Dockerfile | 66 ++++++++++++++++++----- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/e2e2/test/images/bert-training/Dockerfile b/e2e2/test/images/bert-training/Dockerfile index 9e0638ecd..1ec44468a 100644 --- a/e2e2/test/images/bert-training/Dockerfile +++ b/e2e2/test/images/bert-training/Dockerfile @@ -8,20 +8,58 @@ ENV DEBIAN_FRONTEND=noninteractive ENV MASTER_ADDR=127.0.0.1 ENV MASTER_PORT=12355 -# Install Python 3.11 -RUN apt-get update && apt-get install -y \ - software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get update && \ - apt-get install -y \ - python3.11 \ - python3.11-dev \ - python3.11-distutils \ - python3-pip && \ - rm -rf /var/lib/apt/lists/* - -# Create a symbolic link to use python3.11 as python -RUN ln -sf /usr/bin/python3.11 /usr/bin/python +# Dependency version numbers +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG PIP=pip3 + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + software-properties-common \ + wget \ + unzip \ + vim \ + pkg-config \ + gdb \ + lcov \ + libbz2-dev \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools # Set the working directory in the container WORKDIR /app From fa8d244d4cf7456c87957166d739ac431f773353 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Tue, 16 Jul 2024 17:34:03 +0000 Subject: [PATCH 10/16] Remove extra line --- .github/workflows/ci.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1e39c442c..b79a575b0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,7 +31,6 @@ jobs: steps: - uses: actions/checkout@v3 - run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference - build-bert-training: runs-on: ubuntu-latest steps: From f87ba65096893bb966fd6a9eebc020832f938fd5 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Thu, 18 Jul 2024 19:58:04 +0000 Subject: [PATCH 11/16] Had been setting MASTER_ADDR and MASTER_PORT env vars twice. Removed duplicate --- e2e2/test/images/bert-training/Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/e2e2/test/images/bert-training/Dockerfile b/e2e2/test/images/bert-training/Dockerfile index 1ec44468a..c00896c68 100644 --- a/e2e2/test/images/bert-training/Dockerfile +++ b/e2e2/test/images/bert-training/Dockerfile @@ -121,10 +121,6 @@ RUN export OPAL_PREFIX="" && \ ./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \ make && make install -# Set default values for MASTER_ADDR and MASTER_PORT for local testing -ENV MASTER_ADDR=127.0.0.1 -ENV MASTER_PORT=12355 - # Set environment variables for NCCL and clean up ENV NCCL_PROTO simple RUN rm -rf /var/lib/apt/lists/* From 7af6b138a5e4daee404adb0a9e11cbc82eb57afa Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Thu, 18 Jul 2024 19:58:21 +0000 Subject: [PATCH 12/16] Set each process to a GPU via local rank instead of overall rank --- e2e2/test/images/bert-training/train.py | 44 ++++++++++++++----------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py index da23f69eb..bf4d4762b 100644 --- a/e2e2/test/images/bert-training/train.py +++ b/e2e2/test/images/bert-training/train.py @@ -1,5 +1,4 @@ import os -import json import time import torch import torch.distributed as dist @@ -55,7 +54,7 @@ def mask_tokens(inputs, tokenizer, mlm_probability): return inputs, labels -def setup(rank, world_size): +def setup(rank, world_size, local_rank): master_addr = os.environ["MASTER_ADDR"] master_port = os.environ["MASTER_PORT"] dist.init_process_group( @@ -64,26 +63,22 @@ def setup(rank, world_size): rank=rank, world_size=world_size, ) - torch.cuda.set_device(rank) - print(f"Process {rank} initialized, using GPU {rank}") + torch.cuda.set_device(local_rank) + print(f"Process {rank} initialized, using GPU {local_rank}") def cleanup(): dist.destroy_process_group() -def train_bert(rank, world_size, model, tokenizer): - setup(rank, world_size) +def train_bert(rank, world_size, local_rank, model, tokenizer): + setup(rank, world_size, local_rank) - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertForPreTraining.from_pretrained("bert-base-uncased").to(rank) - ddp_model = DDP(model, device_ids=[rank]) + model = model.to(local_rank) + ddp_model = DDP(model, device_ids=[local_rank]) dataset = create_dummy_data(tokenizer) - train_sampler = torch.utils.data.distributed.DistributedSampler( - dataset, num_replicas=world_size, rank=rank - ) - train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=8) + train_dataloader = DataLoader(dataset, batch_size=8) optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001) criterion = torch.nn.CrossEntropyLoss() @@ -96,10 +91,10 @@ def train_bert(rank, world_size, model, tokenizer): optimizer.zero_grad() inputs, masks, labels, next_sentence_labels = batch inputs, masks, labels, next_sentence_labels = ( - inputs.to(rank), - masks.to(rank), - labels.to(rank), - next_sentence_labels.to(rank), + inputs.to(local_rank), + masks.to(local_rank), + labels.to(local_rank), + next_sentence_labels.to(local_rank), ) outputs = ddp_model( input_ids=inputs, @@ -122,13 +117,22 @@ def train_bert(rank, world_size, model, tokenizer): def main(): + rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) + + # TODO: Consider parameterizing for nodes of any GPU count + num_gpus_per_node = 8 # Adjust this based on your setup + local_rank = rank % num_gpus_per_node + + print(f"Process started for rank {rank} with local rank {local_rank}") + # Pre-download model and tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForPreTraining.from_pretrained("bert-base-uncased") - rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) - world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) - train_bert(rank, world_size, model, tokenizer) + print(f"successfully downloaded model and tokenizer for rank: {rank}") + + train_bert(rank, world_size, local_rank, model, tokenizer) if __name__ == "__main__": From 1f5b1c9bc62ab61de6ca18db275dc0827658fc96 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Fri, 19 Jul 2024 21:53:11 +0000 Subject: [PATCH 13/16] Change comment describing section in dockerfile --- e2e2/test/images/bert-training/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e2/test/images/bert-training/Dockerfile b/e2e2/test/images/bert-training/Dockerfile index c00896c68..ccbd04e36 100644 --- a/e2e2/test/images/bert-training/Dockerfile +++ b/e2e2/test/images/bert-training/Dockerfile @@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive ENV MASTER_ADDR=127.0.0.1 ENV MASTER_PORT=12355 -# Dependency version numbers +# Python ependency version numbers ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ARG PIP=pip3 From 4a8e0ecc3e12f83795390be156de78129c2ea4a2 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Tue, 23 Jul 2024 19:00:03 +0000 Subject: [PATCH 14/16] parameterize number of gpus per node in Dockerfile and train.py --- e2e2/test/images/bert-training/Dockerfile | 3 +++ e2e2/test/images/bert-training/train.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/e2e2/test/images/bert-training/Dockerfile b/e2e2/test/images/bert-training/Dockerfile index ccbd04e36..6990d0778 100644 --- a/e2e2/test/images/bert-training/Dockerfile +++ b/e2e2/test/images/bert-training/Dockerfile @@ -8,6 +8,9 @@ ENV DEBIAN_FRONTEND=noninteractive ENV MASTER_ADDR=127.0.0.1 ENV MASTER_PORT=12355 +# Set default number of GPUs per node +ENV NUM_GPUS_PER_NODE=8 + # Python ependency version numbers ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py index bf4d4762b..116fc3108 100644 --- a/e2e2/test/images/bert-training/train.py +++ b/e2e2/test/images/bert-training/train.py @@ -120,8 +120,8 @@ def main(): rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) - # TODO: Consider parameterizing for nodes of any GPU count - num_gpus_per_node = 8 # Adjust this based on your setup + + num_gpus_per_node = int(os.environ["NUM_GPUS_PER_NODE"]) local_rank = rank % num_gpus_per_node print(f"Process started for rank {rank} with local rank {local_rank}") From 01d8270705bfc5f49c35a62f3dafef26843dad17 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Wed, 31 Jul 2024 22:12:14 +0000 Subject: [PATCH 15/16] formatting in train.py --- e2e2/test/images/bert-training/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py index 116fc3108..0f9b5447d 100644 --- a/e2e2/test/images/bert-training/train.py +++ b/e2e2/test/images/bert-training/train.py @@ -120,7 +120,6 @@ def main(): rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) - num_gpus_per_node = int(os.environ["NUM_GPUS_PER_NODE"]) local_rank = rank % num_gpus_per_node From f000ec650058283f7bb463b0cb00c23a6c6dbb6f Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Fri, 11 Oct 2024 12:48:48 +0000 Subject: [PATCH 16/16] Add nvidia batch optimization scripts for both training and inference --- hack/optimize/nvidia/Dockerfile | 27 ++++++++ hack/optimize/nvidia/infer_bert_nvidia.py | 57 +++++++++++++++++ hack/optimize/nvidia/requirements.txt | 3 + hack/optimize/nvidia/train_bert_nvidia.py | 75 +++++++++++++++++++++++ 4 files changed, 162 insertions(+) create mode 100644 hack/optimize/nvidia/Dockerfile create mode 100644 hack/optimize/nvidia/infer_bert_nvidia.py create mode 100644 hack/optimize/nvidia/requirements.txt create mode 100644 hack/optimize/nvidia/train_bert_nvidia.py diff --git a/hack/optimize/nvidia/Dockerfile b/hack/optimize/nvidia/Dockerfile new file mode 100644 index 000000000..31811bfb6 --- /dev/null +++ b/hack/optimize/nvidia/Dockerfile @@ -0,0 +1,27 @@ +# Use NVIDIA CUDA base image +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 + +# Install dependencies +RUN apt-get update && apt-get install -y \ + python3.10 \ + python3.10-dev \ + python3.10-distutils \ + curl && \ + rm -rf /var/lib/apt/lists/* + +# Install pip +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 + +# Set up Python environment and install torch with CUDA support +WORKDIR /app +RUN python3.10 -m pip install --upgrade pip && \ + python3.10 -m pip install --no-cache-dir torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html + +# Install additional dependencies +COPY requirements.txt . +RUN python3.10 -m pip install --no-cache-dir -r requirements.txt + +# Copy scripts into the container +COPY train_bert_nvidia.py /app/train_bert_nvidia.py +COPY infer_bert_nvidia.py /app/infer_bert_nvidia.py + diff --git a/hack/optimize/nvidia/infer_bert_nvidia.py b/hack/optimize/nvidia/infer_bert_nvidia.py new file mode 100644 index 000000000..bf5b0a043 --- /dev/null +++ b/hack/optimize/nvidia/infer_bert_nvidia.py @@ -0,0 +1,57 @@ +import os +import time +import torch +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset + +def create_dummy_data(tokenizer, num_samples=1000, max_length=128): + sentences = [ + f"This is a dummy sentence number {i}" for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + return TensorDataset( + tokenized_inputs.input_ids, + tokenized_inputs.attention_mask + ) + +def inference_bert(model, tokenizer, batch_sizes, device): + model = model.to(device) + model.eval() + + dataset = create_dummy_data(tokenizer) + for batch_size in batch_sizes: + try: + inference_dataloader = DataLoader(dataset, batch_size=batch_size) + start_time = time.time() + with torch.no_grad(): + for batch in inference_dataloader: + inputs, masks = batch + inputs, masks = inputs.to(device), masks.to(device) + outputs = model(input_ids=inputs, attention_mask=masks) + end_time = time.time() + print(f"Batch Size: {batch_size} Inference time: {end_time - start_time:.2f} seconds") + break + except RuntimeError as e: + if 'out of memory' in str(e).lower(): + print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.") + torch.cuda.empty_cache() + continue + else: + raise e + +def main(): + device = torch.device('cuda') + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + batch_sizes = [1024, 512, 256, 128, 64, 32, 16, 8] + inference_bert(model, tokenizer, batch_sizes, device) + +if __name__ == "__main__": + main() + diff --git a/hack/optimize/nvidia/requirements.txt b/hack/optimize/nvidia/requirements.txt new file mode 100644 index 000000000..2a36640ee --- /dev/null +++ b/hack/optimize/nvidia/requirements.txt @@ -0,0 +1,3 @@ +transformers==4.29 +numpy==1.23 +pynvml diff --git a/hack/optimize/nvidia/train_bert_nvidia.py b/hack/optimize/nvidia/train_bert_nvidia.py new file mode 100644 index 000000000..969ec4a6e --- /dev/null +++ b/hack/optimize/nvidia/train_bert_nvidia.py @@ -0,0 +1,75 @@ +import os +import time +import torch +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset + +def create_dummy_data(tokenizer, num_samples=1000, max_length=128): + sentences = [ + f"This is a dummy sentence number {i}" for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + labels = tokenized_inputs.input_ids.detach().clone() + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + return TensorDataset( + tokenized_inputs.input_ids, + tokenized_inputs.attention_mask, + labels, + next_sentence_labels, + ) + +def train_bert(model, tokenizer, batch_sizes, device): + model = model.to(device) + model.train() + + dataset = create_dummy_data(tokenizer) + for batch_size in batch_sizes: + try: + train_dataloader = DataLoader(dataset, batch_size=batch_size) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) + for _ in range(2): + for batch in train_dataloader: + optimizer.zero_grad() + inputs, masks, labels, next_sentence_labels = batch + inputs, masks, labels, next_sentence_labels = ( + inputs.to(device), + masks.to(device), + labels.to(device), + next_sentenP0+r\P0+r\ce_labels.to(device), + ) + outputs = model( + input_ids=inputs, + attention_mask=masks, + labels=labels, + next_sentence_label=next_sentence_labels, + ) + loss = outputs.loss + loss.backward() + optimizer.step() + break + print(f"Batch Size: {batch_size} Training complete.") + break + except RuntimeError as e: + if 'out of memory' in str(e).lower(): + print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.") + torch.cuda.empty_cache() + continue + else: + raise e + +def main(): + device = torch.device('cuda') + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + batch_sizes = [1024, 512, 256, 128, 64, 32, 16, 8] + train_bert(model, tokenizer, batch_sizes, device) + +if __name__ == "__main__": + main() +