Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Batch Optimization Scripts for NVIDIA Instances #498

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
af9fda0
Add python training script, requirements.txt (dependencies), and dock…
mattcjo Jun 26, 2024
104fa93
Add github action to build bert-testing image on PR
mattcjo Jun 26, 2024
477f672
Specify directory the BERT training image should be built in for the …
mattcjo Jun 26, 2024
fb7d18f
Add default values and include in docker env for MASTER_ADDR and MAST…
mattcjo Jun 27, 2024
b5aedc7
Slightly change env var value retrieval. Also ran a formatter to pret…
mattcjo Jun 27, 2024
7f9480b
Update bert training dockerfile to include amazon specific packages f…
mattcjo Jun 28, 2024
19613e1
Change Dockerfile.bert-training file name to just Dockerfile
mattcjo Jul 16, 2024
974da50
Update git workflow to use new Dockerfile path since the name was upd…
mattcjo Jul 16, 2024
5b4ae1a
Update Docker image to use Python version 3.10.12 and build from sour…
mattcjo Jul 16, 2024
6bc3ef4
Merge remote-tracking branch 'upstream/main'
mattcjo Jul 16, 2024
fa8d244
Remove extra line
mattcjo Jul 16, 2024
f87ba65
Had been setting MASTER_ADDR and MASTER_PORT env vars twice. Removed …
mattcjo Jul 18, 2024
7af6b13
Set each process to a GPU via local rank instead of overall rank
mattcjo Jul 18, 2024
1a3ad52
Merge remote-tracking branch 'upstream/main'
mattcjo Jul 18, 2024
1f5b1c9
Change comment describing section in dockerfile
mattcjo Jul 19, 2024
b67026c
Merge branch 'aws:main' into main
mattcjo Jul 23, 2024
4a8e0ec
parameterize number of gpus per node in Dockerfile and train.py
mattcjo Jul 23, 2024
60ddc02
Merge remote-tracking branch 'upstream/main'
mattcjo Jul 31, 2024
01d8270
formatting in train.py
mattcjo Jul 31, 2024
21fd336
Merge remote-tracking branch 'upstream/main'
mattcjo Aug 7, 2024
f250ede
Merge branch 'aws:main' into main
mattcjo Aug 30, 2024
f000ec6
Add nvidia batch optimization scripts for both training and inference
mattcjo Oct 11, 2024
1907a82
Merge branch 'aws:main' into batch-optimization-nvidia
mattcjo Oct 11, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions hack/optimize/nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use NVIDIA CUDA base image
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

# Install dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-dev \
python3.10-distutils \
curl && \
rm -rf /var/lib/apt/lists/*

# Install pip
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10

# Set up Python environment and install torch with CUDA support
WORKDIR /app
RUN python3.10 -m pip install --upgrade pip && \
python3.10 -m pip install --no-cache-dir torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html

# Install additional dependencies
COPY requirements.txt .
RUN python3.10 -m pip install --no-cache-dir -r requirements.txt

# Copy scripts into the container
COPY train_bert_nvidia.py /app/train_bert_nvidia.py
COPY infer_bert_nvidia.py /app/infer_bert_nvidia.py

57 changes: 57 additions & 0 deletions hack/optimize/nvidia/infer_bert_nvidia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import time
import torch
from transformers import BertForPreTraining, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

def create_dummy_data(tokenizer, num_samples=1000, max_length=128):
sentences = [
f"This is a dummy sentence number {i}" for i in range(num_samples)
]
tokenized_inputs = tokenizer(
sentences,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
return TensorDataset(
tokenized_inputs.input_ids,
tokenized_inputs.attention_mask
)

def inference_bert(model, tokenizer, batch_sizes, device):
model = model.to(device)
model.eval()

dataset = create_dummy_data(tokenizer)
for batch_size in batch_sizes:
try:
inference_dataloader = DataLoader(dataset, batch_size=batch_size)
start_time = time.time()
with torch.no_grad():
for batch in inference_dataloader:
inputs, masks = batch
inputs, masks = inputs.to(device), masks.to(device)
outputs = model(input_ids=inputs, attention_mask=masks)
end_time = time.time()
print(f"Batch Size: {batch_size} Inference time: {end_time - start_time:.2f} seconds")
break
except RuntimeError as e:
if 'out of memory' in str(e).lower():
print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.")
torch.cuda.empty_cache()
continue
else:
raise e

def main():
device = torch.device('cuda')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")
batch_sizes = [1024, 512, 256, 128, 64, 32, 16, 8]
inference_bert(model, tokenizer, batch_sizes, device)

if __name__ == "__main__":
main()

3 changes: 3 additions & 0 deletions hack/optimize/nvidia/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
transformers==4.29
numpy==1.23
pynvml
75 changes: 75 additions & 0 deletions hack/optimize/nvidia/train_bert_nvidia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
import time
import torch
from transformers import BertForPreTraining, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

def create_dummy_data(tokenizer, num_samples=1000, max_length=128):
sentences = [
f"This is a dummy sentence number {i}" for i in range(num_samples)
]
tokenized_inputs = tokenizer(
sentences,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
labels = tokenized_inputs.input_ids.detach().clone()
next_sentence_labels = torch.randint(0, 2, (num_samples,))
return TensorDataset(
tokenized_inputs.input_ids,
tokenized_inputs.attention_mask,
labels,
next_sentence_labels,
)

def train_bert(model, tokenizer, batch_sizes, device):
model = model.to(device)
model.train()

dataset = create_dummy_data(tokenizer)
for batch_size in batch_sizes:
try:
train_dataloader = DataLoader(dataset, batch_size=batch_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
for _ in range(2):
for batch in train_dataloader:
optimizer.zero_grad()
inputs, masks, labels, next_sentence_labels = batch
inputs, masks, labels, next_sentence_labels = (
inputs.to(device),
masks.to(device),
labels.to(device),
next_sentenP0+r\P0+r\ce_labels.to(device),
)
outputs = model(
input_ids=inputs,
attention_mask=masks,
labels=labels,
next_sentence_label=next_sentence_labels,
)
loss = outputs.loss
loss.backward()
optimizer.step()
break
print(f"Batch Size: {batch_size} Training complete.")
break
except RuntimeError as e:
if 'out of memory' in str(e).lower():
print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.")
torch.cuda.empty_cache()
continue
else:
raise e

def main():
device = torch.device('cuda')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")
batch_sizes = [1024, 512, 256, 128, 64, 32, 16, 8]
train_bert(model, tokenizer, batch_sizes, device)

if __name__ == "__main__":
main()

Loading