Skip to content

Commit 672acef

Browse files
0x2b3bfa0restyled-commitscasperdcl
authored
Test with GPU and m+t4 machines (#533)
* Test with GPU and m–t4 machines * Restyled by whitespace * Use `Standard_NC4as_T4_v3` on AKS * Bump default disk size on iterative_task resource * Ditto for tests * Fix `az` NGC machine image version * Migrate Google Cloud GPU images to official * Remove redundant `grep` * Bump default disk to 50 GB * Ditto * Try Azure DSVM GPU images * Try which providers support unset disk size * Restyled by gofmt * Make disk size optional * Whoops! * Restyled by gofmt * Use disk_size > 0 everywhere * Fix `gcp` derp * Fix GCP GPU machines * Use `yes`because... why not? * Indent back script * Simplify test script error handling * Remove redundant `Storage` requirement * Avoid `mkdir` error if directory exists * Improve test verbosity & fail fast * Upgrade AWS DLAMI to CUDA 11.3 * Keep it simple * Test `m+k80` to see if `k8s` breaks * Fix `k8s` storage size * Restyled by gofmt * Ahem, ahem * Use `t4` again * Remove `k8s` granular GPU selectors * Fix last `k8s` issues 🤞 * Revert cluster instance change * Delete linux_amd64 * Avoid mkdir errors * Update task/k8s/resources/resource_job.go Co-authored-by: Casper da Costa-Luis <casper.dcl@physics.org> * docs: auto-disk_size * docs: nvidia images * nvidia descrtiption Co-authored-by: Restyled.io <commits@restyled.io> Co-authored-by: Casper da Costa-Luis <casper.dcl@physics.org>
1 parent b9cd04a commit 672acef

File tree

14 files changed

+56
-32
lines changed

14 files changed

+56
-32
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ resource "iterative_task" "example" {
7777
cloud = "aws" # or any of: gcp, az, k8s
7878
machine = "m" # medium. Or any of: l, xl, m+k80, xl+v100, ...
7979
spot = 0 # auto-price. Default -1 to disable, or >0 for hourly USD limit
80-
disk_size = 30 # GB
80+
disk_size = -1 # GB. Default -1 for automatic
8181
8282
storage {
8383
workdir = "." # default blank (don't upload)

docs/guides/getting-started.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ resource "iterative_task" "example" {
3838
cloud = "aws" # or any of: gcp, az, k8s
3939
machine = "m" # medium. Or any of: l, xl, m+k80, xl+v100, ...
4040
spot = 0 # auto-price. Default -1 to disable, or >0 for hourly USD limit
41-
disk_size = 30 # GB
41+
disk_size = -1 # GB. Default -1 for automatic
4242

4343
storage {
4444
workdir = "." # default blank (don't upload)

docs/resources/task.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ resource "iterative_task" "example" {
1515
machine = "m" # medium. Or any of: l, xl, m+k80, xl+v100, ...
1616
image = "ubuntu" # or "nvidia", ...
1717
region = "us-west" # or "us-east", "eu-west", ...
18-
disk_size = 30 # GB
18+
disk_size = -1 # GB. Default -1 for automatic
1919
spot = 0 # auto-price. Default -1 to disable, or >0 for hourly USD limit
2020
parallelism = 1
2121
timeout = 24*60*60 # max 24h before forced termination
@@ -56,7 +56,7 @@ resource "iterative_task" "example" {
5656

5757
- `region` - (Optional) [Cloud region/zone](#cloud-region) to run the task on.
5858
- `machine` - (Optional) See [Machine Types](#machine-type) below.
59-
- `disk_size` - (Optional) Size of the ephemeral machine storage in GB.
59+
- `disk_size` - (Optional) Size of the ephemeral machine storage in GB. `-1`: automatic based on `image`.
6060
- `spot` - (Optional) Spot instance price. `-1`: disabled, `0`: automatic price, any other positive number: maximum bidding price in USD per hour (above which the instance is terminated until the price drops).
6161
- `image` - (Optional) [Machine image](#machine-image) to run the task with.
6262
- `parallelism` - (Optional) Number of machines to be launched in parallel.
@@ -169,7 +169,7 @@ In addition to generic types, it's possible to specify any machine type supporte
169169
The Iterative Provider offers some common machine images which are roughly the same for all supported clouds.
170170

171171
- `ubuntu` - Official [Ubuntu LTS](https://wiki.ubuntu.com/LTS) image (currently 20.04).
172-
- `nvidia` - Official [NVIDIA NGC](https://docs.nvidia.com/ngc/ngc-deploy-public-cloud)-based images, typically needing `disk_size = 32` GB or more.
172+
- `nvidia` - Official Ubuntu LTS with NVIDIA GPU drivers and CUDA toolkit (currently 11.3).
173173

174174
### Cloud-specific
175175

iterative/resource_task.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func resourceTask() *schema.Resource {
5555
Type: schema.TypeInt,
5656
ForceNew: true,
5757
Optional: true,
58-
Default: 30,
58+
Default: -1,
5959
},
6060
"spot": {
6161
Type: schema.TypeFloat,

task/aws/resources/data_source_image.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ func (i *Image) Read(ctx context.Context) error {
3434
image := i.Identifier
3535
images := map[string]string{
3636
"ubuntu": "ubuntu@099720109477:x86_64:*ubuntu/images/hvm-ssd/ubuntu-focal-20.04*",
37-
"nvidia": "ubuntu@898082745236:x86_64:Deep Learning AMI GPU CUDA 11.2.1 (Ubuntu 20.04) 20220306",
37+
"nvidia": "ubuntu@898082745236:x86_64:Deep Learning AMI GPU CUDA 11.3.1 (Ubuntu 20.04) 20220303",
3838
}
3939
if val, ok := images[image]; ok {
4040
image = val

task/aws/resources/resource_launch_template.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ func (l *LaunchTemplate) Create(ctx context.Context) error {
8686
Ebs: &types.LaunchTemplateEbsBlockDeviceRequest{
8787
DeleteOnTermination: aws.Bool(true),
8888
Encrypted: aws.Bool(false),
89-
VolumeSize: aws.Int32(int32(l.Attributes.Size.Storage)),
9089
VolumeType: types.VolumeType("gp2"),
9190
},
9291
},
@@ -110,6 +109,10 @@ func (l *LaunchTemplate) Create(ctx context.Context) error {
110109
},
111110
}
112111

112+
if size := l.Attributes.Size.Storage; size > 0 {
113+
input.LaunchTemplateData.BlockDeviceMappings[0].Ebs.VolumeSize = aws.Int32(int32(size))
114+
}
115+
113116
if _, err := l.Client.Services.EC2.CreateLaunchTemplate(ctx, &input); err != nil {
114117
var e smithy.APIError
115118
if errors.As(err, &e) && e.ErrorCode() == "InvalidLaunchTemplateName.AlreadyExistsException" {

task/az/resources/resource_virtual_machine_scale_set.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func (v *VirtualMachineScaleSet) Create(ctx context.Context) error {
8585
image := v.Attributes.Environment.Image
8686
images := map[string]string{
8787
"ubuntu": "ubuntu@Canonical:0001-com-ubuntu-server-focal:20_04-lts:latest",
88-
"nvidia": "ubuntu@nvidia:ngc_base_image_version_b:gen2_21-11-0:latest#plan",
88+
"nvidia": "ubuntu@microsoft-dsvm:ubuntu-2004:2004-gen2:latest",
8989
}
9090
if val, ok := images[image]; ok {
9191
image = val
@@ -145,7 +145,6 @@ func (v *VirtualMachineScaleSet) Create(ctx context.Context) error {
145145
OsDisk: &compute.VirtualMachineScaleSetOSDisk{
146146
Caching: compute.CachingTypesReadWrite,
147147
CreateOption: compute.DiskCreateOptionTypesFromImage,
148-
DiskSizeGB: to.Int32Ptr(int32(v.Attributes.Size.Storage)),
149148
ManagedDisk: &compute.VirtualMachineScaleSetManagedDiskParameters{
150149
StorageAccountType: compute.StorageAccountTypesStandardLRS,
151150
},
@@ -192,6 +191,10 @@ func (v *VirtualMachineScaleSet) Create(ctx context.Context) error {
192191
},
193192
}
194193

194+
if size := v.Attributes.Size.Storage; size > 0 {
195+
settings.VirtualMachineScaleSetProperties.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB = to.Int32Ptr(int32(size))
196+
}
197+
195198
if plan == "#plan" {
196199
settings.Plan = &compute.Plan{
197200
Publisher: to.StringPtr(publisher),

task/common/machine/script.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ fi
102102
103103
rclone copy "$RCLONE_REMOTE/data" /tmp/tpi-task
104104
105+
yes | /etc/profile.d/install-driver-prompt.sh # for GCP GPU machines
106+
105107
sudo systemctl daemon-reload
106108
sudo systemctl enable tpi-task.service --now
107109
sudo systemctl disable --now apt-daily.timer

task/gcp/resources/data_source_image.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ func (i *Image) Read(ctx context.Context) error {
3232
image := i.Identifier
3333
images := map[string]string{
3434
"ubuntu": "ubuntu@ubuntu-os-cloud/ubuntu-2004-lts",
35-
"nvidia": "ubuntu@nvidia-ngc-public/nvidia-gpu-cloud-image-20211105",
35+
"nvidia": "ubuntu@deeplearning-platform-release/common-cu113-ubuntu-2004",
3636
}
3737
if val, ok := images[image]; ok {
3838
image = val

task/gcp/resources/resource_instance_template.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@ func (i *InstanceTemplate) Create(ctx context.Context) error {
131131
Mode: "READ_WRITE",
132132
InitializeParams: &compute.AttachedDiskInitializeParams{
133133
SourceImage: i.Dependencies.Image.Resource.SelfLink,
134-
DiskSizeGb: int64(i.Attributes.Size.Storage),
135134
DiskType: "pd-balanced",
136135
},
137136
},
@@ -171,6 +170,10 @@ func (i *InstanceTemplate) Create(ctx context.Context) error {
171170
},
172171
}
173172

173+
if size := i.Attributes.Size.Storage; size > 0 {
174+
definition.Properties.Disks[0].InitializeParams.DiskSizeGb = int64(size)
175+
}
176+
174177
insertOperation, err := i.Client.Services.Compute.InstanceTemplates.Insert(i.Client.Credentials.ProjectID, definition).Do()
175178
if err != nil {
176179
if strings.HasSuffix(err.Error(), "alreadyExists") {

0 commit comments

Comments
 (0)