Skip to content

Commit 596925a

Browse files
authored
test: Update L0_io to work with only 2 GPUs instead of requiring 4 GPUs (#7947)
1 parent f7fe649 commit 596925a

File tree

2 files changed

+93
-3
lines changed

2 files changed

+93
-3
lines changed

qa/L0_io/gen_libtorch_model.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/python
2+
# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
import torch
29+
import torch.nn as nn
30+
31+
32+
class SumModule(nn.Module):
33+
def __init__(self, device):
34+
super(SumModule, self).__init__()
35+
self.device = device
36+
37+
def forward(self, INPUT0, INPUT1):
38+
INPUT0 = INPUT0.to(self.device)
39+
INPUT1 = INPUT1.to(self.device)
40+
print(
41+
"SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
42+
INPUT0.device, INPUT1.device
43+
)
44+
)
45+
return INPUT0 + INPUT1
46+
47+
48+
class DiffModule(nn.Module):
49+
def __init__(self, device):
50+
super(DiffModule, self).__init__()
51+
self.device = device
52+
53+
def forward(self, INPUT0, INPUT1):
54+
INPUT0 = INPUT0.to(self.device)
55+
INPUT1 = INPUT1.to(self.device)
56+
print(
57+
"DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
58+
INPUT0.device, INPUT1.device
59+
)
60+
)
61+
return INPUT0 - INPUT1
62+
63+
64+
class TestModel(nn.Module):
65+
def __init__(self, device0, device1):
66+
super(TestModel, self).__init__()
67+
self.device0 = device0
68+
self.device1 = device1
69+
70+
self.layer1 = SumModule(self.device0)
71+
self.layer2 = DiffModule(self.device1)
72+
73+
def forward(self, INPUT0, INPUT1):
74+
op0 = self.layer1(INPUT0, INPUT1)
75+
op1 = self.layer2(INPUT0, INPUT1)
76+
return op0, op1
77+
78+
79+
if torch.cuda.device_count() < 2:
80+
print("Need at least 2 GPUs to run this test")
81+
exit(1)
82+
83+
devices = [("cuda:1", "cuda:0"), ("cpu", "cuda:1")]
84+
model_names = ["libtorch_multi_gpu", "libtorch_multi_device"]
85+
86+
for device_pair, model_name in zip(devices, model_names):
87+
model = TestModel(device_pair[0], device_pair[1])
88+
model_path = "models/" + model_name + "/1/model.pt"
89+
scripted_model = torch.jit.script(model)
90+
scripted_model.save(model_path)

qa/L0_io/test.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -38,7 +38,8 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then
3838
REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
3939
fi
4040

41-
export CUDA_VISIBLE_DEVICES=0,1,2,3
41+
# This test requires at least 2 GPUs to test h2d and d2d transfer combinations
42+
export CUDA_VISIBLE_DEVICES=0,1
4243

4344
IO_TEST_UTIL=./memory_alloc
4445
CLIENT_LOG="./client.log"
@@ -147,7 +148,6 @@ cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \
147148

148149
# prepare libtorch multi-device and multi-gpu models
149150
cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/.
150-
cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py
151151
mkdir -p $MODELSDIR/libtorch_multi_device/1
152152
mkdir -p $MODELSDIR/libtorch_multi_gpu/1
153153
cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/.

0 commit comments

Comments
 (0)