Skip to content

Commit 502e1fc

Browse files
committed
Update on "[3/3] Add build rule and test for Graph in nativert"
We split the large PR for adding Graph.h and Graph.cpp to nativert into 3 smaller PRs: 1. Add header file 2. Add source file 3. **Add test and build rules** Torch Native Runtime RFC: pytorch/rfcs#72 4 classes have been introduced: `Graph`, `Node`, `Value`, `Type` - `Type` represents the kind of a `Value` - `Value` represents a single symbolic value, it could be any kind that exists in `Type`. Values are inputs and outputs of a `Node`. - `Node` represents a single unit of execution, typically a PyTorch op. - `Graph` represents a model's computation graph, which is designed to facilitate transformation/analysis. Differential Revision: [D75495273](https://our.internmc.facebook.com/intern/diff/D75495273/) [ghstack-poisoned]
2 parents e4edea8 + 48246d0 commit 502e1fc

File tree

163 files changed

+2704
-1677
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+2704
-1677
lines changed

.ci/pytorch/test.sh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -820,16 +820,7 @@ test_inductor_torchbench_smoketest_perf() {
820820
done
821821
}
822822

823-
test_inductor_get_core_number() {
824-
if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
825-
echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
826-
else
827-
echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
828-
fi
829-
}
830-
831823
test_inductor_set_cpu_affinity(){
832-
#set jemalloc
833824
JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
834825
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
835826
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
@@ -841,14 +832,23 @@ test_inductor_set_cpu_affinity(){
841832
export KMP_AFFINITY=granularity=fine,compact,1,0
842833
export KMP_BLOCKTIME=1
843834
fi
844-
cores=$(test_inductor_get_core_number)
845-
# Set number of cores to 16 on Aarch64 for performance runs.
835+
836+
# Use nproc here instead of lscpu because it takes into account cgroups slice
837+
cpus=$(nproc)
838+
thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
839+
cores=$((cpus / thread_per_core))
840+
841+
# Set number of cores to 16 on aarch64 for performance runs
846842
if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
847843
cores=16
848844
fi
849845
export OMP_NUM_THREADS=$cores
850-
end_core=$((cores-1))
851-
export TASKSET="taskset -c 0-$end_core"
846+
847+
# Handle cgroups slice start and end CPU
848+
start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
849+
# Leaving one physical CPU for other tasks
850+
end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
851+
export TASKSET="taskset -c $start_cpu-$end_cpu"
852852
}
853853
854854
test_inductor_torchbench_cpu_smoketest_perf(){

.github/requirements/pip-requirements-macOS.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
boto3==1.35.42
2-
cmake==3.25.*
2+
cmake==3.27.*
33
expecttest==0.3.0
44
fbscribelogger==0.1.7
55
filelock==3.6.0

.github/workflows/inductor-rocm-mi300.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ jobs:
3838
opt_out_experiments: lf
3939

4040
linux-jammy-rocm-py3_10-inductor-build:
41-
name: rocm-py3.10-inductor
41+
name: rocm-py3.10-inductor-mi300
4242
uses: ./.github/workflows/_linux-build.yml
4343
needs: get-label-type
4444
with:
4545
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
46-
build-environment: linux-jammy-rocm-py3.10
46+
build-environment: linux-jammy-rocm-py3.10-mi300
4747
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
4848
test-matrix: |
4949
{ include: [
@@ -56,11 +56,11 @@ jobs:
5656
permissions:
5757
id-token: write
5858
contents: read
59-
name: rocm-py3.10-inductor
59+
name: rocm-py3.10-inductor-mi300
6060
uses: ./.github/workflows/_rocm-test.yml
6161
needs: linux-jammy-rocm-py3_10-inductor-build
6262
with:
63-
build-environment: linux-jammy-rocm-py3.10
63+
build-environment: linux-jammy-rocm-py3.10-mi300
6464
docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
6565
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
6666
secrets: inherit

.github/workflows/periodic-rocm-mi300.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,12 @@ jobs:
5050
curr_ref_type: ${{ github.ref_type }}
5151

5252
linux-jammy-rocm-py3_10-build:
53-
name: linux-jammy-rocm-py3.10
53+
name: linux-jammy-rocm-py3.10-mi300
5454
uses: ./.github/workflows/_linux-build.yml
5555
needs: get-label-type
5656
with:
5757
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
58-
build-environment: linux-jammy-rocm-py3.10
58+
build-environment: linux-jammy-rocm-py3.10-mi300
5959
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
6060
test-matrix: |
6161
{ include: [
@@ -69,13 +69,13 @@ jobs:
6969
permissions:
7070
id-token: write
7171
contents: read
72-
name: linux-jammy-rocm-py3.10
72+
name: linux-jammy-rocm-py3.10-mi300
7373
uses: ./.github/workflows/_rocm-test.yml
7474
needs:
7575
- linux-jammy-rocm-py3_10-build
7676
- target-determination
7777
with:
78-
build-environment: linux-jammy-rocm-py3.10
78+
build-environment: linux-jammy-rocm-py3.10-mi300
7979
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
8080
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
8181
secrets: inherit

.github/workflows/rocm-mi300.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ jobs:
3838

3939
linux-jammy-rocm-py3_10-build:
4040
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
41-
name: linux-jammy-rocm-py3.10
41+
name: linux-jammy-rocm-py3.10-mi300
4242
uses: ./.github/workflows/_linux-build.yml
4343
needs: get-label-type
4444
with:
4545
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
46-
build-environment: linux-jammy-rocm-py3.10
46+
build-environment: linux-jammy-rocm-py3.10-mi300
4747
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
4848
sync-tag: rocm-build
4949
test-matrix: |
@@ -61,13 +61,13 @@ jobs:
6161
permissions:
6262
id-token: write
6363
contents: read
64-
name: linux-jammy-rocm-py3.10
64+
name: linux-jammy-rocm-py3.10-mi300
6565
uses: ./.github/workflows/_rocm-test.yml
6666
needs:
6767
- linux-jammy-rocm-py3_10-build
6868
- target-determination
6969
with:
70-
build-environment: linux-jammy-rocm-py3.10
70+
build-environment: linux-jammy-rocm-py3.10-mi300
7171
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
7272
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
7373
secrets: inherit

.lintrunner.toml

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,12 +1160,6 @@ exclude_patterns = [
11601160
'torch/_inductor/autoheuristic/artifacts/**',
11611161
# These files are all grandfathered in, feel free to remove from this list
11621162
# as necessary
1163-
'test/_nvfuser/__init__.py',
1164-
'test/_nvfuser/test_dynamo.py',
1165-
'test/_nvfuser/test_python_frontend.py',
1166-
'test/_nvfuser/test_torchscript.py',
1167-
'test/delete.py',
1168-
'test/expect/__init__.py',
11691163
'test/quantization/__init__.py',
11701164
'test/quantization/core/__init__.py',
11711165
'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py',
@@ -1322,12 +1316,6 @@ exclude_patterns = [
13221316
'torch/_export/passes/const_prop_pass.py',
13231317
'torch/_export/passes/functionalize_side_effectful_ops_pass.py',
13241318
'torch/_export/passes/replace_sym_size_ops_pass.py',
1325-
'torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py',
1326-
'torch/_export/serde/__init__.py',
1327-
'torch/_export/serde/schema.py',
1328-
'torch/_export/serde/serialize.py',
1329-
'torch/_export/serde/upgrade.py',
1330-
'torch/_export/trace.py',
13311319
'torch/testing/_internal/__init__.py',
13321320
'torch/testing/_internal/autocast_test_lists.py',
13331321
'torch/testing/_internal/autograd_function_db.py',
@@ -1444,7 +1432,6 @@ exclude_patterns = [
14441432
'torch/utils/throughput_benchmark.py',
14451433
'torch/utils/viz/__init__.py',
14461434
'torch/utils/viz/_cycles.py',
1447-
'torch/utils/weak.py',
14481435
]
14491436
init_command = [
14501437
'python3',

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
1+
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
22
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
33

44
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
55
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++
66
# one is detected as "AppleClang".
77
cmake_policy(SET CMP0010 NEW)
88
cmake_policy(SET CMP0025 NEW)
9+
cmake_policy(SET CMP0126 OLD)
910

1011
# Enables CMake to set LTO on compilers other than Intel.
1112
cmake_policy(SET CMP0069 NEW)

WORKSPACE

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,12 @@ new_local_repository(
184184
path = "third_party/nlohmann",
185185
)
186186

187+
new_local_repository(
188+
name = "moodycamel",
189+
build_file = "//third_party:moodycamel.BUILD",
190+
path = "third_party/concurrentqueue",
191+
)
192+
187193
new_local_repository(
188194
name = "tensorpipe",
189195
build_file = "//third_party:tensorpipe.BUILD",

aten/src/ATen/cuda/CUDADataType.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
7878
return CUDA_R_64I;
7979
case c10::ScalarType::BFloat16:
8080
return CUDA_R_16BF;
81-
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
81+
#if !defined(USE_ROCM) || ROCM_VERSION >= 60300
8282
case c10::ScalarType::Float8_e4m3fn:
8383
return CUDA_R_8F_E4M3;
8484
case c10::ScalarType::Float8_e5m2:

aten/src/ATen/cuda/CUDAGraph.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ void CUDAGraph::capture_end() {
139139
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
140140
// cudaGraphInstantiateWithFlags
141141
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
142-
#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
142+
#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
143143
int version = 0;
144144
AT_CUDA_CHECK(cudaDriverGetVersion(&version));
145145
if (version < 11040) {
@@ -154,7 +154,7 @@ void CUDAGraph::capture_end() {
154154
#endif
155155
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
156156
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
157-
#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
157+
#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
158158
} else {
159159
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
160160
graph_,
@@ -216,7 +216,7 @@ void CUDAGraph::enable_debug_mode() {
216216
}
217217

218218
void CUDAGraph::debug_dump(const std::string& debug_path) {
219-
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM)
219+
#if defined(CUDA_VERSION) || defined(USE_ROCM)
220220
if (_cuda_graphs_debug) {
221221
TORCH_WARN("DEBUG: calling debug_dump()");
222222
if (has_graph_) {

aten/src/ATen/cuda/Exceptions.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,11 @@ constexpr const char* _cusolver_backend_suggestion = \
117117
"linear algebra operators with other supported backends. " \
118118
"See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
119119

120-
// When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
121120
// When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
122121
#define TORCH_CUSOLVER_CHECK(EXPR) \
123122
do { \
124123
cusolverStatus_t __err = EXPR; \
125-
if ((CUDA_VERSION < 11500 && \
126-
__err == CUSOLVER_STATUS_EXECUTION_FAILED) || \
127-
(CUDA_VERSION >= 11500 && \
128-
__err == CUSOLVER_STATUS_INVALID_VALUE)) { \
124+
if (__err == CUSOLVER_STATUS_INVALID_VALUE) { \
129125
TORCH_CHECK_LINALG( \
130126
false, \
131127
"cusolver error: ", \

aten/src/ATen/cuda/cub.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
291291
#endif
292292
}
293293

294-
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
294+
# if defined(CUDA_VERSION) || defined(USE_ROCM)
295295

296296
template<typename T>
297297
struct BlockPrefixCallbackOp

aten/src/ATen/cuda/detail/LazyNVRTC.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,8 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
146146
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
147147
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
148148
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
149-
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
150149
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
151150
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
152-
#endif
153151
NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
154152
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
155153
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)

aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ namespace at::cuda {
7676
AT_FORALL_NVRTC_BASE(_)
7777
#endif
7878

79-
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
79+
#if defined(CUDA_VERSION)
8080
#define AT_FORALL_NVRTC(_) \
8181
AT_FORALL_NVRTC_EXTENDED(_) \
8282
_(nvrtcGetCUBINSize) \

aten/src/ATen/native/cuda/Blas.cpp

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
359359
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
360360
c10::MaybeOwned<Tensor> self_;
361361
if (&result != &self) {
362-
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
362+
#if defined(CUDA_VERSION) || defined(USE_ROCM)
363363
// Strangely, if mat2 has only 1 row or column, we get
364364
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
365365
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -495,15 +495,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
495495
}
496496
#else
497497
auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
498-
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
499-
// GELU is not supported (and does not compile!) prior
500-
// to CUDA 11.4. Have observed accuracy issues with
501-
// GELU epilogue in 11.4; disabling the GELU epilogue
502-
// path for CUDA version < 11.8.
503-
if (activation == Activation::GELU)
504-
activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None;
505-
#endif
506-
507498
bool okay = true;
508499
if (is_float_output_with_half_input) {
509500
AT_DISPATCH_REDUCED_FLOATING_TYPES(
@@ -646,7 +637,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
646637
// gating activation_to_gemm_and_blas_arg above; here we are manually
647638
// performing a post-GELU because we weren't able to use the GELU
648639
// epilogue above.
649-
#if !(defined(CUDA_VERSION) && CUDA_VERSION >= 11080) && !defined(USE_ROCM)
640+
#if !defined(CUDA_VERSION) && !defined(USE_ROCM)
650641
if (useLtInterface && activation == Activation::GELU) {
651642
at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
652643
}
@@ -1017,7 +1008,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
10171008

10181009
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
10191010

1020-
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || defined(USE_ROCM)
1011+
#if defined(CUDA_VERSION) || defined(USE_ROCM)
10211012
cublasCommonArgs args(self, mat2, result);
10221013

10231014
at::cuda::blas::int8_gemm(

aten/src/ATen/native/cuda/CUDALoops.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,8 +345,8 @@ static inline void launch_vectorized_kernel(
345345
auto output_calc = TrivialOffsetCalculator<1>();
346346
auto loader = memory::LoadWithoutCast();
347347
auto storer = memory::StoreWithoutCast();
348-
int64_t grid_unrolled = (N + io_block_work_size<io_size>() - 1) / io_block_work_size<io_size>();
349-
unrolled_elementwise_kernel<func_t, array_t, elems_per_thread<io_size>()>
348+
int64_t grid_unrolled = (N + elementwise_block_work_size() - 1) / elementwise_block_work_size();
349+
unrolled_elementwise_kernel<func_t, array_t, elementwise_thread_work_size()>
350350
<<<grid_unrolled, num_threads(), 0, stream>>>(
351351
N, f, data, input_calc, output_calc, loader, storer);
352352
C10_CUDA_KERNEL_LAUNCH_CHECK();

aten/src/ATen/native/cuda/MixedDtypesLinear.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <ATen/core/Tensor.h>
33
#include <ATen/cuda/CUDAUtils.h>
44

5-
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
5+
#if defined(USE_ROCM) || defined(_MSC_VER)
66
// Doesn't work on ROCm or Windows yet
77
// TODO: Add compiler warning? Add PyTorch config flag?
88
#else
@@ -20,7 +20,7 @@
2020
#include <ATen/native/cuda/cutlass_extensions/gemm/threadblock/default_mma.h>
2121
#endif
2222

23-
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
23+
#if defined(USE_ROCM) || defined(_MSC_VER)
2424
// Doesn't work on ROCm or Windows yet
2525
#else
2626
#define CUTLASS_STATUS_CHECK(status) \
@@ -32,7 +32,7 @@
3232

3333
namespace at::native {
3434

35-
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
35+
#if defined(USE_ROCM) || defined(_MSC_VER)
3636
// Doesn't work on ROCm or Windows yet or old compiler
3737
#else
3838
template<typename ElementInputA, typename ElementInputB, typename EpilogueTag>
@@ -198,7 +198,7 @@ _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
198198
const Tensor& scale,
199199
const std::optional<Tensor>& bias_opt,
200200
const std::optional<std::string_view> activation_opt) {
201-
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
201+
#if defined(USE_ROCM) || defined(_MSC_VER)
202202
TORCH_CHECK(false, "_mixed_dtypes_linear: not compiled for this platform");
203203
return Tensor{};
204204
#else

aten/src/ATen/native/cuda/Nonzero.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ void nonzero_static_cuda_out_impl(
300300
int64_t size,
301301
int64_t fill_value,
302302
Tensor& out) {
303-
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
303+
#if defined(CUDA_VERSION) || defined(USE_ROCM)
304304

305305
Tensor self_contiguous_ = self.contiguous();
306306
// see comment in nonzero_cuda_out_impl on reqs for out

0 commit comments

Comments
 (0)