pytorch
diff --git a/‎.ci/pytorch/test.sh
Lines changed: 13 additions & 13 deletions b/‎.ci/pytorch/test.sh
Lines changed: 13 additions & 13 deletions
diff --git a/‎.github/requirements/pip-requirements-macOS.txt
Lines changed: 1 addition & 1 deletion b/‎.github/requirements/pip-requirements-macOS.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/inductor-rocm-mi300.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/inductor-rocm-mi300.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/periodic-rocm-mi300.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/periodic-rocm-mi300.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/rocm-mi300.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/rocm-mi300.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 13 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 13 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎WORKSPACE
Lines changed: 6 additions & 0 deletions b/‎WORKSPACE
Lines changed: 6 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDADataType.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/CUDADataType.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.cpp
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/cuda/CUDAGraph.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/cuda/Exceptions.h
Lines changed: 1 addition & 5 deletions b/‎aten/src/ATen/cuda/Exceptions.h
Lines changed: 1 addition & 5 deletions
diff --git a/‎aten/src/ATen/cuda/cub.cuh
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/cub.cuh
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp
Lines changed: 0 additions & 2 deletions b/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp
Lines changed: 0 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/Blas.cpp
Lines changed: 3 additions & 12 deletions b/‎aten/src/ATen/native/cuda/Blas.cpp
Lines changed: 3 additions & 12 deletions
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/MixedDtypesLinear.cu
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/native/cuda/MixedDtypesLinear.cu
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/Nonzero.cu
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/Nonzero.cu
Lines changed: 1 addition & 1 deletion
@@ -820,16 +820,7 @@ test_inductor_torchbench_smoketest_perf() {
   done
 }
 
-test_inductor_get_core_number() {
-  if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
-    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
-  else
-    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
-  fi
-}
-
 test_inductor_set_cpu_affinity(){
-  #set jemalloc
   JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
   export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
@@ -841,14 +832,23 @@ test_inductor_set_cpu_affinity(){
     export KMP_AFFINITY=granularity=fine,compact,1,0
     export KMP_BLOCKTIME=1
   fi
-  cores=$(test_inductor_get_core_number)
-  # Set number of cores to 16 on Aarch64 for performance runs.
+
+  # Use nproc here instead of lscpu because it takes into account cgroups slice
+  cpus=$(nproc)
+  thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
+  cores=$((cpus / thread_per_core))
+
+  # Set number of cores to 16 on aarch64 for performance runs
   if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
     cores=16
   fi
   export OMP_NUM_THREADS=$cores
-  end_core=$((cores-1))
-  export TASKSET="taskset -c 0-$end_core"
+
+  # Handle cgroups slice start and end CPU
+  start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
+  # Leaving one physical CPU for other tasks
+  end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
+  export TASKSET="taskset -c $start_cpu-$end_cpu"
 }
 
 test_inductor_torchbench_cpu_smoketest_perf(){
 
@@ -1,5 +1,5 @@
 boto3==1.35.42
-cmake==3.25.*
+cmake==3.27.*
 expecttest==0.3.0
 fbscribelogger==0.1.7
 filelock==3.6.0
 
@@ -38,12 +38,12 @@ jobs:
       opt_out_experiments: lf
 
   linux-jammy-rocm-py3_10-inductor-build:
-    name: rocm-py3.10-inductor
+    name: rocm-py3.10-inductor-mi300
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
@@ -56,11 +56,11 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    name: rocm-py3.10-inductor
+    name: rocm-py3.10-inductor-mi300
     uses: ./.github/workflows/_rocm-test.yml
     needs: linux-jammy-rocm-py3_10-inductor-build
     with:
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
       docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
       test-matrix:  ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
     secrets: inherit
@@ -50,12 +50,12 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
 
   linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
@@ -69,13 +69,13 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
     uses: ./.github/workflows/_rocm-test.yml
     needs:
       - linux-jammy-rocm-py3_10-build
       - target-determination
     with:
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
     secrets: inherit
@@ -38,12 +38,12 @@ jobs:
 
   linux-jammy-rocm-py3_10-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
@@ -61,13 +61,13 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
     uses: ./.github/workflows/_rocm-test.yml
     needs:
       - linux-jammy-rocm-py3_10-build
       - target-determination
     with:
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
     secrets: inherit
@@ -1160,12 +1160,6 @@ exclude_patterns = [
     'torch/_inductor/autoheuristic/artifacts/**',
     # These files are all grandfathered in, feel free to remove from this list
     # as necessary
-    'test/_nvfuser/__init__.py',
-    'test/_nvfuser/test_dynamo.py',
-    'test/_nvfuser/test_python_frontend.py',
-    'test/_nvfuser/test_torchscript.py',
-    'test/delete.py',
-    'test/expect/__init__.py',
     'test/quantization/__init__.py',
     'test/quantization/core/__init__.py',
     'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py',
@@ -1322,12 +1316,6 @@ exclude_patterns = [
     'torch/_export/passes/const_prop_pass.py',
     'torch/_export/passes/functionalize_side_effectful_ops_pass.py',
     'torch/_export/passes/replace_sym_size_ops_pass.py',
-    'torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py',
-    'torch/_export/serde/__init__.py',
-    'torch/_export/serde/schema.py',
-    'torch/_export/serde/serialize.py',
-    'torch/_export/serde/upgrade.py',
-    'torch/_export/trace.py',
     'torch/testing/_internal/__init__.py',
     'torch/testing/_internal/autocast_test_lists.py',
     'torch/testing/_internal/autograd_function_db.py',
@@ -1444,7 +1432,6 @@ exclude_patterns = [
     'torch/utils/throughput_benchmark.py',
     'torch/utils/viz/__init__.py',
     'torch/utils/viz/_cycles.py',
-    'torch/utils/weak.py',
 ]
 init_command = [
     'python3',
 
@@ -1,11 +1,12 @@
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 # cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
 
 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
 # one is detected as "AppleClang".
 cmake_policy(SET CMP0010 NEW)
 cmake_policy(SET CMP0025 NEW)
+cmake_policy(SET CMP0126 OLD)
 
 # Enables CMake to set LTO on compilers other than Intel.
 cmake_policy(SET CMP0069 NEW)
 
@@ -184,6 +184,12 @@ new_local_repository(
     path = "third_party/nlohmann",
 )
 
+new_local_repository(
+    name = "moodycamel",
+    build_file = "//third_party:moodycamel.BUILD",
+    path = "third_party/concurrentqueue",
+)
+
 new_local_repository(
     name = "tensorpipe",
     build_file = "//third_party:tensorpipe.BUILD",
 
@@ -78,7 +78,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
       return CUDA_R_64I;
     case c10::ScalarType::BFloat16:
       return CUDA_R_16BF;
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60300
     case c10::ScalarType::Float8_e4m3fn:
       return CUDA_R_8F_E4M3;
     case c10::ScalarType::Float8_e5m2:
 
@@ -139,7 +139,7 @@ void CUDAGraph::capture_end() {
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
   // cudaGraphInstantiateWithFlags
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
-#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
   int version = 0;
   AT_CUDA_CHECK(cudaDriverGetVersion(&version));
   if (version < 11040) {
@@ -154,7 +154,7 @@ void CUDAGraph::capture_end() {
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
-#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
   } else {
     AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                 graph_,
@@ -216,7 +216,7 @@ void CUDAGraph::enable_debug_mode() {
 }
 
 void CUDAGraph::debug_dump(const std::string& debug_path) {
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM)
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
   if (_cuda_graphs_debug) {
     TORCH_WARN("DEBUG: calling debug_dump()");
     if (has_graph_) {
 
@@ -117,15 +117,11 @@ constexpr const char* _cusolver_backend_suggestion =            \
   "linear algebra operators with other supported backends. "    \
   "See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
 
-// When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
 // When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
 #define TORCH_CUSOLVER_CHECK(EXPR)                                      \
   do {                                                                  \
     cusolverStatus_t __err = EXPR;                                      \
-    if ((CUDA_VERSION < 11500 &&                                        \
-         __err == CUSOLVER_STATUS_EXECUTION_FAILED) ||                  \
-        (CUDA_VERSION >= 11500 &&                                       \
-         __err == CUSOLVER_STATUS_INVALID_VALUE)) {                     \
+    if (__err == CUSOLVER_STATUS_INVALID_VALUE) {                       \
       TORCH_CHECK_LINALG(                                               \
           false,                                                        \
           "cusolver error: ",                                           \
 
@@ -291,7 +291,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
 #endif
 }
 
-# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
+# if defined(CUDA_VERSION) || defined(USE_ROCM)
 
 template<typename T>
 struct BlockPrefixCallbackOp
 
@@ -146,10 +146,8 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
 NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
 NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
 NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
 NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
 NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
-#endif
 NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
 _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
 NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
 
@@ -76,7 +76,7 @@ namespace at::cuda {
   AT_FORALL_NVRTC_BASE(_)
 #endif
 
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+#if defined(CUDA_VERSION)
 #define AT_FORALL_NVRTC(_) \
   AT_FORALL_NVRTC_EXTENDED(_)  \
   _(nvrtcGetCUBINSize)     \
 
@@ -359,7 +359,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
   c10::MaybeOwned<Tensor> self_;
   if (&result != &self) {
-#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
     // Strangely, if mat2 has only 1 row or column, we get
     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -495,15 +495,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     }
 #else
     auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
-#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
-    // GELU is not supported (and does not compile!) prior
-    // to CUDA 11.4. Have observed accuracy issues with
-    // GELU epilogue in 11.4; disabling the GELU epilogue
-    // path for CUDA version < 11.8.
-    if (activation == Activation::GELU)
-      activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None;
-#endif
-
     bool okay = true;
     if (is_float_output_with_half_input) {
       AT_DISPATCH_REDUCED_FLOATING_TYPES(
@@ -646,7 +637,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 // gating activation_to_gemm_and_blas_arg above; here we are manually
 // performing a post-GELU because we weren't able to use the GELU
 // epilogue above.
-#if !(defined(CUDA_VERSION) && CUDA_VERSION >= 11080) && !defined(USE_ROCM)
+#if !defined(CUDA_VERSION) && !defined(USE_ROCM)
   if (useLtInterface && activation == Activation::GELU) {
     at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
   }
@@ -1017,7 +1008,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
 
   TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
 
-#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || defined(USE_ROCM)
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
   cublasCommonArgs args(self, mat2, result);
 
   at::cuda::blas::int8_gemm(
 
@@ -345,8 +345,8 @@ static inline void launch_vectorized_kernel(
       auto output_calc = TrivialOffsetCalculator<1>();
       auto loader = memory::LoadWithoutCast();
       auto storer = memory::StoreWithoutCast();
-      int64_t grid_unrolled = (N + io_block_work_size<io_size>() - 1) / io_block_work_size<io_size>();
-      unrolled_elementwise_kernel<func_t, array_t, elems_per_thread<io_size>()>
+      int64_t grid_unrolled = (N + elementwise_block_work_size() - 1) / elementwise_block_work_size();
+      unrolled_elementwise_kernel<func_t, array_t, elementwise_thread_work_size()>
           <<<grid_unrolled, num_threads(), 0, stream>>>(
               N, f, data, input_calc, output_calc, loader, storer);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
 
@@ -2,7 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAUtils.h>
 
-#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#if defined(USE_ROCM) || defined(_MSC_VER)
 // Doesn't work on ROCm or Windows yet
 // TODO: Add compiler warning? Add PyTorch config flag?
 #else
@@ -20,7 +20,7 @@
 #include <ATen/native/cuda/cutlass_extensions/gemm/threadblock/default_mma.h>
 #endif
 
-#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#if defined(USE_ROCM) || defined(_MSC_VER)
 // Doesn't work on ROCm or Windows yet
 #else
 #define CUTLASS_STATUS_CHECK(status)                                      \
@@ -32,7 +32,7 @@
 
 namespace at::native {
 
-#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#if defined(USE_ROCM) || defined(_MSC_VER)
 // Doesn't work on ROCm or Windows yet or old compiler
 #else
 template<typename ElementInputA, typename ElementInputB, typename EpilogueTag>
@@ -198,7 +198,7 @@ _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
                      const Tensor& scale,
                      const std::optional<Tensor>& bias_opt,
                      const std::optional<std::string_view> activation_opt) {
-#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#if defined(USE_ROCM) || defined(_MSC_VER)
   TORCH_CHECK(false, "_mixed_dtypes_linear: not compiled for this platform");
   return Tensor{};
 #else
 
@@ -300,7 +300,7 @@ void nonzero_static_cuda_out_impl(
     int64_t size,
     int64_t fill_value,
     Tensor& out) {
-# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
 
   Tensor self_contiguous_ = self.contiguous();
   // see comment in nonzero_cuda_out_impl on reqs for out
Original file line number	Diff line number	Diff line change
`@@ -291,7 +291,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT`
`291`	`291`	`#endif`
`292`	`292`	`}`
`293`	`293`
`294`		`-# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) \|\| defined(USE_ROCM)`
	`294`	`+# if defined(CUDA_VERSION) \|\| defined(USE_ROCM)`
`295`	`295`
`296`	`296`	`template<typename T>`
`297`	`297`	`struct BlockPrefixCallbackOp`