pytorch
diff --git a/‎torchao/experimental/kernels/cpu/Utils.cmake
Lines changed: 56 additions & 0 deletions b/‎torchao/experimental/kernels/cpu/Utils.cmake
Lines changed: 56 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
Lines changed: 13 additions & 0 deletions b/‎torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
Lines changed: 13 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h
Lines changed: 33 additions & 42 deletions b/‎torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h
Lines changed: 33 additions & 42 deletions
diff --git a/‎torchao/experimental/kernels/cpu/linear/examples/CMakeLists.txt
Lines changed: 6 additions & 21 deletions b/‎torchao/experimental/kernels/cpu/linear/examples/CMakeLists.txt
Lines changed: 6 additions & 21 deletions
diff --git a/‎torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/CMakeLists.txt
Lines changed: 7 additions & 19 deletions b/‎torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/CMakeLists.txt
Lines changed: 7 additions & 19 deletions
diff --git a/‎torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/build_custom_op.sh
Lines changed: 1 addition & 0 deletions b/‎torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/build_custom_op.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.cpp
Lines changed: 0 additions & 6 deletions b/‎torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.cpp
Lines changed: 0 additions & 6 deletions
diff --git a/‎torchao/experimental/kernels/cpu/parallel-aten-impl.h
Lines changed: 28 additions & 0 deletions b/‎torchao/experimental/kernels/cpu/parallel-aten-impl.h
Lines changed: 28 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/cpu/parallel-impl.h
Lines changed: 0 additions & 61 deletions b/‎torchao/experimental/kernels/cpu/parallel-impl.h
Lines changed: 0 additions & 61 deletions
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+function(target_link_torchao_parallel_backend target_name torchao_parallel_backend)
+    string(TOUPPER ${torchao_parallel_backend} TORCHAO_PARALLEL_BACKEND_TOUPPER)
+    if(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "ATEN_OPENMP")
+        message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=ATEN_OPENMP")
+
+        set(_OMP_CXX_COMPILE_FLAGS "-fopenmp")
+        if (APPLE)
+            set(_OMP_CXX_COMPILE_FLAGS "-Xclang -fopenmp")
+        endif()
+
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_OMP_CXX_COMPILE_FLAGS}" PARENT_SCOPE)
+
+        find_package(Torch REQUIRED)
+        include_directories("${TORCH_INCLUDE_DIRS}")
+        target_link_libraries(${target_name} PRIVATE "${TORCH_LIBRARIES}")
+
+        target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_ATEN=1 AT_PARALLEL_OPENMP=1 INTRA_OP_PARALLEL=1)
+        target_link_libraries(${target_name} PRIVATE ${TORCH_INSTALL_PREFIX}/lib/libomp${CMAKE_SHARED_LIBRARY_SUFFIX})
+
+    elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "OPENMP")
+        message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=OPENMP.  You must set the CMake variable OpenMP_ROOT to the OMP library location before compiling.  Do not use this option if Torch was built with OPENMP; use ATEN_OPENMP instead.")
+        find_package(OpenMP REQUIRED)
+        target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_OPENMP=1)
+        target_link_libraries(${target_name} PRIVATE OpenMP::OpenMP_CXX)
+
+    elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "PTHREADPOOL")
+        message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=PTHREADPOOL")
+        include(FetchContent)
+        FetchContent_Declare(pthreadpool
+            GIT_REPOSITORY https://github.com/Maratyszcza/pthreadpool.git
+            GIT_TAG master)
+
+        FetchContent_MakeAvailable(
+            pthreadpool)
+
+        target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_PTHREADPOOL=1)
+        target_link_libraries(${target_name} PRIVATE pthreadpool)
+
+    elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "SINGLE_THREADED")
+        message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=SINGLE_THREADED")
+        target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_SINGLE_THREADED=1)
+
+    elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "TEST_DUMMY")
+        message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=TEST_DUMMY")
+        target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_TEST_DUMMY=1)
+
+    else()
+        message(FATAL_ERROR "Unknown TORCHAO_PARALLEL_BACKEND: ${TORCHAO_PARALLEL_BACKEND}. Please choose one of: aten_openmp, openmp, pthreadpool, single_threaded.")
+    endif()
+endfunction()
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+add_library(
+  kernel_aarch64
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp
+)
@@ -57,12 +57,8 @@ void pack_weight_data_operator(
   int nc = std::min(n, tiling_params.nc_by_nr * ukernel_config.nr);
   int num_nc_panels = (n + nc - 1) / nc;
 
-  torchao::parallel_for(0, num_nc_panels, 1, [&](int64_t begin, int64_t end) {
-    // TODO(T200106949): decide how to handle at::parallel_for not respecting
-    // user-supplied grain_size
-    assert(end == begin + 1);
-
-    int nc_tile_idx = begin;
+  torchao::parallel_1d(0, num_nc_panels, [&](int64_t idx) {
+    int nc_tile_idx = idx;
     int n_idx = nc_tile_idx * nc;
     int nc_tile_size = std::min(nc, n - n_idx);
 
@@ -178,12 +174,8 @@ void linear_operator_with_tile_schedule_policy_single_mc_parallel_nc(
         group_size,
         activations + activations_offset);
 
-    torchao::parallel_for(0, num_nc_panels, 1, [&](int64_t begin, int64_t end) {
-      // TODO(T200106949): decide how to handle at::parallel_for not respecting
-      // user-supplied grain_size
-      assert(end == begin + 1);
-
-      int nc_tile_idx = begin;
+    torchao::parallel_1d(0, num_nc_panels, [&](int64_t idx) {
+      int nc_tile_idx = idx;
       int n_idx = nc_tile_idx * nc;
       int nc_tile_size = std::min(nc, n - n_idx);
 
@@ -234,8 +226,8 @@ void linear_operator_with_tile_schedule_policy_parallel_mc_parallel_nc(
   int activation_data_size =
       ukernel_config.activation_data_size_fn(mr, k, group_size);
 
-  torchao::parallel_for(0, num_mc_panels, 1, [&](int64_t begin, int64_t end) {
-    int mc_tile_idx = begin;
+  torchao::parallel_1d(0, num_mc_panels, [&](int64_t idx) {
+    int mc_tile_idx = idx;
     int m_idx = mc_tile_idx * mc;
     int mc_tile_size = std::min(mc, m - m_idx);
     int activations_offset = m_idx * k;
@@ -249,34 +241,33 @@ void linear_operator_with_tile_schedule_policy_parallel_mc_parallel_nc(
         activations + activations_offset);
   });
 
-  torchao::parallel_for(
-      0, num_mc_panels * num_nc_panels, 1, [&](int64_t begin, int64_t end) {
-        int mc_tile_idx = begin / num_nc_panels;
-        int m_idx = mc_tile_idx * mc;
-        int mc_tile_size = std::min(mc, m - m_idx);
-
-        int nc_tile_idx = begin % num_nc_panels;
-        int n_idx = nc_tile_idx * nc;
-        int nc_tile_size = std::min(nc, n - n_idx);
-
-        int activation_data_offset = (m_idx / mr) * activation_data_size;
-        int output_offset = m_idx * n + n_idx;
-        int weight_data_offset = (n_idx / nr) * weight_data_size;
-        int bias_offset = m_idx;
-
-        ukernel_config.kernel_fn(
-            output + output_offset,
-            /*output_m_stride=*/n,
-            /*m=*/mc_tile_size,
-            /*n=*/nc_tile_size,
-            k,
-            group_size,
-            /*weight_data=*/(char*)weight_data + weight_data_offset,
-            /*activation_data=*/activation_data_buffer + activation_data_offset,
-            /*bias=*/bias + bias_offset,
-            clamp_min,
-            clamp_max);
-      });
+  torchao::parallel_1d(0, num_mc_panels * num_nc_panels, [&](int64_t idx) {
+    int mc_tile_idx = idx / num_nc_panels;
+    int m_idx = mc_tile_idx * mc;
+    int mc_tile_size = std::min(mc, m - m_idx);
+
+    int nc_tile_idx = idx % num_nc_panels;
+    int n_idx = nc_tile_idx * nc;
+    int nc_tile_size = std::min(nc, n - n_idx);
+
+    int activation_data_offset = (m_idx / mr) * activation_data_size;
+    int output_offset = m_idx * n + n_idx;
+    int weight_data_offset = (n_idx / nr) * weight_data_size;
+    int bias_offset = m_idx;
+
+    ukernel_config.kernel_fn(
+        output + output_offset,
+        /*output_m_stride=*/n,
+        /*m=*/mc_tile_size,
+        /*n=*/nc_tile_size,
+        k,
+        group_size,
+        /*weight_data=*/(char*)weight_data + weight_data_offset,
+        /*activation_data=*/activation_data_buffer + activation_data_offset,
+        /*bias=*/bias + bias_offset,
+        clamp_min,
+        clamp_max);
+  });
 }
 } // namespace internal
 
 
@@ -16,38 +16,23 @@ include(CMakePrintHelpers)
 message("TORCHAO_LIBRARIES: ${TORCHAO_LIBRARIES}")
 include_directories(${TORCHAO_LIBRARIES})
 
-add_library(
-  torchao_dep
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp
-)
-
+add_subdirectory(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64 ${CMAKE_CURRENT_BINARY_DIR}/kernel_aarch64)
 
 add_executable(separate_function_wrappers separate_function_wrappers.cpp)
 target_link_libraries(
   separate_function_wrappers
     PRIVATE
-    torchao_dep
+    kernel_aarch64
 )
 
 add_executable(stateful_class_wrapper stateful_class_wrapper.cpp)
 target_link_libraries(
   stateful_class_wrapper
     PRIVATE
-    torchao_dep
+    kernel_aarch64
 )
 
+include(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/Utils.cmake)
 
-find_package(OpenMP)
-if(OpenMP_CXX_FOUND)
-  target_link_libraries(separate_function_wrappers PUBLIC OpenMP::OpenMP_CXX)
-  target_compile_definitions(separate_function_wrappers PRIVATE TORCHAO_PARALLEL_OMP=1)
-
-  target_link_libraries(stateful_class_wrapper PUBLIC OpenMP::OpenMP_CXX)
-  target_compile_definitions(stateful_class_wrapper PRIVATE TORCHAO_PARALLEL_OMP=1)
-else()
-  target_compile_definitions(separate_function_wrappers PRIVATE TORCHAO_PARALLEL_SINGLE_THREADED=1)
-  target_compile_definitions(stateful_class_wrapper PRIVATE TORCHAO_PARALLEL_SINGLE_THREADED=1)
-endif()
+target_link_torchao_parallel_backend(stateful_class_wrapper "openmp")
+target_link_torchao_parallel_backend(separate_function_wrappers "openmp")
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-project(examples)
+project(torch_custom_op)
 
 cmake_minimum_required(VERSION 3.19)
 set(CMAKE_CXX_STANDARD 17)
@@ -16,27 +16,15 @@ include(CMakePrintHelpers)
 message("TORCHAO_LIBRARIES: ${TORCHAO_LIBRARIES}")
 include_directories(${TORCHAO_LIBRARIES})
 
-add_library(
-  torchao_dep
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
-  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp
-)
-
-include(FetchContent)
-FetchContent_Declare(pthreadpool
-        GIT_REPOSITORY https://github.com/Maratyszcza/pthreadpool.git
-        GIT_TAG master)
-FetchContent_MakeAvailable(
-  pthreadpool)
+add_subdirectory(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64 ${CMAKE_CURRENT_BINARY_DIR}/kernel_aarch64)
 
 find_package(Torch REQUIRED)
-message("TORCH_INCLUDE_DIRS: ${TORCH_INCLUDE_DIRS}")
 include_directories("${TORCH_INCLUDE_DIRS}")
 
 add_library(torch_custom_op SHARED torch_custom_op.cpp)
 target_link_libraries(torch_custom_op PRIVATE "${TORCH_LIBRARIES}")
-target_link_libraries(torch_custom_op PRIVATE torchao_dep)
-target_compile_definitions(torch_custom_op PRIVATE TORCHAO_PARALLEL_PTHREADPOOL=1)
-target_link_libraries(torch_custom_op PRIVATE pthreadpool)
+target_link_libraries(torch_custom_op PRIVATE kernel_aarch64)
+
+include(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/Utils.cmake)
+set(TORCHAO_PARALLEL_BACKEND "ATEN_OPENMP" CACHE STRING "Choose parallel backend to use for torchao parallelism (aten_openmp, openmp, pthreadpool, single_threaded)")
+target_link_torchao_parallel_backend(torch_custom_op "${TORCHAO_PARALLEL_BACKEND}")
@@ -13,6 +13,7 @@ echo "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}"
 export CMAKE_OUT=/tmp/cmake-out/torch_ao/examples/torch_custom_op
 cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} \
     -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \
+    -DTORCHAO_PARALLEL_BACKEND="aten_openmp" \
     -S ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op \
     -B ${CMAKE_OUT}
 cmake --build  ${CMAKE_OUT}
@@ -50,8 +50,6 @@ at::Tensor pack_weights_without_zeros_cpu(
   auto pack_weight_tiling_params = get_default_pack_weight_data_tiling_params(
       ukernel_config, n, /*target_panels_per_thread=*/1);
 
-  torchao::set_num_threads(torch::get_num_threads());
-
   auto packed_weight_data_size =
       get_packed_weight_data_size(ukernel_config, n, k, group_size);
   at::Tensor packed_weights =
@@ -117,8 +115,6 @@ at::Tensor pack_weights_with_zeros_cpu(
   auto pack_weight_tiling_params = get_default_pack_weight_data_tiling_params(
       ukernel_config, n, /*target_panels_per_thread=*/1);
 
-  torchao::set_num_threads(torch::get_num_threads());
-
   auto packed_weight_data_size =
       get_packed_weight_data_size(ukernel_config, n, k, group_size);
   at::Tensor packed_weights =
@@ -227,8 +223,6 @@ at::Tensor linear_cpu(
   auto linear_scheduling_policy =
       LinearTileSchedulingPolicy::single_mc_parallel_nc;
 
-  torchao::set_num_threads(torch::get_num_threads());
-
   auto activation_data_buffer_size = get_activation_data_buffer_size(
       ukernel_config,
       linear_tiling_params,
 
@@ -0,0 +1,28 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/library.h>
+#include <torch/torch.h>
+#include <Aten/Parallel.h>
+
+// F has signature [&](int64_t idx)
+template <typename F>
+void torchao::parallel_1d(const int64_t begin, const int64_t end, const F& f) {
+  at::parallel_for(begin, end, 1, [&](int64_t begin, int64_t end) {
+    for (int64_t idx = begin; idx < end; idx++) {
+      f(idx);
+    }
+  });
+}
+
+void torchao::set_num_threads(int num_threads) {
+  torch::set_num_threads(num_threads);
+}
+
+int torchao::get_num_threads() {
+  return torch::get_num_threads();
+}