choutim
diff --git a/‎.clang-tidy
Lines changed: 2 additions & 0 deletions b/‎.clang-tidy
Lines changed: 2 additions & 0 deletions
diff --git a/‎bench/EmbeddingQuantizeBenchmark.cc
Lines changed: 4 additions & 4 deletions b/‎bench/EmbeddingQuantizeBenchmark.cc
Lines changed: 4 additions & 4 deletions
diff --git a/‎bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc
Lines changed: 3 additions & 3 deletions b/‎bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎bench/EmbeddingSpMDM8BitBenchmark.cc
Lines changed: 8 additions & 14 deletions b/‎bench/EmbeddingSpMDM8BitBenchmark.cc
Lines changed: 8 additions & 14 deletions
diff --git a/‎bench/EmbeddingSpMDMBenchmark.cc
Lines changed: 5 additions & 5 deletions b/‎bench/EmbeddingSpMDMBenchmark.cc
Lines changed: 5 additions & 5 deletions
diff --git a/‎bench/EmbeddingSpMDMNBit2Benchmark.cc
Lines changed: 10 additions & 15 deletions b/‎bench/EmbeddingSpMDMNBit2Benchmark.cc
Lines changed: 10 additions & 15 deletions
diff --git a/‎bench/EmbeddingSpMDMNBitBenchmark.cc
Lines changed: 1 addition & 1 deletion b/‎bench/EmbeddingSpMDMNBitBenchmark.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/fbgemm/Fbgemm.h
Lines changed: 5 additions & 5 deletions b/‎include/fbgemm/Fbgemm.h
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/DirectConv.h
Lines changed: 15 additions & 15 deletions b/‎src/DirectConv.h
Lines changed: 15 additions & 15 deletions
diff --git a/‎src/EmbeddingSpMDM.cc
Lines changed: 8 additions & 8 deletions b/‎src/EmbeddingSpMDM.cc
Lines changed: 8 additions & 8 deletions
@@ -19,6 +19,8 @@ modernize*,
 -modernize-use-ranges,
 -modernize-use-integer-sign-comparison
 -modernize-use-nodiscard,
+performance*,
+-performance-avoid-endl
 '
 CheckOptions:
  - key: facebook-cuda-safe-api-call-check.HandlerName
 
@@ -29,7 +29,7 @@ static void performance_test() {
   constexpr int NWARMUP = 4;
   constexpr int NITER = 256;
 
-  if (is_same_v<T, float16>) {
+  if constexpr (is_same_v<T, float16>) {
     cout << "With scale and bias as float16" << endl;
   } else {
     cout << "With scale and bias as float" << endl;
@@ -38,7 +38,7 @@ static void performance_test() {
        << "cols" << "," << setw(16) << "elems_per_usec" << "," << setw(10)
        << "GB/Sec" << endl;
   std::vector<int> bit_rates;
-  if (is_same_v<T, float16>) {
+  if constexpr (is_same_v<T, float16>) {
     bit_rates = {2, 4, 8};
   } else {
     // float
@@ -52,7 +52,7 @@ static void performance_test() {
 
         int out_emb_cols = colSize;
 
-        if (is_same<T, float16>::value) {
+        if constexpr (is_same_v<T, float16>) {
           int elements_per_byte = 8 / bit_rate;
           out_emb_cols = (colSize + elements_per_byte - 1) / elements_per_byte;
         }
@@ -63,7 +63,7 @@ static void performance_test() {
 
         duration = measureWithWarmup(
             [&]() {
-              is_same<T, float16>::value
+              is_same_v<T, float16>
                   ? FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
                         bit_rate,
                         inpVec.data(),
 
@@ -29,7 +29,7 @@ static void performance_test() {
   constexpr int NWARMUP = 4;
   constexpr int NITER = 256;
 
-  if (is_same_v<T, float16>) {
+  if constexpr (is_same_v<T, float16>) {
     cout << "With result as float16" << endl;
   } else {
     cout << "With result as float" << endl;
@@ -44,15 +44,15 @@ static void performance_test() {
 
       int out_emb_cols = colSize;
 
-      if (is_same<T, float16>::value) {
+      if constexpr (is_same_v<T, float16>) {
         out_emb_cols /= 2;
       }
       int outVecSize = rowSize * (out_emb_cols + 2 * sizeof(T));
       aligned_vector<T> outVec(outVecSize);
 
       double duration = 0.0f;
 
-      int constexpr kNumRepeats = is_same<T, float16>::value ? 16 : 32;
+      int constexpr kNumRepeats = is_same_v<T, float16> ? 16 : 32;
 
       duration = measureWithWarmup(
           [&]() {
 
@@ -12,7 +12,6 @@
 #endif
 #include <algorithm>
 #include <cassert>
-#include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <iomanip>
@@ -262,10 +261,10 @@ static int run_benchmark(
           for (size_t i = 0; i < output.size(); ++i) {
             float tmp1 = 0;
             float tmp2 = 0;
-            if constexpr (std::is_same<OutType, float>::value) {
+            if constexpr (std::is_same_v<OutType, float>) {
               tmp1 = output[i];
               tmp2 = output_ref[i];
-            } else if constexpr (std::is_same<OutType, uint16_t>::value) {
+            } else if constexpr (std::is_same_v<OutType, uint16_t>) {
               if (is_bf16_out) {
                 tmp1 = cpu_bf162float(output[i]);
                 tmp2 = cpu_bf162float(output_ref[i]);
@@ -289,9 +288,9 @@ static int run_benchmark(
 #pragma omp barrier
 #endif
       if (fbgemm_get_thread_num() == 0) {
-        if constexpr (std::is_same<OutType, float>::value) {
+        if constexpr (std::is_same_v<OutType, float>) {
           cout << "out type fp32";
-        } else if constexpr (std::is_same<OutType, uint16_t>::value) {
+        } else if constexpr (std::is_same_v<OutType, uint16_t>) {
           if (is_bf16_out) {
             cout << "out type bf16";
           } else {
@@ -340,22 +339,17 @@ static int run_benchmark(
 }
 
 int main() {
-  int batch_size;
-  int num_rows;
-  int embedding_dim;
-  int average_len;
-
   bool stress_multi_threading = false;
 
   vector<vector<int>> inputs(GetInputs_());
   benchmarkTimes.resize(fbgemm_get_max_threads());
 
   for (auto& input : inputs) {
     assert(input.size() > 3);
-    batch_size = input[0];
-    num_rows = input[1];
-    embedding_dim = input[2];
-    average_len = input[3];
+    int batch_size = input[0];
+    int num_rows = input[1];
+    int embedding_dim = input[2];
+    int average_len = input[3];
 
     cout << "batch size" << setw(6) << batch_size << setw(10) << "num rows"
          << setw(16) << num_rows << setw(10) << "emb dim" << setw(6)
 
@@ -64,8 +64,8 @@ static void run_benchmark(
 
   vector<float> embedding_table(num_rows * embedding_dim);
   normal_distribution<float> embedding_distribution;
-  for (size_t i = 0; i < embedding_table.size(); ++i) {
-    embedding_table[i] = embedding_distribution(generator);
+  for (float& i : embedding_table) {
+    i = embedding_distribution(generator);
   }
   vector<float16> embedding_table_fp16;
   vector<bfloat16> embedding_table_bf16;
@@ -235,15 +235,15 @@ static void run_benchmark(
         prefetch ? 16 : 0,
         /*is_weight_positional=*/false,
         /*use_offsets=*/true,
-        /*isbf16=*/true);
+        /*is_bf16_out=*/true);
     auto kernel_bf16_i64 = GenerateEmbeddingSpMDM<bfloat16, int64_t>(
         embedding_dim,
         has_weight,
         normalize_by_lengths,
         prefetch ? 16 : 0,
         /*is_weight_positional=*/false,
-        /*is_weight_positional=*/true,
-        /*isbf16=*/true);
+        /*use_offsets=*/true,
+        /*is_bf16_out=*/true);
 
     vector<float>& output = has_weight ? output_slws : output_sls;
     for (bool flush_cache : {false, true}) {
 
@@ -143,10 +143,10 @@ static void print_benchmark_results() {
       << "asmjit b/w (GB/s), asmjit effective b/w (GB/s), asmjit time, "
       << "autovec b/w (GB/s), autovec effective b/w (GB/s), autovec time, "
       << "ref b/w (GB/s), ref effective b/w (GB/s), ref time, "
-      << "asmjit speedup ratio, autovec speedup ratio" << std::endl;
-  for (size_t i = 0; i < benchmarks.size(); ++i) {
-    BenchmarkSpec& spec = benchmarks[i].first;
-    BenchmarkResult& res = benchmarks[i].second;
+      << "asmjit speedup ratio, autovec speedup ratio" << endl;
+  for (auto& benchmark : benchmarks) {
+    BenchmarkSpec& spec = benchmark.first;
+    BenchmarkResult& res = benchmark.second;
     float asmjit_speedup = res.ref_bw > 0.0 ? res.asmjit_bw / res.ref_bw : 0;
     float autovec_speedup = res.ref_bw > 0.0 ? res.autovec_bw / res.ref_bw : 0;
     std::cout << spec.bit_rate << ", " << spec.batch_size << ", "
@@ -158,7 +158,7 @@ static void print_benchmark_results() {
               << res.asmjit_time << ", " << res.autovec_bw << ", "
               << res.autovec_eff_bw << ", " << res.autovec_time << ", "
               << res.ref_bw << ", " << res.ref_eff_bw << ", " << res.ref_time
-              << ", " << asmjit_speedup << ", " << autovec_speedup << std::endl;
+              << ", " << asmjit_speedup << ", " << autovec_speedup << endl;
   }
 }
 
@@ -457,7 +457,7 @@ static int run_benchmark(
         find_benchmark_record(spec).set_asmjit_result(
             bytes / 1e9 / t, bytes_padded / 1e9 / t, t);
       } else {
-        std::cerr << "Bad kern_type parameter: " << kern_type << std::endl;
+        std::cerr << "Bad kern_type parameter: " << kern_type << endl;
         assert(false);
       }
       if (!success) {
@@ -469,20 +469,15 @@ static int run_benchmark(
 }
 
 static void sweep_benchmark(KernelType kern_type) {
-  int batch_size;
-  int num_rows;
-  int embedding_dim;
-  int average_len;
-
   vector<vector<int>> inputs(GetInputs_());
 
   for (int bit_rate : {4, 2}) {
     for (auto& input : inputs) {
       assert(input.size() > 3);
-      batch_size = input[0];
-      num_rows = input[1];
-      embedding_dim = input[2];
-      average_len = input[3];
+      int batch_size = input[0];
+      int num_rows = input[1];
+      int embedding_dim = input[2];
+      int average_len = input[3];
 
       auto run_benchmark_with_above_shape = [&](bool use_32_bit_indices,
                                                 bool prefetch) {
 
@@ -485,7 +485,7 @@ static int run_benchmark(
 #ifndef OUT_TYPE_FLOAT16
       cout << ", asmjit speedup, " << t_ref / t;
 #endif
-      cout << std::endl;
+      cout << endl;
     } // flush_cache
   } // has_weight
   return 0;
 
@@ -379,7 +379,7 @@ class FBGEMM_API PackAMatrix final
   /**
    * @brief Print the packed block.
    */
-  void printPackedMatrix(std::string name);
+  void printPackedMatrix(const std::string& name);
 
  private:
   matrix_op_t trans_;
@@ -464,7 +464,7 @@ class FBGEMM_API PackBMatrix final
    * @brief Print the packed block.
    */
   void printPackedMatrix(
-      std::string name,
+      const std::string& name,
       const BlockingFactors* params = nullptr);
 
   /**
@@ -745,7 +745,7 @@ class FBGEMM_API PackAWithIm2Col
   /**
    * @brief Print the packed block.
    */
-  void printPackedMatrix(std::string name);
+  void printPackedMatrix(const std::string& name);
 
   /**
    * @return Size of row offset buffer in number of elements
@@ -835,7 +835,7 @@ class FBGEMM_API PackAWithRowOffset final
   /**
    * @brief Print the packed block.
    */
-  void printPackedMatrix(std::string name);
+  void printPackedMatrix(const std::string& name);
 
   /**
    * @return size of row offset buffer in number of elements
@@ -927,7 +927,7 @@ class FBGEMM_API PackAWithQuantRowOffset final
   /**
    * @brief Print the packed block.
    */
-  void printPackedMatrix(std::string name);
+  void printPackedMatrix(const std::string& name);
 
   /**
    * @return Size of row offset buffer in number of elements
 
@@ -80,8 +80,8 @@ class DirectConvCodeGenBase {
       x86::Emitter* a,
       int rowRegs,
       int colRegs,
-      x86::Gp C_Offset,
-      x86::Gp ldcReg,
+      const x86::Gp& C_Offset,
+      const x86::Gp& ldcReg,
       bool accum);
 
   /**
@@ -93,9 +93,9 @@ class DirectConvCodeGenBase {
       x86::Emitter* a,
       int rowRegs,
       int colRegs,
-      x86::Gp C_offset,
-      x86::Gp o1XocReg,
-      x86::Gp ldcReg,
+      const x86::Gp& C_offset,
+      const x86::Gp& o1XocReg,
+      const x86::Gp& ldcReg,
       bool accum);
 
   /**
@@ -167,9 +167,9 @@ class DirectConvCodeGenBase {
   template <inst_set_t instSet>
   void genComputeBlock(
       x86::Emitter* a,
-      x86::Gp buffer_A,
-      x86::Gp buffer_B,
-      x86::Gp B_pf,
+      const x86::Gp& buffer_A,
+      const x86::Gp& buffer_B,
+      const x86::Gp& B_pf,
       int rowRegs,
       int colRegs,
       int lda);
@@ -179,9 +179,9 @@ class DirectConvCodeGenBase {
   template <inst_set_t instSet>
   void genComputeBlockDirectConv(
       x86::Emitter* a,
-      x86::Gp buffer_A,
-      x86::Gp buffer_B,
-      x86::Gp B_pf,
+      const x86::Gp& buffer_A,
+      const x86::Gp& buffer_B,
+      const x86::Gp& B_pf,
       int rowRegs,
       int colRegs,
       int strideXich);
@@ -192,10 +192,10 @@ class DirectConvCodeGenBase {
   template <inst_set_t instSet>
   void genComputeBlockDirectConvTrans(
       x86::Emitter* a,
-      x86::Gp buffer_A,
-      x86::Gp buffer_B,
-      x86::Gp icReg,
-      x86::Gp C_offset,
+      const x86::Gp& buffer_A,
+      const x86::Gp& buffer_B,
+      const x86::Gp& icReg,
+      const x86::Gp& C_offset,
       int rowRegs,
       int colRegs);
 
 
@@ -235,15 +235,15 @@ GenEmbeddingSpMDMLookup<
                 offsetType,
                 outType,
                 ROWWISE_SPARSE>::jit_embedding_kernel {
-        bool is_8bit_in = std::is_same_v<inType, uint8_t>;
-        bool is_16bit_in = std::is_same_v<inType, uint16_t>;
-        bool is_16bit_out = std::is_same_v<outType, uint16_t>;
+        constexpr bool is_8bit_in = std::is_same_v<inType, uint8_t>;
+        constexpr bool is_16bit_in = std::is_same_v<inType, uint16_t>;
+        constexpr bool is_16bit_out = std::is_same_v<outType, uint16_t>;
         bool is_fp16_in = is_16bit_in && !is_bf16_in;
         bool is_fp16_out = is_16bit_out && !is_bf16_out;
 
         // TODO: Make this tunable
         int pref_dist = prefetch;
-        bool areIndices64b = std::is_same_v<indxType, int64_t>;
+        constexpr bool areIndices64b = std::is_same_v<indxType, int64_t>;
 
         asmjit::CodeHolder code;
         code.init(runtime().environment());
@@ -576,15 +576,15 @@ GenEmbeddingSpMDMLookup<
           a->jl(LoopDataIndexEnd);
 
           // Array out of bound check
-          if (areIndices64b) {
+          if constexpr (areIndices64b) {
             a->mov(scratchReg1_, x86::qword_ptr(indices));
           } else {
             a->mov(scratchReg1_.r32(), x86::dword_ptr(indices));
           }
           if (!scale_bias_last) {
             // When scale_bias_last == false, assume this is for table batched
             // embedding (TBE) that can get -1 for pruned rows.
-            if (areIndices64b) {
+            if constexpr (areIndices64b) {
               a->cmp(scratchReg1_, static_cast<asmjit::Imm>(-1));
             } else {
               a->cmp(scratchReg1_.r32(), static_cast<asmjit::Imm>(-1));
@@ -623,7 +623,7 @@ GenEmbeddingSpMDMLookup<
             a->cmp(scratchReg2_, index_size);
             a->jge(pref_dist_reset_start);
 
-            if (areIndices64b) {
+            if constexpr (areIndices64b) {
               a->mov(
                   scratchReg2_,
                   x86::qword_ptr(indices, pref_dist * sizeof(indxType)));
@@ -638,7 +638,7 @@ GenEmbeddingSpMDMLookup<
             a->bind(pref_dist_reset_start);
             // things are not okay just get the current row
             // this can be improved to getting the max dist row.
-            if (areIndices64b) {
+            if constexpr (areIndices64b) {
               a->mov(scratchReg2_, x86::qword_ptr(indices));
             } else {
               a->mov(scratchReg2_.r32(), x86::dword_ptr(indices));
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ modernize*,`
`19`	`19`	`-modernize-use-ranges,`
`20`	`20`	`-modernize-use-integer-sign-comparison`
`21`	`21`	`-modernize-use-nodiscard,`
	`22`	`+performance*,`
	`23`	`+-performance-avoid-endl`
`22`	`24`	`'`
`23`	`25`	`CheckOptions:`
`24`	`26`	`- key: facebook-cuda-safe-api-call-check.HandlerName`