pytorch
diff --git a/‎bench/ConvUnifiedBenchmark.cc
Lines changed: 6 additions & 6 deletions b/‎bench/ConvUnifiedBenchmark.cc
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/fbgemm/OutputProcessing-inl.h
Lines changed: 14 additions & 10 deletions b/‎include/fbgemm/OutputProcessing-inl.h
Lines changed: 14 additions & 10 deletions
diff --git a/‎include/fbgemm/Utils.h
Lines changed: 4 additions & 4 deletions b/‎include/fbgemm/Utils.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/DirectConv.h
Lines changed: 4 additions & 4 deletions b/‎src/DirectConv.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/EmbeddingSpMDM.cc
Lines changed: 8 additions & 8 deletions b/‎src/EmbeddingSpMDM.cc
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/EmbeddingSpMDMAutovec.cc
Lines changed: 2 additions & 2 deletions b/‎src/EmbeddingSpMDMAutovec.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/EmbeddingSpMDMNBit.cc
Lines changed: 13 additions & 13 deletions b/‎src/EmbeddingSpMDMNBit.cc
Lines changed: 13 additions & 13 deletions
diff --git a/‎src/Fbgemm.cc
Lines changed: 1 addition & 1 deletion b/‎src/Fbgemm.cc
Lines changed: 1 addition & 1 deletion
@@ -230,42 +230,42 @@ static void performance_test(
   const int NITER = repetitions;
 
   string header = "MB, IC, OC, ";
-  if (SPATIAL_DIM == 3) {
+  if constexpr (SPATIAL_DIM == 3) {
     header += "IT, ";
   }
   if (SPATIAL_DIM > 1) {
     header += "IH, ";
   }
   header += "IW, G, ";
-  if (SPATIAL_DIM == 3) {
+  if constexpr (SPATIAL_DIM == 3) {
     header += "KT, ";
   }
   if (SPATIAL_DIM > 1) {
     header += "KH, ";
   }
   header += "KW, ";
-  if (SPATIAL_DIM == 3) {
+  if constexpr (SPATIAL_DIM == 3) {
     header += "stride_t, ";
   }
   if (SPATIAL_DIM > 1) {
     header += "stride_h, ";
   }
   header += "stride_w, ";
-  if (SPATIAL_DIM == 3) {
+  if constexpr (SPATIAL_DIM == 3) {
     header += "pad_t, ";
   }
   if (SPATIAL_DIM > 1) {
     header += "pad_h, ";
   }
   header += "pad_w, ";
-  if (SPATIAL_DIM == 3) {
+  if constexpr (SPATIAL_DIM == 3) {
     header += "dilation_t, ";
   }
   if (SPATIAL_DIM > 1) {
     header += "dilation_h, ";
   }
   header += "dilation_w, ";
-  if (SPATIAL_DIM == 3) {
+  if constexpr (SPATIAL_DIM == 3) {
     header += "output_padding_t, ";
   }
   if (SPATIAL_DIM > 1) {
 
@@ -81,19 +81,20 @@ ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE, outT, inT, nextOPType>::f(
       block.col_size <= ncol_per_group &&
       "ReQuantizeOutput should be called at most 1 group at a time.");
   int g = block.col_start / ncol_per_group;
-  if (instSet == inst_set_t::anyarch || !std::is_same<outT, uint8_t>::value) {
+  if constexpr (
+      instSet == inst_set_t::anyarch || !std::is_same<outT, uint8_t>::value) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
         inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
         if (Aq_zero_point_) {
           raw -= Aq_zero_point_ * q_col_offsets_[j];
         }
         int Bq_zero_point_idx;
-        if (Q_GRAN == QuantizationGranularity::TENSOR) {
+        if constexpr (Q_GRAN == QuantizationGranularity::TENSOR) {
           Bq_zero_point_idx = 0;
-        } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+        } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
           Bq_zero_point_idx = g;
-        } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        } else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
           Bq_zero_point_idx = j;
         } else {
           assert(false && "unknown quantization granularity");
@@ -123,7 +124,8 @@ ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE, outT, inT, nextOPType>::f(
             std::min(255l, rounded));
       }
     }
-  } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
+  } else if constexpr (
+      instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
     bool b_symmetric =
         (Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) ||
         q_row_offsets_ == nullptr;
@@ -211,19 +213,20 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
       block.col_size <= ncol_per_group &&
       "ReQuantizeOutput should be called at most 1 group at a time.");
   int g = block.col_start / ncol_per_group;
-  if (instSet == inst_set_t::anyarch || !std::is_same<outT, float>::value) {
+  if constexpr (
+      instSet == inst_set_t::anyarch || !std::is_same<outT, float>::value) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
         inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
         if (Aq_zero_point_) {
           raw -= Aq_zero_point_ * q_col_offsets_[j];
         }
         int Bq_zero_point_idx;
-        if (Q_GRAN == QuantizationGranularity::TENSOR) {
+        if constexpr (Q_GRAN == QuantizationGranularity::TENSOR) {
           Bq_zero_point_idx = 0;
-        } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+        } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
           Bq_zero_point_idx = g;
-        } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        } else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
           Bq_zero_point_idx = j;
         } else {
           assert(false && "unknown quantization granularity");
@@ -242,7 +245,8 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
         }
       }
     }
-  } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
+  } else if constexpr (
+      instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
     bool b_symmetric =
         (Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) ||
         q_row_offsets_ == nullptr;
 
@@ -267,7 +267,7 @@ bool isValidBlockingFactor(const BlockingFactors* const param) {
   constexpr bool is_16bit = std::is_same<accT, int16_t>::value;
   static const auto iset = fbgemmInstructionSet();
 
-  if (is_32bit) {
+  if constexpr (is_32bit) {
     if (param->ROW_INTERLEAVE != 4)
       return false;
 
@@ -278,7 +278,7 @@ bool isValidBlockingFactor(const BlockingFactors* const param) {
       if (param->NR_MIN != 8 || param->NR % param->NR_MIN)
         return false;
     }
-  } else if (is_16bit) {
+  } else if constexpr (is_16bit) {
     if (param->ROW_INTERLEAVE != 2)
       return false;
 
@@ -296,11 +296,11 @@ bool isValidBlockingFactor(const BlockingFactors* const param) {
   if (param->NCB % param->NR)
     return false;
   if (isZmm(iset)) {
-    if (is_32bit) {
+    if constexpr (is_32bit) {
       // Zmm register usage for C
       if (param->MR * (param->NR / param->NR_MIN) > 28)
         return false;
-    } else if (is_16bit) {
+    } else if constexpr (is_16bit) {
       // Zmm register usage for C + one row for loading B
       if ((param->MR * (param->NR / param->NR_MIN) +
            (param->NR / param->NR_MIN)) > 28)
 
@@ -124,13 +124,13 @@ class DirectConvCodeGenBase {
         << "_NC-" + std::to_string(nc) << "_NCB-" + std::to_string(NCB)
         << "_KCB-" + std::to_string(KCB) << "_MR-" + std::to_string(MR)
         << "_NR-" + std::to_string(NR);
-    if (instSet == inst_set_t::avx512_vnni) {
+    if constexpr (instSet == inst_set_t::avx512_vnni) {
       oss << "_avx512vnni";
-    } else if (instSet == inst_set_t::avx512) {
+    } else if constexpr (instSet == inst_set_t::avx512) {
       oss << "_avx512";
-    } else if (instSet == inst_set_t::avx512_ymm) {
+    } else if constexpr (instSet == inst_set_t::avx512_ymm) {
       oss << "_avx512_ymm";
-    } else if (instSet == inst_set_t::avx2) {
+    } else if constexpr (instSet == inst_set_t::avx2) {
       oss << "_avx2";
     }
     oss << ".txt";
 
@@ -354,7 +354,7 @@ GenEmbeddingSpMDMLookup<
         asmjit::FuncFrame frame;
         frame.init(func);
 
-        if (instSet == inst_set_t::avx2) {
+        if constexpr (instSet == inst_set_t::avx2) {
           frame.setDirtyRegs(
               asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
@@ -468,7 +468,7 @@ GenEmbeddingSpMDMLookup<
         }
 
         if (remainder) {
-          if (instSet == inst_set_t::avx2) {
+          if constexpr (instSet == inst_set_t::avx2) {
             a->vmovups(
                 mask_vreg,
                 x86::ymmword_ptr(
@@ -524,7 +524,7 @@ GenEmbeddingSpMDMLookup<
 
           // OK to use vreg0 because it's for out_vreg used in the main loop
           vec_reg_t temp_vreg(0);
-          if (instSet == inst_set_t::avx2) {
+          if constexpr (instSet == inst_set_t::avx2) {
             a->mov(scratchReg1_, 1);
             a->cvtsi2ss(vlen_inv_vreg.xmm(), scratchReg1_);
             a->cvtsi2ss(temp_vreg.xmm(), lengths_R_);
@@ -752,7 +752,7 @@ GenEmbeddingSpMDMLookup<
               a->vfmadd231ps(out_vreg, src_vreg, scale_vreg);
             } else if (is_16bit_in) {
               if (remainder && vec_idx + v == num_vec_regs_per_block - 1) {
-                if (instSet == inst_set_t::avx2) {
+                if constexpr (instSet == inst_set_t::avx2) {
                   if (remainder % 2 == 0) {
                     a->vmaskmovps(src_vreg.xmm(), mask_fp16_vreg, src_addr);
                   } else {
@@ -819,7 +819,7 @@ GenEmbeddingSpMDMLookup<
               }
               if (has_weight) {
                 if (remainder && vec_idx + v == num_vec_regs_per_block - 1) {
-                  if (instSet == inst_set_t::avx2) {
+                  if constexpr (instSet == inst_set_t::avx2) {
                     a->vfmadd231ps(out_vreg, w_vreg, src_vreg);
                   } else {
                     a->k(x86::k(1)).vfmadd231ps(out_vreg, w_vreg, src_addr);
@@ -829,7 +829,7 @@ GenEmbeddingSpMDMLookup<
                 }
               } else {
                 if (remainder && vec_idx + v == num_vec_regs_per_block - 1) {
-                  if (instSet == inst_set_t::avx2) {
+                  if constexpr (instSet == inst_set_t::avx2) {
                     a->vaddps(out_vreg, out_vreg, src_vreg);
                   } else {
                     a->k(x86::k(1)).vaddps(out_vreg, out_vreg, src_addr);
@@ -864,7 +864,7 @@ GenEmbeddingSpMDMLookup<
 
             if constexpr (std::is_same_v<outType, float>) {
               if (remainder && vec_idx + v == num_vec_regs_per_block - 1) {
-                if (instSet == inst_set_t::avx2) {
+                if constexpr (instSet == inst_set_t::avx2) {
                   a->vmaskmovps(dst_addr, mask_vreg, out_vreg.ymm());
                 } else {
                   a->k(x86::k(1)).vmovups(dst_addr, out_vreg);
@@ -874,7 +874,7 @@ GenEmbeddingSpMDMLookup<
               }
             } else {
               // fp16/bf16 output
-              if (instSet == inst_set_t::avx2) {
+              if constexpr (instSet == inst_set_t::avx2) {
                 // round nearest with no exception
                 if (is_fp16_out) {
                   a->vcvtps2ph(out_vreg.xmm(), out_vreg, 8);
 
@@ -736,9 +736,9 @@ static bool ALWAYS_INLINE EmbeddingSpMDMRowWiseSparse_autovec(
     float* out,
     const bool is_weight_positional,
     const bool use_offsets) {
-  bool is8bit = std::is_same_v<InType, uint8_t>;
+  constexpr bool is8bit = std::is_same_v<InType, uint8_t>;
 
-  if (is8bit) {
+  if constexpr (is8bit) {
     // block_size is the number of elements and fused_block_size is the size
     // of an entire row, including scale and bias.
     const auto scale_bias_offset = 2 * sizeof(float);
 
@@ -285,7 +285,7 @@ GenEmbeddingSpMDMNBitLookup<
         ++reg_id;
         x86::Gp scratchReg2_ = a->gpz(reg_id); // 14 or 15
         x86::Gp scratchReg3_;
-        if (instSet == inst_set_t::avx2) {
+        if constexpr (instSet == inst_set_t::avx2) {
           scratchReg3_ = a->zax();
         }
 
@@ -470,7 +470,7 @@ GenEmbeddingSpMDMNBitLookup<
         unroll_factor = unroll_factor / 4 * 4;
 
         if (remainder) {
-          if (instSet == inst_set_t::avx2) {
+          if constexpr (instSet == inst_set_t::avx2) {
             a->vmovups(
                 mask_vreg,
                 x86::ymmword_ptr(
@@ -496,7 +496,7 @@ GenEmbeddingSpMDMNBitLookup<
         }
 
         if (remainder_32bit_granularity) {
-          if (instSet == inst_set_t::avx2) {
+          if constexpr (instSet == inst_set_t::avx2) {
             a->lea(
                 x86::rsp,
                 x86::dword_ptr(
@@ -548,7 +548,7 @@ GenEmbeddingSpMDMNBitLookup<
           a->jl(IfLengthsEnd);
 
           vec_reg_t temp_vreg0(0);
-          if (instSet == inst_set_t::avx2) {
+          if constexpr (instSet == inst_set_t::avx2) {
             a->mov(scratchReg1_, 1);
             a->cvtsi2ss(vlen_inv_vreg.xmm(), scratchReg1_);
             a->cvtsi2ss(temp_vreg0.xmm(), lengths_R_);
@@ -755,7 +755,7 @@ GenEmbeddingSpMDMNBitLookup<
             if (bit_rate == 4) {
               if (num_vec_regs_per_block - (vec_idx + v) < 4 &&
                   remainder_32bit_granularity) {
-                if (instSet == inst_set_t::avx512) {
+                if constexpr (instSet == inst_set_t::avx512) {
                   a->k(x86::k(2)).vmovups(src_vreg.ymm(), src_addr);
                 } else {
                   a->vpmaskmovd(src_vreg.xmm(), mask2_vreg.xmm(), src_addr);
@@ -765,7 +765,7 @@ GenEmbeddingSpMDMNBitLookup<
                 a->vpmovzxbw(src_vreg, src_addr);
               }
               a->vpslld(temp_vreg, src_vreg, asmjit::Imm(4));
-              if (instSet == inst_set_t::avx512) {
+              if constexpr (instSet == inst_set_t::avx512) {
                 a->vpord(src_vreg, src_vreg, temp_vreg);
                 a->vpandd(src_vreg, src_vreg, extract_mask_vreg);
               } else {
@@ -776,7 +776,7 @@ GenEmbeddingSpMDMNBitLookup<
             } else {
               if (num_vec_regs_per_block - (vec_idx + v) < 4 &&
                   remainder_32bit_granularity) {
-                if (instSet == inst_set_t::avx512) {
+                if constexpr (instSet == inst_set_t::avx512) {
                   a->k(x86::k(2)).vmovups(src_vreg.xmm(), src_addr);
                   a->vpmovzxbd(src_vreg, src_vreg.xmm());
                 } else {
@@ -788,13 +788,13 @@ GenEmbeddingSpMDMNBitLookup<
               }
               a->vpslld(temp_vreg, src_vreg, 2 * 8 + 2);
               a->vpslld(temp2_vreg, src_vreg, 8 + 4);
-              if (instSet == inst_set_t::avx512) {
+              if constexpr (instSet == inst_set_t::avx512) {
                 a->vpord(temp_vreg, temp_vreg, temp2_vreg);
               } else {
                 a->vpor(temp_vreg.ymm(), temp_vreg.ymm(), temp2_vreg.ymm());
               }
               a->vpslld(temp2_vreg, src_vreg, 6);
-              if (instSet == inst_set_t::avx512) {
+              if constexpr (instSet == inst_set_t::avx512) {
                 a->vpord(temp_vreg, temp_vreg, temp2_vreg);
                 a->vpord(src_vreg, temp_vreg, src_vreg);
                 a->vpandd(src_vreg, src_vreg, extract_mask_vreg);
@@ -817,11 +817,11 @@ GenEmbeddingSpMDMNBitLookup<
               if (i == 0) {
                 a->vpmovsxbd(temp_vreg, src_vreg.xmm());
                 // this is only needed for avx2
-                if (instSet == inst_set_t::avx2) {
+                if constexpr (instSet == inst_set_t::avx2) {
                   a->vmovups(temp2_vreg, src_vreg);
                 }
               } else {
-                if (instSet == inst_set_t::avx512) {
+                if constexpr (instSet == inst_set_t::avx512) {
                   // We could've used avx512_ymm for clock frequency advantage,
                   // if there's an instruction to extract a 64-bit portion from
                   // a YMM as an XMM register.
@@ -868,7 +868,7 @@ GenEmbeddingSpMDMNBitLookup<
 
             if constexpr (std::is_same_v<outType, float>) {
               if (remainder && vec_idx + v == num_vec_regs_per_block - 1) {
-                if (instSet == inst_set_t::avx512) {
+                if constexpr (instSet == inst_set_t::avx512) {
                   a->k(x86::k(1)).vmovups(dst_addr, out_vreg);
                 } else {
                   a->vmaskmovps(dst_addr, mask_vreg, out_vreg.ymm());
@@ -878,7 +878,7 @@ GenEmbeddingSpMDMNBitLookup<
               }
             } else {
               // 16-bit output
-              if (instSet == inst_set_t::avx2) {
+              if constexpr (instSet == inst_set_t::avx2) {
                 if (is_bf16_out) {
                   a->vpaddd(out_vreg, out_vreg, ones_vreg);
                   a->vpsrld(out_vreg, out_vreg, 16);
 
@@ -209,7 +209,7 @@ void fbgemmPacked(
 
 template <int SPATIAL_DIM>
 bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
-  if (SPATIAL_DIM == 1)
+  if constexpr (SPATIAL_DIM == 1)
     return false;
 
   int C_per_G = conv_p.IC / conv_p.G;
Original file line number	Diff line number	Diff line change
`@@ -230,42 +230,42 @@ static void performance_test(`
`230`	`230`	`const int NITER = repetitions;`
`231`	`231`
`232`	`232`	`string header = "MB, IC, OC, ";`
`233`		`- if (SPATIAL_DIM == 3) {`
	`233`	`+ if constexpr (SPATIAL_DIM == 3) {`
`234`	`234`	`header += "IT, ";`
`235`	`235`	`}`
`236`	`236`	`if (SPATIAL_DIM > 1) {`
`237`	`237`	`header += "IH, ";`
`238`	`238`	`}`
`239`	`239`	`header += "IW, G, ";`
`240`		`- if (SPATIAL_DIM == 3) {`
	`240`	`+ if constexpr (SPATIAL_DIM == 3) {`
`241`	`241`	`header += "KT, ";`
`242`	`242`	`}`
`243`	`243`	`if (SPATIAL_DIM > 1) {`
`244`	`244`	`header += "KH, ";`
`245`	`245`	`}`
`246`	`246`	`header += "KW, ";`
`247`		`- if (SPATIAL_DIM == 3) {`
	`247`	`+ if constexpr (SPATIAL_DIM == 3) {`
`248`	`248`	`header += "stride_t, ";`
`249`	`249`	`}`
`250`	`250`	`if (SPATIAL_DIM > 1) {`
`251`	`251`	`header += "stride_h, ";`
`252`	`252`	`}`
`253`	`253`	`header += "stride_w, ";`
`254`		`- if (SPATIAL_DIM == 3) {`
	`254`	`+ if constexpr (SPATIAL_DIM == 3) {`
`255`	`255`	`header += "pad_t, ";`
`256`	`256`	`}`
`257`	`257`	`if (SPATIAL_DIM > 1) {`
`258`	`258`	`header += "pad_h, ";`
`259`	`259`	`}`
`260`	`260`	`header += "pad_w, ";`
`261`		`- if (SPATIAL_DIM == 3) {`
	`261`	`+ if constexpr (SPATIAL_DIM == 3) {`
`262`	`262`	`header += "dilation_t, ";`
`263`	`263`	`}`
`264`	`264`	`if (SPATIAL_DIM > 1) {`
`265`	`265`	`header += "dilation_h, ";`
`266`	`266`	`}`
`267`	`267`	`header += "dilation_w, ";`
`268`		`- if (SPATIAL_DIM == 3) {`
	`268`	`+ if constexpr (SPATIAL_DIM == 3) {`
`269`	`269`	`header += "output_padding_t, ";`
`270`	`270`	`}`
`271`	`271`	`if (SPATIAL_DIM > 1) {`