improve validation for input tensors to guard for case where inputs coming from different device (#1615)

pls331 · facebook-github-bot · commit 3da46da1e529 · 2023-02-27T11:34:54.000-08:00
Summary: Pull Request resolved: #1615 Reviewed By: jianyuh, houseroad Differential Revision: D43564925 fbshipit-source-id: 9d8db49df76889e56f70ebb3fb4984c292186edc
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
@@ -620,20 +620,20 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     const int64_t fp8_exponent_bias
 ) {
     TENSOR_ON_CUDA_GPU(dev_weights);
-    TENSOR_ON_CUDA_GPU(uvm_weights);
-    TENSOR_ON_CUDA_GPU(weights_placements);
-    TENSOR_ON_CUDA_GPU(weights_offsets);
-    TENSOR_ON_CUDA_GPU(weights_tys);
+    TENSORS_ON_SAME_DEVICE(uvm_weights, dev_weights);
+    TENSORS_ON_SAME_DEVICE(weights_placements, dev_weights);
+    TENSORS_ON_SAME_DEVICE(weights_offsets, dev_weights);
+    TENSORS_ON_SAME_DEVICE(weights_tys, dev_weights);
     {% if not nobag %}
-    TENSOR_ON_CUDA_GPU(D_offsets);
+    TENSORS_ON_SAME_DEVICE(D_offsets, dev_weights);
     {% endif %}
-    TENSOR_ON_CUDA_GPU(indices);
-    TENSOR_ON_CUDA_GPU(offsets);
+    TENSORS_ON_SAME_DEVICE(indices, dev_weights);
+    TENSORS_ON_SAME_DEVICE(offsets, dev_weights);
     {% if weighted %}
-    TENSOR_EMPTY_OR_ON_CUDA_GPU(indice_weights);
+    TENSORS_EMPTY_OR_ON_SAME_DEVICE(indice_weights, dev_weights);
     {% endif %}
-    TENSOR_EMPTY_OR_ON_CUDA_GPU(lxu_cache_weights);
-    TENSOR_EMPTY_OR_ON_CUDA_GPU(lxu_cache_locations);
+    TENSORS_EMPTY_OR_ON_SAME_DEVICE(lxu_cache_weights, dev_weights);
+    TENSORS_EMPTY_OR_ON_SAME_DEVICE(lxu_cache_locations, dev_weights);
 
     at::cuda::OptionalCUDAGuard device_guard;
     device_guard.set_index(dev_weights.get_device());
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h
@@ -111,6 +111,12 @@ inline bool torch_tensor_empty_or_on_cpu_check(
       #x " must be empty or a CUDA tensor; it is currently on device ", \
       torch_tensor_device_name(x))
 
+#define TENSORS_EMPTY_OR_ON_SAME_DEVICE(x, y)                           \
+  TORCH_CHECK(                                                          \
+      torch_tensor_on_same_device_check(x, y) || (x.numel() == 0),      \
+      #x " must be empty or a CUDA tensor; it is currently on device ", \
+      torch_tensor_device_name(x))
+
 #define TENSORS_ON_SAME_DEVICE(x, y)                                       \
   TORCH_CHECK(                                                             \
       torch_tensor_on_same_device_check(x, y),                             \