Add top-p

hyeygit · hyeygit · commit 6691bb600ae6 · 2025-03-21T23:23:26.000Z
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -818,6 +818,12 @@ xla::Shape GetTensorShape(const at::Tensor& tensor,
   return CreateComputationShapeFromTensor(tensor, &device);
 }
 
+at::Tensor TopPMask(const at::Tensor& input, float p, int64_t dim) {
+  auto result = tensor_methods::topp_mask(bridge::GetXlaTensor(input), p, dim,
+                                          /*stable=*/false);
+  return bridge::AtenFromXlaTensor(std::move(result));
+}
+
 py::dict GetMemoryInfo(const std::string& device_str) {
   runtime::ComputationClient::MemoryInfo mem_info;
   {
@@ -3008,6 +3014,9 @@ void InitXlaModuleBindings(py::module m) {
       [](std::string name, std::shared_ptr<const runtime::PjRtPlugin> plugin) {
         runtime::RegisterPjRtPlugin(name, plugin);
       });
+  m.def("_xla_topp_mask", [](const at::Tensor& input, float p, int64_t dim) {
+    return TopPMask(input, p, dim);
+  });
   py::class_<runtime::PjRtPlugin, PyPjRtPlugin,
              std::shared_ptr<runtime::PjRtPlugin>>(m, "PjRtPlugin")
       .def(py::init<>())
diff --git a/torch_xla/csrc/ops/topp_mask.cpp b/torch_xla/csrc/ops/topp_mask.cpp
@@ -0,0 +1,34 @@
+#include "torch_xla/csrc/ops/topp_mask.h"
+
+#include "torch_xla/csrc/lowering_context.h"
+#include "torch_xla/csrc/ops/infer_output_shape.h"
+#include "torch_xla/csrc/xla_lower_util.h"
+
+namespace torch_xla {
+
+TopPMask::TopPMask(const torch::lazy::Value& input, float p, int64_t dim,
+                   bool stable)
+    : XlaNode(torch::lazy::OpKind(at::aten::topk), {input}, GetXlaShape(input),
+              /*num_outputs=*/1, torch::lazy::MHash(p, dim, stable)),
+      p_(p),
+      dim_(dim),
+      stable_(stable) {}
+
+torch::lazy::NodePtr TopPMask::Clone(torch::lazy::OpList operands) const {
+  return torch_xla::MakeNode<TopPMask>(operands.at(0), p_, dim_, stable_);
+}
+
+XlaOpVector TopPMask::Lower(LoweringContext* loctx) const {
+  xla::XlaOp input = loctx->GetOutputOp(operand(0));
+  xla::XlaOp output = CreateTopPMask(input, p_, dim_, stable_);
+  return ReturnOp(output, loctx);
+}
+
+std::string TopPMask::ToString() const {
+  std::stringstream ss;
+  ss << XlaNode::ToString() << ", p=" << p_ << ", dim=" << dim_
+     << ", stable=" << stable_;
+  return ss.str();
+}
+
+}  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/topp_mask.h b/torch_xla/csrc/ops/topp_mask.h
@@ -0,0 +1,32 @@
+#ifndef XLA_TORCH_XLA_CSRC_OPS_TOPP_MASK_H_
+#define XLA_TORCH_XLA_CSRC_OPS_TOPP_MASK_H_
+
+#include "torch_xla/csrc/ir.h"
+
+namespace torch_xla {
+
+class TopPMask : public XlaNode {
+ public:
+  TopPMask(const torch::lazy::Value& input, float p, int64_t dim, bool stable);
+
+  std::string ToString() const override;
+
+  torch::lazy::NodePtr Clone(torch::lazy::OpList operands) const override;
+
+  XlaOpVector Lower(LoweringContext* loctx) const override;
+
+  float p() const { return p_; };
+
+  int64_t dim() const { return dim_; };
+
+  bool stable() const { return stable_; }
+
+ private:
+  float p_;
+  int64_t dim_;
+  bool stable_;
+};
+
+}  // namespace torch_xla
+
+#endif  // XLA_TORCH_XLA_CSRC_OPS_TOPP_MASK_H_
diff --git a/torch_xla/csrc/ops/xla_ops.cpp b/torch_xla/csrc/ops/xla_ops.cpp
@@ -40,5 +40,6 @@ const OpKindWrapper xla_update_slice("xla::update_slice");
 const OpKindWrapper xla_custom_sharding("xla::custom_sharding");
 const OpKindWrapper xla_tpu_custom_call("xla::tpu_custom_call");
 const OpKindWrapper xla_gpu_custom_call("xla::gpu_custom_call");
+const OpKindWrapper xla_topp_mask("xla::topp_mask");
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/xla_ops.h b/torch_xla/csrc/ops/xla_ops.h
@@ -65,7 +65,8 @@ extern const OpKindWrapper xla_update_slice;
 extern const OpKindWrapper xla_custom_sharding;
 extern const OpKindWrapper xla_tpu_custom_call;
 extern const OpKindWrapper xla_gpu_custom_call;
+extern const OpKindWrapper xla_topp_mask;
 
 }  // namespace torch_xla
 
-#endif  // XLA_TORCH_XLA_CSRC_OPS_XLA_OPS_H_
+#endif  // XLA_TORCH_XLA_CSRC_OPS_XLA_OPS_H_
diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
@@ -132,6 +132,7 @@
 #include "torch_xla/csrc/ops/threshold.h"
 #include "torch_xla/csrc/ops/threshold_backward.h"
 #include "torch_xla/csrc/ops/topk.h"
+#include "torch_xla/csrc/ops/topp_mask.h"
 #include "torch_xla/csrc/ops/tpu_custom_call.h"
 #include "torch_xla/csrc/ops/triangular_solve.h"
 #include "torch_xla/csrc/ops/uniform.h"
@@ -3438,6 +3439,14 @@ std::tuple<XLATensorPtr, XLATensorPtr> topk(const XLATensorPtr& input,
   return std::make_tuple(t1, t2);
 }
 
+XLATensorPtr topp_mask(const XLATensorPtr& input, float p, int64_t dim,
+                       bool stable) {
+  return input->CreateFrom(torch_xla::MakeNode<TopPMask>(
+      input->GetIrValue(), p,
+      torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().get().rank()),
+      stable));
+}
+
 XLATensorPtr trace(const XLATensorPtr& input) {
   auto input_shape_ref = input->shape();
   XLA_CHECK_EQ((*input_shape_ref).rank(), 2)
diff --git a/torch_xla/csrc/tensor_methods.h b/torch_xla/csrc/tensor_methods.h
@@ -973,6 +973,9 @@ std::tuple<XLATensorPtr, XLATensorPtr> topk(const XLATensorPtr& input,
                                             bool largest, bool sorted,
                                             bool stable);
 
+XLATensorPtr topp_mask(const XLATensorPtr& input, float p, int64_t dim,
+                       bool stable);
+
 // Returns the sum of the elements of the diagonal of the input 2-D matrix.
 XLATensorPtr trace(const XLATensorPtr& input);
 
diff --git a/torch_xla/csrc/xla_lower_util.cpp b/torch_xla/csrc/xla_lower_util.cpp
@@ -390,6 +390,11 @@ std::vector<xla::XlaOp> CreateTopK(xla::XlaOp input, int64_t k, int64_t dim,
                                                xla::PrimitiveType::S64))};
 }
 
+xla::XlaOp CreateTopPMask(xla::XlaOp input, float p, int64_t dim, bool stable) {
+  // TODO: implement
+  return input;
+}
+
 xla::XlaOp CreateMatMul(xla::XlaOp lhs, xla::XlaOp rhs) {
   // Expand cases in https://pytorch.org/docs/stable/torch.html#torch.matmul
   xla::Shape lhs_shape = ShapeHelper::ShapeOfXlaOp(lhs);
diff --git a/torch_xla/csrc/xla_lower_util.h b/torch_xla/csrc/xla_lower_util.h
@@ -20,6 +20,8 @@ std::vector<xla::XlaOp> CreateKthValue(xla::XlaOp input, int64_t k, int64_t dim,
 std::vector<xla::XlaOp> CreateTopK(xla::XlaOp input, int64_t k, int64_t dim,
                                    bool largest, bool stable);
 
+xla::XlaOp CreateTopPMask(xla::XlaOp input, float p, int64_t dim, bool stable);
+
 xla::XlaOp CreateMatMul(xla::XlaOp lhs, xla::XlaOp rhs);
 
 xla::XlaOp BuildMatMul(xla::XlaOp lhs, xla::XlaOp rhs, xla::XlaOp bias);
diff --git a/torch_xla/experimental/topp_mask.py b/torch_xla/experimental/topp_mask.py
@@ -0,0 +1,8 @@
+import torch_xla
+
+def topp_mask(logits, p, dim=None):
+  assert p >= 0 and p <= 1.0, "p must be in [0, 1]."
+  if dim is None:
+    dim = -1
+  return torch_xla._XLAC._xla_topp_mask(logits, p, dim)
+