[Serving] Enable GPU Sampling (#2368)

Hzfengsy · web-flow · commit b18284bfcf87 · 2024-05-24T23:46:21.000-04:00
enable gpu sampling
diff --git a/cpp/serve/sampler/gpu_sampler.cc b/cpp/serve/sampler/gpu_sampler.cc
@@ -545,7 +545,8 @@ class GPUSampler : public SamplerObj {
     if (!need_top_p && !need_prob_values) {
       // - Short path: If top_p and prob values are not needed, we directly sample from multinomial.
       SyncCopyStream(device_, compute_stream_, copy_stream_);
-      if (flashinfer_multinomial_sample_func_ != nullptr) {
+      if (device_.device_type == DLDeviceType::kDLCUDA &&
+          flashinfer_multinomial_sample_func_ != nullptr) {
         sampled_token_ids_device =
             sampled_token_ids_device_.CreateView({sample_indices_device->shape[0]}, dtype_i32_);
         (*flashinfer_multinomial_sample_func_)(probs_on_device, uniform_samples_device,
@@ -588,7 +589,8 @@ class GPUSampler : public SamplerObj {
                                       uniform_samples_device, sample_indices_device, top_p_device);
     } else {
       // - Sample without top_p.
-      if (flashinfer_multinomial_sample_func_ != nullptr) {
+      if (device_.device_type == DLDeviceType::kDLCUDA &&
+          flashinfer_multinomial_sample_func_ != nullptr) {
         sampled_token_ids_device =
             sampled_token_ids_device_.CreateView({sample_indices_device->shape[0]}, dtype_i32_);
         (*flashinfer_multinomial_sample_func_)(probs_on_device, uniform_samples_device,
diff --git a/cpp/serve/sampler/sampler.h b/cpp/serve/sampler/sampler.h
@@ -140,7 +140,8 @@ class Sampler : public ObjectRef {
 
   /*! \brief Check if the given device supports GPU sampling. */
   static bool SupportGPUSampler(Device device) {
-    return device.device_type == DLDeviceType::kDLCUDA;
+    return device.device_type == DLDeviceType::kDLCUDA ||
+           device.device_type == DLDeviceType::kDLVulkan;
   }
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Sampler, ObjectRef, SamplerObj);
diff --git a/python/mlc_llm/compiler_pass/attach_sampler.py b/python/mlc_llm/compiler_pass/attach_sampler.py
@@ -28,7 +28,7 @@ def __init__(self, target: tvm.target.Target, variable_bounds: Dict[str, int]):
 
     def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
         """Entrypoint"""
-        if str(self.target.kind) != "cuda":
+        if str(self.target.kind) not in ["cuda", "vulkan"]:
             # Only enable GPU sampling for CUDA.
             return mod
 
@@ -87,7 +87,11 @@ def _attach_multinomial_sampling_func(bb: relax.BlockBuilder):
                 name="sample_indices",
             )
             result_tensor = nn.multinomial_from_uniform(  # pylint:disable=too-many-function-args
-                probs_tensor, uniform_samples_tensor, sample_indices_tensor, "int32"
+                probs_tensor,
+                uniform_samples_tensor,
+                sample_indices_tensor,
+                "int32",
+                name="nn_multinomial_from_uniform",
             )
             result = bb.emit(
                 relax.call_pure_packed(
@@ -97,7 +101,8 @@ def _attach_multinomial_sampling_func(bb: relax.BlockBuilder):
                     sinfo_args=sample_indices.struct_info,  # pylint: disable=no-member
                 )
             )
-        gv = bb.emit_func_output(result)
+            output = bb.emit_output(result)
+        gv = bb.emit_func_output(output)
     return gv
 
 
diff --git a/python/mlc_llm/compiler_pass/pipeline.py b/python/mlc_llm/compiler_pass/pipeline.py
@@ -121,6 +121,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                 _DebugDump("debug-phase1.py", debug_dump, show_meta=False),
                 # Phase 2. Lowering to TIR, inherited TVM Relax's official "zero" pipeline
                 _LogProgress("Lowering to TVM TIR kernels"),
+                tvm.relax.backend.DispatchSampling(),
                 tvm.relax.backend.DispatchSortScan(),
                 tvm.relax.transform.LegalizeOps(),
                 tvm.relax.transform.AnnotateTIROpPattern(),

Original file line number	Diff line number	Diff line change
`@@ -140,7 +140,8 @@ class Sampler : public ObjectRef {`
`140`	`140`
`141`	`141`	`/! \brief Check if the given device supports GPU sampling. /`
`142`	`142`	`static bool SupportGPUSampler(Device device) {`
`143`		`- return device.device_type == DLDeviceType::kDLCUDA;`
	`143`	`+ return device.device_type == DLDeviceType::kDLCUDA \|\|`
	`144`	`+ device.device_type == DLDeviceType::kDLVulkan;`
`144`	`145`	`}`
`145`	`146`
`146`	`147`	`TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Sampler, ObjectRef, SamplerObj);`