fix: L0_sequence_batcher_cudashm (#7852)

oandreeva-nv · web-flow · commit 83d0e30845c9 · 2024-12-04T11:46:36.000-08:00
diff --git a/src/test/sequence/CMakeLists.txt b/src/test/sequence/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -43,7 +43,7 @@ add_library(
   TritonSequenceBackend::triton-sequence-backend ALIAS triton-sequence-backend
 )
 
-target_compile_features(triton-sequence-backend PRIVATE cxx_std_11)
+target_compile_features(triton-sequence-backend PRIVATE cxx_std_17)
 target_compile_options(
   triton-sequence-backend PRIVATE
   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
diff --git a/src/test/sequence/src/sequence.cc b/src/test/sequence/src/sequence.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -847,9 +847,15 @@ TRITONBACKEND_ModelInstanceExecute(
     if (input_memory_type == TRITONSERVER_MEMORY_GPU) {
       ipbuffer_vec.resize(input_element_cnt);
       ipbuffer_int = ipbuffer_vec.data();
-      cudaMemcpy(
-          const_cast<int32_t*>(ipbuffer_int), input_buffer, input_byte_size,
-          cudaMemcpyDeviceToHost);
+      LOG_IF_CUDA_ERROR(
+          cudaMemcpyAsync(
+              const_cast<int32_t*>(ipbuffer_int), input_buffer, input_byte_size,
+              cudaMemcpyDeviceToHost, instance_state->CudaStream()),
+          "failed to copy buffer from Device to Host");
+
+      LOG_IF_CUDA_ERROR(
+          cudaStreamSynchronize(instance_state->CudaStream()),
+          "failed to perform synchronization on cuda stream");
     } else {
       ipbuffer_int = reinterpret_cast<const int32_t*>(input_buffer);
     }
@@ -939,9 +945,15 @@ TRITONBACKEND_ModelInstanceExecute(
         }
 
         if (output_memory_type == TRITONSERVER_MEMORY_GPU) {
-          cudaMemcpy(
-              output_buffer, const_cast<int32_t*>(obuffer_int),
-              buffer_byte_size, cudaMemcpyHostToDevice);
+          LOG_IF_CUDA_ERROR(
+              cudaMemcpyAsync(
+                  output_buffer, const_cast<int32_t*>(obuffer_int),
+                  buffer_byte_size, cudaMemcpyHostToDevice,
+                  instance_state->CudaStream()),
+              "failed to copy buffer from Device to Host");
+          LOG_IF_CUDA_ERROR(
+              cudaStreamSynchronize(instance_state->CudaStream()),
+              "failed to perform synchronization on cuda stream");
         }
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`1`	`+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`2`	`2`	`#`
`3`	`3`	`# Redistribution and use in source and binary forms, with or without`
`4`	`4`	`# modification, are permitted provided that the following conditions`
`@@ -43,7 +43,7 @@ add_library(`
`43`	`43`	`TritonSequenceBackend::triton-sequence-backend ALIAS triton-sequence-backend`
`44`	`44`	`)`
`45`	`45`
`46`		`-target_compile_features(triton-sequence-backend PRIVATE cxx_std_11)`
	`46`	`+target_compile_features(triton-sequence-backend PRIVATE cxx_std_17)`
`47`	`47`	`target_compile_options(`
`48`	`48`	`triton-sequence-backend PRIVATE`
`49`	`49`	`$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:`