Skip to content

Commit caef132

Browse files
author
Jakub Szuppe
committed
Merge branch 'update_rocprim_docs' into 'master'
Update rocPRIM docs See merge request amd/rocPRIM!176
2 parents acdfad7 + 3a2e9ad commit caef132

12 files changed

+71
-19
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ cmake_minimum_required(VERSION 3.5.1 FATAL_ERROR)
2626
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories")
2727

2828
# rocPRIM project
29-
project(rocprim VERSION 0.3.2.0 LANGUAGES CXX)
29+
project(rocprim VERSION 1.0.0.0 LANGUAGES CXX)
3030

3131
# CMake modules
3232
list(APPEND CMAKE_MODULE_PATH

README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
# Warning (pre-production state)
2-
3-
rocPRIM is in its pre-production state and should be used for development purposes only.
4-
51
# rocPRIM
62

73
The rocPRIM is a header-only library providing HIP and HC parallel primitives for developing

rocprim/include/rocprim/device/config_types.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ struct default_config { };
3939
/// \brief Configuration of particular kernels launched by device-level operation
4040
///
4141
/// \tparam BlockSize - number of threads in a block.
42-
/// \tparam ItemsPerThread - number of items in processed by each thread.
42+
/// \tparam ItemsPerThread - number of items processed by each thread.
4343
template<unsigned int BlockSize, unsigned int ItemsPerThread>
4444
struct kernel_config
4545
{
46-
/// value of BlockSize - number of threads in a block.
46+
/// \brief Number of threads in a block.
4747
static constexpr unsigned int block_size = BlockSize;
48-
/// value of ItemsPerThread - number of items in processed by each thread.
48+
/// \brief Number of items processed by each thread.
4949
static constexpr unsigned int items_per_thread = ItemsPerThread;
5050
};
5151

rocprim/include/rocprim/device/device_radix_sort_config.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,14 @@ template<
5454
>
5555
struct radix_sort_config
5656
{
57-
#ifndef DOXYGEN_SHOULD_SKIP_THIS
57+
/// \brief Number of bits in long iterations.
5858
static constexpr unsigned int long_radix_bits = LongRadixBits;
59+
/// \brief Number of bits in short iterations.
5960
static constexpr unsigned int short_radix_bits = ShortRadixBits;
61+
/// \brief Configuration of digits scan kernel.
6062
using scan = ScanConfig;
63+
/// \brief Configuration of radix sort kernel.
6164
using sort = SortConfig;
62-
#endif
6365
};
6466

6567
namespace detail

rocprim/include/rocprim/device/device_reduce_by_key_config.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ template<
4343
>
4444
struct reduce_by_key_config
4545
{
46+
/// \brief Configuration of carry-outs scan kernel.
4647
using scan = ScanConfig;
48+
/// \brief Configuration of the main reduce-by-key kernel.
4749
using reduce = ReduceConfig;
4850
};
4951

rocprim/include/rocprim/device/device_reduce_config.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ BEGIN_ROCPRIM_NAMESPACE
3838
/// \brief Configuration of device-level reduce primitives.
3939
///
4040
/// \tparam BlockSize - number of threads in a block.
41-
/// \tparam ItemsPerThread - number of items in processed by each thread.
41+
/// \tparam ItemsPerThread - number of items processed by each thread.
4242
/// \tparam BlockReduceMethod - algorithm for block reduce.
4343
template<
4444
unsigned int BlockSize,
@@ -47,9 +47,11 @@ template<
4747
>
4848
struct reduce_config
4949
{
50+
/// \brief Number of threads in a block.
5051
static constexpr unsigned int block_size = BlockSize;
52+
/// \brief Number of items processed by each thread.
5153
static constexpr unsigned int items_per_thread = ItemsPerThread;
52-
54+
/// \brief Algorithm for block reduce.
5355
static constexpr block_reduce_algorithm block_reduce_method = BlockReduceMethod;
5456
};
5557

rocprim/include/rocprim/device/device_run_length_encode_config.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ template<
4545
>
4646
struct run_length_encode_config
4747
{
48+
/// \brief Configuration of device-level reduce-by-key operation.
4849
using reduce_by_key = ReduceByKeyConfig;
50+
/// \brief Configuration of device-level select operation.
4951
using select = SelectConfig;
5052
};
5153

rocprim/include/rocprim/device/device_scan_config.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ BEGIN_ROCPRIM_NAMESPACE
4040
/// \brief Configuration of device-level scan primitives.
4141
///
4242
/// \tparam BlockSize - number of threads in a block.
43-
/// \tparam ItemsPerThread - number of items in processed by each thread.
43+
/// \tparam ItemsPerThread - number of items processed by each thread.
4444
/// \tparam BlockLoadMethod - method for loading input values.
4545
/// \tparam StoreLoadMethod - method for storing values.
4646
/// \tparam BlockScanMethod - algorithm for block scan.
@@ -53,11 +53,15 @@ template<
5353
>
5454
struct scan_config
5555
{
56+
/// \brief Number of threads in a block.
5657
static constexpr unsigned int block_size = BlockSize;
58+
/// \brief Number of items processed by each thread.
5759
static constexpr unsigned int items_per_thread = ItemsPerThread;
58-
60+
/// \brief Method for loading input values.
5961
static constexpr block_load_method block_load_method = BlockLoadMethod;
62+
/// \brief Method for storing values.
6063
static constexpr block_store_method block_store_method = BlockStoreMethod;
64+
/// \brief Algorithm for block scan.
6165
static constexpr block_scan_algorithm block_scan_method = BlockScanMethod;
6266
};
6367

rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,12 @@ template<
5252
>
5353
struct segmented_radix_sort_config
5454
{
55-
#ifndef DOXYGEN_SHOULD_SKIP_THIS
55+
/// \brief Number of bits in long iterations.
5656
static constexpr unsigned int long_radix_bits = LongRadixBits;
57+
/// \brief Number of bits in short iterations
5758
static constexpr unsigned int short_radix_bits = ShortRadixBits;
59+
/// \brief Configuration of radix sort kernel.
5860
using sort = SortConfig;
59-
#endif
6061
};
6162

6263
namespace detail

rocprim/include/rocprim/device/device_select_config.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ BEGIN_ROCPRIM_NAMESPACE
3939
/// \brief Configuration of device-level select operation.
4040
///
4141
/// \tparam BlockSize - number of threads in a block.
42-
/// \tparam ItemsPerThread - number of items in processed by each thread.
42+
/// \tparam ItemsPerThread - number of items processed by each thread.
4343
/// \tparam ValueBlockLoadMethod - method for loading input values.
4444
/// \tparam FlagBlockLoadMethod - method for loading flag values.
4545
/// \tparam BlockScanMethod - algorithm for block scan.
@@ -52,12 +52,15 @@ template<
5252
>
5353
struct select_config
5454
{
55+
/// \brief Number of threads in a block.
5556
static constexpr unsigned int block_size = BlockSize;
57+
/// \brief Number of items processed by each thread.
5658
static constexpr unsigned int items_per_thread = ItemsPerThread;
57-
59+
/// \brief Method for loading input values.
5860
static constexpr block_load_method value_block_load_method = ValueBlockLoadMethod;
61+
/// \brief Method for loading flag values.
5962
static constexpr block_load_method flag_block_load_method = FlagBlockLoadMethod;
60-
63+
/// \brief Algorithm for block scan.
6164
static constexpr block_scan_algorithm block_scan_method = BlockScanMethod;
6265
};
6366

rocprim/include/rocprim/warp/warp_reduce.hpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,25 @@ class warp_reduce
323323
base_type::reduce(input, output, valid_items, storage, reduce_op);
324324
}
325325

326+
/// \brief Performs head-segmented reduction across threads in a logical warp.
327+
///
328+
/// \tparam Flag - type of head flags. Must be contextually convertible to \p bool.
329+
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
330+
/// is rocprim::plus<T>.
331+
///
332+
/// \param [in] input - thread input value.
333+
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
334+
/// \param [in] flag - thread head flag, \p true flags mark beginnings of segments.
335+
/// \param [in] storage - reference to a temporary storage object of type storage_type.
336+
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
337+
/// The signature of the function should be equivalent to the following:
338+
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
339+
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
340+
///
341+
/// \par Storage reusage
342+
/// Synchronization barrier should be placed before \p storage is reused
343+
/// or repurposed: \p __syncthreads() in HIP, \p tile_barrier::wait() in HC, or
344+
/// universal rocprim::syncthreads().
326345
template<class Flag, class BinaryFunction = ::rocprim::plus<T>>
327346
ROCPRIM_DEVICE inline
328347
void head_segmented_reduce(T input,
@@ -334,6 +353,25 @@ class warp_reduce
334353
base_type::head_segmented_reduce(input, output, flag, storage, reduce_op);
335354
}
336355

356+
/// \brief Performs tail-segmented reduction across threads in a logical warp.
357+
///
358+
/// \tparam Flag - type of tail flags. Must be contextually convertible to \p bool.
359+
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
360+
/// is rocprim::plus<T>.
361+
///
362+
/// \param [in] input - thread input value.
363+
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
364+
/// \param [in] flag - thread tail flag, \p true flags mark ends of segments.
365+
/// \param [in] storage - reference to a temporary storage object of type storage_type.
366+
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
367+
/// The signature of the function should be equivalent to the following:
368+
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
369+
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
370+
///
371+
/// \par Storage reusage
372+
/// Synchronization barrier should be placed before \p storage is reused
373+
/// or repurposed: \p __syncthreads() in HIP, \p tile_barrier::wait() in HC, or
374+
/// universal rocprim::syncthreads().
337375
template<class Flag, class BinaryFunction = ::rocprim::plus<T>>
338376
ROCPRIM_DEVICE inline
339377
void tail_segmented_reduce(T input,

rocprim/include/rocprim/warp/warp_scan.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,12 +734,14 @@ class warp_scan
734734
return base_type::broadcast(input, src_lane, storage);
735735
}
736736

737+
#ifndef DOXYGEN_SHOULD_SKIP_THIS
737738
protected:
738739
ROCPRIM_DEVICE inline
739740
void to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage)
740741
{
741742
return base_type::to_exclusive(inclusive_input, exclusive_output, storage);
742743
}
744+
#endif
743745
};
744746

745747
END_ROCPRIM_NAMESPACE

0 commit comments

Comments
 (0)