@@ -23,6 +23,21 @@ THE SOFTWARE.
23
23
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
24
24
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
25
25
26
+ // --------------------- CONSTANTS ------------------------
27
+ // add neutral elements
28
+ __constant int neutral_add_int = 0 ;
29
+ __constant float neutral_add_float = 0 ;
30
+ __constant float3 neutral_add_float3 = (float3 )(0.0 , 0.0 , 0.0 );
31
+ // max neutral elements
32
+ __constant int neutral_max_int = INT_MIN ;
33
+ __constant float neutral_max_float = FLT_MIN ;
34
+ __constant float3 neutral_max_float3 = (float3 )(FLT_MIN , FLT_MIN , FLT_MIN );
35
+ // min neutral elements
36
+ __constant int neutral_min_int = INT_MAX ;
37
+ __constant float neutral_min_float = FLT_MAX ;
38
+ __constant float3 neutral_min_float3 = (float3 )(FLT_MAX , FLT_MAX , FLT_MAX );
39
+
40
+ __constant float epsilon = .00001f ;
26
41
27
42
// --------------------- HELPERS ------------------------
28
43
//#define INT_MAX 0x7FFFFFFF
@@ -1492,3 +1507,196 @@ __kernel void segmented_distribute_part_sum_int_nocut(
1492
1507
}
1493
1508
}
1494
1509
}
1510
+
1511
+ // --------------------- ATOMIC OPERTIONS ------------------------
1512
+
1513
+ #define DEFINE_ATOMIC (operation )\
1514
+ inline void atomic_##operation##_float(volatile __global float* addr, float value)\
1515
+ {\
1516
+ union{\
1517
+ unsigned int u32;\
1518
+ float f32;\
1519
+ } next, expected, current;\
1520
+ current.f32 = *addr;\
1521
+ do{\
1522
+ expected.f32 = current.f32;\
1523
+ next.f32 = operation(expected.f32, value);\
1524
+ current.u32 = atomic_cmpxchg((volatile __global unsigned int *)addr,\
1525
+ expected.u32, next.u32);\
1526
+ } while (current.u32 != expected.u32);\
1527
+ }
1528
+
1529
+ #define DEFINE_ATOMIC_FLOAT3 (operation )\
1530
+ inline void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\
1531
+ {\
1532
+ volatile __global float* p = (volatile __global float*)addr;\
1533
+ atomic_##operation##_float(p, value.x);\
1534
+ atomic_##operation##_float(p + 1, value.y);\
1535
+ atomic_##operation##_float(p + 2, value.z);\
1536
+ }
1537
+
1538
+ inline void atomic_max_int (volatile __global int * addr , int value )
1539
+ {
1540
+ atomic_max (addr , value );
1541
+ }
1542
+
1543
+ inline void atomic_min_int (volatile __global int * addr , int value )
1544
+ {
1545
+ atomic_min (addr , value );
1546
+ }
1547
+
1548
+ // --------------------- HELPERS ------------------------
1549
+
1550
+ #define DEFINE_ASSIGN_OPERATOR (type )\
1551
+ inline void assign_##type(__local type* addr, type value)\
1552
+ {\
1553
+ *addr = value;\
1554
+ }
1555
+
1556
+ inline void assign_float3 (__local float3 * addr , float3 value )
1557
+ {
1558
+ (* addr ).xyz = value .xyz ;
1559
+ }
1560
+
1561
+ inline int divide_int (int dividend , int divider )
1562
+ {
1563
+ return dividend / (divider != 0 ? divider : 1 );
1564
+ }
1565
+
1566
+ inline float divide_float (float dividend , float divider )
1567
+ {
1568
+ return dividend / (fabs (divider ) > epsilon ? divider : 1.f );
1569
+ }
1570
+
1571
+ inline float3 divide_float3 (float3 dividend , float3 divider )
1572
+ {
1573
+ return (float3 )(divide_float (dividend .x , divider .x ),
1574
+ divide_float (dividend .y , divider .y ),
1575
+ divide_float (dividend .z , divider .z ));
1576
+ }
1577
+
1578
+ // --------------------- REDUCTION ------------------------
1579
+
1580
+ #define DEFINE_REDUCTION (bin_op , type )\
1581
+ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
1582
+ int buf_count,\
1583
+ __local type* shared_mem,\
1584
+ __global type* out)\
1585
+ {\
1586
+ int global_id = get_global_id(0);\
1587
+ int group_id = get_group_id(0);\
1588
+ int local_id = get_local_id(0);\
1589
+ int group_size = get_local_size(0);\
1590
+ \
1591
+ if (global_id < buf_count)\
1592
+ {\
1593
+ assign_##type(shared_mem + local_id, buffer[global_id]);\
1594
+ }\
1595
+ else\
1596
+ {\
1597
+ assign_##type(shared_mem + local_id, neutral_##bin_op##_##type);\
1598
+ }\
1599
+ \
1600
+ if (global_id == 0)\
1601
+ {\
1602
+ *out = neutral_##bin_op##_##type;\
1603
+ }\
1604
+ \
1605
+ barrier(CLK_LOCAL_MEM_FENCE);\
1606
+ for (int i = group_size / 2; i > 0; i >>= 1)\
1607
+ {\
1608
+ if (local_id < i)\
1609
+ {\
1610
+ assign_##type(shared_mem + local_id,\
1611
+ bin_op(shared_mem[local_id], shared_mem[local_id + i]));\
1612
+ }\
1613
+ barrier(CLK_LOCAL_MEM_FENCE);\
1614
+ }\
1615
+ \
1616
+ if (local_id == 0)\
1617
+ {\
1618
+ atomic_##bin_op##_##type(out, shared_mem[0]);\
1619
+ }\
1620
+ }
1621
+
1622
+ // --------------------- NORMALIZATION ------------------------
1623
+
1624
+ #define DEFINE_BUFFER_NORMALIZATION (type )\
1625
+ __kernel void buffer_normalization_##type(__global type* input,\
1626
+ __global type* output,\
1627
+ int buffer_count,\
1628
+ __local type* shared_mem,\
1629
+ __global type* auxiliary_buf)\
1630
+ {\
1631
+ int global_id = get_global_id(0);\
1632
+ int group_id = get_group_id(0);\
1633
+ int local_id = get_local_id(0);\
1634
+ int group_size = get_local_size(0);\
1635
+ \
1636
+ __local type* min_buffer = shared_mem;\
1637
+ __local type* max_buffer = shared_mem + group_size;\
1638
+ \
1639
+ if (global_id < buffer_count)\
1640
+ {\
1641
+ min_buffer[local_id] = input[global_id];\
1642
+ max_buffer[local_id] = input[global_id];\
1643
+ }\
1644
+ else\
1645
+ {\
1646
+ min_buffer[local_id] = neutral_min_##type;\
1647
+ max_buffer[local_id] = neutral_max_##type;\
1648
+ }\
1649
+ \
1650
+ if (global_id == 0)\
1651
+ {\
1652
+ auxiliary_buf[0] = neutral_min_##type;\
1653
+ auxiliary_buf[1] = neutral_max_##type;\
1654
+ }\
1655
+ \
1656
+ barrier(CLK_LOCAL_MEM_FENCE);\
1657
+ \
1658
+ for (int i = group_size / 2; i > 0; i >>= 1)\
1659
+ {\
1660
+ if (local_id < i)\
1661
+ {\
1662
+ assign_##type(min_buffer + local_id,\
1663
+ min(min_buffer[local_id], min_buffer[local_id + i]));\
1664
+ assign_##type(max_buffer + local_id,\
1665
+ max(max_buffer[local_id], max_buffer[local_id + i]));\
1666
+ }\
1667
+ barrier(CLK_LOCAL_MEM_FENCE);\
1668
+ }\
1669
+ \
1670
+ if (local_id == 0)\
1671
+ {\
1672
+ atomic_min_##type(auxiliary_buf, min_buffer[0]);\
1673
+ atomic_max_##type(auxiliary_buf + 1, max_buffer[0]);\
1674
+ }\
1675
+ \
1676
+ barrier(CLK_LOCAL_MEM_FENCE);\
1677
+ type diff = auxiliary_buf[1] - auxiliary_buf[0];\
1678
+ if (global_id < buffer_count)\
1679
+ {\
1680
+ output[global_id] = divide_##type(input[global_id], diff);\
1681
+ }\
1682
+ }
1683
+
1684
+ // Do not change the order
1685
+ DEFINE_ATOMIC (min )
1686
+ DEFINE_ATOMIC (max )
1687
+ DEFINE_ATOMIC_FLOAT3 (min )
1688
+ DEFINE_ATOMIC_FLOAT3 (max )
1689
+
1690
+ DEFINE_ASSIGN_OPERATOR (int )
1691
+ DEFINE_ASSIGN_OPERATOR (float )
1692
+
1693
+ DEFINE_REDUCTION (min , int )
1694
+ DEFINE_REDUCTION (min , float )
1695
+ DEFINE_REDUCTION (min , float3 )
1696
+ DEFINE_REDUCTION (max , int )
1697
+ DEFINE_REDUCTION (max , float )
1698
+ DEFINE_REDUCTION (max , float3 )
1699
+
1700
+ DEFINE_BUFFER_NORMALIZATION (int )
1701
+ DEFINE_BUFFER_NORMALIZATION (float )
1702
+ DEFINE_BUFFER_NORMALIZATION (float3 )
0 commit comments