@@ -68,9 +68,9 @@ struct ArithmeticConfiguration
68
68
NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1;
69
69
70
70
NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3 ,uint16_t,SubgroupSize-1 ,0 >::value;
71
- NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3 ,uint16_t,VirtualInvocationsAtLevel1+__padding ,SubgroupSize>::value;
71
+ NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3 ,uint16_t,VirtualInvocationsAtLevel1,SubgroupSize>::value + __padding ;
72
72
NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value<LevelCount==3 ,uint16_t,SubgroupSize,0 >::value;
73
- using ChannelStride = tuple<integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >;
73
+ using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t, __channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
74
74
75
75
// user specified the shared mem size of Scalars
76
76
NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1 ,uint16_t,
@@ -101,17 +101,17 @@ struct ArithmeticConfiguration
101
101
{
102
102
const uint16_t ItemsPerNextInvocation = tuple_element<level,ItemsPerInvocation>::type::value;
103
103
const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t (1u));
104
- const uint16_t outInvocation = virtualSubgroupID/ ItemsPerNextInvocation;
104
+ const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation;
105
105
const uint16_t localOffset = outChannel * tuple_element<level,ChannelStride>::type::value + outInvocation;
106
106
107
107
if (level==2 )
108
108
{
109
- const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize- uint16_t (1u)) * ItemsPerNextInvocation ;
109
+ const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t (1u)) * ItemsPerInvocation_1 ;
110
110
return baseOffset + localOffset;
111
111
}
112
112
else
113
113
{
114
- const uint16_t paddingOffset = virtualSubgroupID/ (SubgroupSize* ItemsPerInvocation_1);
114
+ const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1);
115
115
return localOffset + paddingOffset;
116
116
}
117
117
}
@@ -128,11 +128,11 @@ struct ArithmeticConfiguration
128
128
static uint16_t sharedLoadIndex (const uint16_t invocationIndex, const uint16_t component)
129
129
{
130
130
const uint16_t localOffset = component * tuple_element<level,ChannelStride>::type::value + invocationIndex;
131
- const uint16_t paddingOffset = invocationIndex/ SubgroupSize;
131
+ const uint16_t paddingOffset = invocationIndex / SubgroupSize;
132
132
133
133
if (level==2 )
134
134
{
135
- const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize- uint16_t (1u)) * ItemsPerInvocation_1;
135
+ const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t (1u)) * ItemsPerInvocation_1;
136
136
return baseOffset + localOffset + paddingOffset;
137
137
}
138
138
else
0 commit comments