@@ -143,22 +143,26 @@ TEST_F(Sum4D, CodeOuterBand) {
143
143
EXPECT_GT (posSync4, posC);
144
144
}
145
145
146
- TEST_F (Sum4D, CodeBeforeThreadMapping) {
147
- auto declarations = {" __shared__ float32 _A_0[16][16][16][1];" ,
148
- " __shared__ float32 _B_0[16][16][16][1];" ,
149
- " __shared__ float32 _C_0[16][16][16][1];" };
146
+ /*
147
+ * Check code when promotion is performed above the mapping to threads.
148
+ * Note that the copying code is not mapped to threads because
149
+ * promoteEverythingAt does not call mapCopiesToThreads.
150
+ */
151
+ TEST_F (Sum4D, CodeAboveThreadMapping) {
152
+ auto declarations = {" __shared__ float32 _A_0[16][16][16][16];" ,
153
+ " __shared__ float32 _B_0[16][16][16][16];" ,
154
+ " __shared__ float32 _C_0[16][16][16][16];" };
150
155
auto copyA =
151
- " _A_0[c4][c5][c6][0 ] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3]; " ;
156
+ " _A_0[c4][c5][c6][c7 ] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7] " ;
152
157
auto copyB =
153
- " _B_0[c4][c5][c6][0 ] = B[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3]; " ;
158
+ " _B_0[c4][c5][c6][c7 ] = B[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7] " ;
154
159
auto compute =
155
- " _C_0[c4][c5][c6][0 ] = (_A_0[c4][c5][c6][0 ] + _B_0[c4][c5][c6][0 ]);" ;
160
+ " _C_0[c4][c5][c6][t0 ] = (_A_0[c4][c5][c6][t0 ] + _B_0[c4][c5][c6][t0 ]);" ;
156
161
auto copyC =
157
- " C[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3 ] = _C_0[c4][c5][c6][0 ];" ;
162
+ " C[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7 ] = _C_0[c4][c5][c6][c7 ];" ;
158
163
auto sync = " __syncthreads()" ;
159
164
160
- auto code =
161
- emitCode ({256 , 128 , 192 , 224 }, {16 , 16 , 16 , 16 }, {0 , 0 , 0 , 0 , 0 , 0 });
165
+ auto code = emitCode ({256 , 128 , 192 , 224 }, {16 , 16 , 16 , 16 }, {0 , 0 , 0 , 0 });
162
166
163
167
// Order of copies may be arbitrary, but syncs must be inserted before and
164
168
// after
0 commit comments