Skip to content

Commit 6ca9ffa

Browse files
authored
Merge pull request #4655 from yamazakimitsufumi/update_2d_thread_distribution
Expanding the scope of 2D thread distribution to improve multi-threaded DGEMM performance
2 parents b45a78c + 51ab190 commit 6ca9ffa

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

driver/level3/level3_thread.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
826826
if (nthreads_m * nthreads_n > args -> nthreads) {
827827
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
828828
}
829+
/* The nthreads_m and nthreads_n are adjusted so that the submatrix */
830+
/* to be handled by each thread preferably becomes a square matrix */
831+
/* by minimizing an objective function 'n * nthreads_m + m * nthreads_n'. */
832+
/* Objective function come from sum of partitions in m and n. */
833+
/* (n / nthreads_n) + (m / nthreads_m) */
834+
/* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */
835+
while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) {
836+
nthreads_m /= 2;
837+
nthreads_n *= 2;
838+
}
829839
}
830840

831841
/* Execute serial or parallel computation */

0 commit comments

Comments
 (0)