Skip to content

Commit 36c2589

Browse files
authored
Merge pull request OpenMathLib#5355 from tetsuzo-usui/add_parallel_laed3
Improve [SD]SYEVD performance by parallelizing [SD]LAED3
2 parents a06bcf8 + 14107e3 commit 36c2589

File tree

12 files changed

+564
-11
lines changed

12 files changed

+564
-11
lines changed

cmake/lapack.cmake

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ set(SCLAUX
1111
la_constants.f90
1212
sbdsdc.f
1313
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f
14-
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
14+
slaed0.f slaed1.f slaed2.f slaed4.f slaed5.f slaed6.f
1515
slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f
1616
slagts.f slamrg.f slanst.f
1717
slapy2.f slapy3.f slarnv.f
@@ -31,7 +31,7 @@ set(DZLAUX
3131
dbdsdc.f
3232
dbdsvdx.f dstevx.f dstein.f
3333
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
34-
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
34+
dlaed0.f dlaed1.f dlaed2.f dlaed4.f dlaed5.f dlaed6.f
3535
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
3636
dlagts.f dlamrg.f dlanst.f
3737
dlapy2.f dlapy3.f dlarnv.f
@@ -517,7 +517,7 @@ set(SCLAUX
517517
scombssq.c sbdsvdx.c sstevx.c sstein.c
518518
sbdsdc.c
519519
sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c
520-
slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c
520+
slaed0.c slaed1.c slaed2.c slaed4.c slaed5.c slaed6.c
521521
slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c
522522
slagts.c slamrg.c slanst.c
523523
slapy2.c slapy3.c slarnv.c
@@ -536,7 +536,7 @@ set(DZLAUX
536536
dbdsdc.c
537537
dbdsvdx.c dstevx.c dstein.c
538538
dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c
539-
dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c
539+
dlaed0.c dlaed1.c dlaed2.c dlaed4.c dlaed5.c dlaed6.c
540540
dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c
541541
dlagts.c dlamrg.c dlanst.c
542542
dlapy2.c dlapy3.c dlarnv.c

common_lapack.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,4 +439,9 @@ blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdo
439439
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
440440
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
441441

442+
blasint slaed3_single(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *);
443+
blasint dlaed3_single(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *);
444+
blasint slaed3_parallel(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *);
445+
blasint dlaed3_parallel(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *);
446+
442447
#endif

common_macro.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3035,6 +3035,8 @@ typedef struct {
30353035
#define NEG_TCOPY DNEG_TCOPY
30363036
#define LARF_L DLARF_L
30373037
#define LARF_R DLARF_R
3038+
#define LAED3_SINGLE dlaed3_single
3039+
#define LAED3_PARALLEL dlaed3_parallel
30383040
#else
30393041
#define GETF2 SGETF2
30403042
#define GETRF SGETRF
@@ -3056,6 +3058,8 @@ typedef struct {
30563058
#define NEG_TCOPY SNEG_TCOPY
30573059
#define LARF_L SLARF_L
30583060
#define LARF_R SLARF_R
3061+
#define LAED3_SINGLE slaed3_single
3062+
#define LAED3_PARALLEL slaed3_parallel
30593063
#endif
30603064
#else
30613065
#ifdef XDOUBLE

interface/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ if (NOT NO_LAPACK)
221221
GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3)
222222
GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3)
223223
endif()
224+
GenerateNamedObjects("lapack/laed3.c" "" "" 0 "" "" 0 1)
224225
endif ()
225226

226227
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)

interface/Makefile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,8 +429,8 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
429429
SLAPACKOBJS = \
430430
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
431431
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
432-
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
433-
432+
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) \
433+
slaed3.$(SUFFIX)
434434

435435
#DLAPACKOBJS = \
436436
# dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
@@ -440,8 +440,8 @@ SLAPACKOBJS = \
440440
DLAPACKOBJS = \
441441
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
442442
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
443-
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
444-
443+
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) \
444+
dlaed3.$(SUFFIX)
445445

446446
QLAPACKOBJS = \
447447
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
@@ -2365,6 +2365,11 @@ zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c
23652365
xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c
23662366
$(CC) -c $(CFLAGS) $< -o $(@F)
23672367

2368+
slaed3.$(SUFFIX) slaed3.$(PSUFFIX) : lapack/laed3.c
2369+
$(CC) -c $(CFLAGS) $< -o $(@F)
2370+
2371+
dlaed3.$(SUFFIX) dlaed3.$(PSUFFIX) : lapack/laed3.c
2372+
$(CC) -c $(CFLAGS) $< -o $(@F)
23682373

23692374
############# BLAS EXTENSIONS #####################################
23702375

interface/lapack/laed3.c

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/***************************************************************************
2+
Copyright (c) 2025, The OpenBLAS Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in
14+
the documentation and/or other materials provided with the
15+
distribution.
16+
3. Neither the name of the OpenBLAS project nor the names of
17+
its contributors may be used to endorse or promote products
18+
derived from this software without specific prior written
19+
permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
*****************************************************************************/
32+
33+
#include <stdio.h>
34+
#include "common.h"
35+
36+
#if defined(DOUBLE)
37+
#define ERROR_NAME "DLAED3"
38+
#else
39+
#define ERROR_NAME "SLAED3"
40+
#endif
41+
42+
/* ===================================================================== */
43+
int NAME(blasint *k, blasint *n, blasint *n1, FLOAT *d,
44+
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda,
45+
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w,
46+
FLOAT *s, blasint *Info)
47+
{
48+
blasint kval, nval, qdim, info;
49+
50+
qdim = *ldq;
51+
kval = *k;
52+
nval = *n;
53+
54+
/* Test the input parameters. */
55+
info = 0;
56+
if (kval < 0) {
57+
info = 1;
58+
} else if (nval < kval) {
59+
info = 2;
60+
} else if (qdim < nval || qdim < 1) {
61+
info = 6;
62+
}
63+
if (info) {
64+
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
65+
*Info = - info;
66+
return 0;
67+
}
68+
69+
/* Quick return if possible */
70+
71+
*Info = 0;
72+
if (kval == 0) return 0;
73+
74+
#ifdef SMP
75+
int nthreads = num_cpu_avail(4);
76+
77+
if (nthreads == 1) {
78+
#endif
79+
LAED3_SINGLE(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info);
80+
#ifdef SMP
81+
} else {
82+
LAED3_PARALLEL(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info);
83+
}
84+
#endif
85+
86+
return 0;
87+
}

lapack-netlib/SRC/Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \
8585
../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o
8686

8787
ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" ""
88-
SCLAUX = la_constants.o \
88+
SCLAUX_O = la_constants.o \
8989
sbdsvdx.o sstevx.o sstein.o \
9090
sbdsdc.o \
9191
sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \
@@ -106,7 +106,7 @@ SCLAUX = la_constants.o \
106106
endif
107107

108108
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
109-
DZLAUX = la_constants.o\
109+
DZLAUX_O = la_constants.o\
110110
dcombssq.o \
111111
dbdsvdx.o dstevx.o dstein.o \
112112
dbdsdc.o \
@@ -572,6 +572,8 @@ endif
572572

573573
# filter out optimized codes from OpenBLAS
574574
ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o
575+
SCL_AUX_OBJS = slaed3.o
576+
DZL_AUX_OBJS = dlaed3.o
575577

576578
SLAPACKOBJS = \
577579
sgetrf.o sgetrs.o spotrf.o sgetf2.o \
@@ -598,6 +600,8 @@ ZLAPACKOBJS = \
598600
zsymv.o zsyr.o zspmv.o zspr.o
599601

600602
ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O))
603+
SCLAUX = $(filter-out $(SCL_AUX_OBJS),$(SCLAUX_O))
604+
DZLAUX = $(filter-out $(DZL_AUX_OBJS),$(DZLAUX_O))
601605
SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O))
602606
DLASRC = $(filter-out $(DLAPACKOBJS),$(DLASRC_O))
603607
CLASRC = $(filter-out $(CLAPACKOBJS),$(CLASRC_O))

lapack/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ set(LAPACK_SOURCES
1010
potrf/potrf_L_single.c
1111
lauum/lauum_U_single.c
1212
lauum/lauum_L_single.c
13+
laed3/laed3_single.c
1314
)
1415

1516
# add a 'z' to filename for complex version
@@ -79,6 +80,7 @@ if (USE_THREAD)
7980
lauum/lauum_L_parallel.c
8081
potrf/potrf_U_parallel.c
8182
potrf/potrf_L_parallel.c
83+
laed3/laed3_parallel.c
8284
)
8385

8486
# this has a z version

lapack/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ TOPDIR = ..
22
include ../Makefile.system
33

44
#SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs
5-
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs
5+
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs laed3
66

77
FLAMEDIRS = laswp getf2 potf2 lauu2 trti2
88

lapack/laed3/Makefile

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
TOPDIR = ../..
2+
include ../../Makefile.system
3+
4+
SBLASOBJS = slaed3_single.$(SUFFIX)
5+
DBLASOBJS = dlaed3_single.$(SUFFIX)
6+
7+
ifdef SMP
8+
SBLASOBJS += slaed3_parallel.$(SUFFIX)
9+
DBLASOBJS += dlaed3_parallel.$(SUFFIX)
10+
endif
11+
12+
ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
13+
SBLASOBJS=
14+
endif
15+
ifneq ($(BUILD_DOUBLE),1)
16+
DBLASOBJS=
17+
endif
18+
19+
slaed3_single.$(SUFFIX) : laed3_single.c
20+
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F)
21+
dlaed3_single.$(SUFFIX) : laed3_single.c
22+
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F)
23+
slaed3_parallel.$(SUFFIX) : laed3_parallel.c
24+
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F)
25+
dlaed3_parallel.$(SUFFIX) : laed3_parallel.c
26+
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F)
27+
slaed3_single.$(PSUFFIX) : laed3_single.c
28+
$(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F)
29+
dlaed3_single.$(PSUFFIX) : laed3_single.c
30+
$(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F)
31+
slaed3_parallel.$(PSUFFIX) : laed3_parallel.c
32+
$(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F)
33+
dlaed3_parallel.$(PSUFFIX) : laed3_parallel.c
34+
$(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F)
35+
36+
include ../../Makefile.tail

0 commit comments

Comments
 (0)