1
1
/***************************************************************************
2
- Copyright (c) 2024, The OpenBLAS Project
2
+ Copyright (c) 2024, 2025 The OpenBLAS Project
3
3
All rights reserved.
4
4
5
5
Redistribution and use in source and binary forms, with or without
@@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
56
56
BLASLONG ix ,iy ;
57
57
BLASLONG j ;
58
58
FLOAT * a_ptr ;
59
+ FLOAT * y_ptr ;
59
60
FLOAT temp ;
60
61
61
62
iy = 0 ;
62
63
63
64
if (inc_x == 1 ) {
64
- BLASLONG width = (n + 3 - 1 ) / 3 ;
65
+ BLASLONG width = n / 3 ;
66
+ BLASLONG sve_size = SV_COUNT ();
67
+ svbool_t pg_true = SV_TRUE ();
68
+ svbool_t pg = SV_WHILE (0 , m % sve_size );
65
69
66
70
FLOAT * a0_ptr = a + lda * width * 0 ;
67
71
FLOAT * a1_ptr = a + lda * width * 1 ;
@@ -72,67 +76,79 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
72
76
FLOAT * y2_ptr = y + inc_y * width * 2 ;
73
77
74
78
for (j = 0 ; j < width ; j ++ ) {
75
- svbool_t pg00 = ((j + width * 0 ) < n ) ? SV_TRUE () : svpfalse ();
76
- svbool_t pg01 = ((j + width * 1 ) < n ) ? SV_TRUE () : svpfalse ();
77
- svbool_t pg02 = ((j + width * 2 ) < n ) ? SV_TRUE () : svpfalse ();
78
-
79
79
SV_TYPE temp00_vec = SV_DUP (0.0 );
80
80
SV_TYPE temp01_vec = SV_DUP (0.0 );
81
81
SV_TYPE temp02_vec = SV_DUP (0.0 );
82
82
83
83
i = 0 ;
84
- BLASLONG sve_size = SV_COUNT ();
85
84
while ((i + sve_size * 1 - 1 ) < m ) {
86
- SV_TYPE x0_vec = svld1_vnum ( SV_TRUE () , x + i , 0 );
85
+ SV_TYPE x0_vec = svld1 ( pg_true , x + i );
87
86
88
- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
89
- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
90
- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
87
+ SV_TYPE a00_vec = svld1 ( pg_true , a0_ptr + i );
88
+ SV_TYPE a01_vec = svld1 ( pg_true , a1_ptr + i );
89
+ SV_TYPE a02_vec = svld1 ( pg_true , a2_ptr + i );
91
90
92
- temp00_vec = svmla_m ( pg00 , temp00_vec , a00_vec , x0_vec );
93
- temp01_vec = svmla_m ( pg01 , temp01_vec , a01_vec , x0_vec );
94
- temp02_vec = svmla_m ( pg02 , temp02_vec , a02_vec , x0_vec );
91
+ temp00_vec = svmla_x ( pg_true , temp00_vec , a00_vec , x0_vec );
92
+ temp01_vec = svmla_x ( pg_true , temp01_vec , a01_vec , x0_vec );
93
+ temp02_vec = svmla_x ( pg_true , temp02_vec , a02_vec , x0_vec );
95
94
96
95
i += sve_size * 1 ;
97
96
}
98
97
99
98
if (i < m ) {
100
- svbool_t pg0 = SV_WHILE (i + sve_size * 0 , m );
101
-
102
- pg00 = svand_z (SV_TRUE (), pg0 , pg00 );
103
- pg01 = svand_z (SV_TRUE (), pg0 , pg01 );
104
- pg02 = svand_z (SV_TRUE (), pg0 , pg02 );
99
+ SV_TYPE x0_vec = svld1 (pg , x + i );
105
100
106
- SV_TYPE x0_vec = svld1_vnum (pg0 , x + i , 0 );
101
+ SV_TYPE a00_vec = svld1 (pg , a0_ptr + i );
102
+ SV_TYPE a01_vec = svld1 (pg , a1_ptr + i );
103
+ SV_TYPE a02_vec = svld1 (pg , a2_ptr + i );
107
104
108
- SV_TYPE a00_vec = svld1_vnum (pg00 , a0_ptr + i , 0 );
109
- SV_TYPE a01_vec = svld1_vnum (pg01 , a1_ptr + i , 0 );
110
- SV_TYPE a02_vec = svld1_vnum (pg02 , a2_ptr + i , 0 );
111
-
112
- temp00_vec = svmla_m (pg00 , temp00_vec , a00_vec , x0_vec );
113
- temp01_vec = svmla_m (pg01 , temp01_vec , a01_vec , x0_vec );
114
- temp02_vec = svmla_m (pg02 , temp02_vec , a02_vec , x0_vec );
105
+ temp00_vec = svmla_m (pg , temp00_vec , a00_vec , x0_vec );
106
+ temp01_vec = svmla_m (pg , temp01_vec , a01_vec , x0_vec );
107
+ temp02_vec = svmla_m (pg , temp02_vec , a02_vec , x0_vec );
115
108
}
116
109
117
- if ((j + width * 0 ) < n ) {
118
- temp = svaddv (SV_TRUE (), temp00_vec );
119
- y0_ptr [iy ] += alpha * temp ;
120
- }
121
- if ((j + width * 1 ) < n ) {
122
- temp = svaddv (SV_TRUE (), temp01_vec );
123
- y1_ptr [iy ] += alpha * temp ;
124
- }
125
- if ((j + width * 2 ) < n ) {
126
- temp = svaddv (SV_TRUE (), temp02_vec );
127
- y2_ptr [iy ] += alpha * temp ;
128
- }
110
+ y0_ptr [iy ] += alpha * svaddv (pg_true , temp00_vec );
111
+ y1_ptr [iy ] += alpha * svaddv (pg_true , temp01_vec );
112
+ y2_ptr [iy ] += alpha * svaddv (pg_true , temp02_vec );
113
+
129
114
iy += inc_y ;
130
115
131
116
a0_ptr += lda ;
132
117
a1_ptr += lda ;
133
118
a2_ptr += lda ;
134
119
}
135
120
121
+ a_ptr = a2_ptr ;
122
+ y_ptr = y2_ptr ;
123
+ for (j = width * 3 ; j < n ; j ++ ) {
124
+ SV_TYPE temp_vec = SV_DUP (0.0 );
125
+
126
+ i = 0 ;
127
+ while ((i + sve_size * 1 - 1 ) < m ) {
128
+ SV_TYPE x_vec = svld1 (pg_true , x + i );
129
+
130
+ SV_TYPE a_vec = svld1 (pg_true , a_ptr + i );
131
+
132
+ temp_vec = svmla_x (pg_true , temp_vec , a_vec , x_vec );
133
+
134
+ i += sve_size * 1 ;
135
+ }
136
+
137
+ if (i < m ) {
138
+ SV_TYPE x_vec = svld1 (pg , x + i );
139
+
140
+ SV_TYPE a_vec = svld1 (pg , a_ptr + i );
141
+
142
+ temp_vec = svmla_m (pg , temp_vec , a_vec , x_vec );
143
+ }
144
+
145
+ y_ptr [iy ] += alpha * svaddv (pg_true , temp_vec );
146
+
147
+ iy += inc_y ;
148
+
149
+ a_ptr += lda ;
150
+ }
151
+
136
152
return (0 );
137
153
}
138
154
0 commit comments