@@ -52,17 +52,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
52
52
BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y ,
53
53
FLOAT * buffer )
54
54
{
55
- BLASLONG i ;
56
- BLASLONG ix , iy ;
57
- BLASLONG j ;
58
- FLOAT * a_ptr ;
55
+ BLASLONG i , j ;
56
+ BLASLONG ix = 0 ;
57
+ BLASLONG iy ;
58
+ FLOAT * a_ptr = a ;
59
59
FLOAT temp ;
60
60
61
- ix = 0 ;
62
- a_ptr = a ;
63
-
64
61
if (inc_y == 1 ) {
65
- BLASLONG width = (n + 3 - 1 ) / 3 ;
62
+ BLASLONG width = n / 3 ; // Only process full 3-column blocks
63
+ BLASLONG sve_size = SV_COUNT ();
64
+ svbool_t pg_full = SV_TRUE ();
65
+ svbool_t pg_tail = SV_WHILE (0 , m % sve_size );
66
66
67
67
FLOAT * a0_ptr = a_ptr + lda * width * 0 ;
68
68
FLOAT * a1_ptr = a_ptr + lda * width * 1 ;
@@ -73,57 +73,75 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
73
73
FLOAT * x2_ptr = x + inc_x * width * 2 ;
74
74
75
75
for (j = 0 ; j < width ; j ++ ) {
76
- svbool_t pg00 = (( j + width * 0 ) < n ) ? SV_TRUE () : svpfalse ( );
77
- svbool_t pg01 = (( j + width * 1 ) < n ) ? SV_TRUE () : svpfalse ( );
78
- svbool_t pg02 = (( j + width * 2 ) < n ) ? SV_TRUE () : svpfalse ( );
76
+ SV_TYPE temp0_vec = SV_DUP ( alpha * x0_ptr [ ix ] );
77
+ SV_TYPE temp1_vec = SV_DUP ( alpha * x1_ptr [ ix ] );
78
+ SV_TYPE temp2_vec = SV_DUP ( alpha * x2_ptr [ ix ] );
79
79
80
- SV_TYPE temp0_vec = ((j + width * 0 ) < n ) ? SV_DUP (alpha * x0_ptr [ix ]) : SV_DUP (0.0 );
81
- SV_TYPE temp1_vec = ((j + width * 1 ) < n ) ? SV_DUP (alpha * x1_ptr [ix ]) : SV_DUP (0.0 );
82
- SV_TYPE temp2_vec = ((j + width * 2 ) < n ) ? SV_DUP (alpha * x2_ptr [ix ]) : SV_DUP (0.0 );
83
80
i = 0 ;
84
- BLASLONG sve_size = SV_COUNT ();
85
- while ((i + sve_size * 1 - 1 ) < m ) {
86
- SV_TYPE y0_vec = svld1_vnum (SV_TRUE (), y + i , 0 );
81
+ while ((i + sve_size - 1 ) < m ) {
82
+ SV_TYPE y0_vec = svld1 (pg_full , y + i );
87
83
88
- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
89
- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
90
- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
84
+ SV_TYPE a00_vec = svld1 ( pg_full , a0_ptr + i );
85
+ SV_TYPE a01_vec = svld1 ( pg_full , a1_ptr + i );
86
+ SV_TYPE a02_vec = svld1 ( pg_full , a2_ptr + i );
91
87
92
- y0_vec = svmla_m ( pg00 , y0_vec , temp0_vec , a00_vec );
93
- y0_vec = svmla_m ( pg01 , y0_vec , temp1_vec , a01_vec );
94
- y0_vec = svmla_m ( pg02 , y0_vec , temp2_vec , a02_vec );
88
+ y0_vec = svmla_x ( pg_full , y0_vec , temp0_vec , a00_vec );
89
+ y0_vec = svmla_x ( pg_full , y0_vec , temp1_vec , a01_vec );
90
+ y0_vec = svmla_x ( pg_full , y0_vec , temp2_vec , a02_vec );
95
91
96
- svst1_vnum ( SV_TRUE () , y + i , 0 , y0_vec );
97
- i += sve_size * 1 ;
92
+ svst1 ( pg_full , y + i , y0_vec );
93
+ i += sve_size ;
98
94
}
99
95
100
96
if (i < m ) {
101
- svbool_t pg0 = SV_WHILE (i + sve_size * 0 , m );
102
-
103
- pg00 = svand_z (SV_TRUE (), pg0 , pg00 );
104
- pg01 = svand_z (SV_TRUE (), pg0 , pg01 );
105
- pg02 = svand_z (SV_TRUE (), pg0 , pg02 );
97
+ SV_TYPE y0_vec = svld1 (pg_tail , y + i );
106
98
107
- SV_TYPE y0_vec = svld1_vnum (pg0 , y + i , 0 );
99
+ SV_TYPE a00_vec = svld1 (pg_tail , a0_ptr + i );
100
+ SV_TYPE a01_vec = svld1 (pg_tail , a1_ptr + i );
101
+ SV_TYPE a02_vec = svld1 (pg_tail , a2_ptr + i );
108
102
109
- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
110
- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
111
- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
103
+ y0_vec = svmla_m ( pg_tail , y0_vec , temp0_vec , a00_vec );
104
+ y0_vec = svmla_m ( pg_tail , y0_vec , temp1_vec , a01_vec );
105
+ y0_vec = svmla_m ( pg_tail , y0_vec , temp2_vec , a02_vec );
112
106
113
- y0_vec = svmla_m (pg00 , y0_vec , temp0_vec , a00_vec );
114
- y0_vec = svmla_m (pg01 , y0_vec , temp1_vec , a01_vec );
115
- y0_vec = svmla_m (pg02 , y0_vec , temp2_vec , a02_vec );
116
-
117
- svst1_vnum (pg0 , y + i , 0 , y0_vec );
107
+ svst1 (pg_tail , y + i , y0_vec );
118
108
}
119
109
a0_ptr += lda ;
120
110
a1_ptr += lda ;
121
111
a2_ptr += lda ;
122
112
ix += inc_x ;
123
113
}
114
+ // Handle remaining n % 3 columns
115
+ for (j = width * 3 ; j < n ; j ++ ) {
116
+ FLOAT * a_col = a + j * lda ;
117
+ temp = alpha * x [j * inc_x ];
118
+ SV_TYPE temp_vec = SV_DUP (temp );
119
+
120
+ i = 0 ;
121
+ while ((i + sve_size - 1 ) < m ) {
122
+ SV_TYPE y_vec = svld1 (pg_full , y + i );
123
+
124
+ SV_TYPE a_vec = svld1 (pg_full , a_col + i );
125
+
126
+ y_vec = svmla_x (pg_full , y_vec , temp_vec , a_vec );
127
+
128
+ svst1 (pg_full , y + i , y_vec );
129
+ i += sve_size ;
130
+ }
131
+ if (i < m ) {
132
+ SV_TYPE y_vec = svld1 (pg_tail , y + i );
133
+
134
+ SV_TYPE a_vec = svld1 (pg_tail , a_col + i );
135
+
136
+ y_vec = svmla_m (pg_tail , y_vec , temp_vec , a_vec );
137
+
138
+ svst1 (pg_tail , y + i , y_vec );
139
+ }
140
+ }
124
141
return (0 );
125
142
}
126
143
144
+ // Fallback scalar loop
127
145
for (j = 0 ; j < n ; j ++ ) {
128
146
temp = alpha * x [ix ];
129
147
iy = 0 ;
0 commit comments