@@ -52,17 +52,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
52
52
BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y ,
53
53
FLOAT * buffer )
54
54
{
55
- BLASLONG i ;
56
- BLASLONG ix , iy ;
57
- BLASLONG j ;
58
- FLOAT * a_ptr ;
55
+ BLASLONG i , j ;
56
+ BLASLONG ix = 0 ;
57
+ BLASLONG iy ;
58
+ FLOAT * a_ptr = a ;
59
59
FLOAT temp ;
60
60
61
- ix = 0 ;
62
- a_ptr = a ;
63
-
64
61
if (inc_y == 1 ) {
65
- BLASLONG width = (n + 3 - 1 ) / 3 ;
62
+ BLASLONG width = n / 3 ; // Only process full 3-column blocks
63
+ BLASLONG sve_size = SV_COUNT ();
64
+ svbool_t pg_full = SV_TRUE ();
66
65
67
66
FLOAT * a0_ptr = a_ptr + lda * width * 0 ;
68
67
FLOAT * a1_ptr = a_ptr + lda * width * 1 ;
@@ -73,57 +72,79 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
73
72
FLOAT * x2_ptr = x + inc_x * width * 2 ;
74
73
75
74
for (j = 0 ; j < width ; j ++ ) {
76
- svbool_t pg00 = (( j + width * 0 ) < n ) ? SV_TRUE () : svpfalse ( );
77
- svbool_t pg01 = (( j + width * 1 ) < n ) ? SV_TRUE () : svpfalse ( );
78
- svbool_t pg02 = (( j + width * 2 ) < n ) ? SV_TRUE () : svpfalse ( );
75
+ SV_TYPE temp0_vec = SV_DUP ( alpha * x0_ptr [ ix ] );
76
+ SV_TYPE temp1_vec = SV_DUP ( alpha * x1_ptr [ ix ] );
77
+ SV_TYPE temp2_vec = SV_DUP ( alpha * x2_ptr [ ix ] );
79
78
80
- SV_TYPE temp0_vec = ((j + width * 0 ) < n ) ? SV_DUP (alpha * x0_ptr [ix ]) : SV_DUP (0.0 );
81
- SV_TYPE temp1_vec = ((j + width * 1 ) < n ) ? SV_DUP (alpha * x1_ptr [ix ]) : SV_DUP (0.0 );
82
- SV_TYPE temp2_vec = ((j + width * 2 ) < n ) ? SV_DUP (alpha * x2_ptr [ix ]) : SV_DUP (0.0 );
83
79
i = 0 ;
84
- BLASLONG sve_size = SV_COUNT ();
85
- while ((i + sve_size * 1 - 1 ) < m ) {
86
- SV_TYPE y0_vec = svld1_vnum (SV_TRUE (), y + i , 0 );
80
+ while ((i + sve_size - 1 ) < m ) {
81
+ SV_TYPE y0_vec = svld1 (pg_full , y + i );
87
82
88
- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
89
- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
90
- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
83
+ SV_TYPE a00_vec = svld1 ( pg_full , a0_ptr + i );
84
+ SV_TYPE a01_vec = svld1 ( pg_full , a1_ptr + i );
85
+ SV_TYPE a02_vec = svld1 ( pg_full , a2_ptr + i );
91
86
92
- y0_vec = svmla_m ( pg00 , y0_vec , temp0_vec , a00_vec );
93
- y0_vec = svmla_m ( pg01 , y0_vec , temp1_vec , a01_vec );
94
- y0_vec = svmla_m ( pg02 , y0_vec , temp2_vec , a02_vec );
87
+ y0_vec = svmla_x ( pg_full , y0_vec , temp0_vec , a00_vec );
88
+ y0_vec = svmla_x ( pg_full , y0_vec , temp1_vec , a01_vec );
89
+ y0_vec = svmla_x ( pg_full , y0_vec , temp2_vec , a02_vec );
95
90
96
- svst1_vnum ( SV_TRUE () , y + i , 0 , y0_vec );
97
- i += sve_size * 1 ;
91
+ svst1 ( pg_full , y + i , y0_vec );
92
+ i += sve_size ;
98
93
}
99
94
100
95
if (i < m ) {
101
- svbool_t pg0 = SV_WHILE (i + sve_size * 0 , m );
102
-
103
- pg00 = svand_z (SV_TRUE (), pg0 , pg00 );
104
- pg01 = svand_z (SV_TRUE (), pg0 , pg01 );
105
- pg02 = svand_z (SV_TRUE (), pg0 , pg02 );
96
+ svbool_t pg_tail = SV_WHILE (i , m );
106
97
107
- SV_TYPE y0_vec = svld1_vnum ( pg0 , y + i , 0 );
98
+ SV_TYPE y0_vec = svld1 ( pg_tail , y + i );
108
99
109
- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
110
- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
111
- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
100
+ SV_TYPE a00_vec = svld1 ( pg_tail , a0_ptr + i );
101
+ SV_TYPE a01_vec = svld1 ( pg_tail , a1_ptr + i );
102
+ SV_TYPE a02_vec = svld1 ( pg_tail , a2_ptr + i );
112
103
113
- y0_vec = svmla_m (pg00 , y0_vec , temp0_vec , a00_vec );
114
- y0_vec = svmla_m (pg01 , y0_vec , temp1_vec , a01_vec );
115
- y0_vec = svmla_m (pg02 , y0_vec , temp2_vec , a02_vec );
104
+ y0_vec = svmla_m (pg_tail , y0_vec , temp0_vec , a00_vec );
105
+ y0_vec = svmla_m (pg_tail , y0_vec , temp1_vec , a01_vec );
106
+ y0_vec = svmla_m (pg_tail , y0_vec , temp2_vec , a02_vec );
116
107
117
- svst1_vnum ( pg0 , y + i , 0 , y0_vec );
108
+ svst1 ( pg_tail , y + i , y0_vec );
118
109
}
119
110
a0_ptr += lda ;
120
111
a1_ptr += lda ;
121
112
a2_ptr += lda ;
122
113
ix += inc_x ;
123
114
}
115
+ // Handle remaining n % 3 columns
116
+ for (j = width * 3 ; j < n ; j ++ ) {
117
+ FLOAT * a_col = a + j * lda ;
118
+ temp = alpha * x [j * inc_x ];
119
+ SV_TYPE temp_vec = SV_DUP (temp );
120
+
121
+ i = 0 ;
122
+ while ((i + sve_size - 1 ) < m ) {
123
+ SV_TYPE y_vec = svld1 (pg_full , y + i );
124
+
125
+ SV_TYPE a_vec = svld1 (pg_full , a_col + i );
126
+
127
+ y_vec = svmla_x (pg_full , y_vec , temp_vec , a_vec );
128
+
129
+ svst1 (pg_full , y + i , y_vec );
130
+ i += sve_size ;
131
+ }
132
+ if (i < m ) {
133
+ svbool_t pg_tail = SV_WHILE (i , m );
134
+
135
+ SV_TYPE y_vec = svld1 (pg_tail , y + i );
136
+
137
+ SV_TYPE a_vec = svld1 (pg_tail , a_col + i );
138
+
139
+ y_vec = svmla_m (pg_tail , y_vec , temp_vec , a_vec );
140
+
141
+ svst1 (pg_tail , y + i , y_vec );
142
+ }
143
+ }
124
144
return (0 );
125
145
}
126
146
147
+ // Fallback scalar loop
127
148
for (j = 0 ; j < n ; j ++ ) {
128
149
temp = alpha * x [ix ];
129
150
iy = 0 ;
0 commit comments