@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
37
37
38
38
__asm__ __volatile__
39
39
(
40
- "vbroadcastss (%2 ), %%xmm12 \n\t" // x0
41
- "vbroadcastss 4(%2 ), %%xmm13 \n\t" // x1
42
- "vbroadcastss 8(%2 ), %%xmm14 \n\t" // x2
43
- "vbroadcastss 12(%2 ), %%xmm15 \n\t" // x3
44
- "vbroadcastss 16(%2 ), %%xmm0 \n\t" // x4
45
- "vbroadcastss 20(%2 ), %%xmm1 \n\t" // x5
46
- "vbroadcastss 24(%2 ), %%xmm2 \n\t" // x6
47
- "vbroadcastss 28(%2 ), %%xmm3 \n\t" // x7
40
+ "vbroadcastss (%3 ), %%xmm12 \n\t" // x0
41
+ "vbroadcastss 4(%3 ), %%xmm13 \n\t" // x1
42
+ "vbroadcastss 8(%3 ), %%xmm14 \n\t" // x2
43
+ "vbroadcastss 12(%3 ), %%xmm15 \n\t" // x3
44
+ "vbroadcastss 16(%3 ), %%xmm0 \n\t" // x4
45
+ "vbroadcastss 20(%3 ), %%xmm1 \n\t" // x5
46
+ "vbroadcastss 24(%3 ), %%xmm2 \n\t" // x6
47
+ "vbroadcastss 28(%3 ), %%xmm3 \n\t" // x7
48
48
49
49
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
50
50
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
54
54
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
55
55
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
56
56
57
- "vfmaddps %%xmm4, (%4 ,%0,4), %%xmm12, %%xmm4 \n\t"
58
- "vfmaddps %%xmm5, (%5 ,%0,4), %%xmm13, %%xmm5 \n\t"
59
- "vfmaddps %%xmm4, (%6 ,%0,4), %%xmm14, %%xmm4 \n\t"
60
- "vfmaddps %%xmm5, (%7 ,%0,4), %%xmm15, %%xmm5 \n\t"
57
+ "vfmaddps %%xmm4, (%5 ,%0,4), %%xmm12, %%xmm4 \n\t"
58
+ "vfmaddps %%xmm5, (%6 ,%0,4), %%xmm13, %%xmm5 \n\t"
59
+ "vfmaddps %%xmm4, (%7 ,%0,4), %%xmm14, %%xmm4 \n\t"
60
+ "vfmaddps %%xmm5, (%8 ,%0,4), %%xmm15, %%xmm5 \n\t"
61
61
"addq $4 , %0 \n\t"
62
62
63
- "vfmaddps %%xmm4, (%4,%8 ,4), %%xmm0 , %%xmm4 \n\t"
64
- "vfmaddps %%xmm5, (%5,%8 ,4), %%xmm1 , %%xmm5 \n\t"
65
- "vfmaddps %%xmm4, (%6,%8 ,4), %%xmm2 , %%xmm4 \n\t"
66
- "vfmaddps %%xmm5, (%7,%8 ,4), %%xmm3 , %%xmm5 \n\t"
67
- "addq $4 , %8 \n\t"
63
+ "vfmaddps %%xmm4, (%5,%2 ,4), %%xmm0 , %%xmm4 \n\t"
64
+ "vfmaddps %%xmm5, (%6,%2 ,4), %%xmm1 , %%xmm5 \n\t"
65
+ "vfmaddps %%xmm4, (%7,%2 ,4), %%xmm2 , %%xmm4 \n\t"
66
+ "vfmaddps %%xmm5, (%8,%2 ,4), %%xmm3 , %%xmm5 \n\t"
67
+ "addq $4 , %2 \n\t"
68
68
69
69
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
70
- "vfmaddps -16(%3 ,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
70
+ "vfmaddps -16(%4 ,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
71
71
"subq $4 , %1 \n\t"
72
- "vmovups %%xmm6, -16(%3 ,%0,4) \n\t" // 4 * y
72
+ "vmovups %%xmm6, -16(%4 ,%0,4) \n\t" // 4 * y
73
73
74
74
"2: \n\t"
75
75
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
79
79
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
80
80
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
81
81
82
- "vfmaddps %%xmm4, (%4 ,%0,4), %%xmm12, %%xmm4 \n\t"
83
- "vfmaddps %%xmm5, 16(%4 ,%0,4), %%xmm12, %%xmm5 \n\t"
84
- "vfmaddps %%xmm4, (%5 ,%0,4), %%xmm13, %%xmm4 \n\t"
85
- "vfmaddps %%xmm5, 16(%5 ,%0,4), %%xmm13, %%xmm5 \n\t"
86
- "vfmaddps %%xmm4, (%6 ,%0,4), %%xmm14, %%xmm4 \n\t"
87
- "vfmaddps %%xmm5, 16(%6 ,%0,4), %%xmm14, %%xmm5 \n\t"
88
- "vfmaddps %%xmm4, (%7 ,%0,4), %%xmm15, %%xmm4 \n\t"
89
- "vfmaddps %%xmm5, 16(%7 ,%0,4), %%xmm15, %%xmm5 \n\t"
90
-
91
- "vfmaddps %%xmm4, (%4,%8 ,4), %%xmm0 , %%xmm4 \n\t"
92
- "vfmaddps %%xmm5, 16(%4,%8 ,4), %%xmm0 , %%xmm5 \n\t"
93
- "vfmaddps %%xmm4, (%5,%8 ,4), %%xmm1 , %%xmm4 \n\t"
94
- "vfmaddps %%xmm5, 16(%5,%8 ,4), %%xmm1 , %%xmm5 \n\t"
95
- "vfmaddps %%xmm4, (%6,%8 ,4), %%xmm2 , %%xmm4 \n\t"
96
- "vfmaddps %%xmm5, 16(%6,%8 ,4), %%xmm2 , %%xmm5 \n\t"
97
- "vfmaddps %%xmm4, (%7,%8 ,4), %%xmm3 , %%xmm4 \n\t"
98
- "vfmaddps %%xmm5, 16(%7,%8 ,4), %%xmm3 , %%xmm5 \n\t"
82
+ "vfmaddps %%xmm4, (%5 ,%0,4), %%xmm12, %%xmm4 \n\t"
83
+ "vfmaddps %%xmm5, 16(%5 ,%0,4), %%xmm12, %%xmm5 \n\t"
84
+ "vfmaddps %%xmm4, (%6 ,%0,4), %%xmm13, %%xmm4 \n\t"
85
+ "vfmaddps %%xmm5, 16(%6 ,%0,4), %%xmm13, %%xmm5 \n\t"
86
+ "vfmaddps %%xmm4, (%7 ,%0,4), %%xmm14, %%xmm4 \n\t"
87
+ "vfmaddps %%xmm5, 16(%7 ,%0,4), %%xmm14, %%xmm5 \n\t"
88
+ "vfmaddps %%xmm4, (%8 ,%0,4), %%xmm15, %%xmm4 \n\t"
89
+ "vfmaddps %%xmm5, 16(%8 ,%0,4), %%xmm15, %%xmm5 \n\t"
90
+
91
+ "vfmaddps %%xmm4, (%5,%2 ,4), %%xmm0 , %%xmm4 \n\t"
92
+ "vfmaddps %%xmm5, 16(%5,%2 ,4), %%xmm0 , %%xmm5 \n\t"
93
+ "vfmaddps %%xmm4, (%6,%2 ,4), %%xmm1 , %%xmm4 \n\t"
94
+ "vfmaddps %%xmm5, 16(%6,%2 ,4), %%xmm1 , %%xmm5 \n\t"
95
+ "vfmaddps %%xmm4, (%7,%2 ,4), %%xmm2 , %%xmm4 \n\t"
96
+ "vfmaddps %%xmm5, 16(%7,%2 ,4), %%xmm2 , %%xmm5 \n\t"
97
+ "vfmaddps %%xmm4, (%8,%2 ,4), %%xmm3 , %%xmm4 \n\t"
98
+ "vfmaddps %%xmm5, 16(%8,%2 ,4), %%xmm3 , %%xmm5 \n\t"
99
99
100
- "vfmaddps (%3 ,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
101
- "vfmaddps 16(%3 ,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
102
- "vmovups %%xmm4, (%3 ,%0,4) \n\t" // 4 * y
103
- "vmovups %%xmm5, 16(%3 ,%0,4) \n\t" // 4 * y
100
+ "vfmaddps (%4 ,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
101
+ "vfmaddps 16(%4 ,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
102
+ "vmovups %%xmm4, (%4 ,%0,4) \n\t" // 4 * y
103
+ "vmovups %%xmm5, 16(%4 ,%0,4) \n\t" // 4 * y
104
104
105
105
"addq $8 , %0 \n\t"
106
- "addq $8 , %8 \n\t"
106
+ "addq $8 , %2 \n\t"
107
107
"subq $8 , %1 \n\t"
108
108
109
109
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
120
120
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
121
121
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
122
122
123
- "prefetcht0 192(%4,%0,4) \n\t"
124
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
125
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
126
123
"prefetcht0 192(%5,%0,4) \n\t"
127
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13 , %%xmm4 \n\t"
128
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13 , %%xmm5 \n\t"
124
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12 , %%xmm4 \n\t"
125
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12 , %%xmm5 \n\t"
129
126
"prefetcht0 192(%6,%0,4) \n\t"
130
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14 , %%xmm4 \n\t"
131
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14 , %%xmm5 \n\t"
127
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13 , %%xmm4 \n\t"
128
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13 , %%xmm5 \n\t"
132
129
"prefetcht0 192(%7,%0,4) \n\t"
133
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
130
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
131
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
132
+ "prefetcht0 192(%8,%0,4) \n\t"
133
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
134
134
".align 2 \n\t"
135
- "vfmaddps %%xmm5, 16(%7 ,%0,4), %%xmm15, %%xmm5 \n\t"
136
-
137
- "vfmaddps %%xmm6, 32(%4 ,%0,4), %%xmm12, %%xmm6 \n\t"
138
- "vfmaddps %%xmm7, 48(%4 ,%0,4), %%xmm12, %%xmm7 \n\t"
139
- "vfmaddps %%xmm6, 32(%5 ,%0,4), %%xmm13, %%xmm6 \n\t"
140
- "vfmaddps %%xmm7, 48(%5 ,%0,4), %%xmm13, %%xmm7 \n\t"
141
- "vfmaddps %%xmm6, 32(%6 ,%0,4), %%xmm14, %%xmm6 \n\t"
142
- "vfmaddps %%xmm7, 48(%6 ,%0,4), %%xmm14, %%xmm7 \n\t"
143
- "vfmaddps %%xmm6, 32(%7 ,%0,4), %%xmm15, %%xmm6 \n\t"
144
- "vfmaddps %%xmm7, 48(%7 ,%0,4), %%xmm15, %%xmm7 \n\t"
145
-
146
- "prefetcht0 192(%4,%8 ,4) \n\t"
147
- "vfmaddps %%xmm4, (%4,%8 ,4), %%xmm0 , %%xmm4 \n\t"
148
- "vfmaddps %%xmm5, 16(%4,%8 ,4), %%xmm0 , %%xmm5 \n\t"
149
- "prefetcht0 192(%5,%8 ,4) \n\t"
150
- "vfmaddps %%xmm4, (%5,%8 ,4), %%xmm1 , %%xmm4 \n\t"
151
- "vfmaddps %%xmm5, 16(%5,%8 ,4), %%xmm1 , %%xmm5 \n\t"
152
- "prefetcht0 192(%6,%8 ,4) \n\t"
153
- "vfmaddps %%xmm4, (%6,%8 ,4), %%xmm2 , %%xmm4 \n\t"
154
- "vfmaddps %%xmm5, 16(%6,%8 ,4), %%xmm2 , %%xmm5 \n\t"
155
- "prefetcht0 192(%7,%8 ,4) \n\t"
156
- "vfmaddps %%xmm4, (%7,%8 ,4), %%xmm3 , %%xmm4 \n\t"
157
- "vfmaddps %%xmm5, 16(%7,%8 ,4), %%xmm3 , %%xmm5 \n\t"
135
+ "vfmaddps %%xmm5, 16(%8 ,%0,4), %%xmm15, %%xmm5 \n\t"
136
+
137
+ "vfmaddps %%xmm6, 32(%5 ,%0,4), %%xmm12, %%xmm6 \n\t"
138
+ "vfmaddps %%xmm7, 48(%5 ,%0,4), %%xmm12, %%xmm7 \n\t"
139
+ "vfmaddps %%xmm6, 32(%6 ,%0,4), %%xmm13, %%xmm6 \n\t"
140
+ "vfmaddps %%xmm7, 48(%6 ,%0,4), %%xmm13, %%xmm7 \n\t"
141
+ "vfmaddps %%xmm6, 32(%7 ,%0,4), %%xmm14, %%xmm6 \n\t"
142
+ "vfmaddps %%xmm7, 48(%7 ,%0,4), %%xmm14, %%xmm7 \n\t"
143
+ "vfmaddps %%xmm6, 32(%8 ,%0,4), %%xmm15, %%xmm6 \n\t"
144
+ "vfmaddps %%xmm7, 48(%8 ,%0,4), %%xmm15, %%xmm7 \n\t"
145
+
146
+ "prefetcht0 192(%5,%2 ,4) \n\t"
147
+ "vfmaddps %%xmm4, (%5,%2 ,4), %%xmm0 , %%xmm4 \n\t"
148
+ "vfmaddps %%xmm5, 16(%5,%2 ,4), %%xmm0 , %%xmm5 \n\t"
149
+ "prefetcht0 192(%6,%2 ,4) \n\t"
150
+ "vfmaddps %%xmm4, (%6,%2 ,4), %%xmm1 , %%xmm4 \n\t"
151
+ "vfmaddps %%xmm5, 16(%6,%2 ,4), %%xmm1 , %%xmm5 \n\t"
152
+ "prefetcht0 192(%7,%2 ,4) \n\t"
153
+ "vfmaddps %%xmm4, (%7,%2 ,4), %%xmm2 , %%xmm4 \n\t"
154
+ "vfmaddps %%xmm5, 16(%7,%2 ,4), %%xmm2 , %%xmm5 \n\t"
155
+ "prefetcht0 192(%8,%2 ,4) \n\t"
156
+ "vfmaddps %%xmm4, (%8,%2 ,4), %%xmm3 , %%xmm4 \n\t"
157
+ "vfmaddps %%xmm5, 16(%8,%2 ,4), %%xmm3 , %%xmm5 \n\t"
158
158
159
- "vfmaddps %%xmm6, 32(%4,%8 ,4), %%xmm0 , %%xmm6 \n\t"
160
- "vfmaddps %%xmm7, 48(%4,%8 ,4), %%xmm0 , %%xmm7 \n\t"
161
- "vfmaddps %%xmm6, 32(%5,%8 ,4), %%xmm1 , %%xmm6 \n\t"
162
- "vfmaddps %%xmm7, 48(%5,%8 ,4), %%xmm1 , %%xmm7 \n\t"
163
- "vfmaddps %%xmm6, 32(%6,%8 ,4), %%xmm2 , %%xmm6 \n\t"
164
- "vfmaddps %%xmm7, 48(%6,%8 ,4), %%xmm2 , %%xmm7 \n\t"
165
- "vfmaddps %%xmm6, 32(%7,%8 ,4), %%xmm3 , %%xmm6 \n\t"
166
- "vfmaddps %%xmm7, 48(%7,%8 ,4), %%xmm3 , %%xmm7 \n\t"
159
+ "vfmaddps %%xmm6, 32(%5,%2 ,4), %%xmm0 , %%xmm6 \n\t"
160
+ "vfmaddps %%xmm7, 48(%5,%2 ,4), %%xmm0 , %%xmm7 \n\t"
161
+ "vfmaddps %%xmm6, 32(%6,%2 ,4), %%xmm1 , %%xmm6 \n\t"
162
+ "vfmaddps %%xmm7, 48(%6,%2 ,4), %%xmm1 , %%xmm7 \n\t"
163
+ "vfmaddps %%xmm6, 32(%7,%2 ,4), %%xmm2 , %%xmm6 \n\t"
164
+ "vfmaddps %%xmm7, 48(%7,%2 ,4), %%xmm2 , %%xmm7 \n\t"
165
+ "vfmaddps %%xmm6, 32(%8,%2 ,4), %%xmm3 , %%xmm6 \n\t"
166
+ "vfmaddps %%xmm7, 48(%8,%2 ,4), %%xmm3 , %%xmm7 \n\t"
167
167
168
- "vfmaddps (%3 ,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
169
- "vfmaddps 16(%3 ,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
170
- "vfmaddps 32(%3 ,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
171
- "vfmaddps 48(%3 ,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
168
+ "vfmaddps (%4 ,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
169
+ "vfmaddps 16(%4 ,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
170
+ "vfmaddps 32(%4 ,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
171
+ "vfmaddps 48(%4 ,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
172
172
173
173
"addq $16, %0 \n\t"
174
- "vmovups %%xmm4,-64(%3 ,%0,4) \n\t" // 4 * y
175
- "vmovups %%xmm5,-48(%3 ,%0,4) \n\t" // 4 * y
176
- "addq $16, %8 \n\t"
177
- "vmovups %%xmm6,-32(%3 ,%0,4) \n\t" // 4 * y
178
- "vmovups %%xmm7,-16(%3 ,%0,4) \n\t" // 4 * y
174
+ "vmovups %%xmm4,-64(%4 ,%0,4) \n\t" // 4 * y
175
+ "vmovups %%xmm5,-48(%4 ,%0,4) \n\t" // 4 * y
176
+ "addq $16, %2 \n\t"
177
+ "vmovups %%xmm6,-32(%4 ,%0,4) \n\t" // 4 * y
178
+ "vmovups %%xmm7,-16(%4 ,%0,4) \n\t" // 4 * y
179
179
180
180
"subq $16, %1 \n\t"
181
181
"jnz 1b \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
184
184
185
185
:
186
186
"+r" (i ), // 0
187
- "+r" (n ) // 1
187
+ "+r" (n ), // 1
188
+ "+r" (lda4 ) // 2
188
189
:
189
- "r" (x ), // 2
190
- "r" (y ), // 3
191
- "r" (ap [0 ]), // 4
192
- "r" (ap [1 ]), // 5
193
- "r" (ap [2 ]), // 6
194
- "r" (ap [3 ]), // 7
195
- "r" (lda4 ), // 8
190
+ "r" (x ), // 3
191
+ "r" (y ), // 4
192
+ "r" (ap [0 ]), // 5
193
+ "r" (ap [1 ]), // 6
194
+ "r" (ap [2 ]), // 7
195
+ "r" (ap [3 ]), // 8
196
196
"r" (alpha ) // 9
197
197
: "cc" ,
198
198
"%xmm0" , "%xmm1" ,
0 commit comments