Skip to content

Commit a177448

Browse files
kmvijaygopherbot
authored andcommitted
runtime: Improvement in perf of s390x memclr
Memclr routine of s390x architecture is now implemented with vector operations. And loop unrolling is used for larger sizes. goos: linux goarch: s390x pkg: runtime | old.txt | new_final.txt | | sec/op | sec/op vs base | Memclr/5 2.485n ± 5% 2.421n ± 0% -2.54% (p=0.000 n=10) Memclr/16 3.037n ± 2% 2.969n ± 0% -2.26% (p=0.001 n=10) Memclr/64 9.623n ± 0% 4.455n ± 1% -53.70% (p=0.000 n=10) Memclr/256 3.347n ± 3% 3.312n ± 4% ~ (p=0.670 n=10) Memclr/4096 15.53n ± 0% 15.54n ± 0% +0.06% (p=0.000 n=10) Memclr/65536 329.8n ± 2% 228.4n ± 0% -30.74% (p=0.000 n=10) Memclr/1M 13.09µ ± 0% 12.78µ ± 0% -2.34% (p=0.000 n=10) Memclr/4M 52.33µ ± 0% 51.16µ ± 0% -2.24% (p=0.000 n=10) Memclr/8M 104.6µ ± 0% 102.3µ ± 0% -2.20% (p=0.000 n=10) Memclr/16M 209.4µ ± 0% 204.9µ ± 0% -2.17% (p=0.000 n=10) Memclr/64M 977.8µ ± 0% 967.8µ ± 0% -1.02% (p=0.000 n=10) MemclrUnaligned/0_5 3.398n ± 0% 3.657n ± 0% +7.62% (p=0.000 n=10) MemclrUnaligned/0_16 3.957n ± 0% 3.958n ± 0% ~ (p=0.325 n=10) MemclrUnaligned/0_64 11.550n ± 0% 5.139n ± 0% -55.51% (p=0.000 n=10) MemclrUnaligned/0_256 4.288n ± 0% 4.025n ± 4% -6.14% (p=0.000 n=10) MemclrUnaligned/0_4096 15.53n ± 0% 15.53n ± 0% ~ (p=1.000 n=10) MemclrUnaligned/0_65536 318.3n ± 1% 233.9n ± 0% -26.52% (p=0.000 n=10) MemclrUnaligned/1_5 3.398n ± 0% 3.657n ± 0% +7.62% (p=0.000 n=10) MemclrUnaligned/1_16 3.965n ± 0% 3.969n ± 0% +0.10% (p=0.000 n=10) MemclrUnaligned/1_64 11.550n ± 0% 5.109n ± 0% -55.76% (p=0.000 n=10) MemclrUnaligned/1_256 4.385n ± 0% 4.174n ± 1% -4.80% (p=0.000 n=10) MemclrUnaligned/1_4096 26.23n ± 0% 26.24n ± 0% +0.04% (p=0.005 n=10) MemclrUnaligned/1_65536 570.5n ± 0% 401.3n ± 0% -29.66% (p=0.000 n=10) MemclrUnaligned/4_5 3.398n ± 0% 3.657n ± 0% +7.62% (p=0.000 n=10) MemclrUnaligned/4_16 3.965n ± 0% 3.973n ± 1% +0.19% (p=0.000 n=10) MemclrUnaligned/4_64 11.550n ± 0% 5.131n ± 0% -55.58% (p=0.000 n=10) MemclrUnaligned/4_256 4.419n ± 0% 4.187n ± 1% -5.25% (p=0.000 n=10) MemclrUnaligned/4_4096 26.23n ± 0% 26.24n ± 0% +0.04% (p=0.011 n=10) MemclrUnaligned/4_65536 570.5n ± 0% 401.2n ± 0% -29.67% (p=0.000 n=10) MemclrUnaligned/7_5 3.397n ± 0% 3.657n ± 0% +7.65% (p=0.000 n=10) MemclrUnaligned/7_16 3.965n ± 0% 3.969n ± 0% +0.10% (p=0.000 n=10) MemclrUnaligned/7_64 11.550n ± 0% 5.120n ± 0% -55.67% (p=0.000 n=10) MemclrUnaligned/7_256 4.407n ± 0% 4.188n ± 2% -4.99% (p=0.000 n=10) MemclrUnaligned/7_4096 26.24n ± 0% 26.24n ± 0% ~ (p=1.000 n=10) MemclrUnaligned/7_65536 570.8n ± 0% 401.3n ± 0% -29.69% (p=0.000 n=10) MemclrUnaligned/0_1M 13.08µ ± 0% 12.81µ ± 0% -2.06% (p=0.000 n=10) MemclrUnaligned/0_4M 52.28µ ± 0% 51.13µ ± 0% -2.21% (p=0.000 n=10) MemclrUnaligned/0_8M 104.6µ ± 0% 102.3µ ± 0% -2.18% (p=0.000 n=10) MemclrUnaligned/0_16M 209.5µ ± 0% 204.8µ ± 0% -2.24% (p=0.000 n=10) MemclrUnaligned/0_64M 977.7µ ± 0% 969.1µ ± 0% -0.88% (p=0.000 n=10) MemclrUnaligned/1_1M 17.49µ ± 0% 16.04µ ± 0% -8.32% (p=0.000 n=10) MemclrUnaligned/1_4M 69.92µ ± 0% 64.13µ ± 0% -8.28% (p=0.000 n=10) MemclrUnaligned/1_8M 139.8µ ± 0% 128.2µ ± 0% -8.32% (p=0.000 n=10) MemclrUnaligned/1_16M 279.9µ ± 0% 256.1µ ± 0% -8.50% (p=0.000 n=10) MemclrUnaligned/1_64M 1.250m ± 0% 1.216m ± 0% -2.73% (p=0.000 n=10) MemclrUnaligned/4_1M 17.50µ ± 0% 16.04µ ± 0% -8.33% (p=0.000 n=10) MemclrUnaligned/4_4M 69.93µ ± 0% 64.12µ ± 0% -8.30% (p=0.000 n=10) MemclrUnaligned/4_8M 139.8µ ± 0% 128.2µ ± 0% -8.32% (p=0.000 n=10) MemclrUnaligned/4_16M 280.2µ ± 0% 256.2µ ± 0% -8.55% (p=0.000 n=10) MemclrUnaligned/4_64M 1.250m ± 0% 1.216m ± 0% -2.73% (p=0.000 n=10) MemclrUnaligned/7_1M 17.50µ ± 0% 16.04µ ± 0% -8.35% (p=0.000 n=10) MemclrUnaligned/7_4M 69.92µ ± 0% 64.13µ ± 0% -8.28% (p=0.000 n=10) MemclrUnaligned/7_8M 139.8µ ± 0% 128.2µ ± 0% -8.34% (p=0.000 n=10) MemclrUnaligned/7_16M 279.6µ ± 0% 256.2µ ± 0% -8.35% (p=0.000 n=10) MemclrUnaligned/7_64M 1.250m ± 0% 1.216m ± 0% -2.73% (p=0.000 n=10) MemclrRange/1K_2K 1.053µ ± 0% 1.020µ ± 1% -3.09% (p=0.000 n=10) MemclrRange/2K_8K 1.552µ ± 0% 1.570µ ± 12% ~ (p=0.137 n=10) MemclrRange/4K_16K 1.283µ ± 0% 1.250µ ± 0% -2.61% (p=0.000 n=10) MemclrRange/160K_228K 20.62µ ± 0% 19.86µ ± 0% -3.70% (p=0.000 n=10) MemclrKnownSize1 1.732n ± 0% 1.732n ± 0% ~ (p=1.000 n=10) MemclrKnownSize2 1.925n ± 34% 1.967n ± 8% ~ (p=0.080 n=10) MemclrKnownSize4 1.808n ± 3% 1.732n ± 0% -4.20% (p=0.000 n=10) MemclrKnownSize8 2.002n ± 9% 1.773n ± 5% -11.46% (p=0.000 n=10) MemclrKnownSize16 2.880n ± 5% 2.461n ± 5% -14.53% (p=0.000 n=10) MemclrKnownSize32 8.082n ± 0% 2.838n ± 5% -64.88% (p=0.000 n=10) MemclrKnownSize64 8.083n ± 0% 4.960n ± 4% -38.63% (p=0.000 n=10) MemclrKnownSize112 8.082n ± 0% 5.533n ± 1% -31.53% (p=0.000 n=10) MemclrKnownSize128 8.082n ± 0% 5.534n ± 1% -31.54% (p=0.000 n=10) MemclrKnownSize192 8.082n ± 0% 6.833n ± 2% -15.45% (p=0.000 n=10) MemclrKnownSize248 8.082n ± 0% 7.165n ± 1% -11.34% (p=0.000 n=10) MemclrKnownSize256 2.995n ± 6% 3.226n ± 4% +7.70% (p=0.006 n=10) MemclrKnownSize512 3.356n ± 8% 3.595n ± 3% +7.14% (p=0.007 n=10) MemclrKnownSize1024 4.664n ± 0% 4.665n ± 0% ~ (p=0.426 n=10) MemclrKnownSize4096 15.80n ± 4% 15.15n ± 0% ~ (p=0.449 n=10) MemclrKnownSize512KiB 6.543µ ± 0% 6.380µ ± 0% -2.48% (p=0.000 n=10) geomean 327.2n 286.6n -12.42% Change-Id: I0f8450743e2f7e736c5ff96a316a8b5d98b27222 Reviewed-on: https://go-review.googlesource.com/c/go/+/662475 Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Auto-Submit: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent ac341b8 commit a177448

File tree

1 file changed

+120
-13
lines changed

1 file changed

+120
-13
lines changed

src/runtime/memclr_s390x.s

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT|NOFRAME,$0-16
1111
MOVD ptr+0(FP), R4
1212
MOVD n+8(FP), R5
1313

14+
CMPBGE R5, $32, clearge32
15+
1416
start:
1517
CMPBLE R5, $3, clear0to3
1618
CMPBLE R5, $7, clear4to7
1719
CMPBLE R5, $11, clear8to11
1820
CMPBLE R5, $15, clear12to15
19-
CMP R5, $32
20-
BGE clearmt32
2121
MOVD $0, 0(R4)
2222
MOVD $0, 8(R4)
2323
ADD $16, R4
@@ -102,23 +102,130 @@ clear15:
102102
MOVB $0, 14(R4)
103103
RET
104104

105-
clearmt32:
105+
clearge32:
106+
CMP R5, $4096
107+
BLT clear256Bto4KB
108+
109+
// For size >= 4KB, XC is loop unrolled 16 times (4KB = 256B * 16)
110+
clearge4KB:
111+
XC $256, 0(R4), 0(R4)
112+
ADD $256, R4
113+
ADD $-256, R5
114+
XC $256, 0(R4), 0(R4)
115+
ADD $256, R4
116+
ADD $-256, R5
117+
XC $256, 0(R4), 0(R4)
118+
ADD $256, R4
119+
ADD $-256, R5
120+
XC $256, 0(R4), 0(R4)
121+
ADD $256, R4
122+
ADD $-256, R5
123+
XC $256, 0(R4), 0(R4)
124+
ADD $256, R4
125+
ADD $-256, R5
126+
XC $256, 0(R4), 0(R4)
127+
ADD $256, R4
128+
ADD $-256, R5
129+
XC $256, 0(R4), 0(R4)
130+
ADD $256, R4
131+
ADD $-256, R5
132+
XC $256, 0(R4), 0(R4)
133+
ADD $256, R4
134+
ADD $-256, R5
135+
XC $256, 0(R4), 0(R4)
136+
ADD $256, R4
137+
ADD $-256, R5
138+
XC $256, 0(R4), 0(R4)
139+
ADD $256, R4
140+
ADD $-256, R5
141+
XC $256, 0(R4), 0(R4)
142+
ADD $256, R4
143+
ADD $-256, R5
144+
XC $256, 0(R4), 0(R4)
145+
ADD $256, R4
146+
ADD $-256, R5
147+
XC $256, 0(R4), 0(R4)
148+
ADD $256, R4
149+
ADD $-256, R5
150+
XC $256, 0(R4), 0(R4)
151+
ADD $256, R4
152+
ADD $-256, R5
153+
XC $256, 0(R4), 0(R4)
154+
ADD $256, R4
155+
ADD $-256, R5
156+
XC $256, 0(R4), 0(R4)
157+
ADD $256, R4
158+
ADD $-256, R5
159+
CMP R5, $4096
160+
BGE clearge4KB
161+
162+
clear256Bto4KB:
106163
CMP R5, $256
107-
BLT clearlt256
164+
BLT clear32to255
108165
XC $256, 0(R4), 0(R4)
109166
ADD $256, R4
110167
ADD $-256, R5
111-
BR clearmt32
112-
clearlt256:
168+
BR clear256Bto4KB
169+
170+
clear32to255:
113171
CMPBEQ R5, $0, done
114-
ADD $-1, R5
115-
EXRL $memclr_exrl_xc<>(SB), R5
116-
done:
172+
CMPBLT R5, $32, start
173+
CMPBEQ R5, $32, clear32
174+
CMPBLE R5, $64, clear33to64
175+
CMP R5, $128
176+
BLE clear65to128
177+
CMP R5, $255
178+
BLE clear129to255
179+
180+
clear32:
181+
VZERO V1
182+
VST V1, 0(R4)
183+
VST V1, 16(R4)
117184
RET
118185

119-
// DO NOT CALL - target for exrl (execute relative long) instruction.
120-
TEXT memclr_exrl_xc<>(SB),NOSPLIT|NOFRAME,$0-0
121-
XC $1, 0(R4), 0(R4)
122-
MOVD $0, 0(R0)
186+
clear33to64:
187+
VZERO V1
188+
VST V1, 0(R4)
189+
VST V1, 16(R4)
190+
ADD $-32, R5
191+
VST V1, 0(R4)(R5)
192+
VST V1, 16(R4)(R5)
193+
RET
194+
195+
clear65to128:
196+
VZERO V1
197+
VST V1, 0(R4)
198+
VST V1, 16(R4)
199+
VST V1, 32(R4)
200+
VST V1, 48(R4)
201+
ADD $-64, R5
202+
VST V1, 0(R4)(R5)
203+
VST V1, 16(R4)(R5)
204+
VST V1, 32(R4)(R5)
205+
VST V1, 48(R4)(R5)
206+
RET
207+
208+
clear129to255:
209+
VZERO V1
210+
VST V1, 0(R4)
211+
VST V1, 16(R4)
212+
VST V1, 32(R4)
213+
VST V1, 48(R4)
214+
VST V1, 64(R4)
215+
VST V1, 80(R4)
216+
VST V1, 96(R4)
217+
VST V1, 112(R4)
218+
ADD $-128, R5
219+
VST V1, 0(R4)(R5)
220+
VST V1, 16(R4)(R5)
221+
VST V1, 32(R4)(R5)
222+
VST V1, 48(R4)(R5)
223+
VST V1, 64(R4)(R5)
224+
VST V1, 80(R4)(R5)
225+
VST V1, 96(R4)(R5)
226+
VST V1, 112(R4)(R5)
227+
RET
228+
229+
done:
123230
RET
124231

0 commit comments

Comments
 (0)