Skip to content

Commit 63dcc7b

Browse files
rolandshoemakergopherbot
authored andcommitted
crypto/sha1: add sha-ni AMD64 implementation
Based on the Intel docs. Provides a ~44% speed-up compared to the AVX implementation and a ~57% speed-up compared to the generic AMD64 assembly implementation. │ /usr/local/google/home/bracewell/sha1-avx.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-24 157.60n ± 0% 92.51n ± 0% -41.30% (p=0.000 n=20) Hash8Bytes/Sum-24 147.00n ± 0% 85.06n ± 0% -42.14% (p=0.000 n=20) Hash320Bytes/New-24 625.3n ± 0% 276.7n ± 0% -55.75% (p=0.000 n=20) Hash320Bytes/Sum-24 626.2n ± 0% 272.4n ± 0% -56.51% (p=0.000 n=20) Hash1K/New-24 1206.5n ± 0% 692.2n ± 0% -42.63% (p=0.000 n=20) Hash1K/Sum-24 1210.0n ± 0% 688.2n ± 0% -43.13% (p=0.000 n=20) Hash8K/New-24 7.744µ ± 0% 4.920µ ± 0% -36.46% (p=0.000 n=20) Hash8K/Sum-24 7.737µ ± 0% 4.913µ ± 0% -36.50% (p=0.000 n=20) geomean 971.5n 536.1n -44.81% │ /usr/local/google/home/bracewell/sha1-avx.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ B/s │ B/s vs base │ Hash8Bytes/New-24 48.41Mi ± 0% 82.47Mi ± 0% +70.37% (p=0.000 n=20) Hash8Bytes/Sum-24 51.90Mi ± 0% 89.70Mi ± 0% +72.82% (p=0.000 n=20) Hash320Bytes/New-24 488.0Mi ± 0% 1103.0Mi ± 0% +126.01% (p=0.000 n=20) Hash320Bytes/Sum-24 487.4Mi ± 0% 1120.5Mi ± 0% +129.91% (p=0.000 n=20) Hash1K/New-24 809.6Mi ± 0% 1410.8Mi ± 0% +74.26% (p=0.000 n=20) Hash1K/Sum-24 806.9Mi ± 0% 1419.1Mi ± 0% +75.86% (p=0.000 n=20) Hash8K/New-24 1008.9Mi ± 0% 1588.0Mi ± 0% +57.40% (p=0.000 n=20) Hash8K/Sum-24 1009.8Mi ± 0% 1590.1Mi ± 0% +57.47% (p=0.000 n=20) geomean 375.8Mi 680.9Mi +81.20% │ /usr/local/google/home/bracewell/sha1-amd64.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-24 153.90n ± 0% 92.51n ± 0% -39.89% (p=0.000 n=20) Hash8Bytes/Sum-24 145.90n ± 0% 85.06n ± 0% -41.70% (p=0.000 n=20) Hash320Bytes/New-24 666.8n ± 0% 276.7n ± 0% -58.50% (p=0.000 n=20) Hash320Bytes/Sum-24 660.3n ± 0% 272.4n ± 0% -58.75% (p=0.000 n=20) Hash1K/New-24 1810.5n ± 0% 692.2n ± 0% -61.77% (p=0.000 n=20) Hash1K/Sum-24 1806.0n ± 0% 688.2n ± 0% -61.90% (p=0.000 n=20) Hash8K/New-24 13.509µ ± 0% 4.920µ ± 0% -63.58% (p=0.000 n=20) Hash8K/Sum-24 13.515µ ± 0% 4.913µ ± 0% -63.65% (p=0.000 n=20) geomean 1.248µ 536.1n -57.05% │ /usr/local/google/home/bracewell/sha1-amd64.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ B/s │ B/s vs base │ Hash8Bytes/New-24 49.57Mi ± 0% 82.47Mi ± 0% +66.37% (p=0.000 n=20) Hash8Bytes/Sum-24 52.29Mi ± 0% 89.70Mi ± 0% +71.52% (p=0.000 n=20) Hash320Bytes/New-24 457.7Mi ± 0% 1103.0Mi ± 0% +140.97% (p=0.000 n=20) Hash320Bytes/Sum-24 462.2Mi ± 0% 1120.5Mi ± 0% +142.45% (p=0.000 n=20) Hash1K/New-24 539.4Mi ± 0% 1410.8Mi ± 0% +161.57% (p=0.000 n=20) Hash1K/Sum-24 540.7Mi ± 0% 1419.1Mi ± 0% +162.44% (p=0.000 n=20) Hash8K/New-24 578.4Mi ± 0% 1588.0Mi ± 0% +174.57% (p=0.000 n=20) Hash8K/Sum-24 578.1Mi ± 0% 1590.1Mi ± 0% +175.07% (p=0.000 n=20) geomean 292.4Mi 680.9Mi +132.86% Change-Id: Ife90386ba410a80c2e6222c1fe4df2368c4e12b2 Reviewed-on: https://go-review.googlesource.com/c/go/+/642157 Reviewed-by: Filippo Valsorda <filippo@golang.org> Auto-Submit: Roland Shoemaker <roland@golang.org> Reviewed-by: Neal Patel <nealpatel@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 40b19b5 commit 63dcc7b

File tree

4 files changed

+381
-1
lines changed

4 files changed

+381
-1
lines changed

src/crypto/sha1/_asm/sha1block_amd64_asm.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ func main() {
2525
ConstraintExpr("!purego")
2626
blockAMD64()
2727
blockAVX2()
28+
blockSHANI()
2829
Generate()
2930
}
3031

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
import (
8+
"fmt"
9+
10+
. "github.com/mmcloughlin/avo/build"
11+
. "github.com/mmcloughlin/avo/operand"
12+
. "github.com/mmcloughlin/avo/reg"
13+
)
14+
15+
// Implement the SHA-1 block function using the Intel(R) SHA extensions
16+
// (SHA1RNDS4, SHA1NEXTE, SHA1MSG1, and SHA1MSG2). This implementation requires
17+
// the AVX, SHA, SSE2, SSE4.1, and SSSE3 extensions.
18+
//
19+
// Reference:
20+
// S. Gulley, et al, "New Instructions Supporting the Secure Hash
21+
// Algorithm on Intel® Architecture Processors", July 2013
22+
// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
23+
24+
func blockSHANI() {
25+
Implement("blockSHANI")
26+
27+
digest := Load(Param("dig"), RDI)
28+
data := Load(Param("p").Base(), RSI)
29+
len := Load(Param("p").Len(), RDX)
30+
31+
abcd := XMM()
32+
msg0, msg1, msg2, msg3 := XMM(), XMM(), XMM(), XMM()
33+
e0, e1 := XMM(), XMM()
34+
shufMask := XMM()
35+
36+
CMPQ(len, Imm(0))
37+
JEQ(LabelRef("done"))
38+
ADDQ(data, len)
39+
40+
stackPtr := GP64()
41+
{
42+
Comment("Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes")
43+
local := AllocLocal(32 + 16)
44+
LEAQ(local.Offset(15), stackPtr)
45+
tmp := GP64()
46+
MOVQ(U64(15), tmp)
47+
NOTQ(tmp)
48+
ANDQ(tmp, stackPtr)
49+
}
50+
e0_save := Mem{Base: stackPtr}
51+
abcd_save := Mem{Base: stackPtr}.Offset(16)
52+
53+
Comment("Load initial hash state")
54+
PINSRD(Imm(3), Mem{Base: digest}.Offset(16), e0)
55+
VMOVDQU(Mem{Base: digest}, abcd)
56+
PAND(upperMask(), e0)
57+
PSHUFD(Imm(0x1b), abcd, abcd)
58+
59+
VMOVDQA(flipMask(), shufMask)
60+
61+
Label("loop")
62+
63+
Comment("Save ABCD and E working values")
64+
VMOVDQA(e0, e0_save)
65+
VMOVDQA(abcd, abcd_save)
66+
67+
Comment("Rounds 0-3")
68+
VMOVDQU(Mem{Base: data}, msg0)
69+
PSHUFB(shufMask, msg0)
70+
PADDD(msg0, e0)
71+
VMOVDQA(abcd, e1)
72+
SHA1RNDS4(Imm(0), e0, abcd)
73+
74+
Comment("Rounds 4-7")
75+
VMOVDQU(Mem{Base: data}.Offset(16), msg1)
76+
PSHUFB(shufMask, msg1)
77+
SHA1NEXTE(msg1, e1)
78+
VMOVDQA(abcd, e0)
79+
SHA1RNDS4(Imm(0), e1, abcd)
80+
SHA1MSG1(msg1, msg0)
81+
82+
Comment("Rounds 8-11")
83+
VMOVDQU(Mem{Base: data}.Offset(16*2), msg2)
84+
PSHUFB(shufMask, msg2)
85+
SHA1NEXTE(msg2, e0)
86+
VMOVDQA(abcd, e1)
87+
SHA1RNDS4(Imm(0), e0, abcd)
88+
SHA1MSG1(msg2, msg1)
89+
PXOR(msg2, msg0)
90+
91+
// Rounds 12 through 67 use the same repeated pattern, with e0 and e1 ping-ponging
92+
// back and forth, and each of the msg temporaries moving up one every four rounds.
93+
msgs := []VecVirtual{msg3, msg0, msg1, msg2}
94+
for i := range 14 {
95+
Comment(fmt.Sprintf("Rounds %d-%d", 12+(i*4), 12+(i*4)+3))
96+
a, b := e1, e0
97+
if i == 0 {
98+
VMOVDQU(Mem{Base: data}.Offset(16*3), msg3)
99+
PSHUFB(shufMask, msg3)
100+
}
101+
if i%2 == 1 {
102+
a, b = e0, e1
103+
}
104+
imm := uint64((12 + i*4) / 20)
105+
106+
SHA1NEXTE(msgs[i%4], a)
107+
VMOVDQA(abcd, b)
108+
SHA1MSG2(msgs[i%4], msgs[(1+i)%4])
109+
SHA1RNDS4(Imm(imm), a, abcd)
110+
SHA1MSG1(msgs[i%4], msgs[(3+i)%4])
111+
PXOR(msgs[i%4], msgs[(2+i)%4])
112+
}
113+
114+
Comment("Rounds 68-71")
115+
SHA1NEXTE(msg1, e1)
116+
VMOVDQA(abcd, e0)
117+
SHA1MSG2(msg1, msg2)
118+
SHA1RNDS4(Imm(3), e1, abcd)
119+
PXOR(msg1, msg3)
120+
121+
Comment("Rounds 72-75")
122+
SHA1NEXTE(msg2, e0)
123+
VMOVDQA(abcd, e1)
124+
SHA1MSG2(msg2, msg3)
125+
SHA1RNDS4(Imm(3), e0, abcd)
126+
127+
Comment("Rounds 76-79")
128+
SHA1NEXTE(msg3, e1)
129+
VMOVDQA(abcd, e0)
130+
SHA1RNDS4(Imm(3), e1, abcd)
131+
132+
Comment("Add saved E and ABCD")
133+
SHA1NEXTE(e0_save, e0)
134+
PADDD(abcd_save, abcd)
135+
136+
Comment("Check if we are done, if not return to the loop")
137+
ADDQ(Imm(64), data)
138+
CMPQ(data, len)
139+
JNE(LabelRef("loop"))
140+
141+
Comment("Write the hash state back to digest")
142+
PSHUFD(Imm(0x1b), abcd, abcd)
143+
VMOVDQU(abcd, Mem{Base: digest})
144+
PEXTRD(Imm(3), e0, Mem{Base: digest}.Offset(16))
145+
146+
Label("done")
147+
RET()
148+
}
149+
150+
func flipMask() Mem {
151+
mask := GLOBL("shuffle_mask", RODATA)
152+
// 0x000102030405060708090a0b0c0d0e0f
153+
DATA(0x00, U64(0x08090a0b0c0d0e0f))
154+
DATA(0x08, U64(0x0001020304050607))
155+
return mask
156+
}
157+
158+
func upperMask() Mem {
159+
mask := GLOBL("upper_mask", RODATA)
160+
// 0xFFFFFFFF000000000000000000000000
161+
DATA(0x00, U64(0x0000000000000000))
162+
DATA(0x08, U64(0xFFFFFFFF00000000))
163+
return mask
164+
}

src/crypto/sha1/sha1block_amd64.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,16 @@ func blockAVX2(dig *digest, p []byte)
1414
//go:noescape
1515
func blockAMD64(dig *digest, p []byte)
1616

17+
//go:noescape
18+
func blockSHANI(dig *digest, p []byte)
19+
1720
var useAVX2 = cpu.X86.HasAVX && cpu.X86.HasAVX2 && cpu.X86.HasBMI1 && cpu.X86.HasBMI2
21+
var useSHANI = cpu.X86.HasAVX && cpu.X86.HasSHA && cpu.X86.HasSSE41 && cpu.X86.HasSSSE3
1822

1923
func block(dig *digest, p []byte) {
20-
if useAVX2 && len(p) >= 256 {
24+
if useSHANI {
25+
blockSHANI(dig, p)
26+
} else if useAVX2 && len(p) >= 256 {
2127
// blockAVX2 calculates sha1 for 2 block per iteration
2228
// it also interleaves precalculation for next block.
2329
// So it may read up-to 192 bytes past end of p

0 commit comments

Comments
 (0)