Skip to content

Commit 95ed0e3

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 83a62fe + ca66f90 commit 95ed0e3

File tree

116 files changed

+1249
-717
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+1249
-717
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 121 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -142,45 +142,6 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
142142
a.Index = i
143143
}
144144

145-
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146-
// See runtime/mkduff.go.
147-
const (
148-
dzBlocks = 16 // number of MOV/ADD blocks
149-
dzBlockLen = 4 // number of clears per block
150-
dzBlockSize = 23 // size of instructions in a single block
151-
dzMovSize = 5 // size of single MOV instruction w/ offset
152-
dzLeaqSize = 4 // size of single LEAQ instruction
153-
dzClearStep = 16 // number of bytes cleared by each MOV instruction
154-
)
155-
156-
func duffStart(size int64) int64 {
157-
x, _ := duff(size)
158-
return x
159-
}
160-
func duffAdj(size int64) int64 {
161-
_, x := duff(size)
162-
return x
163-
}
164-
165-
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
166-
// required to use the duffzero mechanism for a block of the given size.
167-
func duff(size int64) (int64, int64) {
168-
if size < 32 || size > 1024 || size%dzClearStep != 0 {
169-
panic("bad duffzero size")
170-
}
171-
steps := size / dzClearStep
172-
blocks := steps / dzBlockLen
173-
steps %= dzBlockLen
174-
off := dzBlockSize * (dzBlocks - blocks)
175-
var adj int64
176-
if steps != 0 {
177-
off -= dzLeaqSize
178-
off -= dzMovSize * steps
179-
adj -= dzClearStep * (dzBlockLen - steps)
180-
}
181-
return off, adj
182-
}
183-
184145
func getgFromTLS(s *ssagen.State, r int16) {
185146
// See the comments in cmd/internal/obj/x86/obj6.go
186147
// near CanUse1InsnTLS for a detailed explanation of these instructions.
@@ -1104,20 +1065,110 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
11041065
zero16(off + n - 16)
11051066
}
11061067

1107-
case ssa.OpAMD64DUFFCOPY:
1108-
p := s.Prog(obj.ADUFFCOPY)
1109-
p.To.Type = obj.TYPE_ADDR
1110-
p.To.Sym = ir.Syms.Duffcopy
1111-
if v.AuxInt%16 != 0 {
1112-
v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
1068+
case ssa.OpAMD64LoweredMove:
1069+
dstReg := v.Args[0].Reg()
1070+
srcReg := v.Args[1].Reg()
1071+
if dstReg == srcReg {
1072+
break
1073+
}
1074+
tmpReg := int16(x86.REG_X14)
1075+
n := v.AuxInt
1076+
if n < 16 {
1077+
v.Fatalf("Move too small %d", n)
1078+
}
1079+
// move 16 bytes from srcReg+off to dstReg+off.
1080+
move16 := func(off int64) {
1081+
move16(s, srcReg, dstReg, tmpReg, off)
1082+
}
1083+
1084+
// Generate copying instructions.
1085+
var off int64
1086+
for n >= 16 {
1087+
move16(off)
1088+
off += 16
1089+
n -= 16
1090+
}
1091+
if n != 0 {
1092+
// use partially overlapped read/write.
1093+
// TODO: use smaller operations when we can?
1094+
move16(off + n - 16)
1095+
}
1096+
1097+
case ssa.OpAMD64LoweredMoveLoop:
1098+
dstReg := v.Args[0].Reg()
1099+
srcReg := v.Args[1].Reg()
1100+
if dstReg == srcReg {
1101+
break
1102+
}
1103+
countReg := v.RegTmp()
1104+
tmpReg := int16(x86.REG_X14)
1105+
n := v.AuxInt
1106+
loopSize := int64(64)
1107+
if n < 3*loopSize {
1108+
// - a loop count of 0 won't work.
1109+
// - a loop count of 1 is useless.
1110+
// - a loop count of 2 is a code size ~tie
1111+
// 4 instructions to implement the loop
1112+
// 4 instructions in the loop body
1113+
// vs
1114+
// 8 instructions in the straightline code
1115+
// Might as well use straightline code.
1116+
v.Fatalf("ZeroLoop size too small %d", n)
1117+
}
1118+
// move 16 bytes from srcReg+off to dstReg+off.
1119+
move16 := func(off int64) {
1120+
move16(s, srcReg, dstReg, tmpReg, off)
1121+
}
1122+
1123+
// Put iteration count in a register.
1124+
// MOVL $n, countReg
1125+
p := s.Prog(x86.AMOVL)
1126+
p.From.Type = obj.TYPE_CONST
1127+
p.From.Offset = n / loopSize
1128+
p.To.Type = obj.TYPE_REG
1129+
p.To.Reg = countReg
1130+
cntInit := p
1131+
1132+
// Copy loopSize bytes starting at srcReg to dstReg.
1133+
for i := range loopSize / 16 {
1134+
move16(i * 16)
1135+
}
1136+
// ADDQ $loopSize, srcReg
1137+
p = s.Prog(x86.AADDQ)
1138+
p.From.Type = obj.TYPE_CONST
1139+
p.From.Offset = loopSize
1140+
p.To.Type = obj.TYPE_REG
1141+
p.To.Reg = srcReg
1142+
// ADDQ $loopSize, dstReg
1143+
p = s.Prog(x86.AADDQ)
1144+
p.From.Type = obj.TYPE_CONST
1145+
p.From.Offset = loopSize
1146+
p.To.Type = obj.TYPE_REG
1147+
p.To.Reg = dstReg
1148+
// DECL countReg
1149+
p = s.Prog(x86.ADECL)
1150+
p.To.Type = obj.TYPE_REG
1151+
p.To.Reg = countReg
1152+
// Jump to loop header if we're not done yet.
1153+
// JNE head
1154+
p = s.Prog(x86.AJNE)
1155+
p.To.Type = obj.TYPE_BRANCH
1156+
p.To.SetTarget(cntInit.Link)
1157+
1158+
// Multiples of the loop size are now done.
1159+
n %= loopSize
1160+
1161+
// Copy any fractional portion.
1162+
var off int64
1163+
for n >= 16 {
1164+
move16(off)
1165+
off += 16
1166+
n -= 16
1167+
}
1168+
if n != 0 {
1169+
// Use partially-overlapping copy.
1170+
move16(off + n - 16)
11131171
}
1114-
p.To.Offset = 14 * (64 - v.AuxInt/16)
1115-
// 14 and 64 are magic constants. 14 is the number of bytes to encode:
1116-
// MOVUPS (SI), X0
1117-
// ADDQ $16, SI
1118-
// MOVUPS X0, (DI)
1119-
// ADDQ $16, DI
1120-
// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
11211172

11221173
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
11231174
if v.Type.IsMemory() {
@@ -1709,3 +1760,21 @@ func zero16(s *ssagen.State, reg int16, off int64) {
17091760
p.To.Reg = reg
17101761
p.To.Offset = off
17111762
}
1763+
1764+
// move 16 bytes from src+off to dst+off using temporary register tmp.
1765+
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
1766+
// MOVUPS off(srcReg), tmpReg
1767+
// MOVUPS tmpReg, off(dstReg)
1768+
p := s.Prog(x86.AMOVUPS)
1769+
p.From.Type = obj.TYPE_MEM
1770+
p.From.Reg = src
1771+
p.From.Offset = off
1772+
p.To.Type = obj.TYPE_REG
1773+
p.To.Reg = tmp
1774+
p = s.Prog(x86.AMOVUPS)
1775+
p.From.Type = obj.TYPE_REG
1776+
p.From.Reg = tmp
1777+
p.To.Type = obj.TYPE_MEM
1778+
p.To.Reg = dst
1779+
p.To.Offset = off
1780+
}

src/cmd/compile/internal/loong64/ssa.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
10651065
{Type: obj.TYPE_CONST, Offset: int64((v.AuxInt >> 0) & 0x1f)},
10661066
})
10671067

1068+
case ssa.OpLOONG64ADDshiftLLV:
1069+
// ADDshiftLLV Rarg0, Rarg1, $shift
1070+
// ALSLV $shift, Rarg1, Rarg0, Rtmp
1071+
p := s.Prog(v.Op.Asm())
1072+
p.From.Type = obj.TYPE_CONST
1073+
p.From.Offset = v.AuxInt
1074+
p.Reg = v.Args[1].Reg()
1075+
p.AddRestSourceReg(v.Args[0].Reg())
1076+
p.To.Type = obj.TYPE_REG
1077+
p.To.Reg = v.Reg()
1078+
10681079
case ssa.OpClobber, ssa.OpClobberReg:
10691080
// TODO: implement for clobberdead experiment. Nop is ok for now.
10701081
default:

src/cmd/compile/internal/ssa/_gen/AMD64.rules

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -264,24 +264,6 @@
264264
(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
265265
(Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem)
266266

267-
(Move [32] dst src mem) =>
268-
(Move [16]
269-
(OffPtr <dst.Type> dst [16])
270-
(OffPtr <src.Type> src [16])
271-
(Move [16] dst src mem))
272-
273-
(Move [48] dst src mem) =>
274-
(Move [32]
275-
(OffPtr <dst.Type> dst [16])
276-
(OffPtr <src.Type> src [16])
277-
(Move [16] dst src mem))
278-
279-
(Move [64] dst src mem) =>
280-
(Move [32]
281-
(OffPtr <dst.Type> dst [32])
282-
(OffPtr <src.Type> src [32])
283-
(Move [32] dst src mem))
284-
285267
(Move [3] dst src mem) =>
286268
(MOVBstore [2] dst (MOVBload [2] src mem)
287269
(MOVWstore dst (MOVWload src mem) mem))
@@ -310,28 +292,19 @@
310292
(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
311293
(MOVQstore dst (MOVQload src mem) mem))
312294

313-
// Adjust moves to be a multiple of 16 bytes.
314-
(Move [s] dst src mem)
315-
&& s > 16 && s%16 != 0 && s%16 <= 8 =>
316-
(Move [s-s%16]
317-
(OffPtr <dst.Type> dst [s%16])
318-
(OffPtr <src.Type> src [s%16])
319-
(MOVQstore dst (MOVQload src mem) mem))
320-
(Move [s] dst src mem)
321-
&& s > 16 && s%16 != 0 && s%16 > 8 =>
322-
(Move [s-s%16]
323-
(OffPtr <dst.Type> dst [s%16])
324-
(OffPtr <src.Type> src [s%16])
325-
(MOVOstore dst (MOVOload src mem) mem))
326-
327-
// Medium copying uses a duff device.
328-
(Move [s] dst src mem)
329-
&& s > 64 && s <= 16*64 && s%16 == 0
330-
&& logLargeCopy(v, s) =>
331-
(DUFFCOPY [s] dst src mem)
295+
// Copying up to 192 bytes uses straightline code.
296+
(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
297+
298+
// Copying up to ~1KB uses a small loop.
299+
(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
332300

333301
// Large copying uses REP MOVSQ.
334-
(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) =>
302+
(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 =>
303+
(Move [s-s%8]
304+
(OffPtr <dst.Type> dst [s%8])
305+
(OffPtr <src.Type> src [s%8])
306+
(MOVQstore dst (MOVQload src mem) mem))
307+
(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) =>
335308
(REPMOVSQ dst src (MOVQconst [s/8]) mem)
336309

337310
// Lowering Zero instructions

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -939,20 +939,38 @@ func init() {
939939
// arg0 = destination pointer
940940
// arg1 = source pointer
941941
// arg2 = mem
942-
// auxint = # of bytes to copy, must be multiple of 16
942+
// auxint = # of bytes to copy
943943
// returns memory
944944
{
945-
name: "DUFFCOPY",
945+
name: "LoweredMove",
946946
aux: "Int64",
947947
argLength: 3,
948948
reg: regInfo{
949-
inputs: []regMask{buildReg("DI"), buildReg("SI")},
950-
clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
949+
inputs: []regMask{gp, gp},
950+
clobbers: buildReg("X14"), // uses X14 as a temporary
951951
},
952-
clobberFlags: true,
953-
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
954-
//faultOnNilArg1: true,
955-
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
952+
faultOnNilArg0: true,
953+
faultOnNilArg1: true,
954+
},
955+
// arg0 = destination pointer
956+
// arg1 = source pointer
957+
// arg2 = mem
958+
// auxint = # of bytes to copy
959+
// returns memory
960+
{
961+
name: "LoweredMoveLoop",
962+
aux: "Int64",
963+
argLength: 3,
964+
reg: regInfo{
965+
inputs: []regMask{gp, gp},
966+
clobbers: buildReg("X14"), // uses X14 as a temporary
967+
clobbersArg0: true,
968+
clobbersArg1: true,
969+
},
970+
clobberFlags: true,
971+
faultOnNilArg0: true,
972+
faultOnNilArg1: true,
973+
needIntTemp: true,
956974
},
957975

958976
// arg0 = destination pointer

src/cmd/compile/internal/ssa/_gen/LOONG64.rules

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -951,6 +951,10 @@
951951
(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
952952
(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes)
953953

954+
// absorb NEGV into branches
955+
(EQZ (NEGV x) yes no) => (EQZ x yes no)
956+
(NEZ (NEGV x) yes no) => (NEZ x yes no)
957+
954958
// Convert branch with zero to more optimal branch zero.
955959
(BEQ (MOVVconst [0]) cond yes no) => (EQZ cond yes no)
956960
(BEQ cond (MOVVconst [0]) yes no) => (EQZ cond yes no)

src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,8 @@ func init() {
577577
// is $hint and bit[41:5] is $n.
578578
{name: "PRELD", argLength: 2, aux: "Int64", reg: preldreg, asm: "PRELD", hasSideEffects: true},
579579
{name: "PRELDX", argLength: 2, aux: "Int64", reg: preldreg, asm: "PRELDX", hasSideEffects: true},
580+
581+
{name: "ADDshiftLLV", argLength: 2, aux: "Int64", reg: gp21, asm: "ALSLV"}, // arg0 + arg1<<auxInt, the value of auxInt should be in the range [1, 4].
580582
}
581583

582584
blocks := []blockData{

src/cmd/compile/internal/ssa/config.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ func (c *Config) buildRecipes(arch string) {
566566
}
567567
case "loong64":
568568
// - multiply is 4 cycles.
569-
// - add/sub/shift are 1 cycle.
569+
// - add/sub/shift/alsl are 1 cycle.
570570
// On loong64, using a multiply also needs to load the constant into a register.
571571
// TODO: figure out a happy medium.
572572
mulCost = 45
@@ -601,6 +601,15 @@ func (c *Config) buildRecipes(arch string) {
601601
return m.Block.NewValue1I(m.Pos, OpLOONG64SLLVconst, m.Type, int64(i), x)
602602
})
603603
}
604+
605+
// ADDshiftLLV
606+
for i := 1; i < 5; i++ {
607+
c := 10
608+
r(1, 1<<i, c,
609+
func(m, x, y *Value) *Value {
610+
return m.Block.NewValue2I(m.Pos, OpLOONG64ADDshiftLLV, m.Type, int64(i), x, y)
611+
})
612+
}
604613
}
605614

606615
c.mulRecipes = map[int64]mulRecipe{}
@@ -718,7 +727,7 @@ func (c *Config) buildRecipes(arch string) {
718727
// Currently:
719728
// len(c.mulRecipes) == 5984 on arm64
720729
// 680 on amd64
721-
// 5984 on loong64
730+
// 9738 on loong64
722731
// This function takes ~2.5ms on arm64.
723732
//println(len(c.mulRecipes))
724733
}

0 commit comments

Comments
 (0)