@@ -142,45 +142,6 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
142142 a .Index = i
143143}
144144
145- // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146- // See runtime/mkduff.go.
147- const (
148- dzBlocks = 16 // number of MOV/ADD blocks
149- dzBlockLen = 4 // number of clears per block
150- dzBlockSize = 23 // size of instructions in a single block
151- dzMovSize = 5 // size of single MOV instruction w/ offset
152- dzLeaqSize = 4 // size of single LEAQ instruction
153- dzClearStep = 16 // number of bytes cleared by each MOV instruction
154- )
155-
156- func duffStart (size int64 ) int64 {
157- x , _ := duff (size )
158- return x
159- }
160- func duffAdj (size int64 ) int64 {
161- _ , x := duff (size )
162- return x
163- }
164-
165- // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
166- // required to use the duffzero mechanism for a block of the given size.
167- func duff (size int64 ) (int64 , int64 ) {
168- if size < 32 || size > 1024 || size % dzClearStep != 0 {
169- panic ("bad duffzero size" )
170- }
171- steps := size / dzClearStep
172- blocks := steps / dzBlockLen
173- steps %= dzBlockLen
174- off := dzBlockSize * (dzBlocks - blocks )
175- var adj int64
176- if steps != 0 {
177- off -= dzLeaqSize
178- off -= dzMovSize * steps
179- adj -= dzClearStep * (dzBlockLen - steps )
180- }
181- return off , adj
182- }
183-
184145func getgFromTLS (s * ssagen.State , r int16 ) {
185146 // See the comments in cmd/internal/obj/x86/obj6.go
186147 // near CanUse1InsnTLS for a detailed explanation of these instructions.
@@ -1104,20 +1065,110 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
11041065 zero16 (off + n - 16 )
11051066 }
11061067
1107- case ssa .OpAMD64DUFFCOPY :
1108- p := s .Prog (obj .ADUFFCOPY )
1109- p .To .Type = obj .TYPE_ADDR
1110- p .To .Sym = ir .Syms .Duffcopy
1111- if v .AuxInt % 16 != 0 {
1112- v .Fatalf ("bad DUFFCOPY AuxInt %v" , v .AuxInt )
1068+ case ssa .OpAMD64LoweredMove :
1069+ dstReg := v .Args [0 ].Reg ()
1070+ srcReg := v .Args [1 ].Reg ()
1071+ if dstReg == srcReg {
1072+ break
1073+ }
1074+ tmpReg := int16 (x86 .REG_X14 )
1075+ n := v .AuxInt
1076+ if n < 16 {
1077+ v .Fatalf ("Move too small %d" , n )
1078+ }
1079+ // move 16 bytes from srcReg+off to dstReg+off.
1080+ move16 := func (off int64 ) {
1081+ move16 (s , srcReg , dstReg , tmpReg , off )
1082+ }
1083+
1084+ // Generate copying instructions.
1085+ var off int64
1086+ for n >= 16 {
1087+ move16 (off )
1088+ off += 16
1089+ n -= 16
1090+ }
1091+ if n != 0 {
1092+ // use partially overlapped read/write.
1093+ // TODO: use smaller operations when we can?
1094+ move16 (off + n - 16 )
1095+ }
1096+
1097+ case ssa .OpAMD64LoweredMoveLoop :
1098+ dstReg := v .Args [0 ].Reg ()
1099+ srcReg := v .Args [1 ].Reg ()
1100+ if dstReg == srcReg {
1101+ break
1102+ }
1103+ countReg := v .RegTmp ()
1104+ tmpReg := int16 (x86 .REG_X14 )
1105+ n := v .AuxInt
1106+ loopSize := int64 (64 )
1107+ if n < 3 * loopSize {
1108+ // - a loop count of 0 won't work.
1109+ // - a loop count of 1 is useless.
1110+ // - a loop count of 2 is a code size ~tie
1111+ // 4 instructions to implement the loop
1112+ // 4 instructions in the loop body
1113+ // vs
1114+ // 8 instructions in the straightline code
1115+ // Might as well use straightline code.
1116+ v .Fatalf ("ZeroLoop size too small %d" , n )
1117+ }
1118+ // move 16 bytes from srcReg+off to dstReg+off.
1119+ move16 := func (off int64 ) {
1120+ move16 (s , srcReg , dstReg , tmpReg , off )
1121+ }
1122+
1123+ // Put iteration count in a register.
1124+ // MOVL $n, countReg
1125+ p := s .Prog (x86 .AMOVL )
1126+ p .From .Type = obj .TYPE_CONST
1127+ p .From .Offset = n / loopSize
1128+ p .To .Type = obj .TYPE_REG
1129+ p .To .Reg = countReg
1130+ cntInit := p
1131+
1132+ // Copy loopSize bytes starting at srcReg to dstReg.
1133+ for i := range loopSize / 16 {
1134+ move16 (i * 16 )
1135+ }
1136+ // ADDQ $loopSize, srcReg
1137+ p = s .Prog (x86 .AADDQ )
1138+ p .From .Type = obj .TYPE_CONST
1139+ p .From .Offset = loopSize
1140+ p .To .Type = obj .TYPE_REG
1141+ p .To .Reg = srcReg
1142+ // ADDQ $loopSize, dstReg
1143+ p = s .Prog (x86 .AADDQ )
1144+ p .From .Type = obj .TYPE_CONST
1145+ p .From .Offset = loopSize
1146+ p .To .Type = obj .TYPE_REG
1147+ p .To .Reg = dstReg
1148+ // DECL countReg
1149+ p = s .Prog (x86 .ADECL )
1150+ p .To .Type = obj .TYPE_REG
1151+ p .To .Reg = countReg
1152+ // Jump to loop header if we're not done yet.
1153+ // JNE head
1154+ p = s .Prog (x86 .AJNE )
1155+ p .To .Type = obj .TYPE_BRANCH
1156+ p .To .SetTarget (cntInit .Link )
1157+
1158+ // Multiples of the loop size are now done.
1159+ n %= loopSize
1160+
1161+ // Copy any fractional portion.
1162+ var off int64
1163+ for n >= 16 {
1164+ move16 (off )
1165+ off += 16
1166+ n -= 16
1167+ }
1168+ if n != 0 {
1169+ // Use partially-overlapping copy.
1170+ move16 (off + n - 16 )
11131171 }
1114- p .To .Offset = 14 * (64 - v .AuxInt / 16 )
1115- // 14 and 64 are magic constants. 14 is the number of bytes to encode:
1116- // MOVUPS (SI), X0
1117- // ADDQ $16, SI
1118- // MOVUPS X0, (DI)
1119- // ADDQ $16, DI
1120- // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
11211172
11221173 case ssa .OpCopy : // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
11231174 if v .Type .IsMemory () {
@@ -1709,3 +1760,21 @@ func zero16(s *ssagen.State, reg int16, off int64) {
17091760 p .To .Reg = reg
17101761 p .To .Offset = off
17111762}
1763+
1764+ // move 16 bytes from src+off to dst+off using temporary register tmp.
1765+ func move16 (s * ssagen.State , src , dst , tmp int16 , off int64 ) {
1766+ // MOVUPS off(srcReg), tmpReg
1767+ // MOVUPS tmpReg, off(dstReg)
1768+ p := s .Prog (x86 .AMOVUPS )
1769+ p .From .Type = obj .TYPE_MEM
1770+ p .From .Reg = src
1771+ p .From .Offset = off
1772+ p .To .Type = obj .TYPE_REG
1773+ p .To .Reg = tmp
1774+ p = s .Prog (x86 .AMOVUPS )
1775+ p .From .Type = obj .TYPE_REG
1776+ p .From .Reg = tmp
1777+ p .To .Type = obj .TYPE_MEM
1778+ p .To .Reg = dst
1779+ p .To .Offset = off
1780+ }
0 commit comments