Skip to content

Commit 320754f

Browse files
Merge pull request #384 from calc84maniac/popcnt-optimizations
Optimize popcount implementations
2 parents 11f4715 + 26940ae commit 320754f

File tree

9 files changed

+256
-85
lines changed

9 files changed

+256
-85
lines changed

src/crt/bpopcnt.src

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,30 @@
33
section .text
44
public __bpopcnt
55
__bpopcnt:
6-
push bc
7-
ld c,a ; c=(A|B|C|D|E|F|G|H)
8-
and a,$aa ; a=(A|0|C|0|E|0|G|0)
9-
cpl ; a=(~A|1|~C|1|~E|1|~G|1)
10-
rrca ; a=(1|~A|1|~C|1|~E|1|~G), cf=1
11-
adc a,c ; a=(A+B|C+D|E+F|G+H)
12-
ld b,a ; b=(A+B|C+D|E+F|G+H)
13-
and a,$33 ; a=(00|C+D|00|G+H)
14-
ld c,a ; c=(00|C+D|00|G+H)
15-
xor a,b ; a=(A+B|00|E+F|00)
16-
rrca
17-
rrca ; a=(00|A+B|00|E+F)
18-
add a,c ; a=(A+B+C+D|E+F+G+H)
19-
ld c,a ; c=(A+B+C+D|E+F+G+H)
20-
rrca
21-
rrca
22-
rrca
23-
rrca ; a=(E+F+G+H|A+B+C+D)
24-
add a,c ; a=(A+B+C+D+E+F+G+H|A+B+C+D+E+F+G+H)
25-
and a,$f ; a=A+B+C+D+E+F+G+H
26-
pop bc
6+
push hl
7+
ld l, a
8+
ld h, 0
9+
10+
; Multiplying HL by 2 increases H by H+(L>>7).
11+
; Use A to track these values of H to cancel from each iteration.
12+
; On the first iteration H is 0, so skip subtracting it.
13+
add hl, hl
14+
sub a, h
15+
add hl, hl
16+
sub a, h
17+
add hl, hl
18+
sub a, h
19+
add hl, hl
20+
sub a, h
21+
add hl, hl
22+
sub a, h
23+
add hl, hl
24+
sub a, h
25+
add hl, hl
26+
sub a, h
27+
; Note that the value of H after the 8th shift would be equivalent to the
28+
; initial value of A, so instead of shifting and adding H to A at the end,
29+
; simply offset from the initial value of A from the beginning.
30+
31+
pop hl
2732
ret

src/crt/ipopcnt.src

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,17 @@
33
section .text
44
public __ipopcnt
55
__ipopcnt:
6-
push bc
7-
ld b, 3
8-
jp __lpopcnt.hijack1
6+
push hl
7+
; Calculate 8-popcount(L)-popcount(HLU), and set HLU=H, L=0
8+
call __popcnt_common_init_full
9+
; Subtract output carry and H (which will be added back in)
10+
sbc a, h
11+
; Accumulate popcount(L)-popcount(HLU)+H-L=H-popcount(HLU)
12+
call __popcnt_common_iter
13+
; Subtract final value from 8, and accumulate output carry
14+
cpl
15+
adc a, 9
16+
pop hl
17+
ret
918

10-
extern __lpopcnt.hijack1
19+
extern __popcnt_common_init_full, __popcnt_common_iter

src/crt/ipopcnt_fast.src

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
assume adl=1
2+
3+
section .text
4+
public __ipopcnt_fast
5+
__ipopcnt_fast:
6+
; Inlined implementation of __ipopcnt
7+
; Destroys: HL
8+
ld a, l
9+
cpl
10+
ld l, a
11+
sub a, h
12+
sub a, h
13+
14+
add hl, hl
15+
sbc a, h
16+
add hl, hl
17+
sbc a, h
18+
add hl, hl
19+
sbc a, h
20+
add hl, hl
21+
sbc a, h
22+
add hl, hl
23+
sbc a, h
24+
add hl, hl
25+
sbc a, h
26+
add hl, hl
27+
sbc a, h
28+
29+
add hl, hl
30+
sbc a, l
31+
add hl, hl
32+
sbc a, l
33+
add hl, hl
34+
sbc a, l
35+
add hl, hl
36+
sbc a, l
37+
add hl, hl
38+
sbc a, l
39+
add hl, hl
40+
sbc a, l
41+
add hl, hl
42+
sbc a, l
43+
add hl, hl
44+
sbc a, l
45+
46+
add hl, hl
47+
cpl
48+
adc a, 9
49+
ret

src/crt/llpopcnt_fast.src

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
assume adl=1
2+
3+
section .text
4+
public __llpopcnt_fast
5+
__llpopcnt_fast:
6+
; Destroys: HL, DE
7+
call __lpopcnt_fast
8+
ex de, hl
9+
ld l, b
10+
ld e, c
11+
ld d, a
12+
call __lpopcnt_fast
13+
add a, d
14+
ret
15+
16+
extern __lpopcnt_fast

src/crt/lpopcnt.src

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,32 +3,22 @@
33
section .text
44
public __lpopcnt
55
__lpopcnt:
6-
push bc
7-
ld b, 4
8-
public __lpopcnt.hijack1
9-
.hijack1:
106
push hl
11-
xor a, a
12-
ld c, a
13-
.loop:
14-
add hl, hl
15-
adc a, c
16-
add hl, hl
17-
adc a, c
18-
add hl, hl
19-
adc a, c
20-
add hl, hl
21-
adc a, c
22-
add hl, hl
23-
adc a, c
24-
add hl, hl
25-
adc a, c
26-
add hl, hl
27-
adc a, c
28-
add hl, hl
29-
adc a, c
30-
ld l, e
31-
djnz .loop
7+
; Calculate 8-popcount(L)-popcount(HLU), and set HLU=H
8+
call __popcnt_common_init_full
9+
; Save the current count in H
10+
ld h, a
11+
; Prepare to accumulate 8-popcount(E)-popcount(HLU)
12+
ld a, e
13+
cpl
14+
ld l, a
15+
; Subtract output carry and an additional H, the adjusted call adds H*2-L
16+
sbc a, h
17+
call __popcnt_common_iter_adjusted
18+
; Subtract final value from 16, and accumulate output carry
19+
cpl
20+
adc a, 17
3221
pop hl
33-
pop bc
3422
ret
23+
24+
extern __popcnt_common_init_full, __popcnt_common_iter_adjusted

src/crt/lpopcnt_fast.src

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
assume adl=1
2+
3+
section .text
4+
public __lpopcnt_fast
5+
__lpopcnt_fast:
6+
; Inlined implementation of __lpopcnt
7+
; Destroys: HL
8+
ld a, l
9+
cpl
10+
ld l, a
11+
sub a, h
12+
sub a, h
13+
14+
add hl, hl
15+
sbc a, h
16+
add hl, hl
17+
sbc a, h
18+
add hl, hl
19+
sbc a, h
20+
add hl, hl
21+
sbc a, h
22+
add hl, hl
23+
sbc a, h
24+
add hl, hl
25+
sbc a, h
26+
add hl, hl
27+
sbc a, h
28+
29+
add hl, hl
30+
ld h, a
31+
ld a, e
32+
cpl
33+
ld l, a
34+
sbc a, h
35+
36+
add hl, hl
37+
sbc a, h
38+
add hl, hl
39+
sbc a, h
40+
add hl, hl
41+
sbc a, h
42+
add hl, hl
43+
sbc a, h
44+
add hl, hl
45+
sbc a, h
46+
add hl, hl
47+
sbc a, h
48+
add hl, hl
49+
sbc a, h
50+
51+
add hl, hl
52+
cpl
53+
adc a, 17
54+
ret

src/crt/popcnt_common.src

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
assume adl=1
2+
3+
section .text
4+
public __popcnt_common_init_full
5+
__popcnt_common_init_full:
6+
; Calculates a combined popcount of L and HLU.
7+
; Note the carry flag must be combined with A on return.
8+
; I: L,HLU=values to popcount
9+
; O: A-cf=8-popcount(L)-popcount(HLU), HLU=H, L=0
10+
ld a, l
11+
12+
public __popcnt_common_init
13+
__popcnt_common_init:
14+
; Same as above, but input is in A and HLU.
15+
16+
; Input value in A is inverted into L since HLU bits will be subtracted,
17+
; calculating 8-popcount(A) to match the sign of accumulated bits.
18+
cpl
19+
ld l, a
20+
; Subtract an additional factor of H to cancel it out,
21+
; since the code below is subtracting it by a factor of 0xFF.
22+
sub a, h
23+
24+
public __popcnt_common_iter
25+
__popcnt_common_iter:
26+
; Calculates a popcount accumulation.
27+
; Note the carry flag must be combined with A on return.
28+
; I: HLU,L=values to popcount, A=current accumulation
29+
; O: A-cf=A+popcount(L)-popcount(HLU)+H-L, HLU=H, L=0
30+
sub a, h
31+
32+
public __popcnt_common_iter_adjusted
33+
__popcnt_common_iter_adjusted:
34+
; Same as above, but an additional H is accumulated into A.
35+
; This is the same popcount technique as described in __bpopcnt,
36+
; but bits shifted out from HLU are additionally subtracted.
37+
add hl, hl
38+
sbc a, h
39+
add hl, hl
40+
sbc a, h
41+
add hl, hl
42+
sbc a, h
43+
add hl, hl
44+
sbc a, h
45+
add hl, hl
46+
sbc a, h
47+
add hl, hl
48+
sbc a, h
49+
add hl, hl
50+
sbc a, h
51+
; Sets the carry flag to the final bit from HLU, which the caller handles.
52+
add hl, hl
53+
ret

src/crt/spopcnt.src

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,18 @@
33
section .text
44
public __spopcnt
55
__spopcnt:
6+
; Set HLU=H and H=L, while saving HL on the stack
67
push hl
7-
ld a,l
8-
call __bpopcnt
9-
ld l,a
10-
ld a,h
11-
call __bpopcnt
12-
add a,l
8+
dec sp
9+
ex (sp), hl
10+
; Calculate 8-popcount(H)-popcount(HLU)
11+
ld a, h
12+
call __popcnt_common_init
13+
; Subtract final value from 8, and accumulate output carry
14+
cpl
15+
adc a, 9
1316
pop hl
17+
inc sp
1418
ret
1519

16-
extern __bpopcnt
20+
extern __popcnt_common_init

src/crt/spopcnt_fast.src

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
section .text
44
public __spopcnt_fast
55
__spopcnt_fast:
6-
push hl
7-
xor a, a
6+
; Inlined implementation of __spopcnt
7+
; Destroys: HL
88
add hl, hl
99
add hl, hl
1010
add hl, hl
@@ -13,37 +13,28 @@ __spopcnt_fast:
1313
add hl, hl
1414
add hl, hl
1515
add hl, hl
16+
17+
ld a, h
18+
ld h, l
19+
cpl
20+
ld l, a
21+
1622
add hl, hl
17-
adc a, l
18-
add hl, hl
19-
adc a, l
20-
add hl, hl
21-
adc a, l
22-
add hl, hl
23-
adc a, l
24-
add hl, hl
25-
adc a, l
26-
add hl, hl
27-
adc a, l
28-
add hl, hl
29-
adc a, l
30-
add hl, hl
31-
adc a, l
32-
add hl, hl
33-
adc a, l
23+
sbc a, h
3424
add hl, hl
35-
adc a, l
25+
sbc a, h
3626
add hl, hl
37-
adc a, l
27+
sbc a, h
3828
add hl, hl
39-
adc a, l
29+
sbc a, h
4030
add hl, hl
41-
adc a, l
31+
sbc a, h
4232
add hl, hl
43-
adc a, l
33+
sbc a, h
4434
add hl, hl
45-
adc a, l
35+
sbc a, h
36+
4637
add hl, hl
47-
adc a, l
48-
pop hl
38+
cpl
39+
adc a, 9
4940
ret

0 commit comments

Comments
 (0)