Skip to content

Commit 61ea6b4

Browse files
committed
Add COUNT_FLIP offset in SIMD
1 parent 2367fd6 commit 61ea6b4

File tree

4 files changed

+130
-112
lines changed

4 files changed

+130
-112
lines changed

src/bit.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ typedef union {
186186
#endif
187187
} V8DI;
188188

189+
#ifdef hasSSE2
190+
typedef union V4SI {
191+
unsigned int ui[4];
192+
__m128i v4;
193+
} V4SI;
194+
#endif
195+
189196
/* Define function attributes directive when available */
190197

191198
#if (defined(_MSC_VER) || defined(__clang__)) && defined(hasSSE2)

src/count_last_flip_neon.c

Lines changed: 79 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -494,26 +494,6 @@ enum {
494494
LF32 = 3063
495495
};
496496

497-
const unsigned short cf_ofs_d[2][64] = {{
498-
0, 0, RF32, RF43, RF54, RF65, CF86, CF87, // RF76 -> CF86
499-
0, 0, RF32, RF43, RF54, RF65, CF86, LF76, // RF31 -> 0, RF42..RF75 -> RF32..RF65
500-
RF30, RF41, RF52, RF63, RF74, CF85, LF75, LF65,
501-
RF40, RF51, RF62, RF73, CF84, LF74, LF64, LF54,
502-
RF50, RF61, RF72, CF83, LF73, LF63, LF53, LF43,
503-
RF60, RF71, CF82, LF72, LF62, LF52, LF42, LF32,
504-
RF70, CF81, LF60, LF50, LF40, LF30, 0, 0, // LF71..LF41 -> LF60..LF30, LF31 -> 0
505-
CF80, CF81, LF60, LF50, LF40, LF30, 0, 0 // LF70 -> CF81
506-
}, {
507-
CF80, RF70, RF60, RF50, RF40, RF30, 0, 0,
508-
CF81, CF81, RF71, RF61, RF51, RF41, 0, 0, // LF70 -> CF81, RF31 -> 0
509-
LF60, LF60, CF82, RF72, RF62, RF52, RF32, RF32, // LF71 -> LF60, RF42 -> RF32
510-
LF50, LF50, LF72, CF83, RF73, RF63, RF43, RF43, // LF61 -> LF50, RF53 -> RF43
511-
LF40, LF40, LF62, LF73, CF84, RF74, RF54, RF54, // LF51 -> LF40, RF64 -> RF54
512-
LF30, LF30, LF52, LF63, LF74, CF85, RF65, RF65, // LF41 -> LF30, RF75 -> RF65
513-
0, 0, LF42, LF53, LF64, LF75, CF86, CF86, // LF31 -> 0, RF76 -> CF86
514-
0, 0, LF32, LF43, LF54, LF65, LF76, CF87
515-
}};
516-
517497
#ifdef HAS_CPU_64
518498
/* bit masks for diagonal lines (interleaved) */
519499
const uint64x2_t mask_dvhd[64][2] = {
@@ -676,23 +656,22 @@ int last_flip(int pos, unsigned long long P)
676656
const uint64x2_t dmask = { 0x0808040402020101, 0x8080404020201010 };
677657

678658
PP = vreinterpretq_u64_u8(vzip1q_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(PP)));
679-
II = vandq_u64(PP, mask_dvhd[pos][0]); // 2 dirs interleaved
680-
t = vaddvq_u16(vreinterpretq_u16_u64(II));
659+
t = vaddvq_u16(vreinterpretq_u16_u64(vandq_u64(PP, mask_dvhd[pos][0]))); // 2 dirs interleaved
681660
n_flips = (uint8_t) COUNT_FLIP_X[t & 0xFF];
682-
n_flips += COUNT_FLIP_X[t >> 8];
661+
n_flips += (uint8_t) COUNT_FLIP_X[t >> 8];
683662
II = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask);
684663
t = vaddvq_u16(vreinterpretq_u16_u64(II));
685-
n_flips += COUNT_FLIP_Y[t & 0xFF];
664+
n_flips += (uint8_t) COUNT_FLIP_Y[t & 0xFF];
686665
n_flips += (uint8_t) COUNT_FLIP_Y[t >> 8];
687666

688667
#else // Neon kindergarten
689-
const uint64x2_t dmask = { 0x1020408001020408, 0x1020408001020408 };
668+
const uint32x4_t dmask = { 0x01020408, 0x10204080, 0x01020408, 0x10204080 };
690669

691670
II = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_u64(vandq_u64(PP, mask_dvhd[pos][0])))));
692671
n_flips = (uint8_t) COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(II), 0)];
693672
n_flips += (uint8_t) COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(II), 2)];
694673
II = vreinterpretq_u64_s8(vnegq_s8(vreinterpretq_s8_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1])))));
695-
II = vpaddlq_u32(vmulq_u32(vreinterpretq_u32_u64(dmask), vreinterpretq_u32_u64(II)));
674+
II = vpaddlq_u32(vmulq_u32(dmask, vreinterpretq_u32_u64(II)));
696675
n_flips += (uint8_t) COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 3)];
697676
n_flips += (uint8_t) COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 11)];
698677
#endif
@@ -715,12 +694,13 @@ int last_flip(int pos, unsigned long long P)
715694
int board_score_neon_1(uint64x1_t P, int alpha, int pos)
716695
{
717696
int score = 2 * vaddv_u8(vcnt_u8(vreinterpret_u8_u64(P))) - SCORE_MAX + 2; // = (bit_count(P) + 1) - (SCORE_MAX - 1 - bit_count(P))
718-
uint_fast8_t n_flips:
697+
unsigned int t0, t1;
698+
uint_fast8_t n_flips;
719699
uint_fast16_t op_flip, m;
720700
const uint16_t *COUNT_FLIP_X = COUNT_FLIP + (pos & 7) * 256;
721701
const uint16_t *COUNT_FLIP_Y = COUNT_FLIP + (pos >> 3) * 256;
722702
uint64x2_t PP = vdupq_lane_u64(P, 0);
723-
uint64x2_t I0, I1;
703+
uint64x2_t II;
724704
static const uint16_t o_mask[64] = {
725705
0xff01, 0x7f03, 0x3f07, 0x1f0f, 0x0f1f, 0x073f, 0x037f, 0x01ff,
726706
0xfe03, 0xff07, 0x7f0f, 0x3f1f, 0x1f3f, 0x0f7f, 0x07ff, 0x03fe,
@@ -731,32 +711,28 @@ int board_score_neon_1(uint64x1_t P, int alpha, int pos)
731711
0xc07f, 0xe0ff, 0xf0fe, 0xf8fc, 0xfcf8, 0xfef0, 0xffe0, 0x7fc0,
732712
0x80ff, 0xc0fe, 0xe0fc, 0xf0f8, 0xf8f0, 0xfce0, 0xfec0, 0xff80
733713
};
734-
735-
// n_flips = last_flip(pos, P);
736714
#ifdef HAS_CPU_64 // vaddvq
737-
unsigned int t0, t1;
738715
const uint64x2_t dmask = { 0x0808040402020101, 0x8080404020201010 };
739716

740717
PP = vreinterpretq_u64_u8(vzip1q_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(PP)));
741-
I0 = vandq_u64(PP, mask_dvhd[pos][0]); // 2 dirs interleaved
742-
t0 = vaddvq_u16(vreinterpretq_u16_u64(I0));
718+
t0 = vaddvq_u16(vreinterpretq_u16_u64(vandq_u64(PP, mask_dvhd[pos][0]))); // 2 dirs interleaved
743719
n_flips = (uint8_t) COUNT_FLIP_X[t0 & 0xFF];
744-
op_flip = (uint8_t) COUNT_FLIP_X[t0 >> 8];
745-
I1 = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask);
746-
t1 = vaddvq_u16(vreinterpretq_u16_u64(I1));
747-
op_flip += (uint8_t) COUNT_FLIP_Y[t1 & 0xFF];
748-
n_flips += (uint8_t) COUNT_FLIP_Y[t1 >> 8];
720+
op_flip = COUNT_FLIP_X[t0 >> 8];
721+
II = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask);
722+
t1 = vaddvq_u16(vreinterpretq_u16_u64(II));
723+
op_flip += COUNT_FLIP_Y[t1 & 0xFF];
724+
n_flips += (uint8_t) COUNT_FLIP_Y[t1 >>= 8];
749725

750726
#else // Neon kindergarten
751-
const uint64x2_t dmask = { 0x1020408001020408, 0x1020408001020408 };
752-
753-
I0 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_u64(vandq_u64(PP, mask_dvhd[pos][0])))));
754-
n_flips = (uint8_t) COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(I0), 0)];
755-
op_flip = (uint8_t) COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(I0), 2)];
756-
I1 = vreinterpretq_u64_s8(vnegq_s8(vreinterpretq_s8_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1])))));
757-
I1 = vpaddlq_u32(vmulq_u32(vreinterpretq_u32_u64(dmask), vreinterpretq_u32_u64(I1)));
758-
op_flip += (uint8_t) COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(I1), 3)];
759-
n_flips += (uint8_t) COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(I1), 11)];
727+
const uint32x4_t dmask = { 0x01020408, 0x10204080, 0x01020408, 0x10204080 };
728+
729+
II = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_u64(vandq_u64(PP, mask_dvhd[pos][0])))));
730+
n_flips = (uint8_t) COUNT_FLIP_X[t0 = vgetq_lane_u32(vreinterpretq_u32_u64(II), 0)];
731+
op_flip = COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(II), 2)];
732+
II = vreinterpretq_u64_s8(vnegq_s8(vreinterpretq_s8_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1])))));
733+
II = vpaddlq_u32(vmulq_u32(dmask, vreinterpretq_u32_u64(II)));
734+
op_flip += COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 3)];
735+
n_flips += (uint8_t) COUNT_FLIP_Y[t1 = vgetq_lane_u8(vreinterpretq_u8_u64(II), 11)];
760736
#endif
761737
n_flips += (uint8_t) op_flip;
762738
score += n_flips;
@@ -770,13 +746,9 @@ int board_score_neon_1(uint64x1_t P, int alpha, int pos)
770746
// n_flips = last_flip(pos, O);
771747
m = o_mask[pos]; // valid diagonal bits
772748
n_flips = op_flip >> 8;
773-
#ifdef HAS_CPU_64
774749
n_flips += (uint8_t) COUNT_FLIP_X[(t0 ^ m) & 0xFF];
775-
n_flips += (uint8_t) COUNT_FLIP_Y[(t1 ^ m) >> 8];
776-
#else
777-
n_flips = (uint8_t) COUNT_FLIP_X[vgetq_lane_u32(vreinterpretq_u32_u64(I0), 0) ^ (m & 0xFF)];
778-
n_flips += (uint8_t) COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(I1), 11) ^ (m >> 8)];
779-
#endif
750+
n_flips += (uint8_t) COUNT_FLIP_Y[t1 ^ (m >> 8)];
751+
780752
if (n_flips != 0)
781753
score = score2 - n_flips;
782754
}
@@ -786,39 +758,76 @@ int board_score_neon_1(uint64x1_t P, int alpha, int pos)
786758
}
787759

788760
#else // simul COUNT_FLIP TLU
761+
762+
#ifdef HAS_CPU_64
763+
const unsigned short cf_ofs_d[64][2] = {
764+
{ 0, CF80 }, { 0, RF70 }, { CF82, RF60 }, { CF83, RF50 }, { CF84, RF40 }, { CF85, RF30 }, { CF86, 0 }, { CF87, 0 },
765+
{ 0, CF81 }, { 0, CF81 }, { CF82, RF71 }, { CF83, RF61 }, { CF84, RF51 }, { CF85, RF41 }, { CF86, 0 }, { LF76, 0 },
766+
{ RF30, CF82 }, { RF41, CF82 }, { RF52, CF82 }, { RF63, RF72 }, { RF74, RF62 }, { CF85, RF52 }, { LF75, CF82 }, { LF65, CF82 },
767+
{ RF40, CF83 }, { RF51, CF83 }, { RF62, LF72 }, { RF73, CF83 }, { CF84, RF73 }, { LF74, RF63 }, { LF64, CF83 }, { LF54, CF83 },
768+
{ RF50, CF84 }, { RF61, CF84 }, { RF72, LF62 }, { CF83, LF73 }, { LF73, CF84 }, { LF63, RF74 }, { LF53, CF84 }, { LF43, CF84 },
769+
{ RF60, CF85 }, { RF71, CF85 }, { CF82, LF52 }, { LF72, LF63 }, { LF62, LF74 }, { LF52, CF85 }, { LF42, CF85 }, { LF32, CF85 },
770+
{ RF70, 0 }, { CF81, 0 }, { CF82, LF42 }, { CF83, LF53 }, { CF84, LF64 }, { CF85, LF75 }, { 0, CF86 }, { 0, CF86 },
771+
{ CF80, 0 }, { CF81, 0 }, { CF82, LF32 }, { CF83, LF43 }, { CF84, LF54 }, { CF85, LF65 }, { 0, LF76 }, { 0, CF87 }
772+
};
773+
774+
#else
775+
const uint32x4_t cf_ofs[64] = {
776+
{ 0, CF80, CF80, CF80 }, { 0, CF80, CF81, RF70 }, { CF82, CF80, CF82, RF60 }, { CF83, CF80, CF83, RF50 },
777+
{ CF84, CF80, CF84, RF40 }, { CF85, CF80, CF85, RF30 }, { CF86, CF80, CF86, 0 }, { CF87, CF80, CF87, 0 },
778+
{ 0, CF81, CF80, CF81 }, { 0, CF81, CF81, CF81 }, { CF82, CF81, CF82, RF71 }, { CF83, CF81, CF83, RF61 },
779+
{ CF84, CF81, CF84, RF51 }, { CF85, CF81, CF85, RF41 }, { CF86, CF81, CF86, 0 }, { LF76, CF81, CF87, 0 },
780+
{ RF30, CF82, CF80, CF82 }, { RF41, CF82, CF81, CF82 }, { RF52, CF82, CF82, CF82 }, { RF63, CF82, CF83, RF72 },
781+
{ RF74, CF82, CF84, RF62 }, { CF85, CF82, CF85, RF52 }, { LF75, CF82, CF86, CF82 }, { LF65, CF82, CF87, CF82 },
782+
{ RF40, CF83, CF80, CF83 }, { RF51, CF83, CF81, CF83 }, { RF62, CF83, CF82, LF72 }, { RF73, CF83, CF83, CF83 },
783+
{ CF84, CF83, CF84, RF73 }, { LF74, CF83, CF85, RF63 }, { LF64, CF83, CF86, CF83 }, { LF54, CF83, CF87, CF83 },
784+
{ RF50, CF84, CF80, CF84 }, { RF61, CF84, CF81, CF84 }, { RF72, CF84, CF82, LF62 }, { CF83, CF84, CF83, LF73 },
785+
{ LF73, CF84, CF84, CF84 }, { LF63, CF84, CF85, RF74 }, { LF53, CF84, CF86, CF84 }, { LF43, CF84, CF87, CF84 },
786+
{ RF60, CF85, CF80, CF85 }, { RF71, CF85, CF81, CF85 }, { CF82, CF85, CF82, LF52 }, { LF72, CF85, CF83, LF63 },
787+
{ LF62, CF85, CF84, LF74 }, { LF52, CF85, CF85, CF85 }, { LF42, CF85, CF86, CF85 }, { LF32, CF85, CF87, CF85 },
788+
{ RF70, CF86, CF80, 0 }, { CF81, CF86, CF81, 0 }, { CF82, CF86, CF82, LF42 }, { CF83, CF86, CF83, LF53 },
789+
{ CF84, CF86, CF84, LF64 }, { CF85, CF86, CF85, LF75 }, { 0, CF86, CF86, CF86 }, { 0, CF86, CF87, CF86 },
790+
{ CF80, CF87, CF80, 0 }, { CF81, CF87, CF81, 0 }, { CF82, CF87, CF82, LF32 }, { CF83, CF87, CF83, LF43 },
791+
{ CF84, CF87, CF84, LF54 }, { CF85, CF87, CF85, LF65 }, { 0, CF87, CF86, LF76 }, { 0, CF87, CF87, CF87 }
792+
};
793+
#endif
794+
789795
int board_score_neon_1(uint64x1_t P, int alpha, int pos)
790796
{
791797
uint_fast16_t op_flip;
792798
int p_flips, o_flips;
793799
int score = 2 * vaddv_u8(vcnt_u8(vreinterpret_u8_u64(P))) - 64 + 2; // = (bit_count(P) + 1) - (SCORE_MAX - 1 - bit_count(P))
794800
uint64x2_t PP = vdupq_lane_u64(P, 0);
795-
uint64x2_t I0, I1;
796-
797801
#ifdef HAS_CPU_64 // vaddvq
798802
unsigned int t0, t1;
799803
const uint64x2_t dmask = { 0x0808040402020101, 0x8080404020201010 };
804+
uint64x2_t II;
800805

801806
PP = vreinterpretq_u64_u8(vzip1q_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(PP)));
802-
I0 = vandq_u64(PP, mask_dvhd[pos][0]); // 2 dirs interleaved
803-
t0 = vaddvq_u16(vreinterpretq_u16_u64(I0));
804-
op_flip = COUNT_FLIP[cf_ofs_d[0][pos] + (t0 & 0xFF)];
807+
t0 = vaddvq_u16(vreinterpretq_u16_u64(vandq_u64(PP, mask_dvhd[pos][0]))); // 2 dirs interleaved
808+
op_flip = COUNT_FLIP[cf_ofs_d[pos][0] + (t0 & 0xFF)];
805809
op_flip += COUNT_FLIP[((pos & 7) * 256) + (t0 >> 8)];
806-
I1 = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask);
807-
t1 = vaddvq_u16(vreinterpretq_u16_u64(I1));
810+
II = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask);
811+
t1 = vaddvq_u16(vreinterpretq_u16_u64(II));
808812
op_flip += COUNT_FLIP[((pos & 0x38) << 5) + (t1 & 0xFF)];
809-
op_flip += COUNT_FLIP[cf_ofs_d[1][pos] + (t1 >> 8)];
813+
op_flip += COUNT_FLIP[cf_ofs_d[pos][1] + (t1 >> 8)];
810814

811815
#else // Neon kindergarten
812-
const uint64x2_t dmask = { 0x1020408001020408, 0x1020408001020408 };
813-
814-
I0 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_u64(vandq_u64(PP, mask_dvhd[pos][0])))));
815-
op_flip = COUNT_FLIP[cf_ofs_d[0][pos] + vgetq_lane_u32(vreinterpretq_u32_u64(I0), 0)];
816-
op_flip += COUNT_FLIP[((pos & 7) * 256) + vgetq_lane_u32(vreinterpretq_u32_u64(I0), 2)];
817-
I1 = vreinterpretq_u64_s8(vnegq_s8(vreinterpretq_s8_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1])))));
818-
I1 = vpaddlq_u32(vmulq_u32(vreinterpretq_u32_u64(dmask), vreinterpretq_u32_u64(I1)));
819-
op_flip += COUNT_FLIP[((pos & 0x38) << 5) + vgetq_lane_u8(vreinterpretq_u8_u64(I1), 3)];
820-
op_flip += COUNT_FLIP[cf_ofs_d[1][pos] + vgetq_lane_u8(vreinterpretq_u8_u64(I1), 11)];
816+
const uint32x4_t dmask = { 0x01020408, 0x10204080, 0x01020408, 0x10204080 };
817+
uint32x4_t cf_ofs_pos = cf_ofs[pos];
818+
uint32x4_t II;
819+
820+
II = vreinterpretq_u32_u64(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_u64(vandq_u64(PP, mask_dvhd[pos][0]))))));
821+
II = vaddq_u32(II, cf_ofs_pos);
822+
op_flip = COUNT_FLIP[vgetq_lane_u32(II, 0)];
823+
op_flip += COUNT_FLIP[vgetq_lane_u32(II, 2)];
824+
II = vreinterpretq_u32_s8(vnegq_s8(vreinterpretq_s8_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1])))));
825+
II = vreinterpretq_u32_u64(vshlq_n_u64(vpaddlq_u32(vmulq_u32(dmask, II)), 8)); // 000000dd******00 000000vv******00
826+
II = vaddq_u32(II, cf_ofs_pos);
827+
op_flip += COUNT_FLIP[vgetq_lane_u32(II, 1)];
828+
op_flip += COUNT_FLIP[vgetq_lane_u32(II, 3)];
821829
#endif
830+
822831
p_flips = op_flip & 0xFF;
823832
if (p_flips)
824833
return score + p_flips;

src/count_last_flip_sse.c

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -495,37 +495,6 @@ enum {
495495
LF32 = 3063
496496
};
497497

498-
const unsigned short cf_ofs_d[2][64] = {{
499-
#ifdef AVXLASTFLIP
500-
0, 0, RF30, RF40, RF50, RF60, RF70, CF80,
501-
0, 0, RF41, RF51, RF61, RF71, CF81, CF81,
502-
CF82, CF82, RF52, RF62, RF72, CF82, CF82, CF82,
503-
CF83, CF83, RF63, RF73, CF83, LF72, CF83, CF83,
504-
CF84, CF84, RF74, CF84, LF73, LF62, CF84, CF84,
505-
CF85, CF85, CF85, LF74, LF63, LF52, CF85, CF85,
506-
CF86, CF86, LF75, LF64, LF53, LF42, 0, 0,
507-
CF87, LF76, LF65, LF54, LF43, LF32, 0, 0
508-
#else
509-
0, 0, CF82, CF83, CF84, CF85, CF86, CF87,
510-
0, 0, CF82, CF83, CF84, CF85, CF86, LF76,
511-
RF30, RF41, RF52, RF63, RF74, CF85, LF75, LF65,
512-
RF40, RF51, RF62, RF73, CF84, LF74, LF64, LF54,
513-
RF50, RF61, RF72, CF83, LF73, LF63, LF53, LF43,
514-
RF60, RF71, CF82, LF72, LF62, LF52, LF42, LF32,
515-
RF70, CF81, CF82, CF83, CF84, CF85, 0, 0,
516-
CF80, CF81, CF82, CF83, CF84, CF85, 0, 0
517-
#endif
518-
}, {
519-
CF80, RF70, RF60, RF50, RF40, RF30, 0, 0,
520-
CF81, CF81, RF71, RF61, RF51, RF41, 0, 0,
521-
CF82, CF82, CF82, RF72, RF62, RF52, CF82, CF82,
522-
CF83, CF83, LF72, CF83, RF73, RF63, CF83, CF83,
523-
CF84, CF84, LF62, LF73, CF84, RF74, CF84, CF84,
524-
CF85, CF85, LF52, LF63, LF74, CF85, CF85, CF85,
525-
0, 0, LF42, LF53, LF64, LF75, CF86, CF86,
526-
0, 0, LF32, LF43, LF54, LF65, LF76, CF87
527-
}};
528-
529498
/* bit masks for diagonal lines */
530499
const V4DI mask_vdhd[64] = {
531500
{{ 0x0000000000000000, 0x00000000000000ff, 0x8040201008040201, 0x0101010101010101 }},
@@ -789,6 +758,40 @@ inline int vectorcall board_score_sse_1(__m128i OP, const int alpha, const int p
789758
}
790759

791760
#else
761+
762+
#ifdef AVXLASTFLIP
763+
const unsigned short cf_ofs_d[64][2] = {
764+
{ 0, CF80 }, { 0, RF70 }, { RF30, RF60 }, { RF40, RF50 }, { RF50, RF40 }, { RF60, RF30 }, { RF70, 0 }, { CF80, 0 },
765+
{ 0, CF81 }, { 0, CF81 }, { RF41, RF71 }, { RF51, RF61 }, { RF61, RF51 }, { RF71, RF41 }, { CF81, 0 }, { CF81, 0 },
766+
{ CF82, CF82 }, { CF82, CF82 }, { RF52, CF82 }, { RF62, RF72 }, { RF72, RF62 }, { CF82, RF52 }, { CF82, CF82 }, { CF82, CF82 },
767+
{ CF83, CF83 }, { CF83, CF83 }, { RF63, LF72 }, { RF73, CF83 }, { CF83, RF73 }, { LF72, RF63 }, { CF83, CF83 }, { CF83, CF83 },
768+
{ CF84, CF84 }, { CF84, CF84 }, { RF74, LF62 }, { CF84, LF73 }, { LF73, CF84 }, { LF62, RF74 }, { CF84, CF84 }, { CF84, CF84 },
769+
{ CF85, CF85 }, { CF85, CF85 }, { CF85, LF52 }, { LF74, LF63 }, { LF63, LF74 }, { LF52, CF85 }, { CF85, CF85 }, { CF85, CF85 },
770+
{ CF86, 0 }, { CF86, 0 }, { LF75, LF42 }, { LF64, LF53 }, { LF53, LF64 }, { LF42, LF75 }, { 0, CF86 }, { 0, CF86 },
771+
{ CF87, 0 }, { LF76, 0 }, { LF65, LF32 }, { LF54, LF43 }, { LF43, LF54 }, { LF32, LF65 }, { 0, LF76 }, { 0, CF87 }
772+
};
773+
774+
#else
775+
const V4SI cf_ofs[64] = {
776+
{ 0, CF80, CF80, CF80 }, { 0, RF70, CF81, CF80 }, { CF82, RF60, CF82, CF80 }, { CF83, RF50, CF83, CF80 },
777+
{ CF84, RF40, CF84, CF80 }, { CF85, RF30, CF85, CF80 }, { CF86, 0, CF86, CF80 }, { CF87, 0, CF87, CF80 },
778+
{ 0, CF81, CF80, CF81 }, { 0, CF81, CF81, CF81 }, { CF82, RF71, CF82, CF81 }, { CF83, RF61, CF83, CF81 },
779+
{ CF84, RF51, CF84, CF81 }, { CF85, RF41, CF85, CF81 }, { CF86, 0, CF86, CF81 }, { LF76, 0, CF87, CF81 },
780+
{ RF30, CF82, CF80, CF82 }, { RF41, CF82, CF81, CF82 }, { RF52, CF82, CF82, CF82 }, { RF63, RF72, CF83, CF82 },
781+
{ RF74, RF62, CF84, CF82 }, { CF85, RF52, CF85, CF82 }, { LF75, CF82, CF86, CF82 }, { LF65, CF82, CF87, CF82 },
782+
{ RF40, CF83, CF80, CF83 }, { RF51, CF83, CF81, CF83 }, { RF62, LF72, CF82, CF83 }, { RF73, CF83, CF83, CF83 },
783+
{ CF84, RF73, CF84, CF83 }, { LF74, RF63, CF85, CF83 }, { LF64, CF83, CF86, CF83 }, { LF54, CF83, CF87, CF83 },
784+
{ RF50, CF84, CF80, CF84 }, { RF61, CF84, CF81, CF84 }, { RF72, LF62, CF82, CF84 }, { CF83, LF73, CF83, CF84 },
785+
{ LF73, CF84, CF84, CF84 }, { LF63, RF74, CF85, CF84 }, { LF53, CF84, CF86, CF84 }, { LF43, CF84, CF87, CF84 },
786+
{ RF60, CF85, CF80, CF85 }, { RF71, CF85, CF81, CF85 }, { CF82, LF52, CF82, CF85 }, { LF72, LF63, CF83, CF85 },
787+
{ LF62, LF74, CF84, CF85 }, { LF52, CF85, CF85, CF85 }, { LF42, CF85, CF86, CF85 }, { LF32, CF85, CF87, CF85 },
788+
{ RF70, 0, CF80, CF86 }, { CF81, 0, CF81, CF86 }, { CF82, LF42, CF82, CF86 }, { CF83, LF53, CF83, CF86 },
789+
{ CF84, LF64, CF84, CF86 }, { CF85, LF75, CF85, CF86 }, { 0, CF86, CF86, CF86 }, { 0, CF86, CF87, CF86 },
790+
{ CF80, 0, CF80, CF87 }, { CF81, 0, CF81, CF87 }, { CF82, LF32, CF82, CF87 }, { CF83, LF43, CF83, CF87 },
791+
{ CF84, LF54, CF84, CF87 }, { CF85, LF65, CF85, CF87 }, { 0, LF76, CF86, CF87 }, { 0, CF87, CF87, CF87 }
792+
};
793+
#endif
794+
792795
// COUNT_LAST_FLIP_SSE - reasonably fast on all platforms
793796
inline int vectorcall board_score_sse_1(__m128i OP, const int alpha, const int pos)
794797
{
@@ -800,21 +803,24 @@ inline int vectorcall board_score_sse_1(__m128i OP, const int alpha, const int p
800803
int score = 2 * bit_count(P) - 64 + 2; // = (bit_count(P) + 1) - (SCORE_MAX - 1 - bit_count(P))
801804

802805
t = TEST_EPI8_MASK32(_mm256_broadcastq_epi64(OP), mask_vdhd[pos].v4);
803-
op_flip = COUNT_FLIP[cf_ofs_d[0][pos] + (t & 0xFF)];
806+
op_flip = COUNT_FLIP[cf_ofs_d[pos][0] + (t & 0xFF)];
804807
op_flip += COUNT_FLIP[(pos & 7) * 256 + ((P >> (pos & 0x38)) & 0xFF)];
805-
t >>= 16;
808+
op_flip += COUNT_FLIP[cf_ofs_d[pos][1] + ((t >> 16) & 0xFF)];
809+
op_flip += COUNT_FLIP[((pos & 0x38) << 5) + (t >> 24)];
806810

807811
#else
808812
int score = 2 * bit_count_si64(OP) - 64 + 2; // = (bit_count(P) + 1) - (SCORE_MAX - 1 - bit_count(P))
809813
__m128i P2 = _mm_unpacklo_epi64(OP, OP);
810814
__m128i II = _mm_sad_epu8(_mm_and_si128(P2, mask_vdhd[pos].v2[0]), _mm_setzero_si128());
811815

812-
op_flip = COUNT_FLIP[cf_ofs_d[0][pos] + _mm_cvtsi128_si32(II)];
813-
op_flip += COUNT_FLIP[(pos & 7) * 256 + _mm_extract_epi16(II, 4)];
816+
II = _mm_add_epi32(II, cf_ofs[pos].v4);
817+
op_flip = COUNT_FLIP[_mm_cvtsi128_si32(II)];
818+
op_flip += COUNT_FLIP[_mm_extract_epi16(II, 4)];
814819
t = TEST_EPI8_MASK16(P2, mask_vdhd[pos].v2[1]);
815-
#endif
816-
op_flip += COUNT_FLIP[cf_ofs_d[1][pos] + (t & 0xFF)];
820+
op_flip += COUNT_FLIP[cf_ofs[pos].ui[1] + (t & 0xFF)];
821+
// op_flip += COUNT_FLIP[cf_ofs[pos].ui[3] + (t >> 8)];
817822
op_flip += COUNT_FLIP[((pos & 0x38) << 5) + (t >> 8)];
823+
#endif
818824

819825
p_flips = op_flip & 0xFF;
820826
if (p_flips)

0 commit comments

Comments
 (0)