Skip to content

Commit 2367fd6

Browse files
committed
More COUNT_FLIP substitutions in bmi2
1 parent 2275ebf commit 2367fd6

File tree

2 files changed

+47
-45
lines changed

2 files changed

+47
-45
lines changed

src/count_last_flip_bmi2.c

Lines changed: 45 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ const uint16_t COUNT_FLIP[] = {
294294
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
295295
0x0600, 0x0600, 0x0600, 0x0600, 0x0600, 0x0600, 0x0600, 0x0600,
296296
0x0800, 0x0800, 0x0800, 0x0800, 0x0a00, 0x0a00, 0x0c00, 0x0000,
297-
// 7[0]: 2048
297+
// 7[0]: 2048 (bbbbbb*)
298298
0x0000, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0600,
299299
0x0006, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0800,
300300
0x0008, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0600,
@@ -303,7 +303,8 @@ const uint16_t COUNT_FLIP[] = {
303303
0x0006, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0800,
304304
0x0008, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0600,
305305
0x0006, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0000,
306-
// 7[2]: 2112
306+
// 7[1] (bbbbb*0) -> 6[0] (bbbbb*)
307+
// 7[2]: 2112 (bbbb*bb)
307308
0x0000, 0x0002, 0x0200, 0x0000, 0x0200, 0x0202, 0x0400, 0x0200,
308309
0x0002, 0x0004, 0x0202, 0x0002, 0x0400, 0x0402, 0x0600, 0x0400,
309310
0x0004, 0x0006, 0x0204, 0x0004, 0x0200, 0x0202, 0x0400, 0x0200,
@@ -312,7 +313,7 @@ const uint16_t COUNT_FLIP[] = {
312313
0x0002, 0x0004, 0x0202, 0x0002, 0x0400, 0x0402, 0x0600, 0x0400,
313314
0x0004, 0x0006, 0x0204, 0x0004, 0x0200, 0x0202, 0x0400, 0x0200,
314315
0x0002, 0x0004, 0x0202, 0x0002, 0x0000, 0x0002, 0x0200, 0x0000,
315-
// 7[3]: 2176
316+
// 7[3]: 2176 (bbb*bbb)
316317
0x0000, 0x0004, 0x0002, 0x0002, 0x0200, 0x0200, 0x0400, 0x0000,
317318
0x0200, 0x0204, 0x0202, 0x0202, 0x0400, 0x0400, 0x0600, 0x0200,
318319
0x0002, 0x0006, 0x0004, 0x0004, 0x0202, 0x0202, 0x0402, 0x0002,
@@ -321,7 +322,7 @@ const uint16_t COUNT_FLIP[] = {
321322
0x0200, 0x0204, 0x0202, 0x0202, 0x0400, 0x0400, 0x0600, 0x0200,
322323
0x0002, 0x0006, 0x0004, 0x0004, 0x0202, 0x0202, 0x0402, 0x0002,
323324
0x0000, 0x0004, 0x0002, 0x0002, 0x0200, 0x0200, 0x0400, 0x0000,
324-
// 7[4]: 2240
325+
// 7[4]: 2240 (bb*bbbb)
325326
0x0000, 0x0006, 0x0004, 0x0004, 0x0002, 0x0002, 0x0002, 0x0002,
326327
0x0200, 0x0200, 0x0200, 0x0200, 0x0400, 0x0400, 0x0600, 0x0000,
327328
0x0200, 0x0206, 0x0204, 0x0204, 0x0202, 0x0202, 0x0202, 0x0202,
@@ -330,72 +331,73 @@ const uint16_t COUNT_FLIP[] = {
330331
0x0202, 0x0202, 0x0202, 0x0202, 0x0402, 0x0402, 0x0602, 0x0002,
331332
0x0000, 0x0006, 0x0004, 0x0004, 0x0002, 0x0002, 0x0002, 0x0002,
332333
0x0200, 0x0200, 0x0200, 0x0200, 0x0400, 0x0400, 0x0600, 0x0000,
333-
// 6[0]: 2368
334+
// 7[5] (0*bbbbb) -> 8[5] (00*bbbbb)
335+
// 7[6] (*bbbbbb) -> 8[6] (0*bbbbbb)
336+
// 6[0]: 2304 (bbbbb*)
334337
0x0000, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0600,
335338
0x0006, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0800,
336339
0x0008, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0600,
337340
0x0006, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0000,
338-
// 6[2]: 2400
341+
// 6[1] (bbbb*0) -> 5[0] (bbbb*)
342+
// 6[2]: 2336 (bbb*bb)
339343
0x0000, 0x0002, 0x0200, 0x0000, 0x0200, 0x0202, 0x0400, 0x0200,
340344
0x0002, 0x0004, 0x0202, 0x0002, 0x0400, 0x0402, 0x0600, 0x0400,
341345
0x0004, 0x0006, 0x0204, 0x0004, 0x0200, 0x0202, 0x0400, 0x0200,
342346
0x0002, 0x0004, 0x0202, 0x0002, 0x0000, 0x0002, 0x0200, 0x0000,
343-
// 6[3]: 2432
347+
// 6[3]: 2368 (bb*bbb)
344348
0x0000, 0x0004, 0x0002, 0x0002, 0x0200, 0x0200, 0x0400, 0x0000,
345349
0x0200, 0x0204, 0x0202, 0x0202, 0x0400, 0x0400, 0x0600, 0x0200,
346350
0x0002, 0x0006, 0x0004, 0x0004, 0x0202, 0x0202, 0x0402, 0x0002,
347351
0x0000, 0x0004, 0x0002, 0x0002, 0x0200, 0x0200, 0x0400, 0x0000,
348-
// 6[5]: 2464
349-
0x0000, 0x0008, 0x0006, 0x0006, 0x0004, 0x0004, 0x0004, 0x0004,
350-
0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
351-
0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
352-
0x0400, 0x0400, 0x0400, 0x0400, 0x0600, 0x0600, 0x0800, 0x0000,
353-
// 5[0]: 2496
352+
// 6[4] (0*bbbb) -> 8[4] (000*bbbb)
353+
// 6[5] (*bbbbb) -> 8[5] (00*bbbbb)
354+
// 5[0]: 2400 (bbbb*)
354355
0x0000, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0600,
355356
0x0006, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0000,
356-
// 5[2]: 2512
357-
0x0000, 0x0002, 0x0200, 0x0000, 0x0200, 0x0202, 0x0400, 0x0200,
358-
0x0002, 0x0004, 0x0202, 0x0002, 0x0000, 0x0002, 0x0200, 0x0000,
359-
// 5[4]: 2528
360-
0x0000, 0x0006, 0x0004, 0x0004, 0x0002, 0x0002, 0x0002, 0x0002,
361-
0x0200, 0x0200, 0x0200, 0x0200, 0x0400, 0x0400, 0x0600, 0x0000,
362-
// 4[0]: 2544
357+
// 5[1] (bbb*0) -> 4[0] (bbb*)
358+
// 5[2]: 2416 (bb*bb)
359+
0x0000, 0x0002, 0x0200, 0x0000, 0x0200, 0x0202, 0x0400, 0x0200,
360+
0x0002, 0x0004, 0x0202, 0x0002, 0x0000, 0x0002, 0x0200, 0x0000,
361+
// 5[3] (0*bbb) -> 8[3] (0000*bbb)
362+
// 5[4] (*bbbb) -> 8[4] (000*bbbb)
363+
// 4[0]: 2432 (bbb*)
363364
0x0000, 0x0200, 0x0002, 0x0400, 0x0004, 0x0200, 0x0002, 0x0000,
364-
// 4[3]: 2552
365-
0x0000, 0x0004, 0x0002, 0x0002, 0x0200, 0x0200, 0x0400, 0x0000,
366-
// 3[0]: 2560
365+
// 4[1] (bb*0) -> 3[0] (bb*)
366+
// 4[2] (0*bb) -> 8[2] (00000*bb)
367+
// 4[3] (*bbb) -> 8[3] (0000*bbb)
368+
// 3[0]: 2440 (bb*)
367369
0x0000, 0x0200, 0x0002, 0x0000,
368-
// 3[2]: 2564
369-
0x0000, 0x0002, 0x0200, 0x0000,
370-
};
370+
// 3[1] (0*0) -> 0
371+
// 3[2] (*bb) -> 8[2] (00000*bb)
372+
}; // 2444
371373

372374
enum {
373375
CF80 = 0, CF81 = 256, CF82 = 512, CF83 = 768, CF84 = 1024, CF85 = 1280, CF86 = 1536, CF87 = 1792,
374376
CF70 = 2048, CF72 = 2112, CF73 = 2176, CF74 = 2240,
375-
CF60 = 2304, CF62 = 2336, CF63 = 2368, CF65 = 2400,
376-
CF50 = 2432, CF52 = 2448, CF54 = 2464,
377-
CF40 = 2480, CF43 = 2488,
378-
CF30 = 2496, CF32 = 2500
377+
CF60 = 2304, CF62 = 2336, CF63 = 2368,
378+
CF50 = 2400, CF52 = 2416,
379+
CF40 = 2432,
380+
CF30 = 2440
379381
};
380382

381383
const unsigned short cf_ofs_d[2][64] = {{
382384
0, 0, CF30, CF40, CF50, CF60, CF70, CF80,
383-
0, 0, CF30, CF40, CF50, CF60, CF81, CF70, // CF31 -> 0, CF41..CF71 -> CF30..CF60
384-
CF32, CF32, CF52, CF62, CF72, CF82, CF60, CF60, // CF42 -> CF32, CF71 -> CF60
385-
CF43, CF43, CF63, CF73, CF83, CF72, CF50, CF50, // CF53 -> CF43, CF61 -> CF50
386-
CF54, CF54, CF74, CF84, CF73, CF62, CF40, CF40, // CF64 -> CF54, CF51 -> CF40
387-
CF65, CF65, CF85, CF74, CF63, CF52, CF30, CF30, // CF75 -> CF65, CF41 -> CF30
388-
CF86, CF86, CF65, CF54, CF43, CF32, 0, 0, // CF31 -> 0, CF76 -> CF86, CF75..CF42 -> CF65..CF32
389-
CF87, CF86, CF65, CF54, CF43, CF32, 0, 0 // CF76 -> CF86
385+
0, 0, CF30, CF40, CF50, CF60, CF81, CF70,
386+
CF82, CF82, CF52, CF62, CF72, CF82, CF60, CF60,
387+
CF83, CF83, CF63, CF73, CF83, CF72, CF50, CF50,
388+
CF84, CF84, CF74, CF84, CF73, CF62, CF40, CF40,
389+
CF85, CF85, CF85, CF74, CF63, CF52, CF30, CF30,
390+
CF86, CF86, CF85, CF84, CF83, CF82, 0, 0,
391+
CF87, CF86, CF85, CF84, CF83, CF82, 0, 0
390392
}, {
391393
CF80, CF70, CF60, CF50, CF40, CF30, 0, 0,
392394
CF70, CF81, CF60, CF50, CF40, CF30, 0, 0,
393-
CF60, CF60, CF82, CF72, CF62, CF52, CF32, CF32,
394-
CF50, CF50, CF72, CF83, CF73, CF63, CF43, CF43,
395-
CF40, CF40, CF62, CF73, CF84, CF74, CF54, CF54,
396-
CF30, CF30, CF52, CF63, CF74, CF85, CF65, CF65,
397-
0, 0, CF32, CF43, CF54, CF65, CF86, CF86,
398-
0, 0, CF32, CF43, CF54, CF65, CF86, CF87
395+
CF60, CF60, CF82, CF72, CF62, CF52, CF82, CF82,
396+
CF50, CF50, CF72, CF83, CF73, CF63, CF83, CF83,
397+
CF40, CF40, CF62, CF73, CF84, CF74, CF84, CF84,
398+
CF30, CF30, CF52, CF63, CF74, CF85, CF85, CF85,
399+
0, 0, CF82, CF83, CF84, CF85, CF86, CF86,
400+
0, 0, CF82, CF83, CF84, CF85, CF86, CF87
399401
}};
400402

401403
/* bit masks for diagonal lines */

src/count_last_flip_sse.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ enum {
498498
const unsigned short cf_ofs_d[2][64] = {{
499499
#ifdef AVXLASTFLIP
500500
0, 0, RF30, RF40, RF50, RF60, RF70, CF80,
501-
0, 0, RF41, RF51, RF61, RF71, CF81, CF81,
501+
0, 0, RF41, RF51, RF61, RF71, CF81, CF81,
502502
CF82, CF82, RF52, RF62, RF72, CF82, CF82, CF82,
503503
CF83, CF83, RF63, RF73, CF83, LF72, CF83, CF83,
504504
CF84, CF84, RF74, CF84, LF73, LF62, CF84, CF84,
@@ -507,7 +507,7 @@ const unsigned short cf_ofs_d[2][64] = {{
507507
CF87, LF76, LF65, LF54, LF43, LF32, 0, 0
508508
#else
509509
0, 0, CF82, CF83, CF84, CF85, CF86, CF87,
510-
0, 0, CF82, CF83, CF84, CF85, CF86, LF76,
510+
0, 0, CF82, CF83, CF84, CF85, CF86, LF76,
511511
RF30, RF41, RF52, RF63, RF74, CF85, LF75, LF65,
512512
RF40, RF51, RF62, RF73, CF84, LF74, LF64, LF54,
513513
RF50, RF61, RF72, CF83, LF73, LF63, LF53, LF43,

0 commit comments

Comments
 (0)