@@ -451,8 +451,8 @@ TEST_F(MapperMemoryPromotionRAW, throwIfCopiesBelowThreads) {
451
451
}
452
452
453
453
class MatMulBias : public TestMapper {
454
- public :
455
- std::string emitCode (
454
+ protected :
455
+ std::unique_ptr<MappedScop> prepare (
456
456
const std::unordered_map<std::string, size_t >& parameters,
457
457
const CudaMappingOptions& mappingOptions) {
458
458
std::string tc = R"TC(
@@ -462,9 +462,45 @@ def fun(float(N,K) A, float(K,M) B, float(N,M) C) -> (O) {
462
462
}
463
463
)TC" ;
464
464
465
- auto mscop = makeMappedScop (tc, mappingOptions, parameters);
465
+ return makeMappedScop (tc, mappingOptions, parameters);
466
+ }
467
+
468
+ std::string emitCode (const std::unique_ptr<MappedScop>& mscop) {
466
469
return std::get<0 >(mscop->codegen (" fun" ));
467
470
}
471
+
472
+ std::string emitCode (
473
+ const std::unordered_map<std::string, size_t >& parameters,
474
+ const CudaMappingOptions& mappingOptions) {
475
+ return emitCode (prepare (parameters, mappingOptions));
476
+ }
477
+
478
+ void expectNoABCPromotion (const std::string& code) {
479
+ auto aDeclPos = code.find (" float32 _A_0" );
480
+ auto bDeclPos = code.find (" float32 _B_0" );
481
+ auto cDeclPos = code.find (" float32 _C_0" );
482
+ EXPECT_TRUE (aDeclPos == std::string::npos)
483
+ << " tensor A promoted to register but has elements accessed "
484
+ << " by multiple threads" ;
485
+ EXPECT_TRUE (bDeclPos == std::string::npos)
486
+ << " tensor B promoted to register but has elements accessed "
487
+ << " by multiple threads" ;
488
+ EXPECT_TRUE (cDeclPos == std::string::npos)
489
+ << " tensor C promoted to register but has no reuse" ;
490
+ }
491
+
492
+ void expectNoSymbolicSubscript (const std::string& code) {
493
+ // We don't know the exact name of the iterator, but it starts with c.
494
+ auto oWithIteratorPos = code.find (" _O_0[c" );
495
+ auto oWithThreadPos = code.find (" _O_0[t1" );
496
+
497
+ EXPECT_TRUE (oWithIteratorPos == std::string::npos)
498
+ << " accessing local arrays with iterators in subscripts makes "
499
+ << " these arrays placed in local memory instead of registers" ;
500
+ EXPECT_TRUE (oWithThreadPos == std::string::npos)
501
+ << " expected per-thread groups to be computed, i.e. thread "
502
+ << " identifiers should not appear in the subscripts" ;
503
+ }
468
504
};
469
505
470
506
TEST_F (MatMulBias, RegisterPromotion) {
@@ -482,8 +518,6 @@ TEST_F(MatMulBias, RegisterPromotion) {
482
518
483
519
auto originalAccPos =
484
520
code.find (" O[32 * b0 + c3][t0 + 32 * b1]" , copyToPos + 1 );
485
- auto cDeclPos = code.find (" float32 _C_0" );
486
- auto aDeclPos = code.find (" float32 _A_0" );
487
521
488
522
EXPECT_TRUE (declPos != std::string::npos) << " no declaration of the register" ;
489
523
EXPECT_TRUE (copyToPos != std::string::npos) << " expected copy to register" ;
@@ -492,10 +526,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
492
526
493
527
EXPECT_NE (originalAccPos, copyFromPos)
494
528
<< " global array reference is used in main computation" ;
495
- EXPECT_TRUE (cDeclPos == std::string::npos)
496
- << " tensor C promoted to register but has no reuse" ;
497
- EXPECT_TRUE (aDeclPos == std::string::npos)
498
- << " tensor A promoted to register but has elements accessed by multiple threads" ;
529
+
530
+ expectNoABCPromotion (code);
499
531
}
500
532
501
533
TEST_F (MatMulBias, RegisterPromotionSharedPreference) {
@@ -506,16 +538,93 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
506
538
.usePrivateMemory (true );
507
539
508
540
auto code = emitCode ({{" N" , 42 }, {" M" , 56 }, {" K" , 37 }}, mappingOptions);
509
- auto declPos = code.find (" float32 _O_0[1][1]" );
510
- auto cDeclPos = code.find (" float32 _C_0[1][1]" );
511
- auto aDeclPos = code.find (" float32 _A_0[1][1]" );
512
541
542
+ auto declPos = code.find (" float32 _O_0[1][1]" );
513
543
EXPECT_TRUE (declPos == std::string::npos)
514
544
<< " not expected promotion to register because promoted to shared" ;
515
- EXPECT_TRUE (cDeclPos == std::string::npos)
516
- << " tensor C promoted to register but has no reuse" ;
517
- EXPECT_TRUE (aDeclPos == std::string::npos)
518
- << " tensor A promoted to register but has elements accessed by multiple threads" ;
545
+
546
+ expectNoABCPromotion (code);
547
+ }
548
+
549
+ TEST_F (MatMulBias, RegistersAtRoot) {
550
+ // Disable automatic promotion to registers because we are going to call it
551
+ // manually. Require sufficient unrolling to actually hit registers.
552
+ auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions ()
553
+ .unroll (512 )
554
+ .useSharedMemory (false )
555
+ .usePrivateMemory (false );
556
+
557
+ auto mscop = prepare ({{" N" , 42 }, {" M" , 56 }, {" K" , 37 }}, mappingOptions);
558
+ promoteToRegistersBelow (*mscop, mscop->scop ().scheduleRoot ());
559
+ auto code = emitCode (mscop);
560
+
561
+ // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
562
+ // after tiling by 32.
563
+ auto oDeclPos = code.find (" float32 _O_0[4][1];" );
564
+ EXPECT_TRUE (oDeclPos != std::string::npos)
565
+ << " expected O to be promoted to registers" ;
566
+
567
+ expectNoABCPromotion (code);
568
+ expectNoSymbolicSubscript (code);
569
+
570
+ auto o00Pos = code.find (" _O_0[0][0]" );
571
+ auto o10Pos = code.find (" _O_0[1][0]" );
572
+ auto o20Pos = code.find (" _O_0[2][0]" );
573
+ auto o30Pos = code.find (" _O_0[3][0]" );
574
+
575
+ EXPECT_TRUE (o00Pos != std::string::npos)
576
+ << " expected constant subscripts in _O_0" ;
577
+ EXPECT_TRUE (o10Pos != std::string::npos)
578
+ << " expected constant subscripts in _O_0" ;
579
+ EXPECT_TRUE (o20Pos != std::string::npos)
580
+ << " expected constant subscripts in _O_0" ;
581
+ EXPECT_TRUE (o30Pos != std::string::npos)
582
+ << " expected constant subscripts in _O_0" ;
583
+ }
584
+
585
+ TEST_F (MatMulBias, RegistersAtRootNotEnoughUnroll) {
586
+ // Disable automatic promotion to registers because we are going to call it
587
+ // manually. Require no unrolling so as to make promotion to registers
588
+ // invalid.
589
+ auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions ()
590
+ .unroll (1 )
591
+ .useSharedMemory (false )
592
+ .usePrivateMemory (false );
593
+
594
+ auto mscop = prepare ({{" N" , 42 }, {" M" , 56 }, {" K" , 37 }}, mappingOptions);
595
+ promoteToRegistersBelow (*mscop, mscop->scop ().scheduleRoot ());
596
+ auto code = emitCode (mscop);
597
+ auto oDeclPos = code.find (" float32 _O_0;" );
598
+
599
+ EXPECT_TRUE (oDeclPos == std::string::npos)
600
+ << " not expected O to be promoted to registers" ;
601
+
602
+ expectNoABCPromotion (code);
603
+ expectNoSymbolicSubscript (code);
604
+ }
605
+
606
+ TEST_F (MatMulBias, RegistersBelowFirstBand) {
607
+ using namespace polyhedral ::detail;
608
+
609
+ // Disable automatic promotion to registers because we are going to call it
610
+ // manually.
611
+ auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions ()
612
+ .useSharedMemory (false )
613
+ .usePrivateMemory (false );
614
+ auto mscop = prepare ({{" N" , 42 }, {" M" , 56 }, {" K" , 37 }}, mappingOptions);
615
+
616
+ auto nodes = ScheduleTree::collectDFSPostorder (
617
+ mscop->scop ().scheduleRoot (), ScheduleTreeType::Band);
618
+ ASSERT_GT (nodes.size (), 0u );
619
+ auto node = nodes[0 ];
620
+ promoteToRegistersBelow (*mscop, node);
621
+ auto code = emitCode (mscop);
622
+
623
+ auto oDeclPos = code.find (" float32 _O_0[1][1];" );
624
+ EXPECT_TRUE (oDeclPos != std::string::npos)
625
+ << " expected O to be promoted to registers" ;
626
+ expectNoABCPromotion (code);
627
+ expectNoSymbolicSubscript (code);
519
628
}
520
629
521
630
class Strided : public TestMapper {
0 commit comments