Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 3a36e60

Browse files
Improve tclib::MappingOptions bindings
This commit adds the previously missing `usePrivateMemory` binding. Additionally make each call on the python side return the `MappingOptions` instance so the methods become nicely chainable on the python side.
1 parent 58f2d0b commit 3a36e60

File tree

2 files changed

+26
-7
lines changed

2 files changed

+26
-7
lines changed

python/examples/tc_pybind_example.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
8686
################################################################################
8787
from tensor_comprehensions.tclib import compile
8888

89-
executor = compile(mm, "matmul", (mat1, mat2), MappingOptions())
89+
executor = compile(mm, "matmul", (mat1, mat2), MappingOptions('naive'))
9090
outputs = executor.run((mat1, mat2), ())
9191
outputs = executor.unchecked_run((mat1, mat2), tuple(outputs))
9292
time_tc(100,
@@ -108,7 +108,7 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
108108
compilation_cache = CompilationCache(mm)
109109
# Compilation returns an allocated tuple of outputs with the proper shapes.
110110
# Allocation overhead is negligible compared to compilation overhead.
111-
compilation_cache.compile("matmul", (mat1, mat2), MappingOptions())
111+
compilation_cache.compile("matmul", (mat1, mat2), MappingOptions('naive'))
112112
# Run once without timing
113113
compilation_cache.unchecked_run("matmul", (mat1, mat2), ())
114114
# unchecked_run on tensors
@@ -136,7 +136,7 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
136136
top1 = tuner.tune(
137137
"matmul",
138138
(mat1, mat2),
139-
MappingOptions(),
139+
MappingOptions('naive'),
140140
TunerConfig(threads = 8, pop_size = 25, generations = 3, devices = "0"))
141141
cache = MappingOptionsCache(unique_filename)
142142
top10 = cache.load(mm, "matmul", (mat1, mat2), 10)
@@ -213,7 +213,7 @@ def compileOrTune(self, name = "", force_reinforcement_tuning = False, inputs =
213213
"########################################################")
214214

215215
if len(base_options_list) == 0:
216-
mapping_options = MappingOptions()
216+
mapping_options = MappingOptions('naive')
217217
else:
218218
mapping_options = base_options_list[0]
219219

@@ -417,7 +417,7 @@ def compileOrTune(self, name = "", force_reinforcement_tuning = False, inputs =
417417
"########################################################")
418418

419419
if len(base_options_list) == 0:
420-
mapping_options = MappingOptions()
420+
mapping_options = MappingOptions('naive')
421421
else:
422422
mapping_options = base_options_list[0]
423423

tensor_comprehensions/pybinds/tclib.cc

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,11 @@ PYBIND11_MODULE(tclib, m) {
514514
"MappingOptions for a Tensor Comprehensions (TC)",
515515
py::module_local())
516516
.def(
517-
py::init([]() {
517+
py::init([](const std::string& optionsName) {
518+
TC_CHECK_EQ(optionsName, "naive")
519+
<< "Naive options are the only constructible user-facing "
520+
<< "options. We recommended using the tuner to get better "
521+
<< "options or, alternatively, retrieving some from a cache.";
518522
return tc::CudaMappingOptions::makeNaiveMappingOptions();
519523
}),
520524
"Initialize naive CudaMappingOption")
@@ -543,6 +547,10 @@ PYBIND11_MODULE(tclib, m) {
543547
&tc::CudaMappingOptions::useSharedMemory,
544548
"Create block-local copies of data in shared memory when this can "
545549
"leverage data reuse or global memory access coalescing")
550+
.def(
551+
"usePrivateMemory",
552+
&tc::CudaMappingOptions::usePrivateMemory,
553+
"Create thread-local copies of data in private memory")
546554
.def(
547555
"unrollCopyShared",
548556
&tc::CudaMappingOptions::unrollCopyShared,
@@ -556,13 +564,15 @@ PYBIND11_MODULE(tclib, m) {
556564
"scheduleFusionStrategy",
557565
[](tc::CudaMappingOptions& instance, const std::string& type) {
558566
instance.scheduleFusionStrategy(type);
567+
return instance;
559568
},
560569
"Set up outerScheduleFusionStrategy and intraTileFusionStrategy "
561570
"to the given value")
562571
.def(
563572
"outerScheduleFusionStrategy",
564573
[](tc::CudaMappingOptions& instance, const std::string& type) {
565574
instance.outerScheduleFusionStrategy(type);
575+
return instance;
566576
},
567577
"Require TC to try and execute different TC expressions interleaved "
568578
"(Max), separately (Min)\n"
@@ -574,6 +584,7 @@ PYBIND11_MODULE(tclib, m) {
574584
"intraTileScheduleFusionStrategy",
575585
[](tc::CudaMappingOptions& instance, const std::string& type) {
576586
instance.intraTileScheduleFusionStrategy(type);
587+
return instance;
577588
},
578589
"Require TC to try and execute different TC expressions interleaved "
579590
"(Max), separately (Min)\n"
@@ -584,7 +595,10 @@ PYBIND11_MODULE(tclib, m) {
584595
"tile",
585596
// pybind11 has implicit conversion from tuple -> vector
586597
[](tc::CudaMappingOptions& instance,
587-
std::vector<uint64_t>& tileSizes) { instance.tile(tileSizes); },
598+
std::vector<uint64_t>& tileSizes) {
599+
instance.tile(tileSizes);
600+
return instance;
601+
},
588602
"Perform loop tiling on the generated code with the given sizes. "
589603
"Independent of mapping to a\n"
590604
"grid of thread blocks")
@@ -593,6 +607,7 @@ PYBIND11_MODULE(tclib, m) {
593607
[](tc::CudaMappingOptions& instance,
594608
std::vector<uint64_t>& threadSizes) {
595609
instance.mapToThreads(threadSizes);
610+
return instance;
596611
},
597612
"The configuration of CUDA block, i.e. the number of CUDA threads "
598613
"in each block along three\n"
@@ -604,6 +619,7 @@ PYBIND11_MODULE(tclib, m) {
604619
[](tc::CudaMappingOptions& instance,
605620
std::vector<uint64_t>& blockSizes) {
606621
instance.mapToBlocks(blockSizes);
622+
return instance;
607623
},
608624
"The configuration of CUDA grid, i.e. the number of CUDA blocks "
609625
"along three dimensions. Must be\n"
@@ -613,13 +629,15 @@ PYBIND11_MODULE(tclib, m) {
613629
"matchLibraryCalls",
614630
[](tc::CudaMappingOptions& instance, bool match) {
615631
instance.matchLibraryCalls(match);
632+
return instance;
616633
},
617634
"Replace computation patterns with calls to highly optimized "
618635
"libraries (such as CUB, CUTLASS) when possible")
619636
.def(
620637
"fixParametersBeforeScheduling",
621638
[](tc::CudaMappingOptions& instance, bool fix) {
622639
instance.fixParametersBeforeScheduling(fix);
640+
return instance;
623641
},
624642
"Perform automatic loop scheduling taking into account specific "
625643
"tensor sizes.\n"
@@ -631,6 +649,7 @@ PYBIND11_MODULE(tclib, m) {
631649
"unroll",
632650
[](tc::CudaMappingOptions& instance, uint64_t factor) {
633651
instance.unroll(factor);
652+
return instance;
634653
},
635654
"Perform loop unrolling on the generated code and produce at "
636655
"most the given number of statements");

0 commit comments

Comments
 (0)