Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 65e42b5

Browse files
Merge pull request #467 from nicolasvasilache/pr/more-python-bindings
More python bindings
2 parents ac2975a + 7cef7f6 commit 65e42b5

File tree

2 files changed

+69
-60
lines changed

2 files changed

+69
-60
lines changed

tc/proto/mapping_options.proto

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,12 @@ message MappingOptionsProto {
4444
optional TilingProto tiling = 4;
4545
// Unroll innermost loops to until at most "unroll" copies. If not provided,
4646
// do not unroll.
47-
optional uint64 unroll = 7;
48-
// Apply tiling to imperfectly nested loops if possible. Same tilesizes are
49-
// used.
50-
required bool tile_imperfectly_nested = 8;
51-
// reserved 5, 6, 9 to 13; can only activate with proto3
52-
// Reserved: 5, 6, 9-13 -> factored out into CudaMappingOptionsProto
53-
required bool match_library_calls = 14;
47+
optional uint64 unroll = 5;
48+
// Apply tiling to imperfectly nested loops if possible.
49+
// Same tile sizes are used.
50+
required bool tile_imperfectly_nested = 6;
51+
// Match library calls when possible (CUB reductions for now)
52+
required bool match_library_calls = 7;
5453
}
5554

5655
message CudaMappingOptionsProto {

tensor_comprehensions/pybinds/tc.cc

Lines changed: 63 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -287,24 +287,16 @@ PYBIND11_MODULE(tc, m) {
287287
" 5. group_conv\n"
288288
" 6. single_thread")
289289
.def(
290-
"maxSharedMemory",
291-
&tc::CudaMappingOptions::maxSharedMemory,
292-
"The amount of shared memory to use, in bytes. If not provided, "
293-
"TC will query the active GPU and use all available shared memory.")
294-
.def(
295-
"useSharedMemory",
296-
&tc::CudaMappingOptions::useSharedMemory,
297-
"Create block-local copies of data in shared memory when this can "
298-
"leverage data reuse or global memory access coalescing")
299-
.def(
300-
"unrollCopyShared",
301-
&tc::CudaMappingOptions::unrollCopyShared,
302-
"Also unroll the copies to and from shared memory. If an unroll "
303-
"value is not provided, has no effect")
304-
.def(
305-
"useReadOnlyCache",
306-
&tc::CudaMappingOptions::useReadOnlyCache,
307-
"Use the readonly cache (i.e. emit __ldg loads)")
290+
"__str__",
291+
[](tc::CudaMappingOptions& instance) {
292+
std::string str;
293+
google::protobuf::TextFormat::PrintToString(instance.proto(), &str);
294+
return str;
295+
},
296+
"Returns the CudaMappingOptions as a human-readable string")
297+
//
298+
// Generic options
299+
//
308300
.def(
309301
"scheduleFusionStrategy",
310302
[](tc::CudaMappingOptions& instance, const std::string& type) {
@@ -334,20 +326,16 @@ PYBIND11_MODULE(tc, m) {
334326
"(Preserve3Coincident) by\n"
335327
"performing loop fusion and fission. Applies before tiling")
336328
.def(
337-
"serialize",
338-
[](tc::CudaMappingOptions& instance) {
339-
std::string str = instance.toProtobufSerializedString();
340-
return py::bytes(str);
341-
},
342-
"Serialize the options to a protobuf string")
343-
.def(
344-
"toString",
345-
[](tc::CudaMappingOptions& instance) {
346-
std::string str;
347-
google::protobuf::TextFormat::PrintToString(instance.proto(), &str);
348-
return str;
329+
"fixParametersBeforeScheduling",
330+
[](tc::CudaMappingOptions& instance, bool fix) {
331+
instance.fixParametersBeforeScheduling(fix);
349332
},
350-
"Returns the CudaMappingOptions as a human-readable string")
333+
"Perform automatic loop scheduling taking into account specific "
334+
"tensor sizes.\n"
335+
"May produce faster kernels but significantly increases compilation "
336+
"time.\n"
337+
"Note that the mapping will be performed for specific tensor sizes "
338+
"anyway")
351339
.def(
352340
"tile",
353341
// pybind11 has implicit conversion from list -> vector
@@ -356,6 +344,29 @@ PYBIND11_MODULE(tc, m) {
356344
"Perform loop tiling on the generated code with the given sizes. "
357345
"Independent of mapping to a\n"
358346
"grid of thread blocks")
347+
.def(
348+
"tile_imperfectly_nested",
349+
[](tc::CudaMappingOptions& instance, bool tile) {
350+
instance.tileImperfectlyNested(tile);
351+
},
352+
"Allow imperfectly nested loop tiling")
353+
.def(
354+
"unroll",
355+
[](tc::CudaMappingOptions& instance, uint64_t factor) {
356+
instance.unroll(factor);
357+
},
358+
"Perform loop unrolling on the generated code and produce at "
359+
"most the given number of statements")
360+
.def(
361+
"matchLibraryCalls",
362+
[](tc::CudaMappingOptions& instance, bool match) {
363+
instance.matchLibraryCalls(match);
364+
},
365+
"Replace computation patterns with calls to highly optimized "
366+
"libraries (such as CUB, CUTLASS) when possible")
367+
//
368+
// CUDA-specific options
369+
//
359370
.def(
360371
"mapToThreads",
361372
[](tc::CudaMappingOptions& instance,
@@ -378,30 +389,29 @@ PYBIND11_MODULE(tc, m) {
378389
"within the range allowed by CUDA (maximum 2^31-1 for the first "
379390
"value and 65535 for the second and third)")
380391
.def(
381-
"matchLibraryCalls",
382-
[](tc::CudaMappingOptions& instance, bool match) {
383-
instance.matchLibraryCalls(match);
384-
},
385-
"Replace computation patterns with calls to highly optimized "
386-
"libraries (such as CUB, CUTLASS) when possible")
392+
"useSharedMemory",
393+
&tc::CudaMappingOptions::useSharedMemory,
394+
"Create block-local copies of data in shared memory when this can "
395+
"leverage data reuse or global memory access coalescing")
387396
.def(
388-
"fixParametersBeforeScheduling",
389-
[](tc::CudaMappingOptions& instance, bool fix) {
390-
instance.fixParametersBeforeScheduling(fix);
391-
},
392-
"Perform automatic loop scheduling taking into account specific "
393-
"tensor sizes.\n"
394-
"May produce faster kernels but significantly increases compilation "
395-
"time.\n"
396-
"Note that the mapping will be performed for specific tensor sizes "
397-
"anyway")
397+
"usePrivateMemory",
398+
&tc::CudaMappingOptions::usePrivateMemory,
399+
"Use private memoery (registers) when possible")
400+
398401
.def(
399-
"unroll",
400-
[](tc::CudaMappingOptions& instance, uint64_t factor) {
401-
instance.unroll(factor);
402-
},
403-
"Perform loop unrolling on the generated code and produce at "
404-
"most the given number of statements");
402+
"unrollCopyShared",
403+
&tc::CudaMappingOptions::unrollCopyShared,
404+
"Also unroll the copies to and from shared memory. If an unroll "
405+
"value is not provided, has no effect")
406+
.def(
407+
"maxSharedMemory",
408+
&tc::CudaMappingOptions::maxSharedMemory,
409+
"The amount of shared memory to use, in bytes. If not provided, "
410+
"TC will query the active GPU and use all available shared memory.")
411+
.def(
412+
"useReadOnlyCache",
413+
&tc::CudaMappingOptions::useReadOnlyCache,
414+
"Use the readonly cache (i.e. emit __ldg loads)");
405415
}
406416

407417
} // namespace python

0 commit comments

Comments
 (0)