@@ -287,24 +287,16 @@ PYBIND11_MODULE(tc, m) {
287
287
" 5. group_conv\n "
288
288
" 6. single_thread" )
289
289
.def (
290
- " maxSharedMemory" ,
291
- &tc::CudaMappingOptions::maxSharedMemory,
292
- " The amount of shared memory to use, in bytes. If not provided, "
293
- " TC will query the active GPU and use all available shared memory." )
294
- .def (
295
- " useSharedMemory" ,
296
- &tc::CudaMappingOptions::useSharedMemory,
297
- " Create block-local copies of data in shared memory when this can "
298
- " leverage data reuse or global memory access coalescing" )
299
- .def (
300
- " unrollCopyShared" ,
301
- &tc::CudaMappingOptions::unrollCopyShared,
302
- " Also unroll the copies to and from shared memory. If an unroll "
303
- " value is not provided, has no effect" )
304
- .def (
305
- " useReadOnlyCache" ,
306
- &tc::CudaMappingOptions::useReadOnlyCache,
307
- " Use the readonly cache (i.e. emit __ldg loads)" )
290
+ " __str__" ,
291
+ [](tc::CudaMappingOptions& instance) {
292
+ std::string str;
293
+ google::protobuf::TextFormat::PrintToString (instance.proto (), &str);
294
+ return str;
295
+ },
296
+ " Returns the CudaMappingOptions as a human-readable string" )
297
+ //
298
+ // Generic options
299
+ //
308
300
.def (
309
301
" scheduleFusionStrategy" ,
310
302
[](tc::CudaMappingOptions& instance, const std::string& type) {
@@ -334,20 +326,16 @@ PYBIND11_MODULE(tc, m) {
334
326
" (Preserve3Coincident) by\n "
335
327
" performing loop fusion and fission. Applies before tiling" )
336
328
.def (
337
- " serialize" ,
338
- [](tc::CudaMappingOptions& instance) {
339
- std::string str = instance.toProtobufSerializedString ();
340
- return py::bytes (str);
341
- },
342
- " Serialize the options to a protobuf string" )
343
- .def (
344
- " toString" ,
345
- [](tc::CudaMappingOptions& instance) {
346
- std::string str;
347
- google::protobuf::TextFormat::PrintToString (instance.proto (), &str);
348
- return str;
329
+ " fixParametersBeforeScheduling" ,
330
+ [](tc::CudaMappingOptions& instance, bool fix) {
331
+ instance.fixParametersBeforeScheduling (fix);
349
332
},
350
- " Returns the CudaMappingOptions as a human-readable string" )
333
+ " Perform automatic loop scheduling taking into account specific "
334
+ " tensor sizes.\n "
335
+ " May produce faster kernels but significantly increases compilation "
336
+ " time.\n "
337
+ " Note that the mapping will be performed for specific tensor sizes "
338
+ " anyway" )
351
339
.def (
352
340
" tile" ,
353
341
// pybind11 has implicit conversion from list -> vector
@@ -356,6 +344,29 @@ PYBIND11_MODULE(tc, m) {
356
344
" Perform loop tiling on the generated code with the given sizes. "
357
345
" Independent of mapping to a\n "
358
346
" grid of thread blocks" )
347
+ .def (
348
+ " tile_imperfectly_nested" ,
349
+ [](tc::CudaMappingOptions& instance, bool tile) {
350
+ instance.tileImperfectlyNested (tile);
351
+ },
352
+ " Allow imperfectly nested loop tiling" )
353
+ .def (
354
+ " unroll" ,
355
+ [](tc::CudaMappingOptions& instance, uint64_t factor) {
356
+ instance.unroll (factor);
357
+ },
358
+ " Perform loop unrolling on the generated code and produce at "
359
+ " most the given number of statements" )
360
+ .def (
361
+ " matchLibraryCalls" ,
362
+ [](tc::CudaMappingOptions& instance, bool match) {
363
+ instance.matchLibraryCalls (match);
364
+ },
365
+ " Replace computation patterns with calls to highly optimized "
366
+ " libraries (such as CUB, CUTLASS) when possible" )
367
+ //
368
+ // CUDA-specific options
369
+ //
359
370
.def (
360
371
" mapToThreads" ,
361
372
[](tc::CudaMappingOptions& instance,
@@ -378,30 +389,29 @@ PYBIND11_MODULE(tc, m) {
378
389
" within the range allowed by CUDA (maximum 2^31-1 for the first "
379
390
" value and 65535 for the second and third)" )
380
391
.def (
381
- " matchLibraryCalls" ,
382
- [](tc::CudaMappingOptions& instance, bool match) {
383
- instance.matchLibraryCalls (match);
384
- },
385
- " Replace computation patterns with calls to highly optimized "
386
- " libraries (such as CUB, CUTLASS) when possible" )
392
+ " useSharedMemory" ,
393
+ &tc::CudaMappingOptions::useSharedMemory,
394
+ " Create block-local copies of data in shared memory when this can "
395
+ " leverage data reuse or global memory access coalescing" )
387
396
.def (
388
- " fixParametersBeforeScheduling" ,
389
- [](tc::CudaMappingOptions& instance, bool fix) {
390
- instance.fixParametersBeforeScheduling (fix);
391
- },
392
- " Perform automatic loop scheduling taking into account specific "
393
- " tensor sizes.\n "
394
- " May produce faster kernels but significantly increases compilation "
395
- " time.\n "
396
- " Note that the mapping will be performed for specific tensor sizes "
397
- " anyway" )
397
+ " usePrivateMemory" ,
398
+ &tc::CudaMappingOptions::usePrivateMemory,
399
+ " Use private memoery (registers) when possible" )
400
+
398
401
.def (
399
- " unroll" ,
400
- [](tc::CudaMappingOptions& instance, uint64_t factor) {
401
- instance.unroll (factor);
402
- },
403
- " Perform loop unrolling on the generated code and produce at "
404
- " most the given number of statements" );
402
+ " unrollCopyShared" ,
403
+ &tc::CudaMappingOptions::unrollCopyShared,
404
+ " Also unroll the copies to and from shared memory. If an unroll "
405
+ " value is not provided, has no effect" )
406
+ .def (
407
+ " maxSharedMemory" ,
408
+ &tc::CudaMappingOptions::maxSharedMemory,
409
+ " The amount of shared memory to use, in bytes. If not provided, "
410
+ " TC will query the active GPU and use all available shared memory." )
411
+ .def (
412
+ " useReadOnlyCache" ,
413
+ &tc::CudaMappingOptions::useReadOnlyCache,
414
+ " Use the readonly cache (i.e. emit __ldg loads)" );
405
415
}
406
416
407
417
} // namespace python
0 commit comments