Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 63da0fc

Browse files
[C++ API] Step 1: drop manual and compilation caches
This is the first PR towards implementing the API proposed in #307. As part of refactoring the OptionsCache we realized that compilation caches may have been quite useful in the beginning but are a premature optimization in the new API. Removing them has multiple benefits such as: 1. simplify the codebase CRTP + inheritance + custom traversals and serialization are not fun to maintain 2. simplify porting to the new API where ExecutionEngine is dead 3. caches seem to add overhead in the context of cooperative groups 4. in the new API we can cache executors which removes the need to rely on compilation caches 5. tuning and options caches is the expensive bit that needs to be memoized This seems corroborated by the [discussion](#339 (comment)). Note that some tests in test_tc_mapper need to be temporarily degraded because we do not have a simple way to extract generated code before the new API.
1 parent df36d16 commit 63da0fc

18 files changed

+39
-1540
lines changed

docs/source/framework/pytorch_integration/writing_layers.rst

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -219,48 +219,6 @@ adopt whatever feels more convenient.
219219
inp = torch.ones(1, 1, 4, 4).cuda()
220220
out = avgpool(inp)
221221
222-
223-
Manually injecting external CUDA code
224-
-------------------------------------
225-
226-
If you have an external efficient CUDA code that you want to use rather than
227-
the CUDA code that TC generates, you can inject your code easily. For this,
228-
you need to create a string which has the CUDA code you want to inject and you
229-
need to pass the name of the kernel and the CUDA code string to the :code:`tc.define`
230-
call. For example:
231-
232-
.. code-block:: python
233-
234-
import tensor_comprehensions as tc
235-
import torch
236-
lang = """
237-
def add(float(N) A, float(N) B) -> (output) {
238-
output(n) = A(n) + B(n)
239-
}
240-
"""
241-
242-
cuda_code = """
243-
extern "C"{
244-
__global__ void my_add(float* __restrict__ output, const float* __restrict__ A, const float* __restrict B)
245-
{
246-
int t = threadIdx.x;
247-
output[t] = A[t] + B[t];
248-
}
249-
}
250-
"""
251-
252-
add = tc.define(lang, name="add", inject_kernel="my_add", cuda_code=cuda_code)
253-
a, b = torch.randn(100).cuda(), torch.randn(100).cuda()
254-
out = add(a, b, grid=[1, 1, 1], block=[100, 1, 1])
255-
256-
.. note::
257-
258-
In such cases, please note that TC doesn't modify the injected CUDA kernel. It will
259-
simply run the kernel injected as is and TC will also not guarantee the performance
260-
of the kernel. User needs to specify the :code:`grid` and :code:`block` values
261-
when running the layer and TC will simply use those settings.
262-
263-
264222
Built-in Functions
265223
------------------
266224

tc/autotuner/genetic_autotuner.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,8 @@ namespace {
4343

4444
void enableOrLoadCache(const std::string& filename) {
4545
tc::OptionsCache::enableCache();
46-
tc::CudaCache::enableCache();
4746
if (!filename.empty()) {
4847
tc::OptionsCache::loadCacheFromProtobuf(tc::makeOptionsFilename(filename));
49-
tc::CudaCache::loadCacheFromProtobuf(tc::makeCudaFilename(filename));
5048
}
5149
}
5250
} // namespace
@@ -62,9 +60,6 @@ void GeneticAutotuner::storeCaches(const std::string& filename) {
6260
tc::OptionsCache::dumpCacheToProtobuf(tc::makeOptionsFilename(filename));
6361

6462
tc::OptionsCache::getCache()->keepOnlyBestCandidates(1);
65-
tc::removeFromCudaCacheEntriesNotInOptionsCache(
66-
*tc::CudaCache::getCache(), *tc::OptionsCache::getCache());
67-
tc::CudaCache::dumpCacheToProtobuf(tc::makeCudaFilename(filename));
6863
}
6964
}
7065

tc/autotuner/genetic_autotuner_aten.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include <thread>
2222

2323
#include "tc/core/cuda/cuda.h"
24-
#include "tc/core/cuda/cuda_compilation_cache.h"
2524
#include "tc/core/cuda/cuda_tc_executor.h"
2625
#include "tc/core/flags.h"
2726
#include "tc/core/scope_guard.h"

tc/autotuner/genetic_tuning_harness.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include "tc/autotuner/utils/printer.h"
2828
#include "tc/autotuner/utils/utils.h"
2929
#include "tc/core/cuda/cuda.h"
30-
#include "tc/core/cuda/cuda_compilation_cache.h"
3130
#include "tc/core/cuda/cuda_mapping_options_cpp_printer.h"
3231
#include "tc/core/cuda/cuda_tc_executor.h"
3332
#include "tc/core/execution_engine.h"

tc/benchmarks/benchmark_fixture.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,6 @@ struct Benchmark : public ::testing::Test {
260260

261261
tc::OptionsCache::enableCache();
262262
tc::OptionsCache::loadCacheFromProtobuf(cacheFilename + ".options");
263-
tc::CudaCache::enableCache();
264-
tc::CudaCache::loadCacheFromProtobuf(tc::makeCudaFilename(cacheFilename));
265263
tc::FLAGS_tuner_gen_restore_number = 1;
266264

267265
tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;

tc/core/cuda/cuda_compilation_cache.cc

Lines changed: 0 additions & 208 deletions
Original file line numberDiff line numberDiff line change
@@ -141,152 +141,6 @@ CachedEntryType* searchKernel(
141141
}
142142
} // namespace
143143

144-
////////////////////////////////////////////////////////////////////////////////
145-
// CudaCache
146-
////////////////////////////////////////////////////////////////////////////////
147-
std::shared_ptr<CudaCache>& CudaCache::getGlobalSharedCache() {
148-
static std::shared_ptr<CudaCache> cudaCache_;
149-
return cudaCache_;
150-
}
151-
152-
CudaCachedEntry::CudaCachedEntry(
153-
const std::string& id,
154-
const std::string& kernelSpecializedName,
155-
const std::vector<int>& kernelParameters,
156-
const Grid& grid,
157-
const Block& block,
158-
const CudaMappingOptions& mappingOptions,
159-
const std::vector<const DLTensor*>& inputs,
160-
const std::vector<const DLTensor*>& outputs,
161-
const std::string& cudaSource,
162-
const std::string& deviceStr)
163-
: key{id,
164-
mappingOptions,
165-
DLTensorToTensorInfoVector(inputs),
166-
DLTensorToTensorInfoVector(outputs),
167-
deviceStr,
168-
git_version},
169-
values{cudaSource, kernelSpecializedName, kernelParameters, grid, block} {
170-
}
171-
172-
CudaCachedEntry::CudaCachedEntry(const CudaCacheEntryProto& buf)
173-
: key{buf.id(),
174-
CudaMappingOptions{buf.kernel_options()},
175-
ProtoToTensorInfoVector(buf.inputs()),
176-
ProtoToTensorInfoVector(buf.outputs()),
177-
buf.device_str(),
178-
buf.git_version()},
179-
values{buf.cuda_source(),
180-
buf.specialized_name(),
181-
std::vector<int>{buf.parameters().begin(), buf.parameters().end()},
182-
Grid(buf.grid_dims()),
183-
Block(buf.block_dims())} {}
184-
185-
CudaCache::CudaCache(const CudaCacheProto& buf) {
186-
entries_.reserve(buf.entries_size());
187-
for (const auto& entry_buf : buf.entries())
188-
entries_.emplace_back(entry_buf);
189-
}
190-
191-
void CudaCache::cacheKernel(CudaCachedEntry&& entry) {
192-
std::lock_guard<std::mutex> lock(mtx_);
193-
++numberCacheAttemps;
194-
auto retrievedEntry = searchKernel(
195-
entries_,
196-
entry.key.id,
197-
entry.key.mappingOptions,
198-
entry.key.inputs,
199-
entry.key.outputs);
200-
if (retrievedEntry) {
201-
if (retrievedEntry->values.cudaSource != entry.values.cudaSource or
202-
retrievedEntry->values.grid != entry.values.grid or
203-
retrievedEntry->values.block != entry.values.block) {
204-
throw CacheEntrySameKeyDifferentValue(
205-
"CudaCache::CacheKernel: a kernel matching the id, options and "
206-
"inputs was previously cached with different cuda source or block "
207-
"or grid dimensions.");
208-
}
209-
return;
210-
}
211-
entries_.emplace_back(std::move(entry));
212-
}
213-
214-
std::unique_ptr<CudaCacheRetrievalResult> CudaCache::retrieveKernel(
215-
const std::string& id,
216-
const CudaMappingOptions& options,
217-
const std::vector<const DLTensor*>& inputs,
218-
const std::vector<const DLTensor*>& outputs) const {
219-
std::lock_guard<std::mutex> lock(mtx_);
220-
++numberAttemptedRetrievals;
221-
auto entry = searchKernel(entries_, id, options, inputs, outputs);
222-
if (not entry) {
223-
return nullptr;
224-
}
225-
++numberSuccessfulRetrievals;
226-
return std::unique_ptr<CudaCacheRetrievalResult>(
227-
new CudaCacheRetrievalResult{entry->values.cudaSource,
228-
entry->values.kernelSpecializedName,
229-
entry->values.kernelParameters,
230-
entry->values.grid,
231-
entry->values.block});
232-
}
233-
234-
void CudaCache::removeEntriesNotInOptionsCache(const OptionsCache& oc) {
235-
std::vector<CudaCachedEntry> newEntries;
236-
for (const auto& entry : oc) {
237-
for (const auto& options : entry.values) {
238-
auto cudaEntry = searchKernel(
239-
entries_,
240-
entry.key.id,
241-
options.mappingOptions,
242-
entry.key.inputs,
243-
entry.key.outputs);
244-
if (cudaEntry) {
245-
newEntries.push_back(std::move(*cudaEntry));
246-
}
247-
}
248-
}
249-
entries_ = std::move(newEntries);
250-
}
251-
252-
CudaCacheProto CudaCache::toProtobuf() const {
253-
CudaCacheProto buf;
254-
auto* entriesBuf = buf.mutable_entries();
255-
entriesBuf->Reserve(entries_.size());
256-
std::transform(
257-
entries_.begin(),
258-
entries_.end(),
259-
google::protobuf::RepeatedPtrFieldBackInserter(entriesBuf),
260-
[](const CudaCachedEntry& entry) { return entry.toProtobuf(); });
261-
return buf;
262-
}
263-
264-
CudaCacheEntryProto CudaCachedEntry::toProtobuf() const {
265-
CudaCacheEntryProto buf;
266-
buf.set_id(key.id);
267-
*buf.mutable_kernel_options() = key.mappingOptions.proto();
268-
std::transform(
269-
key.inputs.begin(),
270-
key.inputs.end(),
271-
google::protobuf::RepeatedPtrFieldBackInserter(buf.mutable_inputs()),
272-
[](const detail::TensorInfo& input) { return input.toProtobuf(); });
273-
std::transform(
274-
key.outputs.begin(),
275-
key.outputs.end(),
276-
google::protobuf::RepeatedPtrFieldBackInserter(buf.mutable_outputs()),
277-
[](const detail::TensorInfo& output) { return output.toProtobuf(); });
278-
buf.set_device_str(key.deviceStr);
279-
buf.set_git_version(key.gitVersion);
280-
281-
buf.set_cuda_source(values.cudaSource);
282-
*buf.mutable_grid_dims() = values.grid.view.proto;
283-
*buf.mutable_block_dims() = values.block.view.proto;
284-
buf.set_specialized_name(values.kernelSpecializedName);
285-
WriteProtobufArray(values.kernelParameters, buf.mutable_parameters());
286-
287-
return buf;
288-
}
289-
290144
////////////////////////////////////////////////////////////////////////////////
291145
// OptionsCache
292146
////////////////////////////////////////////////////////////////////////////////
@@ -566,66 +420,4 @@ void OptionsCache::keepOnlyBestCandidates(size_t numberToKeep) {
566420
}
567421
}
568422
}
569-
570-
////////////////////////////////////////////////////////////////////////////////
571-
// ManualCudaCache
572-
////////////////////////////////////////////////////////////////////////////////
573-
std::shared_ptr<ManualCudaCache>& ManualCudaCache::getGlobalSharedCache() {
574-
static std::shared_ptr<ManualCudaCache> manualCudaCache_;
575-
return manualCudaCache_;
576-
}
577-
578-
ManualCudaCachedEntry::ManualCudaCachedEntry(
579-
const std::string& id,
580-
const std::string& kernelSpecializedName,
581-
const std::vector<int>& kernelParameters,
582-
const Grid& grid,
583-
const Block& block,
584-
const std::vector<const DLTensor*>& inputs,
585-
const std::vector<const DLTensor*>& outputs,
586-
const std::string& cudaSource,
587-
const std::string& deviceStr)
588-
: key{id,
589-
DLTensorToTensorInfoVector(inputs),
590-
DLTensorToTensorInfoVector(outputs),
591-
deviceStr,
592-
git_version},
593-
values{cudaSource, kernelSpecializedName, kernelParameters, grid, block} {
594-
}
595-
596-
void ManualCudaCache::cacheKernel(ManualCudaCachedEntry&& entry) {
597-
std::lock_guard<std::mutex> lock(mtx_);
598-
++numberCacheAttemps;
599-
auto retrievedEntry =
600-
searchKernel(entries_, entry.key.id, entry.key.inputs, entry.key.outputs);
601-
if (retrievedEntry) {
602-
retrievedEntry->values.grid = entry.values.grid;
603-
retrievedEntry->values.block = entry.values.block;
604-
retrievedEntry->values.cudaSource = entry.values.cudaSource;
605-
retrievedEntry->values.kernelSpecializedName =
606-
entry.values.kernelSpecializedName;
607-
retrievedEntry->values.kernelParameters = entry.values.kernelParameters;
608-
return;
609-
}
610-
entries_.emplace_back(std::move(entry));
611-
}
612-
613-
std::unique_ptr<ManualCudaCacheRetrievalResult> ManualCudaCache::retrieveKernel(
614-
const std::string& id,
615-
const std::vector<const DLTensor*>& inputs,
616-
const std::vector<const DLTensor*>& outputs) const {
617-
std::lock_guard<std::mutex> lock(mtx_);
618-
++numberAttemptedRetrievals;
619-
auto entry = searchKernel(entries_, id, inputs, outputs);
620-
if (not entry) {
621-
return nullptr;
622-
}
623-
++numberSuccessfulRetrievals;
624-
return std::unique_ptr<ManualCudaCacheRetrievalResult>(
625-
new ManualCudaCacheRetrievalResult{entry->values.cudaSource,
626-
entry->values.kernelSpecializedName,
627-
entry->values.kernelParameters,
628-
entry->values.grid,
629-
entry->values.block});
630-
}
631423
} // namespace tc

0 commit comments

Comments
 (0)