Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 76ae999

Browse files
Merge pull request #353 from nicolasvasilache/pr/c++-api-refactor-delete-compilation-cache
[C++ API] Step 1: drop manual and compilation caches
2 parents fadace1 + 63da0fc commit 76ae999

18 files changed

+39
-1540
lines changed

docs/source/framework/pytorch_integration/writing_layers.rst

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -219,48 +219,6 @@ adopt whatever feels more convenient.
219219
inp = torch.ones(1, 1, 4, 4).cuda()
220220
out = avgpool(inp)
221221
222-
223-
Manually injecting external CUDA code
224-
-------------------------------------
225-
226-
If you have an external efficient CUDA code that you want to use rather than
227-
the CUDA code that TC generates, you can inject your code easily. For this,
228-
you need to create a string which has the CUDA code you want to inject and you
229-
need to pass the name of the kernel and the CUDA code string to the :code:`tc.define`
230-
call. For example:
231-
232-
.. code-block:: python
233-
234-
import tensor_comprehensions as tc
235-
import torch
236-
lang = """
237-
def add(float(N) A, float(N) B) -> (output) {
238-
output(n) = A(n) + B(n)
239-
}
240-
"""
241-
242-
cuda_code = """
243-
extern "C"{
244-
__global__ void my_add(float* __restrict__ output, const float* __restrict__ A, const float* __restrict B)
245-
{
246-
int t = threadIdx.x;
247-
output[t] = A[t] + B[t];
248-
}
249-
}
250-
"""
251-
252-
add = tc.define(lang, name="add", inject_kernel="my_add", cuda_code=cuda_code)
253-
a, b = torch.randn(100).cuda(), torch.randn(100).cuda()
254-
out = add(a, b, grid=[1, 1, 1], block=[100, 1, 1])
255-
256-
.. note::
257-
258-
In such cases, please note that TC doesn't modify the injected CUDA kernel. It will
259-
simply run the kernel injected as is and TC will also not guarantee the performance
260-
of the kernel. User needs to specify the :code:`grid` and :code:`block` values
261-
when running the layer and TC will simply use those settings.
262-
263-
264222
Built-in Functions
265223
------------------
266224

tc/autotuner/genetic_autotuner.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,8 @@ namespace {
4343

4444
void enableOrLoadCache(const std::string& filename) {
4545
tc::OptionsCache::enableCache();
46-
tc::CudaCache::enableCache();
4746
if (!filename.empty()) {
4847
tc::OptionsCache::loadCacheFromProtobuf(tc::makeOptionsFilename(filename));
49-
tc::CudaCache::loadCacheFromProtobuf(tc::makeCudaFilename(filename));
5048
}
5149
}
5250
} // namespace
@@ -62,9 +60,6 @@ void GeneticAutotuner::storeCaches(const std::string& filename) {
6260
tc::OptionsCache::dumpCacheToProtobuf(tc::makeOptionsFilename(filename));
6361

6462
tc::OptionsCache::getCache()->keepOnlyBestCandidates(1);
65-
tc::removeFromCudaCacheEntriesNotInOptionsCache(
66-
*tc::CudaCache::getCache(), *tc::OptionsCache::getCache());
67-
tc::CudaCache::dumpCacheToProtobuf(tc::makeCudaFilename(filename));
6863
}
6964
}
7065

tc/autotuner/genetic_autotuner_aten.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include <thread>
2222

2323
#include "tc/core/cuda/cuda.h"
24-
#include "tc/core/cuda/cuda_compilation_cache.h"
2524
#include "tc/core/cuda/cuda_tc_executor.h"
2625
#include "tc/core/flags.h"
2726
#include "tc/core/scope_guard.h"

tc/autotuner/genetic_tuning_harness.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include "tc/autotuner/utils/printer.h"
2828
#include "tc/autotuner/utils/utils.h"
2929
#include "tc/core/cuda/cuda.h"
30-
#include "tc/core/cuda/cuda_compilation_cache.h"
3130
#include "tc/core/cuda/cuda_mapping_options_cpp_printer.h"
3231
#include "tc/core/cuda/cuda_tc_executor.h"
3332
#include "tc/core/execution_engine.h"

tc/benchmarks/benchmark_fixture.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,6 @@ struct Benchmark : public ::testing::Test {
260260

261261
tc::OptionsCache::enableCache();
262262
tc::OptionsCache::loadCacheFromProtobuf(cacheFilename + ".options");
263-
tc::CudaCache::enableCache();
264-
tc::CudaCache::loadCacheFromProtobuf(tc::makeCudaFilename(cacheFilename));
265263
tc::FLAGS_tuner_gen_restore_number = 1;
266264

267265
tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;

tc/core/cuda/cuda_compilation_cache.cc

Lines changed: 0 additions & 208 deletions
Original file line numberDiff line numberDiff line change
@@ -141,152 +141,6 @@ CachedEntryType* searchKernel(
141141
}
142142
} // namespace
143143

144-
////////////////////////////////////////////////////////////////////////////////
145-
// CudaCache
146-
////////////////////////////////////////////////////////////////////////////////
147-
std::shared_ptr<CudaCache>& CudaCache::getGlobalSharedCache() {
148-
static std::shared_ptr<CudaCache> cudaCache_;
149-
return cudaCache_;
150-
}
151-
152-
CudaCachedEntry::CudaCachedEntry(
153-
const std::string& id,
154-
const std::string& kernelSpecializedName,
155-
const std::vector<int>& kernelParameters,
156-
const Grid& grid,
157-
const Block& block,
158-
const CudaMappingOptions& mappingOptions,
159-
const std::vector<const DLTensor*>& inputs,
160-
const std::vector<const DLTensor*>& outputs,
161-
const std::string& cudaSource,
162-
const std::string& deviceStr)
163-
: key{id,
164-
mappingOptions,
165-
DLTensorToTensorInfoVector(inputs),
166-
DLTensorToTensorInfoVector(outputs),
167-
deviceStr,
168-
git_version},
169-
values{cudaSource, kernelSpecializedName, kernelParameters, grid, block} {
170-
}
171-
172-
CudaCachedEntry::CudaCachedEntry(const CudaCacheEntryProto& buf)
173-
: key{buf.id(),
174-
CudaMappingOptions{buf.kernel_options()},
175-
ProtoToTensorInfoVector(buf.inputs()),
176-
ProtoToTensorInfoVector(buf.outputs()),
177-
buf.device_str(),
178-
buf.git_version()},
179-
values{buf.cuda_source(),
180-
buf.specialized_name(),
181-
std::vector<int>{buf.parameters().begin(), buf.parameters().end()},
182-
Grid(buf.grid_dims()),
183-
Block(buf.block_dims())} {}
184-
185-
CudaCache::CudaCache(const CudaCacheProto& buf) {
186-
entries_.reserve(buf.entries_size());
187-
for (const auto& entry_buf : buf.entries())
188-
entries_.emplace_back(entry_buf);
189-
}
190-
191-
void CudaCache::cacheKernel(CudaCachedEntry&& entry) {
192-
std::lock_guard<std::mutex> lock(mtx_);
193-
++numberCacheAttemps;
194-
auto retrievedEntry = searchKernel(
195-
entries_,
196-
entry.key.id,
197-
entry.key.mappingOptions,
198-
entry.key.inputs,
199-
entry.key.outputs);
200-
if (retrievedEntry) {
201-
if (retrievedEntry->values.cudaSource != entry.values.cudaSource or
202-
retrievedEntry->values.grid != entry.values.grid or
203-
retrievedEntry->values.block != entry.values.block) {
204-
throw CacheEntrySameKeyDifferentValue(
205-
"CudaCache::CacheKernel: a kernel matching the id, options and "
206-
"inputs was previously cached with different cuda source or block "
207-
"or grid dimensions.");
208-
}
209-
return;
210-
}
211-
entries_.emplace_back(std::move(entry));
212-
}
213-
214-
std::unique_ptr<CudaCacheRetrievalResult> CudaCache::retrieveKernel(
215-
const std::string& id,
216-
const CudaMappingOptions& options,
217-
const std::vector<const DLTensor*>& inputs,
218-
const std::vector<const DLTensor*>& outputs) const {
219-
std::lock_guard<std::mutex> lock(mtx_);
220-
++numberAttemptedRetrievals;
221-
auto entry = searchKernel(entries_, id, options, inputs, outputs);
222-
if (not entry) {
223-
return nullptr;
224-
}
225-
++numberSuccessfulRetrievals;
226-
return std::unique_ptr<CudaCacheRetrievalResult>(
227-
new CudaCacheRetrievalResult{entry->values.cudaSource,
228-
entry->values.kernelSpecializedName,
229-
entry->values.kernelParameters,
230-
entry->values.grid,
231-
entry->values.block});
232-
}
233-
234-
void CudaCache::removeEntriesNotInOptionsCache(const OptionsCache& oc) {
235-
std::vector<CudaCachedEntry> newEntries;
236-
for (const auto& entry : oc) {
237-
for (const auto& options : entry.values) {
238-
auto cudaEntry = searchKernel(
239-
entries_,
240-
entry.key.id,
241-
options.mappingOptions,
242-
entry.key.inputs,
243-
entry.key.outputs);
244-
if (cudaEntry) {
245-
newEntries.push_back(std::move(*cudaEntry));
246-
}
247-
}
248-
}
249-
entries_ = std::move(newEntries);
250-
}
251-
252-
CudaCacheProto CudaCache::toProtobuf() const {
253-
CudaCacheProto buf;
254-
auto* entriesBuf = buf.mutable_entries();
255-
entriesBuf->Reserve(entries_.size());
256-
std::transform(
257-
entries_.begin(),
258-
entries_.end(),
259-
google::protobuf::RepeatedPtrFieldBackInserter(entriesBuf),
260-
[](const CudaCachedEntry& entry) { return entry.toProtobuf(); });
261-
return buf;
262-
}
263-
264-
CudaCacheEntryProto CudaCachedEntry::toProtobuf() const {
265-
CudaCacheEntryProto buf;
266-
buf.set_id(key.id);
267-
*buf.mutable_kernel_options() = key.mappingOptions.proto();
268-
std::transform(
269-
key.inputs.begin(),
270-
key.inputs.end(),
271-
google::protobuf::RepeatedPtrFieldBackInserter(buf.mutable_inputs()),
272-
[](const detail::TensorInfo& input) { return input.toProtobuf(); });
273-
std::transform(
274-
key.outputs.begin(),
275-
key.outputs.end(),
276-
google::protobuf::RepeatedPtrFieldBackInserter(buf.mutable_outputs()),
277-
[](const detail::TensorInfo& output) { return output.toProtobuf(); });
278-
buf.set_device_str(key.deviceStr);
279-
buf.set_git_version(key.gitVersion);
280-
281-
buf.set_cuda_source(values.cudaSource);
282-
*buf.mutable_grid_dims() = values.grid.view.proto;
283-
*buf.mutable_block_dims() = values.block.view.proto;
284-
buf.set_specialized_name(values.kernelSpecializedName);
285-
WriteProtobufArray(values.kernelParameters, buf.mutable_parameters());
286-
287-
return buf;
288-
}
289-
290144
////////////////////////////////////////////////////////////////////////////////
291145
// OptionsCache
292146
////////////////////////////////////////////////////////////////////////////////
@@ -566,66 +420,4 @@ void OptionsCache::keepOnlyBestCandidates(size_t numberToKeep) {
566420
}
567421
}
568422
}
569-
570-
////////////////////////////////////////////////////////////////////////////////
571-
// ManualCudaCache
572-
////////////////////////////////////////////////////////////////////////////////
573-
std::shared_ptr<ManualCudaCache>& ManualCudaCache::getGlobalSharedCache() {
574-
static std::shared_ptr<ManualCudaCache> manualCudaCache_;
575-
return manualCudaCache_;
576-
}
577-
578-
ManualCudaCachedEntry::ManualCudaCachedEntry(
579-
const std::string& id,
580-
const std::string& kernelSpecializedName,
581-
const std::vector<int>& kernelParameters,
582-
const Grid& grid,
583-
const Block& block,
584-
const std::vector<const DLTensor*>& inputs,
585-
const std::vector<const DLTensor*>& outputs,
586-
const std::string& cudaSource,
587-
const std::string& deviceStr)
588-
: key{id,
589-
DLTensorToTensorInfoVector(inputs),
590-
DLTensorToTensorInfoVector(outputs),
591-
deviceStr,
592-
git_version},
593-
values{cudaSource, kernelSpecializedName, kernelParameters, grid, block} {
594-
}
595-
596-
void ManualCudaCache::cacheKernel(ManualCudaCachedEntry&& entry) {
597-
std::lock_guard<std::mutex> lock(mtx_);
598-
++numberCacheAttemps;
599-
auto retrievedEntry =
600-
searchKernel(entries_, entry.key.id, entry.key.inputs, entry.key.outputs);
601-
if (retrievedEntry) {
602-
retrievedEntry->values.grid = entry.values.grid;
603-
retrievedEntry->values.block = entry.values.block;
604-
retrievedEntry->values.cudaSource = entry.values.cudaSource;
605-
retrievedEntry->values.kernelSpecializedName =
606-
entry.values.kernelSpecializedName;
607-
retrievedEntry->values.kernelParameters = entry.values.kernelParameters;
608-
return;
609-
}
610-
entries_.emplace_back(std::move(entry));
611-
}
612-
613-
std::unique_ptr<ManualCudaCacheRetrievalResult> ManualCudaCache::retrieveKernel(
614-
const std::string& id,
615-
const std::vector<const DLTensor*>& inputs,
616-
const std::vector<const DLTensor*>& outputs) const {
617-
std::lock_guard<std::mutex> lock(mtx_);
618-
++numberAttemptedRetrievals;
619-
auto entry = searchKernel(entries_, id, inputs, outputs);
620-
if (not entry) {
621-
return nullptr;
622-
}
623-
++numberSuccessfulRetrievals;
624-
return std::unique_ptr<ManualCudaCacheRetrievalResult>(
625-
new ManualCudaCacheRetrievalResult{entry->values.cudaSource,
626-
entry->values.kernelSpecializedName,
627-
entry->values.kernelParameters,
628-
entry->values.grid,
629-
entry->values.block});
630-
}
631423
} // namespace tc

0 commit comments

Comments
 (0)