Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 384873a

Browse files
[C++ API] Consolidate autotuner/utils
This changeset consolidates autotuner/utils/* by moving the remaining functionality into autotuner/utils.cc|h. This allows preparing for the final step of refactoring and reduce the delta in the last commit. This also helps centralize some current dependencies of autotuner/utils on cuda_mapping_options.h, cuda_compilation_cache.h, dlpack.h and canonicalize.h. These dependencies will go away in the last refactoring commit. Lastly this commit also removes some dead code related to compilation and options caches but cannot remove all such code because it is still needed until we switch to the new API (in particular getOptionsAndMedianRuntimes that will go away in a later commit).
1 parent 74a81c6 commit 384873a

13 files changed

+156
-214
lines changed

tc/autotuner/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ if (WITH_CUDA)
99
genetic_search.cc
1010
genetic_tuning_harness.cc
1111
parameters.cc
12-
utils/printer.cc
13-
utils/utils.cc)
12+
utils.cc)
1413

1514
target_include_directories(tc_autotuner PUBLIC ${PROJECT_SOURCE_DIR}/include)
1615
target_link_libraries(tc_autotuner PUBLIC ${ATEN_LIBRARIES} tc_cuda tc_proto)

tc/autotuner/genetic_autotuner.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <csignal>
2121
#include <thread>
2222

23+
#include "tc/autotuner/utils.h"
2324
#include "tc/core/cuda/cuda_compilation_cache.h"
2425
#include "tc/core/cuda/cuda_tc_executor.h"
2526
#include "tc/core/flags.h"

tc/autotuner/genetic_autotuner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#include <vector>
2121

2222
#include "tc/autotuner/genetic_tuning_harness.h"
23-
#include "tc/autotuner/utils/utils.h"
23+
#include "tc/autotuner/utils.h"
2424
#include "tc/core/execution_engine.h"
2525
#include "tc/lang/parser.h"
2626

tc/autotuner/genetic_autotuner_aten.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include "tc/aten/aten_compiler.h"
2323
#include "tc/autotuner/genetic_autotuner.h"
2424
#include "tc/autotuner/genetic_tuning_harness.h"
25-
#include "tc/autotuner/utils/utils.h"
25+
#include "tc/autotuner/utils.h"
2626
#include "tc/lang/parser.h"
2727

2828
namespace tc {

tc/autotuner/genetic_tuning_harness.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@
2424
#include <cuda_runtime_api.h>
2525
#include <glog/stl_logging.h>
2626

27-
#include "tc/autotuner/utils/printer.h"
28-
#include "tc/autotuner/utils/utils.h"
27+
#include "tc/autotuner/utils.h"
2928
#include "tc/core/cuda/cuda.h"
3029
#include "tc/core/cuda/cuda_mapping_options_cpp_printer.h"
3130
#include "tc/core/cuda/cuda_tc_executor.h"

tc/autotuner/genetic_tuning_harness.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
#include "tc/autotuner/genetic_search.h"
2626
#include "tc/autotuner/parameters.h"
27-
#include "tc/autotuner/utils/printer.h"
27+
#include "tc/autotuner/utils.h"
2828
#include "tc/core/utils/dlpack.h"
2929
#include "tc/lang/parser.h"
3030

tc/autotuner/utils/utils-inl.h renamed to tc/autotuner/utils-inl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* limitations under the License.
1515
*/
1616
#pragma once
17+
1718
#include <algorithm>
1819
#include <iterator>
1920
#include <type_traits>

tc/autotuner/utils/utils.cc renamed to tc/autotuner/utils.cc

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,36 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
#include "tc/autotuner/utils.h"
17+
1618
#include <algorithm>
1719
#include <cmath>
20+
#include <sstream>
21+
22+
#include <glog/stl_logging.h>
1823

19-
#include "tc/aten/aten_compiler.h"
20-
#include "tc/autotuner/utils/utils.h"
2124
#include "tc/core/cuda/cuda_compilation_cache.h"
25+
#include "tc/core/cuda/cuda_mapping_options.h"
26+
#include "tc/core/flags.h"
27+
#include "tc/core/utils/dlpack.h"
2228
#include "tc/core/utils/math.h"
29+
#include "tc/core/utils/time.h"
2330
#include "tc/lang/canonicalize.h"
2431

2532
namespace tc {
2633
namespace autotune {
2734

28-
namespace {
29-
std::vector<std::size_t> firstPowers2(std::size_t n) {
30-
std::vector<std::size_t> powers(n + 1);
35+
std::vector<std::size_t> powers2andCeilDivisors(std::size_t val) {
36+
auto numPowers = static_cast<std::size_t>(std::ceil(std::log2(val)));
37+
// 1. generate `numPowers' powers of 2
38+
std::vector<std::size_t> res(numPowers + 1);
3139
std::size_t p = 1;
32-
std::generate(powers.begin(), powers.end(), [p]() mutable {
40+
std::generate(res.begin(), res.end(), [p]() mutable {
3341
auto old_p = p;
3442
p *= 2;
3543
return old_p;
3644
});
37-
return powers;
38-
}
39-
} // namespace
40-
41-
std::vector<std::size_t> powers2andCeilDivisors(std::size_t val) {
42-
auto res = firstPowers2(static_cast<std::size_t>(std::ceil(std::log2(val))));
45+
// 2. additionally insert ceil(val / powers2)
4346
res.reserve(res.size() * 2);
4447
for (std::size_t i = 0, s = res.size(); i < s; ++i) {
4548
if (res[i] > val) {
@@ -52,6 +55,11 @@ std::vector<std::size_t> powers2andCeilDivisors(std::size_t val) {
5255
return res;
5356
}
5457

58+
struct OptionsWithMedianTime {
59+
CudaMappingOptions options;
60+
Duration medianRuntime;
61+
};
62+
5563
std::vector<OptionsWithMedianTime> getOptionsAndMedianRuntimes(
5664
const lang::CanonicalTcString& id,
5765
const std::vector<const DLTensor*>& inputs,
@@ -109,5 +117,92 @@ llvm::Optional<CudaMappingOptions> getBestOptions(
109117
return llvm::Optional<CudaMappingOptions>{};
110118
}
111119

120+
void Printer::record(Duration runtime) {
121+
std::lock_guard<std::mutex> lock(runtimesMtx_);
122+
runtimes_.push_back(runtime);
123+
}
124+
125+
namespace {
126+
uint64_t toMicroseconds(const Duration& d) {
127+
return std::chrono::duration_cast<std::chrono::microseconds>(d).count();
128+
}
129+
} // namespace
130+
131+
void Printer::printLoop() {
132+
while (true) {
133+
std::this_thread::sleep_for(std::chrono::seconds(1));
134+
135+
std::stringstream ss;
136+
ss << "Iteration " << iteration_;
137+
ss << "\tJobs(Compiled, Evaluated)/total ("
138+
<< std::min(total_, currentCompilationJob_.load()) << ", "
139+
<< std::min(total_, numEvaluations_.load()) << ")/" << total_;
140+
141+
{
142+
std::lock_guard<std::mutex> lock(runtimesMtx_);
143+
if (not runtimes_.empty()) {
144+
std::sort(runtimes_.begin(), runtimes_.end());
145+
auto best = toMicroseconds(runtimes_.front());
146+
auto median = toMicroseconds(runtimes_.at(runtimes_.size() / 2));
147+
auto worst = toMicroseconds(runtimes_.back());
148+
ss << " (best/median/worst)us: " << best << '/' << median << '/'
149+
<< worst;
150+
}
151+
}
152+
// XXX: platform specific erase current line and move cursor to begining
153+
// of line. Currently works with python/C++ both.
154+
std::cout << "\u001b[2K\r" << ss.str() << std::flush;
155+
LOG_IF(INFO, FLAGS_debug_tuner) << "\u001b[2K\r" << ss.str() << std::endl;
156+
157+
if (stopPrinting_.load()) {
158+
// Print one more time to flush
159+
// XXX: platform specific erase current line and move cursor to begining
160+
// of line. Currently works with python/C++ both.
161+
std::cout << "\u001b[2K\r" << ss.str() << std::flush;
162+
LOG_IF(INFO, FLAGS_debug_tuner) << "\u001b[2K\r" << ss.str() << std::endl;
163+
// commit line so it does not get erased at the next iteration
164+
std::cerr << std::endl;
165+
return;
166+
}
167+
}
168+
}
169+
170+
Printer::Printer(
171+
size_t iteration,
172+
size_t total,
173+
const std::atomic_size_t& currentCompilationJob,
174+
const std::atomic_size_t& numEvaluations)
175+
: iteration_(iteration),
176+
printerThread_([this]() { printLoop(); }),
177+
total_(total),
178+
currentCompilationJob_(currentCompilationJob),
179+
numEvaluations_(numEvaluations) {}
180+
181+
Printer::~Printer() {
182+
stop();
183+
printerThread_.join();
184+
}
185+
186+
void Printer::stop() {
187+
stopPrinting_.store(true);
188+
}
189+
190+
void Printer::printAll() {
191+
auto runtimes = [this]() {
192+
std::lock_guard<std::mutex> lock(runtimesMtx_);
193+
std::sort(runtimes_.begin(), runtimes_.end());
194+
std::vector<uint64_t> runtimes;
195+
runtimes.reserve(runtimes_.size());
196+
std::transform(
197+
runtimes_.begin(),
198+
runtimes_.end(),
199+
std::back_inserter(runtimes),
200+
toMicroseconds);
201+
return runtimes;
202+
}();
203+
LOG_IF(INFO, FLAGS_debug_tuner)
204+
<< "\n [TUNER][ITERATION LOG] median times of each candidate (in us) "
205+
<< runtimes << std::endl;
206+
}
112207
} // namespace autotune
113208
} // namespace tc

tc/autotuner/utils/utils.h renamed to tc/autotuner/utils.h

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,21 @@
1414
* limitations under the License.
1515
*/
1616
#pragma once
17+
18+
#include <atomic>
19+
#include <iostream>
20+
#include <mutex>
21+
#include <numeric>
22+
#include <thread>
1723
#include <vector>
1824

19-
#include "tc/core/cuda/cuda.h"
25+
#include <llvm/ADT/Optional.h>
26+
27+
#include "tc/core/cuda/cuda_compilation_cache.h"
2028
#include "tc/core/cuda/cuda_mapping_options.h"
2129
#include "tc/core/utils/dlpack.h"
30+
#include "tc/core/utils/time.h"
2231
#include "tc/lang/canonicalize.h"
23-
#include "tc/lang/tree.h"
24-
25-
#include <llvm/ADT/Optional.h>
2632

2733
namespace tc {
2834
namespace autotune {
@@ -35,10 +41,6 @@ std::vector<std::size_t> powers2andCeilDivisors(std::size_t val);
3541
template <typename Vector, typename... Vectors>
3642
Vector mergeVectors(Vector&& v, Vectors&&... vs);
3743

38-
/// The following API allows interacting with the autotuner caches.
39-
/// Caches generally take arbitrary strings for keys.
40-
/// The autotuner uses a canonicalized TC expression to load / store into
41-
/// caches. Add a layer of type safety to interact with these.
4244
std::vector<CudaMappingOptions> restoreCandidates(
4345
const lang::CanonicalTcString& tc,
4446
const std::vector<const DLTensor*>& inputs,
@@ -49,16 +51,38 @@ llvm::Optional<CudaMappingOptions> getBestOptions(
4951
const std::vector<const DLTensor*>& inputs,
5052
const std::vector<const DLTensor*>& outputs);
5153

52-
struct OptionsWithMedianTime {
53-
CudaMappingOptions options;
54-
Duration medianRuntime;
55-
};
54+
/**
55+
* Helper class to pretty print autotuning progress
56+
*/
57+
class Printer {
58+
public:
59+
Printer(
60+
size_t iteration,
61+
size_t total,
62+
const std::atomic_size_t& currentCompilationJob,
63+
const std::atomic_size_t& numEvaluations);
64+
~Printer();
5665

57-
std::vector<OptionsWithMedianTime> getOptionsAndMedianRuntimes(
58-
const lang::CanonicalTcString& id,
59-
const std::vector<const DLTensor*>& inputs);
66+
void record(Duration runtime);
67+
void stop();
68+
69+
void printAll();
70+
71+
private:
72+
void printLoop();
6073

74+
size_t iteration_;
75+
std::vector<Duration> runtimes_;
76+
mutable std::mutex runtimesMtx_;
77+
78+
std::atomic_bool stopPrinting_{false};
79+
std::thread printerThread_;
80+
81+
const size_t total_;
82+
const std::atomic_size_t& currentCompilationJob_;
83+
const std::atomic_size_t& numEvaluations_;
84+
};
6185
} // namespace autotune
6286
} // namespace tc
6387

64-
#include "tc/autotuner/utils/utils-inl.h"
88+
#include "tc/autotuner/utils-inl.h"

0 commit comments

Comments
 (0)