Split examples

nicolasvasilache · nicolasvasilache · commit eef8d22f15f6 · 2018-03-29T06:35:59.000-06:00
This changeset split examples between small, simple, user-facing examples and longer, more complex, full-autotuning run benchmarks
diff --git a/.jenkins/run_test.sh b/.jenkins/run_test.sh
@@ -12,8 +12,8 @@ source /etc/lsb-release
 
 if [[ "$DISTRIB_RELEASE" == 14.04 ]]; then
   echo "Running TC backend tests"
-  FILTER_OUT=example_MLP_model ./test.sh
-  ./build/examples/example_MLP_model --gtest_filter=-*2LUT*
+  FILTER_OUT=MLP_model ./test.sh
+  ./build/benchmarks/MLP_model --gtest_filter=-*2LUT*
   if [[ $(conda --version | wc -c) -ne 0 ]]; then
     source activate tc-env
     echo "Running TC PyTorch tests"
@@ -28,7 +28,7 @@ if [[ "$DISTRIB_RELEASE" == 16.04 ]]; then
     ./test_python/run_test.sh
   else
     echo "Running TC backend tests"
-    FILTER_OUT=example_MLP_model ./test.sh
-    ./build/examples/example_MLP_model --gtest_filter=-*2LUT*
+    FILTER_OUT=MLP_model ./test.sh
+    ./build/benchmarks/MLP_model --gtest_filter=-*2LUT*
   fi
 fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -258,8 +258,14 @@ endif()
 enable_testing()
 add_subdirectory(test)
 
-if (WITH_CAFFE2 AND WITH_CUDA)
+if (WITH_CUDA)
   add_subdirectory(examples)
 else()
-  message(STATUS "Not building examples, caffe2 or CUDA not available")
+  message(STATUS "Not building examples, CUDA not available")
+endif()
+
+if (WITH_CAFFE2 AND WITH_CUDA)
+  add_subdirectory(benchmarks)
+else()
+  message(STATUS "Not building benchmarks, caffe2 or CUDA not available")
 endif()
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -0,0 +1,48 @@
+include_directories(.)
+include_directories(..)
+include_directories(../../include)
+
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+# Needed to ensure reproducibility (proper cublas version) via call to
+# cublasGetVersion_v2.
+find_library(CUDA_CUBLAS_LIBRARIES cublas
+  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+find_library(CUDA_CUDNN_LIBRARIES cudnn
+  PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64)
+
+################################################################################
+# Don't ask ... TODO: cleanup gtest
+################################################################################
+include_directories(${PROJECT_SOURCE_DIR}/third-party/googletest/googletest/include)
+set(GTEST_LIBS gtest gtest_main)
+
+################################################################################
+# Examples
+################################################################################
+set(EXAMPLES_FILES
+  batchmatmul
+  group_convolution
+  tmm
+  MLP_model
+)
+foreach(i ${EXAMPLES_FILES})
+  add_executable(${i} ${i}.cc)
+  add_test(${i} ${i})
+  target_link_libraries(
+     ${i}
+     tc_autotuner
+     tc_core
+     tc_c2
+
+     ${CUDA_CUBLAS_LIBRARIES}
+     ${CUDA_CUDNN_LIBRARIES}
+     ${GTEST_LIBS}
+     ${GFLAGS_LIBRARIES}
+     ${GLOG_LIBRARIES}
+
+     ${ATEN_LIBRARIES}
+  )
+endforeach()
diff --git a/benchmarks/MLP_model.cc b/benchmarks/MLP_model.cc
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"
diff --git a/benchmarks/batchmatmul.cc b/benchmarks/batchmatmul.cc
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"
diff --git a/benchmarks/benchmark_fixture.h b/benchmarks/benchmark_fixture.h
diff --git a/benchmarks/group_convolution.cc b/benchmarks/group_convolution.cc
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"
diff --git a/benchmarks/scripts/8.0.sh b/benchmarks/scripts/8.0.sh
diff --git a/benchmarks/scripts/9.0.sh b/benchmarks/scripts/9.0.sh
diff --git a/benchmarks/scripts/AUTOTUNER_COMMANDS b/benchmarks/scripts/AUTOTUNER_COMMANDS
@@ -0,0 +1,14 @@
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/batchmatmul --gtest_filter="*.TransposedBatchMatMul" --B=500 --K=26 --M=72 --N=26 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/TransposedBatchMatMul_B_500_K_26_M_72_N_26.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/batchmatmul --gtest_filter="*.TransposedBatchMatMul" --B=500 --K=26 --M=72 --N=26 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/TransposedBatchMatMul_B_500_K_26_M_72_N_26.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=4 --F=4 --W=56 --H=56 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_4_F_4_W_56_H_56_KW_3_KH_3.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=4 --F=4 --W=56 --H=56 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_4_F_4_W_56_H_56_KW_3_KH_3.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=8 --F=8 --W=28 --H=28 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_8_F_8_W_28_H_28_KW_3_KH_3.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=8 --F=8 --W=28 --H=28 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_8_F_8_W_28_H_28_KW_3_KH_3.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=16 --F=16 --W=14 --H=14 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_16_F_16_W_14_H_14_KW_3_KH_3.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=16 --F=16 --W=14 --H=14 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_16_F_16_W_14_H_14_KW_3_KH_3.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=32 --F=32 --W=7 --H=7 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_32_F_32_W_7_H_7_KW_3_KH_3.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/group_convolution --gtest_filter="*.GroupConvolution" --N=32 --G=32 --C=32 --F=32 --W=7 --H=7 --KW=3 --KH=3 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/GroupConvolution_N_32_G_32_C_32_F_32_W_7_H_7_KW_3_KH_3.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/tmm --gtest_filter="*.TransposedMatMul" --M=128 --K=32 --N=256 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/TransposedMatMul_M_128_K_32_N_256.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/tmm --gtest_filter="*.TransposedMatMul" --M=128 --K=32 --N=256 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/TransposedMatMul_M_128_K_32_N_256.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/tmm --gtest_filter="*.TransposedMatMul" --M=128 --K=1024 --N=1024 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/TransposedMatMul_M_128_K_1024_N_1024.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/tmm --gtest_filter="*.TransposedMatMul" --M=128 --K=1024 --N=1024 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/TransposedMatMul_M_128_K_1024_N_1024.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/tmm --gtest_filter="*.TransposedMatMul" --M=128 --K=4096 --N=16384 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/TransposedMatMul_M_128_K_4096_N_16384.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/tmm --gtest_filter="*.TransposedMatMul" --M=128 --K=4096 --N=16384 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/TransposedMatMul_M_128_K_4096_N_16384.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.1LUT" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/1LUT_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.1LUT" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/1LUT_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.2LUT" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/2LUT_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.2LUT" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/2LUT_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.C3" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/C3_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.C3" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/C3_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log 2>&1
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.MLP1" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/MLP1_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.MLP1" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/MLP1_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log 2>&1
+# MLP3 can get stuck in a variety of cases, putting it at the end
+echo CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.MLP3" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ > ${LOG_DIR}/autotuner/MLP3_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log && CUDA_LAUNCH_BLOCKING=1 ./build/benchmarks/MLP_model --gtest_filter="*.MLP3" --B=128 --D=64 --L1=50 --E1=10000000 --L2=50 --E2=10000000 --WX=1000 --WY=1024 --M=2000 --N=128 --O=64 --P=32 --Q=2 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_gpus="${TUNER_GPUS}" --save_tuner_proto_prefix=${LOG_DIR}/autotuner/ --tuner_gen_restore_from_proto=0 >> ${LOG_DIR}/autotuner/MLP3_B_128_D_64_L1_50_E1_10000000_L2_50_E2_10000000_WX_1000_WY_1024_M_2000_N_128_O_64_P_32_Q_2.log 2>&1
diff --git a/benchmarks/scripts/autotuner_parallel.sh b/benchmarks/scripts/autotuner_parallel.sh
@@ -13,15 +13,15 @@ export TUNER_GPUS=${TUNER_GPUS:="0,1"}
 export GPU_NAME=$(nvidia-smi -L | head -n 1 | cut -d'(' -f 1 | cut -d':' -f 2 | sed "s/ //g")
 
 export TC_PREFIX=$(git rev-parse --show-toplevel)
-export PREFIX=${TC_PREFIX}/examples/results_$(date +%m%d%y)/${GPU_NAME}
-export LOG_DIR=${TC_PREFIX}/examples/results_$(date +%m%d%y)/${GPU_NAME}/logs/${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}
+export PREFIX=${TC_PREFIX}/benchmarks/results_$(date +%m%d%y)/${GPU_NAME}
+export LOG_DIR=${TC_PREFIX}/benchmarks/results_$(date +%m%d%y)/${GPU_NAME}/logs/${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}
 
 mkdir -p ${LOG_DIR}
 mkdir -p ${LOG_DIR}/autotuner
 chmod -R 777 ${LOG_DIR}
 
-cat ${TC_PREFIX}/examples/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i echo {} > ${LOG_DIR}/COMMAND
-cat ${TC_PREFIX}/examples/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}"
+cat ${TC_PREFIX}/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i echo {} > ${LOG_DIR}/COMMAND
+cat ${TC_PREFIX}/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}"
 
 # Run with:
-# sbatch --array=1-14 ./examples/scripts/autotuner_parallel.sh
+# sbatch --array=1-14 ./benchmarks/scripts/autotuner_parallel.sh
diff --git a/benchmarks/tmm.cc b/benchmarks/tmm.cc
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -4,64 +4,27 @@ include_directories(../../include)
 
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
-# Needed to ensure reproducibility (proper cublas version) via call to
-# cublasGetVersion_v2.
-find_library(CUDA_CUBLAS_LIBRARIES cublas
-  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
-find_library(CUDA_CUDNN_LIBRARIES cudnn
-  PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64)
-
 ################################################################################
 # Don't ask ... TODO: cleanup gtest
 ################################################################################
 include_directories(${PROJECT_SOURCE_DIR}/third-party/googletest/googletest/include)
 set(GTEST_LIBS gtest gtest_main)
 
-################################################################################
-# Simple Example
-################################################################################
-set(EXAMPLES_FILES
-  example_tensordot
-)
-foreach(i ${EXAMPLES_FILES})
-  add_executable(${i} ${i}.cc)
-  add_test(${i} ${i})
-  target_link_libraries(
-     ${i}
-
-     tc_autotuner
-     tc_core
-
-     ${GTEST_LIBS}
-     ${GFLAGS_LIBRARIES}
-     ${GLOG_LIBRARIES}
-
-     ${ATEN_LIBRARIES}
-  )
-endforeach()
-
 ################################################################################
 # Examples
 ################################################################################
 set(EXAMPLES_FILES
-  example_batchmatmul
-  example_group_convolution
-  example_tmm
-  example_MLP_model
+  tensordot
 )
 foreach(i ${EXAMPLES_FILES})
   add_executable(${i} ${i}.cc)
   add_test(${i} ${i})
   target_link_libraries(
      ${i}
+
      tc_autotuner
      tc_core
-     tc_c2
 
-     ${CUDA_CUBLAS_LIBRARIES}
-     ${CUDA_CUDNN_LIBRARIES}
      ${GTEST_LIBS}
      ${GFLAGS_LIBRARIES}
      ${GLOG_LIBRARIES}
diff --git a/examples/scripts/AUTOTUNER_COMMANDS b/examples/scripts/AUTOTUNER_COMMANDS
diff --git a/examples/tensordot.cc b/examples/tensordot.cc