diff --git a/tc/core/cuda/cuda_rtc.cc b/tc/core/cuda/cuda_rtc.cc
index 9121704da..0a6eaf0cb 100644
--- a/tc/core/cuda/cuda_rtc.cc
+++ b/tc/core/cuda/cuda_rtc.cc
@@ -29,6 +29,7 @@
 #include "tc/core/cuda/cuda_rtc.h"
 #include "tc/core/flags.h"
 #include "tc/core/scope_guard.h"
+#include "tc/core/utils/system.h"
 
 namespace tc {
 std::mutex nvrtc_mutex;
@@ -65,17 +66,6 @@ void checkOrCreateContext() {
 }
 
 namespace {
-static void checkedSystemCall(
-    const std::string& cmd,
-    const std::vector<std::string>& args) {
-  std::stringstream command;
-  command << cmd << " ";
-  for (const auto& s : args) {
-    command << s << " ";
-  }
-  TC_CHECK_EQ(std::system(command.str().c_str()), 0) << command.str();
-}
-
 static std::tuple<int, int, int> getCudaArchitecture() {
   int device, major, minor;
   CUdevice deviceHandle;
@@ -119,7 +109,7 @@ static std::string llvmCompile(
   });
 
   // Compile
-  checkedSystemCall(
+  utils::checkedSystemCall(
       std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/clang++",
       {"-x cuda " + inputFileName,
        "--cuda-device-only",
@@ -134,7 +124,7 @@ static std::string llvmCompile(
        "-o " + outputClangFile});
 
   // Link libdevice before opt
-  checkedSystemCall(
+  utils::checkedSystemCall(
       std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/llvm-link ",
       {outputClangFile,
        std::string(TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR)) +
@@ -143,7 +133,7 @@ static std::string llvmCompile(
        "-o " + outputLinkFile});
 
   // Opt
-  checkedSystemCall(
+  utils::checkedSystemCall(
       std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/opt",
       {"-internalize",
        std::string("-internalize-public-api-list=") + name,
@@ -154,7 +144,7 @@ static std::string llvmCompile(
        std::string("-o ") + outputOptFile});
 
   // Ptx
-  checkedSystemCall(
+  utils::checkedSystemCall(
       std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/llc",
       {std::string("-mcpu=") + arch,
        outputOptFile,
@@ -188,7 +178,7 @@ static std::string nvccCompile(
   // cstdio's std::remove to delete files
   tc::ScopeGuard sgo([&]() { std::remove(outputPtxFile.c_str()); });
 
-  checkedSystemCall(
+  utils::checkedSystemCall(
       std::string(TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR)) + "/bin/nvcc",
       {"-x cu",
        inputFileName,
diff --git a/tc/core/flags.cc b/tc/core/flags.cc
index a5dc9e438..c1645ec30 100644
--- a/tc/core/flags.cc
+++ b/tc/core/flags.cc
@@ -53,8 +53,14 @@ DEFINE_string(
     "compiler flags to set when nvcc is used");
 
 // CPU codegen options
+DEFINE_string(mcpu, "", "see llvm's --mcpu");
 DEFINE_bool(llvm_dump_before_opt, false, "Print IR before optimization");
 DEFINE_bool(llvm_dump_after_opt, false, "Print IR after optimization");
+DEFINE_bool(llvm_dump_asm, false, "Print asm");
+DEFINE_string(
+    llvm_dump_asm_options,
+    "-filetype=asm",
+    "Options used when dumping asm");
 
 DEFINE_uint32(
     benchmark_warmup,
diff --git a/tc/core/flags.h b/tc/core/flags.h
index 38739a2da..0646a14e3 100644
--- a/tc/core/flags.h
+++ b/tc/core/flags.h
@@ -36,9 +36,12 @@ DECLARE_string(cuda_compiler);
 DECLARE_string(llvm_flags);
 DECLARE_string(nvcc_flags);
 
-// llvm codegen
+// CPU codegen options
+DECLARE_string(mcpu);
 DECLARE_bool(llvm_dump_before_opt);
 DECLARE_bool(llvm_dump_after_opt);
+DECLARE_bool(llvm_dump_asm);
+DECLARE_string(llvm_dump_asm_options);
 
 // Used in benchmarking and autotuning
 DECLARE_uint32(benchmark_warmup);
diff --git a/tc/core/polyhedral/codegen_llvm.cc b/tc/core/polyhedral/codegen_llvm.cc
index 4e9e264e1..2389658dc 100644
--- a/tc/core/polyhedral/codegen_llvm.cc
+++ b/tc/core/polyhedral/codegen_llvm.cc
@@ -15,6 +15,7 @@
  */
 #include "tc/core/polyhedral/codegen_llvm.h"
 
+#include <fstream>
 #include <sstream>
 #include <vector>
 
@@ -26,6 +27,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
@@ -41,7 +43,10 @@
 #include "tc/core/polyhedral/schedule_isl_conversion.h"
 #include "tc/core/polyhedral/scop.h"
 #include "tc/core/scope_guard.h"
+#include "tc/core/utils/cpu.h"
+#include "tc/core/utils/system.h"
 #include "tc/external/isl.h"
+#include "tc/tc_config.h"
 
 #ifndef LLVM_VERSION_MAJOR
 #error LLVM_VERSION_MAJOR not set
@@ -50,9 +55,8 @@
 using namespace Halide;
 
 namespace tc {
-
 namespace polyhedral {
-
+namespace {
 using IteratorMapType = std::unordered_map<std::string, isl::ast_expr>;
 using IteratorMapsType =
     std::unordered_map<isl::id, IteratorMapType, isl::IslIdIslHash>;
@@ -60,7 +64,64 @@ using IteratorMapsType =
 using StmtSubscriptExprMapType =
     std::unordered_map<isl::id, std::vector<isl::ast_expr>, isl::IslIdIslHash>;
 
-namespace {
+struct IslCodegenRes {
+  IteratorMapsType iteratorMaps;
+  StmtSubscriptExprMapType stmtSubscripts;
+  isl::ast_node astNode;
+};
+
+isl::ast_node collectIteratorMaps(
+    isl::ast_node node,
+    isl::ast_build build,
+    IteratorMapsType& iteratorMaps,
+    const Scop& scop,
+    StmtSubscriptExprMapType& stmtSubscripts) {
+  auto user = node.as<isl::ast_node_user>();
+  TC_CHECK(user);
+  auto expr = user.get_expr().as<isl::ast_expr_op>();
+  auto schedule = build.get_schedule();
+  auto scheduleMap = isl::map::from_union_map(schedule);
+
+  auto stmtId = expr.get_arg(0).as<isl::ast_expr_id>().get_id();
+  TC_CHECK_EQ(0u, iteratorMaps.count(stmtId)) << "entry exists: " << stmtId;
+  auto iteratorMap = isl::pw_multi_aff(scheduleMap.reverse());
+  auto tuple = scop.halide.domains.at(stmtId).tuple;
+  auto& stmtIteratorMap = iteratorMaps[stmtId];
+  for (int i = 0; i < tuple.size(); ++i) {
+    auto expr = build.expr_from(iteratorMap.get_pw_aff(i));
+    stmtIteratorMap.emplace(tuple.get_id(i).get_name(), expr);
+  }
+  auto& subscripts = stmtSubscripts[stmtId];
+  auto provide =
+      scop.halide.statements.at(stmtId).as<Halide::Internal::Provide>();
+  for (auto e : provide->args) {
+    const auto& map = iteratorMap;
+    auto aff = scop.makeIslAffFromStmtExpr(stmtId, e);
+    auto pulled = isl::pw_aff(aff).pullback(map);
+    TC_CHECK_EQ(pulled.n_piece(), 1);
+    subscripts.push_back(build.expr_from(pulled));
+  }
+  return node.set_annotation(stmtId);
+}
+
+static IslCodegenRes codegenISL(const Scop& scop) {
+  IteratorMapsType iteratorMaps;
+  StmtSubscriptExprMapType stmtSubscripts;
+  auto collect = [&iteratorMaps, &scop, &stmtSubscripts](
+                     isl::ast_node n, isl::ast_build b) -> isl::ast_node {
+    auto& uv = iteratorMaps;
+    return collectIteratorMaps(n, b, uv, scop, stmtSubscripts);
+  };
+
+  auto schedule = detail::toIslSchedule(scop.scheduleRoot());
+  auto astBuild = isl::ast_build(schedule.get_ctx());
+  astBuild = astBuild.set_at_each_domain(collect);
+  auto root = scop.scheduleRoot();
+  astBuild = astBuild.set_iterators(Codegen::makeLoopIterators(root));
+  auto astNode = astBuild.node_from(schedule);
+  return {
+      std::move(iteratorMaps), std::move(stmtSubscripts), std::move(astNode)};
+}
 
 thread_local llvm::LLVMContext llvmCtx;
 
@@ -95,6 +156,32 @@ std::vector<int64_t> getTensorSizesWithoutLeadingDim(
   return sizes;
 }
 
+// Set some options, grabbed from Halide + we force fast math atm
+static llvm::TargetOptions makeTargetOptions() {
+  bool use_soft_float_abi = false;
+  bool per_instruction_fast_math_flags = true;
+
+  llvm::TargetOptions options;
+  options.AllowFPOpFusion = per_instruction_fast_math_flags
+      ? llvm::FPOpFusion::Strict
+      : llvm::FPOpFusion::Fast;
+  options.UnsafeFPMath = !per_instruction_fast_math_flags;
+  options.NoInfsFPMath = !per_instruction_fast_math_flags;
+  options.NoNaNsFPMath = !per_instruction_fast_math_flags;
+  options.HonorSignDependentRoundingFPMathOption =
+      !per_instruction_fast_math_flags;
+  options.NoZerosInBSS = false;
+  options.GuaranteedTailCallOpt = false;
+  options.StackAlignmentOverride = 0;
+  options.FunctionSections = true;
+  options.UseInitArray = false;
+  options.FloatABIType =
+      use_soft_float_abi ? llvm::FloatABI::Soft : llvm::FloatABI::Hard;
+  options.RelaxELFRelocations = false;
+
+  return options;
+}
+
 static constexpr int kOptLevel = 3;
 
 class CodeGen_TC : public Halide::Internal::CodeGen_X86 {
@@ -112,6 +199,7 @@ class CodeGen_TC : public Halide::Internal::CodeGen_X86 {
     const char* llvm_args[] = {"tc (LLVM argument parsing)", nullptr};
     llvm::cl::ParseCommandLineOptions(
         sizeof(llvm_args) / sizeof(*llvm_args) - 1, llvm_args);
+
     init_context();
     module =
         llvm::make_unique<llvm::Module>("TensorComprehensionsModule", *context);
@@ -194,33 +282,35 @@ class CodeGen_TC : public Halide::Internal::CodeGen_X86 {
   }
 
  public:
-  void optimize_module() {
+  void optimize_module(const llvm::TargetMachine& targetMachine) {
     LOG_IF(INFO, FLAGS_llvm_dump_before_opt)
         << "[LLVM-IR] Before optimization:\n"
         << toString(module.get());
 
-    llvm::legacy::FunctionPassManager functionPassManager(module.get());
-    llvm::legacy::PassManager modulePassManager;
+    std::unique_ptr<llvm::TargetMachine> targetMachineWithOptions(
+        targetMachine.getTarget().createTargetMachine(
+            targetMachine.getTargetTriple().str(),
+            targetMachine.getTargetCPU(),
+            targetMachine.getTargetFeatureString(),
+            makeTargetOptions(),
+            llvm::Reloc::PIC_,
+            llvm::CodeModel::Small,
+            llvm::CodeGenOpt::Aggressive));
 
-    std::unique_ptr<llvm::TargetMachine> targetMachine =
-        Halide::Internal::make_target_machine(*module);
+    llvm::legacy::PassManager modulePassManager;
     modulePassManager.add(llvm::createTargetTransformInfoWrapperPass(
-        targetMachine ? targetMachine->getTargetIRAnalysis()
-                      : llvm::TargetIRAnalysis()));
+        targetMachineWithOptions->getTargetIRAnalysis()));
+
+    llvm::legacy::FunctionPassManager functionPassManager(module.get());
     functionPassManager.add(llvm::createTargetTransformInfoWrapperPass(
-        targetMachine ? targetMachine->getTargetIRAnalysis()
-                      : llvm::TargetIRAnalysis()));
+        targetMachineWithOptions->getTargetIRAnalysis()));
 
     llvm::PassManagerBuilder b;
     b.OptLevel = kOptLevel;
     b.Inliner = llvm::createFunctionInliningPass(b.OptLevel, 0, false);
     b.LoopVectorize = true;
     b.SLPVectorize = true;
-
-    if (targetMachine) {
-      targetMachine->adjustPassManager(b);
-    }
-
+    targetMachineWithOptions->adjustPassManager(b);
     b.populateFunctionPassManager(functionPassManager);
     b.populateModulePassManager(modulePassManager);
 
@@ -229,7 +319,6 @@ class CodeGen_TC : public Halide::Internal::CodeGen_X86 {
     for (llvm::Module::iterator i = module->begin(); i != module->end(); i++) {
       functionPassManager.run(*i);
     }
-
     functionPassManager.doFinalization();
     modulePassManager.run(*module);
 
@@ -291,6 +380,71 @@ Halide::Expr CodeGen_TC::makeHalideExpr(isl::ast_expr expr) {
 }
 
 class LLVMCodegen {
+ public:
+  LLVMCodegen(
+      const std::string& specializedName,
+      const Scop& scop,
+      const llvm::TargetMachine& targetMachine)
+      : scop_(scop),
+        islCg_(codegenISL(scop_)),
+        iteratorMaps_(islCg_.iteratorMaps),
+        stmtSubscripts_(islCg_.stmtSubscripts),
+        targetMachine(targetMachine),
+        // we don't use Halide to tinker with llvm::Module optimization so we
+        // tthe Halide target can be whatever.
+        halide_cg(Halide::get_host_target()) {
+    halide_cg.set_context(llvmCtx);
+    halide_cg.init_module();
+    halide_cg.get_module()->setDataLayout(targetMachine.createDataLayout());
+    halide_cg.get_module()->setTargetTriple(
+        targetMachine.getTargetTriple().str());
+    auto entry = createSignature(
+        scop.halide.inputs,
+        scop.halide.outputs,
+        scop.halide.params,
+        specializedName);
+    auto exit = emitAst(islCg_.astNode, entry);
+    halide_cg.get_builder().SetInsertPoint(exit);
+    halide_cg.get_builder().CreateRetVoid();
+
+    TC_CHECK(!llvm::verifyModule(*halide_cg.get_module()))
+        << "LLVM generated module is invalid." << str().c_str();
+
+    halide_cg.optimize_module(targetMachine);
+
+    if (FLAGS_llvm_dump_asm) {
+      std::string pat("/tmp/tcXXXXXX");
+      std::vector<char> ifn(pat.begin(), pat.end());
+      TC_CHECK_GE(mkstemp(ifn.data()), 0); // string.c_str is const char*
+      std::string fileName(ifn.begin(), ifn.end());
+      std::string optFile = fileName + "-opt.ll";
+      std::string asmFile = fileName + ".s";
+      // cstdio's std::remove to delete files
+      tc::ScopeGuard sgi([&]() {
+        std::remove(optFile.c_str());
+        std::remove(asmFile.c_str());
+      });
+      {
+        std::ofstream ostream(optFile, std::ios::binary);
+        ostream << str();
+      }
+      utils::checkedSystemCall(
+          std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/llc",
+          {FLAGS_llvm_dump_asm_options,
+           utils::CPUID::llcFlags(),
+           optFile,
+           std::string("-o ") + asmFile});
+
+      std::ifstream is(asmFile);
+      std::string str(
+          (std::istreambuf_iterator<char>(is)),
+          std::istreambuf_iterator<char>());
+      LOG(INFO) << "Dumping asm for: " << utils::CPUID::llcFlags() << "\n"
+                << str;
+    }
+  }
+
+ private:
   void collectTensor(const Halide::OutputImageParam& t) {
     auto sizes = getTensorSizesWithoutLeadingDim(t, scop_.context());
     if (not sizes.empty()) {
@@ -321,21 +475,16 @@ class LLVMCodegen {
     }
   }
 
- public:
-  LLVMCodegen(
-      const Scop& scop,
-      const IteratorMapsType& iteratorMaps,
-      const StmtSubscriptExprMapType& stmtSubscripts)
-      : scop_(scop),
-        iteratorMaps_(iteratorMaps),
-        stmtSubscripts_(stmtSubscripts),
-        halide_cg(Halide::Target(
-            Halide::Target::OSUnknown,
-            Halide::Target::X86,
-            64)) {
-    halide_cg.set_context(llvmCtx);
-
-    halide_cg.init_module();
+  llvm::Type* makePtrToArrayType(
+      llvm::Type* baseTy,
+      const std::vector<int64_t>& sizes) {
+    TC_CHECK_GE(sizes.size(), 1u);
+    TC_CHECK(baseTy);
+    llvm::Type* arrTy = llvm::ArrayType::get(baseTy, sizes.back());
+    for (auto s = sizes.rbegin() + 1; s != sizes.rend(); ++s) {
+      arrTy = llvm::ArrayType::get(arrTy, *s);
+    }
+    return arrTy->getPointerTo();
   }
 
   // This creates a signature of the form:
@@ -416,19 +565,6 @@ class LLVMCodegen {
     return nullptr;
   }
 
- private:
-  llvm::Type* makePtrToArrayType(
-      llvm::Type* baseTy,
-      const std::vector<int64_t>& sizes) {
-    TC_CHECK_GE(sizes.size(), 1u);
-    TC_CHECK(baseTy);
-    llvm::Type* arrTy = llvm::ArrayType::get(baseTy, sizes.back());
-    for (auto s = sizes.rbegin() + 1; s != sizes.rend(); ++s) {
-      arrTy = llvm::ArrayType::get(arrTy, *s);
-    }
-    return arrTy->getPointerTo();
-  }
-
   llvm::BasicBlock* emitIf(
       isl::ast_node_if node,
       llvm::BasicBlock* insertionPoint) {
@@ -547,6 +683,7 @@ class LLVMCodegen {
 
  private:
   const Scop& scop_;
+  const IslCodegenRes islCg_;
   const IteratorMapsType& iteratorMaps_;
   const StmtSubscriptExprMapType& stmtSubscripts_;
 
@@ -554,92 +691,16 @@ class LLVMCodegen {
   std::vector<std::string> argNames_;
 
  public:
+  const llvm::TargetMachine& targetMachine;
   CodeGen_TC halide_cg;
 };
-
-struct IslCodegenRes {
-  IteratorMapsType iteratorMaps;
-  StmtSubscriptExprMapType stmtSubscripts;
-  isl::ast_node astNode;
-};
-
-isl::ast_node collectIteratorMaps(
-    isl::ast_node node,
-    isl::ast_build build,
-    IteratorMapsType& iteratorMaps,
-    const Scop& scop,
-    StmtSubscriptExprMapType& stmtSubscripts) {
-  auto user = node.as<isl::ast_node_user>();
-  TC_CHECK(user);
-  auto expr = user.get_expr().as<isl::ast_expr_op>();
-  auto schedule = build.get_schedule();
-  auto scheduleMap = isl::map::from_union_map(schedule);
-
-  auto stmtId = expr.get_arg(0).as<isl::ast_expr_id>().get_id();
-  TC_CHECK_EQ(0u, iteratorMaps.count(stmtId)) << "entry exists: " << stmtId;
-  auto iteratorMap = isl::pw_multi_aff(scheduleMap.reverse());
-  auto tuple = scop.halide.domains.at(stmtId).tuple;
-  auto& stmtIteratorMap = iteratorMaps[stmtId];
-  for (int i = 0; i < tuple.size(); ++i) {
-    auto expr = build.expr_from(iteratorMap.get_pw_aff(i));
-    stmtIteratorMap.emplace(tuple.get_id(i).get_name(), expr);
-  }
-  auto& subscripts = stmtSubscripts[stmtId];
-  auto provide =
-      scop.halide.statements.at(stmtId).as<Halide::Internal::Provide>();
-  for (auto e : provide->args) {
-    const auto& map = iteratorMap;
-    auto aff = scop.makeIslAffFromStmtExpr(stmtId, e);
-    auto pulled = isl::pw_aff(aff).pullback(map);
-    TC_CHECK_EQ(pulled.n_piece(), 1);
-    subscripts.push_back(build.expr_from(pulled));
-  }
-  return node.set_annotation(stmtId);
-}
-
-IslCodegenRes codegenISL(const Scop& scop) {
-  IteratorMapsType iteratorMaps;
-  StmtSubscriptExprMapType stmtSubscripts;
-  auto collect = [&iteratorMaps, &scop, &stmtSubscripts](
-                     isl::ast_node n, isl::ast_build b) -> isl::ast_node {
-    auto& uv = iteratorMaps;
-    return collectIteratorMaps(n, b, uv, scop, stmtSubscripts);
-  };
-
-  auto schedule = detail::toIslSchedule(scop.scheduleRoot());
-  auto astBuild = isl::ast_build(schedule.get_ctx());
-  astBuild = astBuild.set_at_each_domain(collect);
-  auto root = scop.scheduleRoot();
-  astBuild = astBuild.set_iterators(Codegen::makeLoopIterators(root));
-  auto astNode = astBuild.node_from(schedule);
-  return {
-      std::move(iteratorMaps), std::move(stmtSubscripts), std::move(astNode)};
-}
-
 } // namespace
 
 std::unique_ptr<llvm::Module> emitLLVMKernel(
     const std::string& specializedName,
     const Scop& scop,
-    const llvm::DataLayout& dataLayout) {
-  auto islCg = codegenISL(scop);
-  LLVMCodegen cg(scop, islCg.iteratorMaps, islCg.stmtSubscripts);
-  cg.halide_cg.get_module()->setDataLayout(dataLayout);
-  cg.halide_cg.get_module()->setTargetTriple(
-      llvm::EngineBuilder().selectTarget()->getTargetTriple().str());
-  auto entry = cg.createSignature(
-      scop.halide.inputs,
-      scop.halide.outputs,
-      scop.halide.params,
-      specializedName);
-  auto exit = cg.emitAst(islCg.astNode, entry);
-  cg.halide_cg.get_builder().SetInsertPoint(exit);
-  cg.halide_cg.get_builder().CreateRetVoid();
-
-  TC_CHECK(!llvm::verifyModule(*cg.halide_cg.get_module()))
-      << "LLVM generated module is invalid." << cg.str().c_str();
-
-  cg.halide_cg.optimize_module();
+    const llvm::TargetMachine& targetMachine) {
+  LLVMCodegen cg(specializedName, scop, targetMachine);
   return cg.halide_cg.move_module();
 }
 
diff --git a/tc/core/polyhedral/codegen_llvm.h b/tc/core/polyhedral/codegen_llvm.h
index 08c5a9d31..3e84b7aee 100644
--- a/tc/core/polyhedral/codegen_llvm.h
+++ b/tc/core/polyhedral/codegen_llvm.h
@@ -55,7 +55,7 @@ struct Scop;
 std::unique_ptr<llvm::Module> emitLLVMKernel(
     const std::string& specializedName,
     const Scop& scop,
-    const llvm::DataLayout& dataLayout);
+    const llvm::TargetMachine& targetMachine);
 
 // TODO: I want to do something like the following, but compilation was unhappy
 //  using initialize_llvm = Halide::Internal::CodeGen_LLVM::initialize_llvm;
diff --git a/tc/core/polyhedral/llvm_jit.cc b/tc/core/polyhedral/llvm_jit.cc
index ef80fb6dc..00f3f20ec 100644
--- a/tc/core/polyhedral/llvm_jit.cc
+++ b/tc/core/polyhedral/llvm_jit.cc
@@ -31,11 +31,11 @@
 #include "tc/core/check.h"
 #include "tc/core/flags.h"
 #include "tc/core/polyhedral/codegen_llvm.h"
+#include "tc/core/utils/cpu.h"
 
 using namespace llvm;
 
 namespace tc {
-
 Jit::Jit()
     : ES(),
       Resolver(llvm::orc::createLegacyLookupResolver(
@@ -51,7 +51,7 @@ Jit::Jit()
             return nullptr;
           },
           [](Error err) { throw std::runtime_error("Lookup failed!"); })),
-      TM_(EngineBuilder().selectTarget()),
+      TM_(EngineBuilder().setMCPU(utils::CPUID::mcpu()).selectTarget()),
       DL_(TM_->createDataLayout()),
       objectLayer_(
           ES,
@@ -71,8 +71,8 @@ void Jit::addModule(std::shared_ptr<Module> M) {
 std::shared_ptr<Module> Jit::codegenScop(
     const std::string& specializedName,
     const polyhedral::Scop& scop) {
-  std::shared_ptr<Module> mod = emitLLVMKernel(
-      specializedName, scop, getTargetMachine().createDataLayout());
+  std::shared_ptr<Module> mod =
+      emitLLVMKernel(specializedName, scop, getTargetMachine());
   addModule(mod);
   return mod;
 }
diff --git a/tc/core/polyhedral/llvm_jit.h b/tc/core/polyhedral/llvm_jit.h
index 75031d106..843f30b95 100644
--- a/tc/core/polyhedral/llvm_jit.h
+++ b/tc/core/polyhedral/llvm_jit.h
@@ -16,15 +16,12 @@
 #pragma once
 
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/Target/TargetMachine.h"
 
-#if LLVM_VERSION_MAJOR > 6
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#endif
-
 namespace tc {
 
 namespace polyhedral {
@@ -33,10 +30,8 @@ class Scop;
 
 class Jit {
  private:
-#if LLVM_VERSION_MAJOR > 6
   llvm::orc::ExecutionSession ES;
   std::shared_ptr<llvm::orc::SymbolResolver> Resolver;
-#endif
   std::unique_ptr<llvm::TargetMachine> TM_;
   const llvm::DataLayout DL_;
   llvm::orc::RTDyldObjectLinkingLayer objectLayer_;
diff --git a/tc/core/utils/cpu.h b/tc/core/utils/cpu.h
new file mode 100644
index 000000000..81a7e65dd
--- /dev/null
+++ b/tc/core/utils/cpu.h
@@ -0,0 +1,153 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <unordered_map>
+
+#include <cpuid.h>
+
+#include "tc/core/flags.h"
+#include "tc/core/utils/cpu.h"
+
+namespace tc {
+namespace utils {
+
+#define INTEL_ebx 0x756e6547
+#define INTEL_ecx 0x6c65746e
+#define INTEL_edx 0x49656e69
+
+/**
+ * We start with a reasonable subset of the processors listed in the result
+ * of running the command:
+ *    llvm-as < /dev/null | llc -march=x86-64 -mcpu=help
+ */
+struct CPUID {
+ public:
+  CPUID() : eax(0), ebx(0), ecx(0), edx(0) {
+    __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+  }
+
+  static bool isIntel() {
+    unsigned int a, b, c, d;
+    __get_cpuid(0, &a, &b, &c, &d);
+    return b == INTEL_ebx && c == INTEL_ecx && d == INTEL_edx;
+  }
+
+  using Stepping = unsigned char;
+  using Model = unsigned char;
+  using Family = unsigned char;
+  using ProcessorType = unsigned char;
+  using ExtendedModel = unsigned char;
+  using ExtendedFamily = unsigned short;
+  struct FullModel {
+    FullModel(Model m, ExtendedModel em) {
+      val = (em << 4) + m;
+    }
+    operator unsigned short() {
+      return val;
+    }
+    operator const unsigned short&() const {
+      return val;
+    }
+    unsigned short val;
+  };
+
+  static const std::unordered_map<unsigned short, std::string>&
+  intelFamily6ExtendedFamily0() {
+    static std::unordered_map<unsigned short, std::string> m{
+        {FullModel(0xD, 0x3), "broadwell"}, // client
+        {FullModel(0x7, 0x4), "broadwell"}, // client
+        {FullModel(0xF, 0x4), "broadwell"}, // server
+        {FullModel(0x6, 0x5), "broadwell"}, // server
+        {FullModel(0x6, 0x6), "cannonlake"}, // client
+        {FullModel(0x6, 0x4), "haswell"}, // client
+        {FullModel(0x5, 0x4), "haswell"}, // client
+        {FullModel(0xC, 0x3), "haswell"}, // client
+        {FullModel(0xF, 0x3), "haswell"}, // server
+        {FullModel(0xA, 0x3), "ivybridge"}, // client
+        {FullModel(0xE, 0x3), "ivybridge"}, // server
+        {FullModel(0xA, 0x2), "sandybridge"}, // client
+        {FullModel(0xD, 0x2), "sandybridge"}, // server
+        {FullModel(0xE, 0x4), "skylake"}, // client
+        {FullModel(0xE, 0x5), "skylake"}, // client
+        {FullModel(0x5, 0x5), "skylake-avx512"}, // server
+        {FullModel(0x5, 0x2), "westmere"}, // client
+        {FullModel(0xC, 0x2), "westmere"}, // server
+        {FullModel(0xF, 0x2), "westmere"}, // server
+    };
+    return m;
+  };
+
+  static std::tuple<
+      Stepping,
+      Model,
+      Family,
+      ProcessorType,
+      ExtendedModel,
+      ExtendedFamily>
+  parseCPU() {
+    CPUID id;
+    return std::make_tuple(
+        static_cast<Stepping>(id.eax & 0x0000000F), // 3:0
+        static_cast<Model>((id.eax >> 4) & 0x0000000F), // 7:4
+        static_cast<Family>((id.eax >> 8) & 0x0000000F), // 11:8
+        static_cast<ProcessorType>((id.eax >> 12) & 0x00000003), // 13:12
+        static_cast<ExtendedModel>((id.eax >> 16) & 0x0000000F), // 19:16
+        static_cast<ExtendedFamily>((id.eax >> 20) & 0x000000FF) // 27:20
+    );
+  }
+
+#define INTEL_FAMILY_6 0x6
+#define INTEL_EXTENDED_FAMILY_0 0x0
+  static std::string mcpu() {
+    if (FLAGS_mcpu.size() > 0) {
+      return FLAGS_mcpu;
+    }
+
+    TC_CHECK(CPUID::isIntel());
+    auto parsedCPU = CPUID::parseCPU();
+    auto model = std::get<1>(parsedCPU);
+    auto family = std::get<2>(parsedCPU);
+    auto extendedModel = std::get<4>(parsedCPU);
+    auto extendedFamily = std::get<5>(parsedCPU);
+    if (family == INTEL_FAMILY_6 && extendedFamily == INTEL_EXTENDED_FAMILY_0) {
+      if (intelFamily6ExtendedFamily0().count(FullModel(model, extendedModel)) >
+          0) {
+        return intelFamily6ExtendedFamily0().at(
+            FullModel(model, extendedModel));
+      }
+      LOG(ERROR) << "FullModel: "
+                 << (unsigned short)FullModel(model, extendedModel)
+                 << " -> unspecified x86-64";
+      return "x86-64";
+    }
+    TC_CHECK(false) << "Unsupported family/model/extendedmodel: " << family
+                    << "/" << model << "/" << extendedModel;
+    return "";
+  }
+
+  static std::string llcFlags() {
+    return std::string("-march=x86-64 -mcpu=") + CPUID::mcpu();
+  }
+
+ public:
+  unsigned int eax;
+  unsigned int ebx;
+  unsigned int ecx;
+  unsigned int edx;
+};
+} // namespace utils
+} // namespace tc
diff --git a/tc/core/utils/system.h b/tc/core/utils/system.h
new file mode 100644
index 000000000..2f72a5268
--- /dev/null
+++ b/tc/core/utils/system.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "tc/core/check.h"
+
+namespace tc {
+namespace utils {
+inline void checkedSystemCall(
+    const std::string& cmd,
+    const std::vector<std::string>& args) {
+  std::stringstream command;
+  command << cmd << " ";
+  for (const auto& s : args) {
+    command << s << " ";
+  }
+  TC_CHECK_EQ(std::system(command.str().c_str()), 0) << command.str();
+}
+} // namespace utils
+} // namespace tc