[AOTI] Pass device explicitly (pytorch#858)

malfet · malfet · commit 523341a40bbc · 2024-07-17T09:55:46.000-07:00
Followup after pytorch/torchchat#815 to unblock migration to a newer version of PyTorch where AOTI seems to lost ability error out when one attempts to load CPU model on GPU, see https://github.com/pytorch/torchchat/actions/runs/9391753397/job/25913830802 for example Workaround by adding `-d ${DEVICE}` option to `aoti_runner`
diff --git a/.github/workflows/hqq-dtype.yml b/.github/workflows/hqq-dtype.yml
@@ -62,7 +62,7 @@ jobs:
           python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
           .ci/scripts/check_gibberish ./output_aoti
 
-          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so  -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
+          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so -d ${DEVICE} -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
           cat ./output_runner_aoti
           # .ci/scripts/check_gibberish ./output_runner_aoti --no-extract
 
@@ -77,7 +77,7 @@ jobs:
           python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
           .ci/scripts/check_gibberish ./output_aoti
 
-          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so  -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
+          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so -d ${DEVICE} -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
           cat ./output_runner_aoti
           # .ci/scripts/check_gibberish ./output_runner_aoti --no-extract
 
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
@@ -58,7 +58,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
 
-            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.so -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
             echo "**********************************************"
             echo "******** INT4 HQQ group-wise quantized *******"
diff --git a/runner/run.cpp b/runner/run.cpp
@@ -136,10 +136,10 @@ void build_transformer(
 
 #ifdef __AOTI_MODEL__
 #ifdef USE_CUDA
-  try {
+  if (aoti_device.type() == torch::kCUDA) {
     t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path);
     aoti_device = torch::Device(torch::kCUDA);
-  } catch (std::runtime_error& e) {
+  } else {
 #else
   {
 #endif
@@ -811,6 +811,7 @@ void error_usage() {
       "  -v <int>    (optional) vocab size, default is model-specific.\n");
   fprintf(
       stderr, "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
+  fprintf(stderr, "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
@@ -880,6 +881,20 @@ int main(int argc, char* argv[]) {
       system_prompt = argv[i + 1];
     } else if (argv[i][1] == 'l') {
       llama_ver = atoi(argv[i + 1]);
+#ifdef __AOTI_MODEL__
+    } else if (argv[i][1] == 'd') {
+#ifdef USE_CUDA
+       if (strcasecmp(argv[i + 1], "CUDA") == 0) {
+          aoti_device = torch::Device(torch::kCUDA);
+       } else
+#endif
+       if (strcasecmp(argv[i + 1], "CPU") == 0) {
+          aoti_device = torch::Device(torch::kCPU);
+       } else {
+         fprintf(stderr, "Unknown device %s", argv[i + 1]);
+         exit(1);
+       }
+#endif
     } else {
       error_usage();
     }