From fb8e8e268245b89480d2f67b932163fb35a0615f Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Sun, 9 Jun 2024 23:28:11 -0400
Subject: [PATCH 1/2] Fix two warnings in build script

---
 llama-cpp-sys-2/build.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 010e7fed..07e0e4ff 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -507,8 +507,7 @@ fn compile_metal(cx: &mut Build, cxx: &mut Build) {
     let common = LLAMA_PATH.join("ggml-common.h");
 
     let input_file = File::open(ggml_metal_shader_path).expect("Failed to open input file");
-    let mut output_file =
-        File::create(&ggml_metal_shader_out_path).expect("Failed to create output file");
+    let output_file = File::create(&ggml_metal_shader_out_path).expect("Failed to create output file");
 
     let output = Command::new("sed")
         .arg("-e")
@@ -656,7 +655,7 @@ fn main() {
     push_warn_flags(&mut cx, &mut cxx);
     push_feature_flags(&mut cx, &mut cxx);
 
-    let feat_lib = if cfg!(feature = "vulkan") {
+    let _feat_lib = if cfg!(feature = "vulkan") {
         Some(compile_vulkan(&mut cx, &mut cxx))
     } else if cfg!(feature = "cuda") {
         Some(compile_cuda(&mut cx, &mut cxx, featless_cxx))

From cd982221da251dc3a9e1c1dccc8cc0dc09a79fc3 Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Sun, 9 Jun 2024 23:35:14 -0400
Subject: [PATCH 2/2] Add hipBLAS feature and fix build script

Attempts to copy the CMake build steps as much as possible. GPU is detected when
running but on my setup appears to segfault when loading the model. I'm at a bit
of a loss on the segfault but this is definitely better than before.
---
 Cargo.lock                 |  1 +
 llama-cpp-2/Cargo.toml     |  3 ++-
 llama-cpp-2/src/lib.rs     |  1 +
 llama-cpp-sys-2/Cargo.toml |  3 ++-
 llama-cpp-sys-2/build.rs   | 48 +++++++++++++++++++++++---------------
 simple/src/main.rs         |  8 +++----
 6 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index efa53644..8f839616 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -494,6 +494,7 @@ version = "0.1.56"
 dependencies = [
  "bindgen",
  "cc",
+ "glob",
  "once_cell",
 ]
 
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 454276d1..c3b2e940 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -16,9 +16,10 @@ tracing = { workspace = true }
 [features]
 cuda = ["llama-cpp-sys-2/cuda"]
 metal = ["llama-cpp-sys-2/metal"]
+hipblas = ["llama-cpp-sys-2/hipblas"]
 sampler = []
 
-[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]  
+[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]
 llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", features=["metal"], version = "0.1.48" }
 
 [lints]
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 52d63c7f..049922fb 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -12,6 +12,7 @@
 //! # Feature Flags
 //!
 //! - `cuda` enables CUDA gpu support.
+//! - `hipblas` enables hipBLAS (ROCm) gpu support (experimental).
 //! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
 use std::ffi::NulError;
 use std::fmt::Debug;
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 0dae9980..e6323465 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -51,8 +51,9 @@ include = [
 bindgen = { workspace = true }
 cc = { workspace = true, features = ["parallel"] }
 once_cell = "1.19.0"
+glob = "0.3.1"
 
 [features]
 cuda = []
 metal = []
-
+hipblas = []
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 07e0e4ff..720ad9e3 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,11 +1,13 @@
-use std::env;
+use std::env::{self, VarError};
 use std::fs::{read_dir, File};
 use std::io::Write;
 use std::path::{Path, PathBuf};
 use std::process::Command;
+use std::str::FromStr;
 
 use cc::Build;
 use once_cell::sync::Lazy;
+use glob::glob;
 
 // This build file is based on:
 // https://github.com/mdrokz/rust-llama.cpp/blob/master/build.rs
@@ -365,23 +367,16 @@ fn compile_blis(cx: &mut Build) {
 }
 
 fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static str {
-    const DEFAULT_ROCM_PATH_STR: &str = "/opt/rocm/";
+    let rocm_path_str = env::var("ROCM_PATH").or(Ok::<String, VarError>(String::from_str("/opt/rocm/").unwrap())).unwrap();
 
-    let rocm_path_str = env::var("ROCM_PATH")
-        .map_err(|_| DEFAULT_ROCM_PATH_STR.to_string())
-        .unwrap();
-    println!("Compiling HIPBLAS GGML. Using ROCm from {rocm_path_str}");
+    println!("Compiling hipBLAS GGML. Using ROCm from {rocm_path_str}");
 
     let rocm_path = PathBuf::from(rocm_path_str);
     let rocm_include = rocm_path.join("include");
     let rocm_lib = rocm_path.join("lib");
     let rocm_hip_bin = rocm_path.join("bin/hipcc");
 
-    let cuda_lib = "ggml-cuda";
-    let cuda_file = cuda_lib.to_string() + ".cu";
-    let cuda_header = cuda_lib.to_string() + ".h";
-
-    let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUBLAS"];
+    let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUDA"];
     for def in defines {
         cx.define(def, None);
         cxx.define(def, None);
@@ -390,24 +385,39 @@ fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static
     cx.include(&rocm_include);
     cxx.include(&rocm_include);
 
+    let ggml_cuda = glob(LLAMA_PATH.join("ggml-cuda").join("*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+    let ggml_template_fattn = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("fattn-vec*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+    let ggml_template_wmma = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("fattn-wmma*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+    let ggml_template_mmq = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("mmq*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+
     hip.compiler(rocm_hip_bin)
         .std("c++11")
-        .file(LLAMA_PATH.join(cuda_file))
-        .include(LLAMA_PATH.join(cuda_header))
+        .define("LLAMA_CUDA_DMMV_X", Some("32"))
+        .define("LLAMA_CUDA_MMV_Y", Some("1"))
+        .define("LLAMA_CUDA_KQUANTS_ITER", Some("2"))
+        .file(LLAMA_PATH.join("ggml-cuda.cu"))
+        .files(ggml_cuda)
+        .files(ggml_template_fattn)
+        .files(ggml_template_wmma)
+        .files(ggml_template_mmq)
+        .include(LLAMA_PATH.join(""))
+        .include(LLAMA_PATH.join("ggml-cuda"))
         .define("GGML_USE_HIPBLAS", None)
-        .compile(cuda_lib);
+        .define("GGML_USE_CUDA", None)
+        .compile("ggml-cuda");
 
-    println!(
-        "cargo:rustc-link-search=native={}",
-        rocm_lib.to_string_lossy()
-    );
+    println!("cargo:rustc-link-search=native={}", rocm_lib.to_string_lossy());
 
     let rocm_libs = ["hipblas", "rocblas", "amdhip64"];
     for lib in rocm_libs {
         println!("cargo:rustc-link-lib={lib}");
     }
 
-    cuda_lib
+    "ggml-cuda"
 }
 
 fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str {
diff --git a/simple/src/main.rs b/simple/src/main.rs
index 8e6700c8..2fc4d817 100644
--- a/simple/src/main.rs
+++ b/simple/src/main.rs
@@ -44,7 +44,7 @@ struct Args {
     #[arg(short = 'o', value_parser = parse_key_val)]
     key_value_overrides: Vec<(String, ParamOverrideValue)>,
     /// Disable offloading layers to the gpu
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hipblas"))]
     #[clap(long)]
     disable_gpu: bool,
     #[arg(short = 's', long, help = "RNG seed (default: 1234)")]
@@ -124,7 +124,7 @@ fn main() -> Result<()> {
         model,
         prompt,
         file,
-        #[cfg(feature = "cuda")]
+        #[cfg(any(feature = "cuda", feature = "hipblas"))]
         disable_gpu,
         key_value_overrides,
         seed,
@@ -138,13 +138,13 @@ fn main() -> Result<()> {
 
     // offload all layers to the gpu
     let model_params = {
-        #[cfg(feature = "cuda")]
+        #[cfg(any(feature = "cuda", feature = "hipblas"))]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
         } else {
             LlamaModelParams::default()
         }
-        #[cfg(not(feature = "cuda"))]
+        #[cfg(not(any(feature = "cuda", feature = "hipblas")))]
         LlamaModelParams::default()
     };